diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56146 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.39489923788056713, + "eval_steps": 500, + "global_step": 400848, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.925797782208806e-05, + "grad_norm": 4.725747585296631, + "learning_rate": 4.9000000000000005e-06, + "loss": 10.8905, + "step": 50 + }, + { + "epoch": 9.851595564417613e-05, + "grad_norm": 2.1797287464141846, + "learning_rate": 9.9e-06, + "loss": 9.8832, + "step": 100 + }, + { + "epoch": 0.0001477739334662642, + "grad_norm": 1.7530624866485596, + "learning_rate": 9.999903452461176e-06, + "loss": 9.1544, + "step": 150 + }, + { + "epoch": 0.00019703191128835225, + "grad_norm": 1.458598256111145, + "learning_rate": 9.999804934564413e-06, + "loss": 8.6751, + "step": 200 + }, + { + "epoch": 0.00024628988911044034, + "grad_norm": 1.4195055961608887, + "learning_rate": 9.999706416667653e-06, + "loss": 8.2598, + "step": 250 + }, + { + "epoch": 0.0002955478669325284, + "grad_norm": 1.2028930187225342, + "learning_rate": 9.999607898770891e-06, + "loss": 7.9452, + "step": 300 + }, + { + "epoch": 0.00034480584475461647, + "grad_norm": 1.1207659244537354, + "learning_rate": 9.99950938087413e-06, + "loss": 7.7246, + "step": 350 + }, + { + "epoch": 0.0003940638225767045, + "grad_norm": 0.9927027225494385, + "learning_rate": 9.999410862977369e-06, + "loss": 7.5687, + "step": 400 + }, + { + "epoch": 0.0004433218003987926, + "grad_norm": 0.7077513933181763, + "learning_rate": 9.999312345080608e-06, + "loss": 7.4449, + "step": 450 + }, + { + "epoch": 0.0004925797782208807, + "grad_norm": 1.172118902206421, + "learning_rate": 9.999213827183846e-06, + "loss": 7.3723, + "step": 500 + }, + { + "epoch": 0.0005418377560429688, + "grad_norm": 0.7955384254455566, + "learning_rate": 9.999115309287086e-06, + "loss": 7.2582, + "step": 550 + }, + { + "epoch": 0.0005910957338650568, + "grad_norm": 1.0187832117080688, + "learning_rate": 9.999016791390325e-06, + "loss": 7.2036, + "step": 600 + }, + { + "epoch": 0.0006403537116871448, + "grad_norm": 1.025915265083313, + "learning_rate": 9.998918273493563e-06, + "loss": 7.1192, + "step": 650 + }, + { + "epoch": 0.0006896116895092329, + "grad_norm": 1.271194338798523, + "learning_rate": 9.998819755596803e-06, + "loss": 7.0922, + "step": 700 + }, + { + "epoch": 0.000738869667331321, + "grad_norm": 1.4047073125839233, + "learning_rate": 9.998721237700041e-06, + "loss": 7.0221, + "step": 750 + }, + { + "epoch": 0.000788127645153409, + "grad_norm": 1.6794075965881348, + "learning_rate": 9.99862271980328e-06, + "loss": 6.9713, + "step": 800 + }, + { + "epoch": 0.0008373856229754971, + "grad_norm": 1.4108102321624756, + "learning_rate": 9.998524201906518e-06, + "loss": 6.9433, + "step": 850 + }, + { + "epoch": 0.0008866436007975852, + "grad_norm": 1.5267258882522583, + "learning_rate": 9.998425684009758e-06, + "loss": 6.9022, + "step": 900 + }, + { + "epoch": 0.0009359015786196733, + "grad_norm": 1.275080919265747, + "learning_rate": 9.998327166112996e-06, + "loss": 6.8109, + "step": 950 + }, + { + "epoch": 0.0009851595564417614, + "grad_norm": 3.133509635925293, + "learning_rate": 9.998228648216236e-06, + "loss": 6.7884, + "step": 1000 + }, + { + "epoch": 0.0010344175342638495, + "grad_norm": 1.3738231658935547, + "learning_rate": 9.998130130319475e-06, + "loss": 6.7652, + "step": 1050 + }, + { + "epoch": 0.0010836755120859375, + "grad_norm": 1.615710973739624, + "learning_rate": 9.998031612422713e-06, + "loss": 6.7464, + "step": 1100 + }, + { + "epoch": 0.0011329334899080254, + "grad_norm": 1.3281502723693848, + "learning_rate": 9.997933094525953e-06, + "loss": 6.6762, + "step": 1150 + }, + { + "epoch": 0.0011821914677301135, + "grad_norm": 1.5175045728683472, + "learning_rate": 9.997834576629192e-06, + "loss": 6.6743, + "step": 1200 + }, + { + "epoch": 0.0012314494455522016, + "grad_norm": 1.5772520303726196, + "learning_rate": 9.99773605873243e-06, + "loss": 6.6188, + "step": 1250 + }, + { + "epoch": 0.0012807074233742897, + "grad_norm": 1.855621576309204, + "learning_rate": 9.997637540835668e-06, + "loss": 6.5788, + "step": 1300 + }, + { + "epoch": 0.0013299654011963778, + "grad_norm": 2.193713665008545, + "learning_rate": 9.997539022938908e-06, + "loss": 6.5943, + "step": 1350 + }, + { + "epoch": 0.0013792233790184659, + "grad_norm": 1.454984188079834, + "learning_rate": 9.997440505042146e-06, + "loss": 6.5445, + "step": 1400 + }, + { + "epoch": 0.001428481356840554, + "grad_norm": 1.607843041419983, + "learning_rate": 9.997341987145386e-06, + "loss": 6.516, + "step": 1450 + }, + { + "epoch": 0.001477739334662642, + "grad_norm": 2.0937602519989014, + "learning_rate": 9.997243469248623e-06, + "loss": 6.4946, + "step": 1500 + }, + { + "epoch": 0.00152699731248473, + "grad_norm": 1.8227548599243164, + "learning_rate": 9.997144951351863e-06, + "loss": 6.4632, + "step": 1550 + }, + { + "epoch": 0.001576255290306818, + "grad_norm": 1.6639997959136963, + "learning_rate": 9.997046433455103e-06, + "loss": 6.4916, + "step": 1600 + }, + { + "epoch": 0.001625513268128906, + "grad_norm": 1.8181363344192505, + "learning_rate": 9.996947915558342e-06, + "loss": 6.4132, + "step": 1650 + }, + { + "epoch": 0.0016747712459509942, + "grad_norm": 1.9472105503082275, + "learning_rate": 9.99684939766158e-06, + "loss": 6.3929, + "step": 1700 + }, + { + "epoch": 0.0017240292237730823, + "grad_norm": 2.028184175491333, + "learning_rate": 9.99675087976482e-06, + "loss": 6.3564, + "step": 1750 + }, + { + "epoch": 0.0017732872015951704, + "grad_norm": 1.6994073390960693, + "learning_rate": 9.996652361868058e-06, + "loss": 6.3804, + "step": 1800 + }, + { + "epoch": 0.0018225451794172585, + "grad_norm": 2.174935817718506, + "learning_rate": 9.996553843971296e-06, + "loss": 6.357, + "step": 1850 + }, + { + "epoch": 0.0018718031572393466, + "grad_norm": 2.056065797805786, + "learning_rate": 9.996455326074535e-06, + "loss": 6.3348, + "step": 1900 + }, + { + "epoch": 0.0019210611350614346, + "grad_norm": 1.9429932832717896, + "learning_rate": 9.996356808177773e-06, + "loss": 6.3153, + "step": 1950 + }, + { + "epoch": 0.0019703191128835227, + "grad_norm": 1.999122142791748, + "learning_rate": 9.996258290281013e-06, + "loss": 6.345, + "step": 2000 + }, + { + "epoch": 0.002019577090705611, + "grad_norm": 1.873927116394043, + "learning_rate": 9.996159772384253e-06, + "loss": 6.2666, + "step": 2050 + }, + { + "epoch": 0.002068835068527699, + "grad_norm": 2.1296603679656982, + "learning_rate": 9.99606125448749e-06, + "loss": 6.2563, + "step": 2100 + }, + { + "epoch": 0.002118093046349787, + "grad_norm": 1.8446933031082153, + "learning_rate": 9.99596273659073e-06, + "loss": 6.2273, + "step": 2150 + }, + { + "epoch": 0.002167351024171875, + "grad_norm": 1.9267503023147583, + "learning_rate": 9.99586421869397e-06, + "loss": 6.2162, + "step": 2200 + }, + { + "epoch": 0.0022166090019939627, + "grad_norm": 2.406846523284912, + "learning_rate": 9.995765700797208e-06, + "loss": 6.1871, + "step": 2250 + }, + { + "epoch": 0.002265866979816051, + "grad_norm": 2.456120729446411, + "learning_rate": 9.995667182900447e-06, + "loss": 6.2566, + "step": 2300 + }, + { + "epoch": 0.002315124957638139, + "grad_norm": 1.8963744640350342, + "learning_rate": 9.995568665003685e-06, + "loss": 6.1969, + "step": 2350 + }, + { + "epoch": 0.002364382935460227, + "grad_norm": 2.4525437355041504, + "learning_rate": 9.995470147106923e-06, + "loss": 6.1614, + "step": 2400 + }, + { + "epoch": 0.002413640913282315, + "grad_norm": 2.212376832962036, + "learning_rate": 9.995371629210163e-06, + "loss": 6.1044, + "step": 2450 + }, + { + "epoch": 0.002462898891104403, + "grad_norm": 1.9281485080718994, + "learning_rate": 9.995273111313402e-06, + "loss": 6.0902, + "step": 2500 + }, + { + "epoch": 0.0025121568689264913, + "grad_norm": 2.088820695877075, + "learning_rate": 9.99517459341664e-06, + "loss": 6.0916, + "step": 2550 + }, + { + "epoch": 0.0025614148467485794, + "grad_norm": 2.277050018310547, + "learning_rate": 9.99507607551988e-06, + "loss": 6.0823, + "step": 2600 + }, + { + "epoch": 0.0026106728245706675, + "grad_norm": 2.2391433715820312, + "learning_rate": 9.99497755762312e-06, + "loss": 6.0875, + "step": 2650 + }, + { + "epoch": 0.0026599308023927556, + "grad_norm": 2.389378547668457, + "learning_rate": 9.994879039726358e-06, + "loss": 6.068, + "step": 2700 + }, + { + "epoch": 0.0027091887802148436, + "grad_norm": 2.047518491744995, + "learning_rate": 9.994780521829597e-06, + "loss": 6.0431, + "step": 2750 + }, + { + "epoch": 0.0027584467580369317, + "grad_norm": 2.395382881164551, + "learning_rate": 9.994682003932835e-06, + "loss": 6.0413, + "step": 2800 + }, + { + "epoch": 0.00280770473585902, + "grad_norm": 2.185030221939087, + "learning_rate": 9.994583486036075e-06, + "loss": 6.0214, + "step": 2850 + }, + { + "epoch": 0.002856962713681108, + "grad_norm": 1.9633649587631226, + "learning_rate": 9.994484968139313e-06, + "loss": 5.9573, + "step": 2900 + }, + { + "epoch": 0.002906220691503196, + "grad_norm": 2.1540608406066895, + "learning_rate": 9.99438645024255e-06, + "loss": 5.9708, + "step": 2950 + }, + { + "epoch": 0.002955478669325284, + "grad_norm": 2.28556227684021, + "learning_rate": 9.99428793234579e-06, + "loss": 5.9707, + "step": 3000 + }, + { + "epoch": 0.003004736647147372, + "grad_norm": 2.1143901348114014, + "learning_rate": 9.99418941444903e-06, + "loss": 5.9658, + "step": 3050 + }, + { + "epoch": 0.00305399462496946, + "grad_norm": 2.667074680328369, + "learning_rate": 9.99409089655227e-06, + "loss": 5.9224, + "step": 3100 + }, + { + "epoch": 0.003103252602791548, + "grad_norm": 2.3364250659942627, + "learning_rate": 9.993992378655507e-06, + "loss": 5.9221, + "step": 3150 + }, + { + "epoch": 0.003152510580613636, + "grad_norm": 2.3552770614624023, + "learning_rate": 9.993893860758747e-06, + "loss": 5.9139, + "step": 3200 + }, + { + "epoch": 0.003201768558435724, + "grad_norm": 2.1598031520843506, + "learning_rate": 9.993795342861985e-06, + "loss": 5.9354, + "step": 3250 + }, + { + "epoch": 0.003251026536257812, + "grad_norm": 2.305964708328247, + "learning_rate": 9.993696824965225e-06, + "loss": 5.9267, + "step": 3300 + }, + { + "epoch": 0.0033002845140799003, + "grad_norm": 2.497861623764038, + "learning_rate": 9.993598307068463e-06, + "loss": 5.9537, + "step": 3350 + }, + { + "epoch": 0.0033495424919019884, + "grad_norm": 2.5343267917633057, + "learning_rate": 9.993499789171702e-06, + "loss": 5.9309, + "step": 3400 + }, + { + "epoch": 0.0033988004697240765, + "grad_norm": 2.1720359325408936, + "learning_rate": 9.99340127127494e-06, + "loss": 5.8551, + "step": 3450 + }, + { + "epoch": 0.0034480584475461646, + "grad_norm": 2.4061872959136963, + "learning_rate": 9.99330275337818e-06, + "loss": 5.8667, + "step": 3500 + }, + { + "epoch": 0.0034973164253682527, + "grad_norm": 2.1035633087158203, + "learning_rate": 9.99320423548142e-06, + "loss": 5.8721, + "step": 3550 + }, + { + "epoch": 0.0035465744031903407, + "grad_norm": 2.323695659637451, + "learning_rate": 9.993105717584657e-06, + "loss": 5.8444, + "step": 3600 + }, + { + "epoch": 0.003595832381012429, + "grad_norm": 2.7823591232299805, + "learning_rate": 9.993007199687897e-06, + "loss": 5.9007, + "step": 3650 + }, + { + "epoch": 0.003645090358834517, + "grad_norm": 2.4761390686035156, + "learning_rate": 9.992908681791135e-06, + "loss": 5.8517, + "step": 3700 + }, + { + "epoch": 0.003694348336656605, + "grad_norm": 2.1898834705352783, + "learning_rate": 9.992810163894374e-06, + "loss": 5.8172, + "step": 3750 + }, + { + "epoch": 0.003743606314478693, + "grad_norm": 2.3213911056518555, + "learning_rate": 9.992711645997612e-06, + "loss": 5.8537, + "step": 3800 + }, + { + "epoch": 0.003792864292300781, + "grad_norm": 2.354266405105591, + "learning_rate": 9.992613128100852e-06, + "loss": 5.7941, + "step": 3850 + }, + { + "epoch": 0.0038421222701228693, + "grad_norm": 2.4953129291534424, + "learning_rate": 9.99251461020409e-06, + "loss": 5.8024, + "step": 3900 + }, + { + "epoch": 0.0038913802479449574, + "grad_norm": 2.5142929553985596, + "learning_rate": 9.99241609230733e-06, + "loss": 5.7685, + "step": 3950 + }, + { + "epoch": 0.0039406382257670455, + "grad_norm": 2.313856601715088, + "learning_rate": 9.992317574410567e-06, + "loss": 5.8136, + "step": 4000 + }, + { + "epoch": 0.003989896203589133, + "grad_norm": 2.510972738265991, + "learning_rate": 9.992219056513807e-06, + "loss": 5.7491, + "step": 4050 + }, + { + "epoch": 0.004039154181411222, + "grad_norm": 2.5576400756835938, + "learning_rate": 9.992120538617047e-06, + "loss": 5.697, + "step": 4100 + }, + { + "epoch": 0.004088412159233309, + "grad_norm": 2.7585551738739014, + "learning_rate": 9.992022020720285e-06, + "loss": 5.7728, + "step": 4150 + }, + { + "epoch": 0.004137670137055398, + "grad_norm": 2.6345133781433105, + "learning_rate": 9.991923502823524e-06, + "loss": 5.7841, + "step": 4200 + }, + { + "epoch": 0.0041869281148774855, + "grad_norm": 2.1408374309539795, + "learning_rate": 9.991824984926762e-06, + "loss": 5.7751, + "step": 4250 + }, + { + "epoch": 0.004236186092699574, + "grad_norm": 2.6891562938690186, + "learning_rate": 9.991726467030002e-06, + "loss": 5.6952, + "step": 4300 + }, + { + "epoch": 0.004285444070521662, + "grad_norm": 2.546168804168701, + "learning_rate": 9.99162794913324e-06, + "loss": 5.6943, + "step": 4350 + }, + { + "epoch": 0.00433470204834375, + "grad_norm": 2.212948799133301, + "learning_rate": 9.99152943123648e-06, + "loss": 5.6808, + "step": 4400 + }, + { + "epoch": 0.004383960026165838, + "grad_norm": 2.2889039516448975, + "learning_rate": 9.991430913339717e-06, + "loss": 5.6614, + "step": 4450 + }, + { + "epoch": 0.0044332180039879255, + "grad_norm": 2.9201114177703857, + "learning_rate": 9.991332395442957e-06, + "loss": 5.7015, + "step": 4500 + }, + { + "epoch": 0.004482475981810014, + "grad_norm": 2.0735507011413574, + "learning_rate": 9.991233877546197e-06, + "loss": 5.6942, + "step": 4550 + }, + { + "epoch": 0.004531733959632102, + "grad_norm": 2.483539581298828, + "learning_rate": 9.991135359649435e-06, + "loss": 5.668, + "step": 4600 + }, + { + "epoch": 0.00458099193745419, + "grad_norm": 2.944549798965454, + "learning_rate": 9.991036841752674e-06, + "loss": 5.6572, + "step": 4650 + }, + { + "epoch": 0.004630249915276278, + "grad_norm": 2.0549709796905518, + "learning_rate": 9.990938323855912e-06, + "loss": 5.6971, + "step": 4700 + }, + { + "epoch": 0.004679507893098366, + "grad_norm": 2.51755952835083, + "learning_rate": 9.990839805959152e-06, + "loss": 5.6556, + "step": 4750 + }, + { + "epoch": 0.004728765870920454, + "grad_norm": 2.2347500324249268, + "learning_rate": 9.99074128806239e-06, + "loss": 5.635, + "step": 4800 + }, + { + "epoch": 0.0047780238487425426, + "grad_norm": 2.4430387020111084, + "learning_rate": 9.99064277016563e-06, + "loss": 5.6139, + "step": 4850 + }, + { + "epoch": 0.00482728182656463, + "grad_norm": 2.5289666652679443, + "learning_rate": 9.990544252268867e-06, + "loss": 5.6439, + "step": 4900 + }, + { + "epoch": 0.004876539804386719, + "grad_norm": 2.662947177886963, + "learning_rate": 9.990445734372107e-06, + "loss": 5.6426, + "step": 4950 + }, + { + "epoch": 0.004925797782208806, + "grad_norm": 2.731626272201538, + "learning_rate": 9.990347216475346e-06, + "loss": 5.5945, + "step": 5000 + }, + { + "epoch": 0.004975055760030895, + "grad_norm": 2.1274847984313965, + "learning_rate": 9.990248698578584e-06, + "loss": 5.6053, + "step": 5050 + }, + { + "epoch": 0.005024313737852983, + "grad_norm": 2.5701067447662354, + "learning_rate": 9.990150180681824e-06, + "loss": 5.6039, + "step": 5100 + }, + { + "epoch": 0.005073571715675071, + "grad_norm": 2.9836716651916504, + "learning_rate": 9.990051662785062e-06, + "loss": 5.5815, + "step": 5150 + }, + { + "epoch": 0.005122829693497159, + "grad_norm": 2.1876139640808105, + "learning_rate": 9.989953144888302e-06, + "loss": 5.5223, + "step": 5200 + }, + { + "epoch": 0.005172087671319247, + "grad_norm": 2.5413544178009033, + "learning_rate": 9.98985462699154e-06, + "loss": 5.5988, + "step": 5250 + }, + { + "epoch": 0.005221345649141335, + "grad_norm": 2.636404514312744, + "learning_rate": 9.989756109094779e-06, + "loss": 5.6048, + "step": 5300 + }, + { + "epoch": 0.005270603626963423, + "grad_norm": 3.6950316429138184, + "learning_rate": 9.989657591198017e-06, + "loss": 5.6094, + "step": 5350 + }, + { + "epoch": 0.005319861604785511, + "grad_norm": 2.6820759773254395, + "learning_rate": 9.989559073301257e-06, + "loss": 5.514, + "step": 5400 + }, + { + "epoch": 0.005369119582607599, + "grad_norm": 2.682157278060913, + "learning_rate": 9.989460555404495e-06, + "loss": 5.6015, + "step": 5450 + }, + { + "epoch": 0.005418377560429687, + "grad_norm": 2.904618740081787, + "learning_rate": 9.989362037507734e-06, + "loss": 5.5666, + "step": 5500 + }, + { + "epoch": 0.005467635538251775, + "grad_norm": 2.792703628540039, + "learning_rate": 9.989263519610974e-06, + "loss": 5.5716, + "step": 5550 + }, + { + "epoch": 0.0055168935160738635, + "grad_norm": 2.512916326522827, + "learning_rate": 9.989165001714214e-06, + "loss": 5.5335, + "step": 5600 + }, + { + "epoch": 0.005566151493895951, + "grad_norm": 2.446918487548828, + "learning_rate": 9.989066483817451e-06, + "loss": 5.5122, + "step": 5650 + }, + { + "epoch": 0.00561540947171804, + "grad_norm": 2.86995530128479, + "learning_rate": 9.98896796592069e-06, + "loss": 5.5549, + "step": 5700 + }, + { + "epoch": 0.005664667449540127, + "grad_norm": 2.0753886699676514, + "learning_rate": 9.988869448023929e-06, + "loss": 5.4771, + "step": 5750 + }, + { + "epoch": 0.005713925427362216, + "grad_norm": 2.271014451980591, + "learning_rate": 9.988770930127167e-06, + "loss": 5.5639, + "step": 5800 + }, + { + "epoch": 0.0057631834051843035, + "grad_norm": 2.598052978515625, + "learning_rate": 9.988672412230407e-06, + "loss": 5.489, + "step": 5850 + }, + { + "epoch": 0.005812441383006392, + "grad_norm": 2.9920969009399414, + "learning_rate": 9.988573894333645e-06, + "loss": 5.5314, + "step": 5900 + }, + { + "epoch": 0.00586169936082848, + "grad_norm": 2.37494158744812, + "learning_rate": 9.988475376436884e-06, + "loss": 5.5744, + "step": 5950 + }, + { + "epoch": 0.005910957338650568, + "grad_norm": 3.0005249977111816, + "learning_rate": 9.988376858540124e-06, + "loss": 5.4652, + "step": 6000 + }, + { + "epoch": 0.005960215316472656, + "grad_norm": 2.7213375568389893, + "learning_rate": 9.988278340643363e-06, + "loss": 5.4926, + "step": 6050 + }, + { + "epoch": 0.006009473294294744, + "grad_norm": 2.5693137645721436, + "learning_rate": 9.988179822746601e-06, + "loss": 5.4675, + "step": 6100 + }, + { + "epoch": 0.006058731272116832, + "grad_norm": 2.4643566608428955, + "learning_rate": 9.98808130484984e-06, + "loss": 5.4316, + "step": 6150 + }, + { + "epoch": 0.00610798924993892, + "grad_norm": 2.843510150909424, + "learning_rate": 9.987982786953079e-06, + "loss": 5.4439, + "step": 6200 + }, + { + "epoch": 0.006157247227761008, + "grad_norm": 2.2801809310913086, + "learning_rate": 9.987884269056317e-06, + "loss": 5.4392, + "step": 6250 + }, + { + "epoch": 0.006206505205583096, + "grad_norm": 2.228440046310425, + "learning_rate": 9.987785751159556e-06, + "loss": 5.4408, + "step": 6300 + }, + { + "epoch": 0.006255763183405184, + "grad_norm": 2.654935598373413, + "learning_rate": 9.987687233262794e-06, + "loss": 5.4031, + "step": 6350 + }, + { + "epoch": 0.006305021161227272, + "grad_norm": 2.7862391471862793, + "learning_rate": 9.987588715366034e-06, + "loss": 5.4508, + "step": 6400 + }, + { + "epoch": 0.006354279139049361, + "grad_norm": 3.0758538246154785, + "learning_rate": 9.987490197469274e-06, + "loss": 5.4921, + "step": 6450 + }, + { + "epoch": 0.006403537116871448, + "grad_norm": 2.832315683364868, + "learning_rate": 9.987391679572512e-06, + "loss": 5.462, + "step": 6500 + }, + { + "epoch": 0.006452795094693537, + "grad_norm": 2.451906204223633, + "learning_rate": 9.987293161675751e-06, + "loss": 5.4419, + "step": 6550 + }, + { + "epoch": 0.006502053072515624, + "grad_norm": 2.3465466499328613, + "learning_rate": 9.98719464377899e-06, + "loss": 5.4235, + "step": 6600 + }, + { + "epoch": 0.006551311050337713, + "grad_norm": 2.684234619140625, + "learning_rate": 9.987096125882229e-06, + "loss": 5.4586, + "step": 6650 + }, + { + "epoch": 0.006600569028159801, + "grad_norm": 2.792478084564209, + "learning_rate": 9.986997607985467e-06, + "loss": 5.428, + "step": 6700 + }, + { + "epoch": 0.006649827005981889, + "grad_norm": 2.363053560256958, + "learning_rate": 9.986899090088706e-06, + "loss": 5.4461, + "step": 6750 + }, + { + "epoch": 0.006699084983803977, + "grad_norm": 2.9717445373535156, + "learning_rate": 9.986800572191944e-06, + "loss": 5.3866, + "step": 6800 + }, + { + "epoch": 0.006748342961626065, + "grad_norm": 3.2148444652557373, + "learning_rate": 9.986702054295184e-06, + "loss": 5.3826, + "step": 6850 + }, + { + "epoch": 0.006797600939448153, + "grad_norm": 2.63067626953125, + "learning_rate": 9.986603536398422e-06, + "loss": 5.4061, + "step": 6900 + }, + { + "epoch": 0.0068468589172702415, + "grad_norm": 2.5444979667663574, + "learning_rate": 9.986505018501661e-06, + "loss": 5.3423, + "step": 6950 + }, + { + "epoch": 0.006896116895092329, + "grad_norm": 2.4615836143493652, + "learning_rate": 9.986406500604901e-06, + "loss": 5.3855, + "step": 7000 + }, + { + "epoch": 0.006945374872914418, + "grad_norm": 2.4069976806640625, + "learning_rate": 9.98630798270814e-06, + "loss": 5.3411, + "step": 7050 + }, + { + "epoch": 0.006994632850736505, + "grad_norm": 2.8284878730773926, + "learning_rate": 9.986209464811379e-06, + "loss": 5.3785, + "step": 7100 + }, + { + "epoch": 0.007043890828558593, + "grad_norm": 2.9697582721710205, + "learning_rate": 9.986110946914618e-06, + "loss": 5.3951, + "step": 7150 + }, + { + "epoch": 0.0070931488063806815, + "grad_norm": 3.478571891784668, + "learning_rate": 9.986012429017856e-06, + "loss": 5.3656, + "step": 7200 + }, + { + "epoch": 0.007142406784202769, + "grad_norm": 2.390310525894165, + "learning_rate": 9.985913911121094e-06, + "loss": 5.3476, + "step": 7250 + }, + { + "epoch": 0.007191664762024858, + "grad_norm": 3.101663112640381, + "learning_rate": 9.985815393224334e-06, + "loss": 5.3678, + "step": 7300 + }, + { + "epoch": 0.007240922739846945, + "grad_norm": 2.633324146270752, + "learning_rate": 9.985716875327572e-06, + "loss": 5.3972, + "step": 7350 + }, + { + "epoch": 0.007290180717669034, + "grad_norm": 2.532758951187134, + "learning_rate": 9.985618357430811e-06, + "loss": 5.2801, + "step": 7400 + }, + { + "epoch": 0.0073394386954911215, + "grad_norm": 2.296886444091797, + "learning_rate": 9.985519839534051e-06, + "loss": 5.3737, + "step": 7450 + }, + { + "epoch": 0.00738869667331321, + "grad_norm": 2.2987313270568848, + "learning_rate": 9.98542132163729e-06, + "loss": 5.3501, + "step": 7500 + }, + { + "epoch": 0.007437954651135298, + "grad_norm": 2.866952419281006, + "learning_rate": 9.985322803740528e-06, + "loss": 5.4083, + "step": 7550 + }, + { + "epoch": 0.007487212628957386, + "grad_norm": 2.6010262966156006, + "learning_rate": 9.985224285843768e-06, + "loss": 5.2717, + "step": 7600 + }, + { + "epoch": 0.007536470606779474, + "grad_norm": 2.413188934326172, + "learning_rate": 9.985125767947006e-06, + "loss": 5.3104, + "step": 7650 + }, + { + "epoch": 0.007585728584601562, + "grad_norm": 2.554912805557251, + "learning_rate": 9.985027250050246e-06, + "loss": 5.4004, + "step": 7700 + }, + { + "epoch": 0.00763498656242365, + "grad_norm": 2.739072561264038, + "learning_rate": 9.984928732153484e-06, + "loss": 5.3099, + "step": 7750 + }, + { + "epoch": 0.007684244540245739, + "grad_norm": 2.6271185874938965, + "learning_rate": 9.984830214256722e-06, + "loss": 5.2716, + "step": 7800 + }, + { + "epoch": 0.007733502518067826, + "grad_norm": 2.8631014823913574, + "learning_rate": 9.984731696359961e-06, + "loss": 5.3577, + "step": 7850 + }, + { + "epoch": 0.007782760495889915, + "grad_norm": 2.8305814266204834, + "learning_rate": 9.9846331784632e-06, + "loss": 5.3704, + "step": 7900 + }, + { + "epoch": 0.007832018473712002, + "grad_norm": 2.467615842819214, + "learning_rate": 9.984534660566439e-06, + "loss": 5.2781, + "step": 7950 + }, + { + "epoch": 0.007881276451534091, + "grad_norm": 2.7488834857940674, + "learning_rate": 9.984436142669678e-06, + "loss": 5.3023, + "step": 8000 + }, + { + "epoch": 0.007930534429356179, + "grad_norm": 2.5197978019714355, + "learning_rate": 9.984337624772918e-06, + "loss": 5.289, + "step": 8050 + }, + { + "epoch": 0.007979792407178266, + "grad_norm": 3.241250991821289, + "learning_rate": 9.984239106876156e-06, + "loss": 5.2463, + "step": 8100 + }, + { + "epoch": 0.008029050385000354, + "grad_norm": 2.873776435852051, + "learning_rate": 9.984140588979396e-06, + "loss": 5.2921, + "step": 8150 + }, + { + "epoch": 0.008078308362822443, + "grad_norm": 2.737545967102051, + "learning_rate": 9.984042071082633e-06, + "loss": 5.3002, + "step": 8200 + }, + { + "epoch": 0.008127566340644531, + "grad_norm": 2.891793727874756, + "learning_rate": 9.983943553185873e-06, + "loss": 5.2682, + "step": 8250 + }, + { + "epoch": 0.008176824318466619, + "grad_norm": 3.688206672668457, + "learning_rate": 9.983845035289111e-06, + "loss": 5.2973, + "step": 8300 + }, + { + "epoch": 0.008226082296288706, + "grad_norm": 3.1400790214538574, + "learning_rate": 9.98374651739235e-06, + "loss": 5.2762, + "step": 8350 + }, + { + "epoch": 0.008275340274110796, + "grad_norm": 2.4480161666870117, + "learning_rate": 9.983647999495589e-06, + "loss": 5.3078, + "step": 8400 + }, + { + "epoch": 0.008324598251932883, + "grad_norm": 2.787797689437866, + "learning_rate": 9.983549481598828e-06, + "loss": 5.2399, + "step": 8450 + }, + { + "epoch": 0.008373856229754971, + "grad_norm": 2.967879295349121, + "learning_rate": 9.983450963702068e-06, + "loss": 5.271, + "step": 8500 + }, + { + "epoch": 0.008423114207577059, + "grad_norm": 2.8270304203033447, + "learning_rate": 9.983352445805306e-06, + "loss": 5.2504, + "step": 8550 + }, + { + "epoch": 0.008472372185399148, + "grad_norm": 3.0329606533050537, + "learning_rate": 9.983253927908545e-06, + "loss": 5.1802, + "step": 8600 + }, + { + "epoch": 0.008521630163221236, + "grad_norm": 3.035229444503784, + "learning_rate": 9.983155410011783e-06, + "loss": 5.2257, + "step": 8650 + }, + { + "epoch": 0.008570888141043323, + "grad_norm": 3.1413824558258057, + "learning_rate": 9.983056892115023e-06, + "loss": 5.218, + "step": 8700 + }, + { + "epoch": 0.008620146118865411, + "grad_norm": 2.30039644241333, + "learning_rate": 9.982958374218261e-06, + "loss": 5.1838, + "step": 8750 + }, + { + "epoch": 0.0086694040966875, + "grad_norm": 2.7453882694244385, + "learning_rate": 9.9828598563215e-06, + "loss": 5.2439, + "step": 8800 + }, + { + "epoch": 0.008718662074509588, + "grad_norm": 2.842217445373535, + "learning_rate": 9.982761338424738e-06, + "loss": 5.2346, + "step": 8850 + }, + { + "epoch": 0.008767920052331676, + "grad_norm": 2.731189250946045, + "learning_rate": 9.982662820527978e-06, + "loss": 5.2316, + "step": 8900 + }, + { + "epoch": 0.008817178030153763, + "grad_norm": 2.762725830078125, + "learning_rate": 9.982564302631218e-06, + "loss": 5.2461, + "step": 8950 + }, + { + "epoch": 0.008866436007975851, + "grad_norm": 3.2438535690307617, + "learning_rate": 9.982465784734456e-06, + "loss": 5.202, + "step": 9000 + }, + { + "epoch": 0.00891569398579794, + "grad_norm": 2.3306050300598145, + "learning_rate": 9.982367266837695e-06, + "loss": 5.225, + "step": 9050 + }, + { + "epoch": 0.008964951963620028, + "grad_norm": 2.7309937477111816, + "learning_rate": 9.982268748940933e-06, + "loss": 5.1254, + "step": 9100 + }, + { + "epoch": 0.009014209941442116, + "grad_norm": 2.607841730117798, + "learning_rate": 9.982170231044173e-06, + "loss": 5.2148, + "step": 9150 + }, + { + "epoch": 0.009063467919264203, + "grad_norm": 2.752448320388794, + "learning_rate": 9.98207171314741e-06, + "loss": 5.1724, + "step": 9200 + }, + { + "epoch": 0.009112725897086293, + "grad_norm": 2.578098773956299, + "learning_rate": 9.98197319525065e-06, + "loss": 5.1401, + "step": 9250 + }, + { + "epoch": 0.00916198387490838, + "grad_norm": 2.506199598312378, + "learning_rate": 9.981874677353888e-06, + "loss": 5.222, + "step": 9300 + }, + { + "epoch": 0.009211241852730468, + "grad_norm": 2.8980226516723633, + "learning_rate": 9.981776159457128e-06, + "loss": 5.2026, + "step": 9350 + }, + { + "epoch": 0.009260499830552556, + "grad_norm": 2.5668675899505615, + "learning_rate": 9.981677641560366e-06, + "loss": 5.1399, + "step": 9400 + }, + { + "epoch": 0.009309757808374645, + "grad_norm": 2.9774155616760254, + "learning_rate": 9.981579123663605e-06, + "loss": 5.1841, + "step": 9450 + }, + { + "epoch": 0.009359015786196733, + "grad_norm": 2.784179449081421, + "learning_rate": 9.981480605766845e-06, + "loss": 5.1706, + "step": 9500 + }, + { + "epoch": 0.00940827376401882, + "grad_norm": 3.276780128479004, + "learning_rate": 9.981382087870083e-06, + "loss": 5.2006, + "step": 9550 + }, + { + "epoch": 0.009457531741840908, + "grad_norm": 2.45902156829834, + "learning_rate": 9.981283569973323e-06, + "loss": 5.1438, + "step": 9600 + }, + { + "epoch": 0.009506789719662997, + "grad_norm": 3.334418535232544, + "learning_rate": 9.98118505207656e-06, + "loss": 5.1962, + "step": 9650 + }, + { + "epoch": 0.009556047697485085, + "grad_norm": 2.549914836883545, + "learning_rate": 9.9810865341798e-06, + "loss": 5.2424, + "step": 9700 + }, + { + "epoch": 0.009605305675307173, + "grad_norm": 3.135145664215088, + "learning_rate": 9.980988016283038e-06, + "loss": 5.1309, + "step": 9750 + }, + { + "epoch": 0.00965456365312926, + "grad_norm": 2.222324848175049, + "learning_rate": 9.980889498386278e-06, + "loss": 5.1401, + "step": 9800 + }, + { + "epoch": 0.009703821630951348, + "grad_norm": 2.6519627571105957, + "learning_rate": 9.980790980489516e-06, + "loss": 5.1435, + "step": 9850 + }, + { + "epoch": 0.009753079608773437, + "grad_norm": 3.206404209136963, + "learning_rate": 9.980692462592755e-06, + "loss": 5.1377, + "step": 9900 + }, + { + "epoch": 0.009802337586595525, + "grad_norm": 2.6864330768585205, + "learning_rate": 9.980593944695995e-06, + "loss": 5.0888, + "step": 9950 + }, + { + "epoch": 0.009851595564417613, + "grad_norm": 3.300452947616577, + "learning_rate": 9.980495426799233e-06, + "loss": 5.1518, + "step": 10000 + }, + { + "epoch": 0.0099008535422397, + "grad_norm": 2.7210471630096436, + "learning_rate": 9.980396908902473e-06, + "loss": 5.2172, + "step": 10050 + }, + { + "epoch": 0.00995011152006179, + "grad_norm": 3.0081121921539307, + "learning_rate": 9.98029839100571e-06, + "loss": 5.1176, + "step": 10100 + }, + { + "epoch": 0.009999369497883877, + "grad_norm": 2.973844289779663, + "learning_rate": 9.98019987310895e-06, + "loss": 5.1191, + "step": 10150 + }, + { + "epoch": 0.010048627475705965, + "grad_norm": 3.0125725269317627, + "learning_rate": 9.980101355212188e-06, + "loss": 5.119, + "step": 10200 + }, + { + "epoch": 0.010097885453528053, + "grad_norm": 2.811295747756958, + "learning_rate": 9.980002837315428e-06, + "loss": 5.1327, + "step": 10250 + }, + { + "epoch": 0.010147143431350142, + "grad_norm": 3.0833497047424316, + "learning_rate": 9.979904319418666e-06, + "loss": 5.0468, + "step": 10300 + }, + { + "epoch": 0.01019640140917223, + "grad_norm": 2.67097544670105, + "learning_rate": 9.979805801521905e-06, + "loss": 5.1153, + "step": 10350 + }, + { + "epoch": 0.010245659386994318, + "grad_norm": 2.8564915657043457, + "learning_rate": 9.979707283625145e-06, + "loss": 5.1419, + "step": 10400 + }, + { + "epoch": 0.010294917364816405, + "grad_norm": 2.9845869541168213, + "learning_rate": 9.979608765728383e-06, + "loss": 5.1225, + "step": 10450 + }, + { + "epoch": 0.010344175342638495, + "grad_norm": 3.286940574645996, + "learning_rate": 9.979510247831622e-06, + "loss": 5.1463, + "step": 10500 + }, + { + "epoch": 0.010393433320460582, + "grad_norm": 2.777956962585449, + "learning_rate": 9.97941172993486e-06, + "loss": 5.1447, + "step": 10550 + }, + { + "epoch": 0.01044269129828267, + "grad_norm": 3.1841282844543457, + "learning_rate": 9.9793132120381e-06, + "loss": 5.0844, + "step": 10600 + }, + { + "epoch": 0.010491949276104758, + "grad_norm": 2.8072447776794434, + "learning_rate": 9.979214694141338e-06, + "loss": 5.0622, + "step": 10650 + }, + { + "epoch": 0.010541207253926845, + "grad_norm": 2.525725841522217, + "learning_rate": 9.979116176244577e-06, + "loss": 5.103, + "step": 10700 + }, + { + "epoch": 0.010590465231748935, + "grad_norm": 2.84185528755188, + "learning_rate": 9.979017658347815e-06, + "loss": 5.078, + "step": 10750 + }, + { + "epoch": 0.010639723209571022, + "grad_norm": 2.6805336475372314, + "learning_rate": 9.978919140451055e-06, + "loss": 5.1111, + "step": 10800 + }, + { + "epoch": 0.01068898118739311, + "grad_norm": 2.621917247772217, + "learning_rate": 9.978820622554295e-06, + "loss": 5.1216, + "step": 10850 + }, + { + "epoch": 0.010738239165215198, + "grad_norm": 3.3322038650512695, + "learning_rate": 9.978722104657533e-06, + "loss": 5.1474, + "step": 10900 + }, + { + "epoch": 0.010787497143037287, + "grad_norm": 2.7139110565185547, + "learning_rate": 9.978623586760772e-06, + "loss": 5.1347, + "step": 10950 + }, + { + "epoch": 0.010836755120859375, + "grad_norm": 2.7151482105255127, + "learning_rate": 9.978525068864012e-06, + "loss": 5.0979, + "step": 11000 + }, + { + "epoch": 0.010886013098681462, + "grad_norm": 2.9608211517333984, + "learning_rate": 9.97842655096725e-06, + "loss": 5.0786, + "step": 11050 + }, + { + "epoch": 0.01093527107650355, + "grad_norm": 2.5886597633361816, + "learning_rate": 9.978328033070488e-06, + "loss": 5.0645, + "step": 11100 + }, + { + "epoch": 0.01098452905432564, + "grad_norm": 2.6722819805145264, + "learning_rate": 9.978229515173727e-06, + "loss": 5.0679, + "step": 11150 + }, + { + "epoch": 0.011033787032147727, + "grad_norm": 2.6561288833618164, + "learning_rate": 9.978130997276965e-06, + "loss": 5.0565, + "step": 11200 + }, + { + "epoch": 0.011083045009969815, + "grad_norm": 3.1985936164855957, + "learning_rate": 9.978032479380205e-06, + "loss": 5.0584, + "step": 11250 + }, + { + "epoch": 0.011132302987791902, + "grad_norm": 3.015976905822754, + "learning_rate": 9.977933961483443e-06, + "loss": 5.0309, + "step": 11300 + }, + { + "epoch": 0.011181560965613992, + "grad_norm": 2.7622482776641846, + "learning_rate": 9.977835443586682e-06, + "loss": 5.0371, + "step": 11350 + }, + { + "epoch": 0.01123081894343608, + "grad_norm": 2.7675328254699707, + "learning_rate": 9.977736925689922e-06, + "loss": 5.0582, + "step": 11400 + }, + { + "epoch": 0.011280076921258167, + "grad_norm": 2.6566014289855957, + "learning_rate": 9.977638407793162e-06, + "loss": 5.0632, + "step": 11450 + }, + { + "epoch": 0.011329334899080255, + "grad_norm": 2.5212440490722656, + "learning_rate": 9.9775398898964e-06, + "loss": 5.0305, + "step": 11500 + }, + { + "epoch": 0.011378592876902342, + "grad_norm": 3.1915969848632812, + "learning_rate": 9.97744137199964e-06, + "loss": 5.0243, + "step": 11550 + }, + { + "epoch": 0.011427850854724432, + "grad_norm": 2.747652053833008, + "learning_rate": 9.977342854102877e-06, + "loss": 5.0453, + "step": 11600 + }, + { + "epoch": 0.01147710883254652, + "grad_norm": 3.1639904975891113, + "learning_rate": 9.977244336206115e-06, + "loss": 5.0106, + "step": 11650 + }, + { + "epoch": 0.011526366810368607, + "grad_norm": 2.8592841625213623, + "learning_rate": 9.977145818309355e-06, + "loss": 5.0245, + "step": 11700 + }, + { + "epoch": 0.011575624788190695, + "grad_norm": 2.4575510025024414, + "learning_rate": 9.977047300412593e-06, + "loss": 5.0459, + "step": 11750 + }, + { + "epoch": 0.011624882766012784, + "grad_norm": 2.752303123474121, + "learning_rate": 9.976948782515832e-06, + "loss": 5.0125, + "step": 11800 + }, + { + "epoch": 0.011674140743834872, + "grad_norm": 2.4985873699188232, + "learning_rate": 9.976850264619072e-06, + "loss": 5.0599, + "step": 11850 + }, + { + "epoch": 0.01172339872165696, + "grad_norm": 2.650714159011841, + "learning_rate": 9.97675174672231e-06, + "loss": 5.005, + "step": 11900 + }, + { + "epoch": 0.011772656699479047, + "grad_norm": 2.8964450359344482, + "learning_rate": 9.97665322882555e-06, + "loss": 5.0384, + "step": 11950 + }, + { + "epoch": 0.011821914677301136, + "grad_norm": 2.7454681396484375, + "learning_rate": 9.976554710928789e-06, + "loss": 4.9572, + "step": 12000 + }, + { + "epoch": 0.011871172655123224, + "grad_norm": 4.804060459136963, + "learning_rate": 9.976456193032027e-06, + "loss": 4.9991, + "step": 12050 + }, + { + "epoch": 0.011920430632945312, + "grad_norm": 2.895745277404785, + "learning_rate": 9.976357675135267e-06, + "loss": 5.0209, + "step": 12100 + }, + { + "epoch": 0.0119696886107674, + "grad_norm": 3.107665777206421, + "learning_rate": 9.976259157238505e-06, + "loss": 5.0109, + "step": 12150 + }, + { + "epoch": 0.012018946588589489, + "grad_norm": 3.312875986099243, + "learning_rate": 9.976160639341743e-06, + "loss": 5.0715, + "step": 12200 + }, + { + "epoch": 0.012068204566411576, + "grad_norm": 2.6124379634857178, + "learning_rate": 9.976062121444982e-06, + "loss": 5.0115, + "step": 12250 + }, + { + "epoch": 0.012117462544233664, + "grad_norm": 2.636563301086426, + "learning_rate": 9.975963603548222e-06, + "loss": 5.011, + "step": 12300 + }, + { + "epoch": 0.012166720522055752, + "grad_norm": 2.8454091548919678, + "learning_rate": 9.97586508565146e-06, + "loss": 4.9804, + "step": 12350 + }, + { + "epoch": 0.01221597849987784, + "grad_norm": 2.3826451301574707, + "learning_rate": 9.9757665677547e-06, + "loss": 5.0208, + "step": 12400 + }, + { + "epoch": 0.012265236477699929, + "grad_norm": 2.883273124694824, + "learning_rate": 9.975668049857939e-06, + "loss": 5.0381, + "step": 12450 + }, + { + "epoch": 0.012314494455522016, + "grad_norm": 3.6011040210723877, + "learning_rate": 9.975569531961177e-06, + "loss": 4.9563, + "step": 12500 + }, + { + "epoch": 0.012363752433344104, + "grad_norm": 8.041337013244629, + "learning_rate": 9.975471014064417e-06, + "loss": 4.9735, + "step": 12550 + }, + { + "epoch": 0.012413010411166192, + "grad_norm": 2.9154751300811768, + "learning_rate": 9.975372496167654e-06, + "loss": 4.9955, + "step": 12600 + }, + { + "epoch": 0.012462268388988281, + "grad_norm": 3.00058650970459, + "learning_rate": 9.975273978270894e-06, + "loss": 4.9427, + "step": 12650 + }, + { + "epoch": 0.012511526366810369, + "grad_norm": 3.041849136352539, + "learning_rate": 9.975175460374132e-06, + "loss": 4.9327, + "step": 12700 + }, + { + "epoch": 0.012560784344632456, + "grad_norm": 2.5672147274017334, + "learning_rate": 9.97507694247737e-06, + "loss": 5.0007, + "step": 12750 + }, + { + "epoch": 0.012610042322454544, + "grad_norm": 2.7387607097625732, + "learning_rate": 9.97497842458061e-06, + "loss": 4.9586, + "step": 12800 + }, + { + "epoch": 0.012659300300276633, + "grad_norm": 2.7960760593414307, + "learning_rate": 9.97487990668385e-06, + "loss": 4.9412, + "step": 12850 + }, + { + "epoch": 0.012708558278098721, + "grad_norm": 2.8948588371276855, + "learning_rate": 9.974781388787089e-06, + "loss": 5.0998, + "step": 12900 + }, + { + "epoch": 0.012757816255920809, + "grad_norm": 2.80606746673584, + "learning_rate": 9.974682870890327e-06, + "loss": 4.9639, + "step": 12950 + }, + { + "epoch": 0.012807074233742896, + "grad_norm": 2.617866039276123, + "learning_rate": 9.974584352993566e-06, + "loss": 4.9566, + "step": 13000 + }, + { + "epoch": 0.012856332211564986, + "grad_norm": 2.61118483543396, + "learning_rate": 9.974485835096804e-06, + "loss": 4.9398, + "step": 13050 + }, + { + "epoch": 0.012905590189387074, + "grad_norm": 3.115861654281616, + "learning_rate": 9.974387317200044e-06, + "loss": 4.9141, + "step": 13100 + }, + { + "epoch": 0.012954848167209161, + "grad_norm": 3.14267635345459, + "learning_rate": 9.974288799303282e-06, + "loss": 4.9804, + "step": 13150 + }, + { + "epoch": 0.013004106145031249, + "grad_norm": 2.8500235080718994, + "learning_rate": 9.97419028140652e-06, + "loss": 4.9494, + "step": 13200 + }, + { + "epoch": 0.013053364122853336, + "grad_norm": 2.4847493171691895, + "learning_rate": 9.97409176350976e-06, + "loss": 4.8897, + "step": 13250 + }, + { + "epoch": 0.013102622100675426, + "grad_norm": 2.5797436237335205, + "learning_rate": 9.973993245612999e-06, + "loss": 4.9435, + "step": 13300 + }, + { + "epoch": 0.013151880078497514, + "grad_norm": 2.6346473693847656, + "learning_rate": 9.973894727716239e-06, + "loss": 4.9417, + "step": 13350 + }, + { + "epoch": 0.013201138056319601, + "grad_norm": 2.570680618286133, + "learning_rate": 9.973796209819477e-06, + "loss": 4.9819, + "step": 13400 + }, + { + "epoch": 0.013250396034141689, + "grad_norm": 2.6807267665863037, + "learning_rate": 9.973697691922716e-06, + "loss": 4.9395, + "step": 13450 + }, + { + "epoch": 0.013299654011963778, + "grad_norm": 3.142136573791504, + "learning_rate": 9.973599174025954e-06, + "loss": 4.9729, + "step": 13500 + }, + { + "epoch": 0.013348911989785866, + "grad_norm": 2.413271427154541, + "learning_rate": 9.973500656129194e-06, + "loss": 4.927, + "step": 13550 + }, + { + "epoch": 0.013398169967607954, + "grad_norm": 2.633683681488037, + "learning_rate": 9.973402138232432e-06, + "loss": 4.9453, + "step": 13600 + }, + { + "epoch": 0.013447427945430041, + "grad_norm": 2.6855924129486084, + "learning_rate": 9.973303620335671e-06, + "loss": 4.9406, + "step": 13650 + }, + { + "epoch": 0.01349668592325213, + "grad_norm": 2.8274478912353516, + "learning_rate": 9.97320510243891e-06, + "loss": 4.9191, + "step": 13700 + }, + { + "epoch": 0.013545943901074218, + "grad_norm": 2.3939945697784424, + "learning_rate": 9.973106584542149e-06, + "loss": 4.9049, + "step": 13750 + }, + { + "epoch": 0.013595201878896306, + "grad_norm": 2.7994632720947266, + "learning_rate": 9.973008066645387e-06, + "loss": 4.9746, + "step": 13800 + }, + { + "epoch": 0.013644459856718394, + "grad_norm": 2.558964490890503, + "learning_rate": 9.972909548748627e-06, + "loss": 4.8646, + "step": 13850 + }, + { + "epoch": 0.013693717834540483, + "grad_norm": 2.6282644271850586, + "learning_rate": 9.972811030851866e-06, + "loss": 4.9234, + "step": 13900 + }, + { + "epoch": 0.01374297581236257, + "grad_norm": 2.561087131500244, + "learning_rate": 9.972712512955104e-06, + "loss": 4.9091, + "step": 13950 + }, + { + "epoch": 0.013792233790184658, + "grad_norm": 2.9112539291381836, + "learning_rate": 9.972613995058344e-06, + "loss": 4.9158, + "step": 14000 + }, + { + "epoch": 0.013841491768006746, + "grad_norm": 2.4566030502319336, + "learning_rate": 9.972515477161582e-06, + "loss": 4.9312, + "step": 14050 + }, + { + "epoch": 0.013890749745828835, + "grad_norm": 2.7782633304595947, + "learning_rate": 9.972416959264821e-06, + "loss": 4.9544, + "step": 14100 + }, + { + "epoch": 0.013940007723650923, + "grad_norm": 2.9850313663482666, + "learning_rate": 9.97231844136806e-06, + "loss": 4.8732, + "step": 14150 + }, + { + "epoch": 0.01398926570147301, + "grad_norm": 3.229119062423706, + "learning_rate": 9.972219923471299e-06, + "loss": 4.9412, + "step": 14200 + }, + { + "epoch": 0.014038523679295098, + "grad_norm": 2.7228994369506836, + "learning_rate": 9.972121405574537e-06, + "loss": 4.9139, + "step": 14250 + }, + { + "epoch": 0.014087781657117186, + "grad_norm": 2.9174795150756836, + "learning_rate": 9.972022887677776e-06, + "loss": 4.8683, + "step": 14300 + }, + { + "epoch": 0.014137039634939275, + "grad_norm": 2.9443442821502686, + "learning_rate": 9.971924369781016e-06, + "loss": 4.8991, + "step": 14350 + }, + { + "epoch": 0.014186297612761363, + "grad_norm": 3.1987316608428955, + "learning_rate": 9.971825851884254e-06, + "loss": 4.8839, + "step": 14400 + }, + { + "epoch": 0.01423555559058345, + "grad_norm": 3.0276005268096924, + "learning_rate": 9.971727333987494e-06, + "loss": 4.9147, + "step": 14450 + }, + { + "epoch": 0.014284813568405538, + "grad_norm": 3.049004316329956, + "learning_rate": 9.971628816090732e-06, + "loss": 4.8882, + "step": 14500 + }, + { + "epoch": 0.014334071546227628, + "grad_norm": 3.058237314224243, + "learning_rate": 9.971530298193971e-06, + "loss": 4.919, + "step": 14550 + }, + { + "epoch": 0.014383329524049715, + "grad_norm": 2.641164541244507, + "learning_rate": 9.971431780297209e-06, + "loss": 4.9429, + "step": 14600 + }, + { + "epoch": 0.014432587501871803, + "grad_norm": 2.5120842456817627, + "learning_rate": 9.971333262400449e-06, + "loss": 4.8806, + "step": 14650 + }, + { + "epoch": 0.01448184547969389, + "grad_norm": 2.727837324142456, + "learning_rate": 9.971234744503687e-06, + "loss": 4.9088, + "step": 14700 + }, + { + "epoch": 0.01453110345751598, + "grad_norm": 2.8373379707336426, + "learning_rate": 9.971136226606926e-06, + "loss": 4.8997, + "step": 14750 + }, + { + "epoch": 0.014580361435338068, + "grad_norm": 2.5231852531433105, + "learning_rate": 9.971037708710166e-06, + "loss": 4.8688, + "step": 14800 + }, + { + "epoch": 0.014629619413160155, + "grad_norm": 2.866428852081299, + "learning_rate": 9.970939190813404e-06, + "loss": 4.8645, + "step": 14850 + }, + { + "epoch": 0.014678877390982243, + "grad_norm": 3.116302490234375, + "learning_rate": 9.970840672916643e-06, + "loss": 4.9037, + "step": 14900 + }, + { + "epoch": 0.014728135368804332, + "grad_norm": 2.9273884296417236, + "learning_rate": 9.970742155019881e-06, + "loss": 4.8069, + "step": 14950 + }, + { + "epoch": 0.01477739334662642, + "grad_norm": 2.5544352531433105, + "learning_rate": 9.970643637123121e-06, + "loss": 4.8998, + "step": 15000 + }, + { + "epoch": 0.014826651324448508, + "grad_norm": 2.3461215496063232, + "learning_rate": 9.970545119226359e-06, + "loss": 4.8263, + "step": 15050 + }, + { + "epoch": 0.014875909302270595, + "grad_norm": 2.9023947715759277, + "learning_rate": 9.970446601329599e-06, + "loss": 4.8763, + "step": 15100 + }, + { + "epoch": 0.014925167280092683, + "grad_norm": 2.755872964859009, + "learning_rate": 9.970348083432836e-06, + "loss": 4.8345, + "step": 15150 + }, + { + "epoch": 0.014974425257914772, + "grad_norm": 3.4369239807128906, + "learning_rate": 9.970249565536076e-06, + "loss": 4.8348, + "step": 15200 + }, + { + "epoch": 0.01502368323573686, + "grad_norm": 2.513559341430664, + "learning_rate": 9.970151047639314e-06, + "loss": 4.8278, + "step": 15250 + }, + { + "epoch": 0.015072941213558948, + "grad_norm": 2.857710838317871, + "learning_rate": 9.970052529742554e-06, + "loss": 4.8655, + "step": 15300 + }, + { + "epoch": 0.015122199191381035, + "grad_norm": 2.9492580890655518, + "learning_rate": 9.969954011845793e-06, + "loss": 4.836, + "step": 15350 + }, + { + "epoch": 0.015171457169203125, + "grad_norm": 2.8338775634765625, + "learning_rate": 9.969855493949031e-06, + "loss": 4.832, + "step": 15400 + }, + { + "epoch": 0.015220715147025212, + "grad_norm": 2.7897562980651855, + "learning_rate": 9.969756976052271e-06, + "loss": 4.8706, + "step": 15450 + }, + { + "epoch": 0.0152699731248473, + "grad_norm": 2.9784884452819824, + "learning_rate": 9.969658458155509e-06, + "loss": 4.8397, + "step": 15500 + }, + { + "epoch": 0.015319231102669388, + "grad_norm": 2.539855718612671, + "learning_rate": 9.969559940258748e-06, + "loss": 4.7733, + "step": 15550 + }, + { + "epoch": 0.015368489080491477, + "grad_norm": 2.8186769485473633, + "learning_rate": 9.969461422361986e-06, + "loss": 4.8489, + "step": 15600 + }, + { + "epoch": 0.015417747058313565, + "grad_norm": 2.567842483520508, + "learning_rate": 9.969362904465226e-06, + "loss": 4.7802, + "step": 15650 + }, + { + "epoch": 0.015467005036135652, + "grad_norm": 2.962284564971924, + "learning_rate": 9.969264386568464e-06, + "loss": 4.7659, + "step": 15700 + }, + { + "epoch": 0.01551626301395774, + "grad_norm": 2.9043500423431396, + "learning_rate": 9.969165868671704e-06, + "loss": 4.8493, + "step": 15750 + }, + { + "epoch": 0.01556552099177983, + "grad_norm": 2.8906874656677246, + "learning_rate": 9.969067350774943e-06, + "loss": 4.832, + "step": 15800 + }, + { + "epoch": 0.015614778969601917, + "grad_norm": 2.635098457336426, + "learning_rate": 9.968968832878183e-06, + "loss": 4.7984, + "step": 15850 + }, + { + "epoch": 0.015664036947424003, + "grad_norm": 2.9309582710266113, + "learning_rate": 9.96887031498142e-06, + "loss": 4.8289, + "step": 15900 + }, + { + "epoch": 0.015713294925246094, + "grad_norm": 2.7389392852783203, + "learning_rate": 9.968771797084659e-06, + "loss": 4.8491, + "step": 15950 + }, + { + "epoch": 0.015762552903068182, + "grad_norm": 2.7155375480651855, + "learning_rate": 9.968673279187898e-06, + "loss": 4.817, + "step": 16000 + }, + { + "epoch": 0.01581181088089027, + "grad_norm": 3.205986976623535, + "learning_rate": 9.968574761291136e-06, + "loss": 4.7906, + "step": 16050 + }, + { + "epoch": 0.015861068858712357, + "grad_norm": 2.7359631061553955, + "learning_rate": 9.968476243394376e-06, + "loss": 4.7927, + "step": 16100 + }, + { + "epoch": 0.015910326836534445, + "grad_norm": 2.73769474029541, + "learning_rate": 9.968377725497614e-06, + "loss": 4.8115, + "step": 16150 + }, + { + "epoch": 0.015959584814356532, + "grad_norm": 2.722442388534546, + "learning_rate": 9.968279207600853e-06, + "loss": 4.8113, + "step": 16200 + }, + { + "epoch": 0.01600884279217862, + "grad_norm": 3.076115846633911, + "learning_rate": 9.968180689704093e-06, + "loss": 4.8335, + "step": 16250 + }, + { + "epoch": 0.016058100770000708, + "grad_norm": 3.232151985168457, + "learning_rate": 9.968082171807331e-06, + "loss": 4.7885, + "step": 16300 + }, + { + "epoch": 0.0161073587478228, + "grad_norm": 2.7500979900360107, + "learning_rate": 9.96798365391057e-06, + "loss": 4.8502, + "step": 16350 + }, + { + "epoch": 0.016156616725644887, + "grad_norm": 3.8684093952178955, + "learning_rate": 9.96788513601381e-06, + "loss": 4.8037, + "step": 16400 + }, + { + "epoch": 0.016205874703466974, + "grad_norm": 3.7351253032684326, + "learning_rate": 9.967786618117048e-06, + "loss": 4.7569, + "step": 16450 + }, + { + "epoch": 0.016255132681289062, + "grad_norm": 2.939333438873291, + "learning_rate": 9.967688100220286e-06, + "loss": 4.8306, + "step": 16500 + }, + { + "epoch": 0.01630439065911115, + "grad_norm": 3.126208543777466, + "learning_rate": 9.967589582323526e-06, + "loss": 4.8278, + "step": 16550 + }, + { + "epoch": 0.016353648636933237, + "grad_norm": 2.5775349140167236, + "learning_rate": 9.967491064426764e-06, + "loss": 4.838, + "step": 16600 + }, + { + "epoch": 0.016402906614755325, + "grad_norm": 2.6388967037200928, + "learning_rate": 9.967392546530003e-06, + "loss": 4.7967, + "step": 16650 + }, + { + "epoch": 0.016452164592577413, + "grad_norm": 3.136052131652832, + "learning_rate": 9.967294028633241e-06, + "loss": 4.8092, + "step": 16700 + }, + { + "epoch": 0.016501422570399504, + "grad_norm": 2.507441520690918, + "learning_rate": 9.96719551073648e-06, + "loss": 4.8197, + "step": 16750 + }, + { + "epoch": 0.01655068054822159, + "grad_norm": 2.833686351776123, + "learning_rate": 9.96709699283972e-06, + "loss": 4.7749, + "step": 16800 + }, + { + "epoch": 0.01659993852604368, + "grad_norm": 3.32494854927063, + "learning_rate": 9.96699847494296e-06, + "loss": 4.7853, + "step": 16850 + }, + { + "epoch": 0.016649196503865767, + "grad_norm": 2.7150046825408936, + "learning_rate": 9.966899957046198e-06, + "loss": 4.7526, + "step": 16900 + }, + { + "epoch": 0.016698454481687854, + "grad_norm": 2.52634859085083, + "learning_rate": 9.966801439149438e-06, + "loss": 4.7693, + "step": 16950 + }, + { + "epoch": 0.016747712459509942, + "grad_norm": 3.0011000633239746, + "learning_rate": 9.966702921252676e-06, + "loss": 4.8027, + "step": 17000 + }, + { + "epoch": 0.01679697043733203, + "grad_norm": 3.262936592102051, + "learning_rate": 9.966604403355913e-06, + "loss": 4.7472, + "step": 17050 + }, + { + "epoch": 0.016846228415154117, + "grad_norm": 2.754255771636963, + "learning_rate": 9.966505885459153e-06, + "loss": 4.7705, + "step": 17100 + }, + { + "epoch": 0.016895486392976205, + "grad_norm": 3.0748441219329834, + "learning_rate": 9.966407367562391e-06, + "loss": 4.7362, + "step": 17150 + }, + { + "epoch": 0.016944744370798296, + "grad_norm": 2.397820472717285, + "learning_rate": 9.96630884966563e-06, + "loss": 4.7954, + "step": 17200 + }, + { + "epoch": 0.016994002348620384, + "grad_norm": 2.6865358352661133, + "learning_rate": 9.96621033176887e-06, + "loss": 4.7928, + "step": 17250 + }, + { + "epoch": 0.01704326032644247, + "grad_norm": 2.6370294094085693, + "learning_rate": 9.96611181387211e-06, + "loss": 4.7236, + "step": 17300 + }, + { + "epoch": 0.01709251830426456, + "grad_norm": 2.90108060836792, + "learning_rate": 9.966013295975348e-06, + "loss": 4.756, + "step": 17350 + }, + { + "epoch": 0.017141776282086647, + "grad_norm": 2.5440845489501953, + "learning_rate": 9.965914778078587e-06, + "loss": 4.7545, + "step": 17400 + }, + { + "epoch": 0.017191034259908734, + "grad_norm": 2.8412258625030518, + "learning_rate": 9.965816260181825e-06, + "loss": 4.8121, + "step": 17450 + }, + { + "epoch": 0.017240292237730822, + "grad_norm": 2.730078935623169, + "learning_rate": 9.965717742285065e-06, + "loss": 4.7686, + "step": 17500 + }, + { + "epoch": 0.01728955021555291, + "grad_norm": 3.051495313644409, + "learning_rate": 9.965619224388303e-06, + "loss": 4.7846, + "step": 17550 + }, + { + "epoch": 0.017338808193375, + "grad_norm": 3.127047300338745, + "learning_rate": 9.965520706491541e-06, + "loss": 4.7092, + "step": 17600 + }, + { + "epoch": 0.01738806617119709, + "grad_norm": 2.6953890323638916, + "learning_rate": 9.96542218859478e-06, + "loss": 4.7411, + "step": 17650 + }, + { + "epoch": 0.017437324149019176, + "grad_norm": 2.555135488510132, + "learning_rate": 9.96532367069802e-06, + "loss": 4.7444, + "step": 17700 + }, + { + "epoch": 0.017486582126841264, + "grad_norm": 2.8850784301757812, + "learning_rate": 9.965225152801258e-06, + "loss": 4.7264, + "step": 17750 + }, + { + "epoch": 0.01753584010466335, + "grad_norm": 3.1482346057891846, + "learning_rate": 9.965126634904498e-06, + "loss": 4.7424, + "step": 17800 + }, + { + "epoch": 0.01758509808248544, + "grad_norm": 2.9815826416015625, + "learning_rate": 9.965028117007737e-06, + "loss": 4.7988, + "step": 17850 + }, + { + "epoch": 0.017634356060307527, + "grad_norm": 2.593313455581665, + "learning_rate": 9.964929599110975e-06, + "loss": 4.8004, + "step": 17900 + }, + { + "epoch": 0.017683614038129614, + "grad_norm": 2.5208353996276855, + "learning_rate": 9.964831081214215e-06, + "loss": 4.7578, + "step": 17950 + }, + { + "epoch": 0.017732872015951702, + "grad_norm": 2.3754539489746094, + "learning_rate": 9.964732563317453e-06, + "loss": 4.7409, + "step": 18000 + }, + { + "epoch": 0.017782129993773793, + "grad_norm": 3.008375644683838, + "learning_rate": 9.964634045420692e-06, + "loss": 4.784, + "step": 18050 + }, + { + "epoch": 0.01783138797159588, + "grad_norm": 3.5501465797424316, + "learning_rate": 9.96453552752393e-06, + "loss": 4.7189, + "step": 18100 + }, + { + "epoch": 0.01788064594941797, + "grad_norm": 2.732070207595825, + "learning_rate": 9.96443700962717e-06, + "loss": 4.6844, + "step": 18150 + }, + { + "epoch": 0.017929903927240056, + "grad_norm": 2.8019258975982666, + "learning_rate": 9.964338491730408e-06, + "loss": 4.6937, + "step": 18200 + }, + { + "epoch": 0.017979161905062144, + "grad_norm": 2.3495283126831055, + "learning_rate": 9.964239973833648e-06, + "loss": 4.7496, + "step": 18250 + }, + { + "epoch": 0.01802841988288423, + "grad_norm": 2.951258897781372, + "learning_rate": 9.964141455936887e-06, + "loss": 4.7037, + "step": 18300 + }, + { + "epoch": 0.01807767786070632, + "grad_norm": 2.8172523975372314, + "learning_rate": 9.964042938040125e-06, + "loss": 4.7194, + "step": 18350 + }, + { + "epoch": 0.018126935838528407, + "grad_norm": 2.6205809116363525, + "learning_rate": 9.963944420143365e-06, + "loss": 4.7109, + "step": 18400 + }, + { + "epoch": 0.018176193816350498, + "grad_norm": 2.723106861114502, + "learning_rate": 9.963845902246603e-06, + "loss": 4.7064, + "step": 18450 + }, + { + "epoch": 0.018225451794172585, + "grad_norm": 2.867903709411621, + "learning_rate": 9.963747384349842e-06, + "loss": 4.7304, + "step": 18500 + }, + { + "epoch": 0.018274709771994673, + "grad_norm": 2.6120004653930664, + "learning_rate": 9.96364886645308e-06, + "loss": 4.6865, + "step": 18550 + }, + { + "epoch": 0.01832396774981676, + "grad_norm": 2.9803898334503174, + "learning_rate": 9.96355034855632e-06, + "loss": 4.6462, + "step": 18600 + }, + { + "epoch": 0.01837322572763885, + "grad_norm": 2.8633246421813965, + "learning_rate": 9.963451830659558e-06, + "loss": 4.643, + "step": 18650 + }, + { + "epoch": 0.018422483705460936, + "grad_norm": 2.9035658836364746, + "learning_rate": 9.963353312762797e-06, + "loss": 4.6612, + "step": 18700 + }, + { + "epoch": 0.018471741683283024, + "grad_norm": 3.018044948577881, + "learning_rate": 9.963254794866037e-06, + "loss": 4.7231, + "step": 18750 + }, + { + "epoch": 0.01852099966110511, + "grad_norm": 3.025055408477783, + "learning_rate": 9.963156276969275e-06, + "loss": 4.7165, + "step": 18800 + }, + { + "epoch": 0.0185702576389272, + "grad_norm": 3.1283817291259766, + "learning_rate": 9.963057759072515e-06, + "loss": 4.6978, + "step": 18850 + }, + { + "epoch": 0.01861951561674929, + "grad_norm": 2.801934003829956, + "learning_rate": 9.962959241175753e-06, + "loss": 4.6834, + "step": 18900 + }, + { + "epoch": 0.018668773594571378, + "grad_norm": 2.7277987003326416, + "learning_rate": 9.962860723278992e-06, + "loss": 4.6815, + "step": 18950 + }, + { + "epoch": 0.018718031572393466, + "grad_norm": 2.6831300258636475, + "learning_rate": 9.96276220538223e-06, + "loss": 4.6825, + "step": 19000 + }, + { + "epoch": 0.018767289550215553, + "grad_norm": 3.4934778213500977, + "learning_rate": 9.96266368748547e-06, + "loss": 4.7283, + "step": 19050 + }, + { + "epoch": 0.01881654752803764, + "grad_norm": 2.8354766368865967, + "learning_rate": 9.962565169588708e-06, + "loss": 4.664, + "step": 19100 + }, + { + "epoch": 0.01886580550585973, + "grad_norm": 2.8361477851867676, + "learning_rate": 9.962466651691947e-06, + "loss": 4.7284, + "step": 19150 + }, + { + "epoch": 0.018915063483681816, + "grad_norm": 2.7482402324676514, + "learning_rate": 9.962368133795185e-06, + "loss": 4.691, + "step": 19200 + }, + { + "epoch": 0.018964321461503904, + "grad_norm": 3.1883468627929688, + "learning_rate": 9.962269615898425e-06, + "loss": 4.6753, + "step": 19250 + }, + { + "epoch": 0.019013579439325995, + "grad_norm": 2.7762362957000732, + "learning_rate": 9.962171098001664e-06, + "loss": 4.7052, + "step": 19300 + }, + { + "epoch": 0.019062837417148083, + "grad_norm": 2.7757973670959473, + "learning_rate": 9.962072580104902e-06, + "loss": 4.6868, + "step": 19350 + }, + { + "epoch": 0.01911209539497017, + "grad_norm": 3.002589702606201, + "learning_rate": 9.961974062208142e-06, + "loss": 4.7017, + "step": 19400 + }, + { + "epoch": 0.019161353372792258, + "grad_norm": 3.270817279815674, + "learning_rate": 9.96187554431138e-06, + "loss": 4.6161, + "step": 19450 + }, + { + "epoch": 0.019210611350614346, + "grad_norm": 2.584317922592163, + "learning_rate": 9.96177702641462e-06, + "loss": 4.6854, + "step": 19500 + }, + { + "epoch": 0.019259869328436433, + "grad_norm": 2.954659938812256, + "learning_rate": 9.961678508517858e-06, + "loss": 4.7057, + "step": 19550 + }, + { + "epoch": 0.01930912730625852, + "grad_norm": 2.62088942527771, + "learning_rate": 9.961579990621097e-06, + "loss": 4.631, + "step": 19600 + }, + { + "epoch": 0.01935838528408061, + "grad_norm": 2.6912055015563965, + "learning_rate": 9.961481472724335e-06, + "loss": 4.6033, + "step": 19650 + }, + { + "epoch": 0.019407643261902696, + "grad_norm": 3.1379692554473877, + "learning_rate": 9.961382954827575e-06, + "loss": 4.6644, + "step": 19700 + }, + { + "epoch": 0.019456901239724787, + "grad_norm": 2.9732141494750977, + "learning_rate": 9.961284436930814e-06, + "loss": 4.6448, + "step": 19750 + }, + { + "epoch": 0.019506159217546875, + "grad_norm": 3.068929672241211, + "learning_rate": 9.961185919034052e-06, + "loss": 4.611, + "step": 19800 + }, + { + "epoch": 0.019555417195368963, + "grad_norm": 3.177074432373047, + "learning_rate": 9.961087401137292e-06, + "loss": 4.5738, + "step": 19850 + }, + { + "epoch": 0.01960467517319105, + "grad_norm": 2.8266665935516357, + "learning_rate": 9.96098888324053e-06, + "loss": 4.6535, + "step": 19900 + }, + { + "epoch": 0.019653933151013138, + "grad_norm": 2.889270782470703, + "learning_rate": 9.96089036534377e-06, + "loss": 4.6278, + "step": 19950 + }, + { + "epoch": 0.019703191128835226, + "grad_norm": 2.630666971206665, + "learning_rate": 9.960791847447007e-06, + "loss": 4.6061, + "step": 20000 + }, + { + "epoch": 0.019752449106657313, + "grad_norm": 2.609053611755371, + "learning_rate": 9.960693329550247e-06, + "loss": 4.6736, + "step": 20050 + }, + { + "epoch": 0.0198017070844794, + "grad_norm": 2.826141357421875, + "learning_rate": 9.960594811653485e-06, + "loss": 4.6866, + "step": 20100 + }, + { + "epoch": 0.019850965062301492, + "grad_norm": 2.97489333152771, + "learning_rate": 9.960496293756725e-06, + "loss": 4.6179, + "step": 20150 + }, + { + "epoch": 0.01990022304012358, + "grad_norm": 2.913787364959717, + "learning_rate": 9.960397775859964e-06, + "loss": 4.6686, + "step": 20200 + }, + { + "epoch": 0.019949481017945667, + "grad_norm": 2.4863719940185547, + "learning_rate": 9.960299257963202e-06, + "loss": 4.6685, + "step": 20250 + }, + { + "epoch": 0.019998738995767755, + "grad_norm": 2.8387584686279297, + "learning_rate": 9.960200740066442e-06, + "loss": 4.6156, + "step": 20300 + }, + { + "epoch": 0.020047996973589843, + "grad_norm": 2.749788284301758, + "learning_rate": 9.96010222216968e-06, + "loss": 4.648, + "step": 20350 + }, + { + "epoch": 0.02009725495141193, + "grad_norm": 2.7358319759368896, + "learning_rate": 9.96000370427292e-06, + "loss": 4.591, + "step": 20400 + }, + { + "epoch": 0.020146512929234018, + "grad_norm": 2.9210987091064453, + "learning_rate": 9.959905186376157e-06, + "loss": 4.6453, + "step": 20450 + }, + { + "epoch": 0.020195770907056106, + "grad_norm": 2.8509223461151123, + "learning_rate": 9.959806668479397e-06, + "loss": 4.6533, + "step": 20500 + }, + { + "epoch": 0.020245028884878193, + "grad_norm": 2.7001538276672363, + "learning_rate": 9.959708150582635e-06, + "loss": 4.5922, + "step": 20550 + }, + { + "epoch": 0.020294286862700284, + "grad_norm": 3.0613067150115967, + "learning_rate": 9.959609632685874e-06, + "loss": 4.6449, + "step": 20600 + }, + { + "epoch": 0.020343544840522372, + "grad_norm": 2.8858015537261963, + "learning_rate": 9.959511114789114e-06, + "loss": 4.6368, + "step": 20650 + }, + { + "epoch": 0.02039280281834446, + "grad_norm": 2.850172281265259, + "learning_rate": 9.959412596892352e-06, + "loss": 4.5766, + "step": 20700 + }, + { + "epoch": 0.020442060796166547, + "grad_norm": 4.303860187530518, + "learning_rate": 9.959314078995592e-06, + "loss": 4.6691, + "step": 20750 + }, + { + "epoch": 0.020491318773988635, + "grad_norm": 3.1398849487304688, + "learning_rate": 9.95921556109883e-06, + "loss": 4.6266, + "step": 20800 + }, + { + "epoch": 0.020540576751810723, + "grad_norm": 3.1535465717315674, + "learning_rate": 9.95911704320207e-06, + "loss": 4.6393, + "step": 20850 + }, + { + "epoch": 0.02058983472963281, + "grad_norm": 2.909069299697876, + "learning_rate": 9.959018525305307e-06, + "loss": 4.6261, + "step": 20900 + }, + { + "epoch": 0.020639092707454898, + "grad_norm": 3.134897470474243, + "learning_rate": 9.958920007408547e-06, + "loss": 4.6733, + "step": 20950 + }, + { + "epoch": 0.02068835068527699, + "grad_norm": 3.3675618171691895, + "learning_rate": 9.958821489511785e-06, + "loss": 4.6452, + "step": 21000 + }, + { + "epoch": 0.020737608663099077, + "grad_norm": 3.3791799545288086, + "learning_rate": 9.958722971615024e-06, + "loss": 4.5812, + "step": 21050 + }, + { + "epoch": 0.020786866640921164, + "grad_norm": 3.0720391273498535, + "learning_rate": 9.958624453718262e-06, + "loss": 4.5883, + "step": 21100 + }, + { + "epoch": 0.020836124618743252, + "grad_norm": 3.6765329837799072, + "learning_rate": 9.958525935821502e-06, + "loss": 4.604, + "step": 21150 + }, + { + "epoch": 0.02088538259656534, + "grad_norm": 2.7886910438537598, + "learning_rate": 9.958427417924742e-06, + "loss": 4.6039, + "step": 21200 + }, + { + "epoch": 0.020934640574387427, + "grad_norm": 2.8640899658203125, + "learning_rate": 9.958328900027981e-06, + "loss": 4.6267, + "step": 21250 + }, + { + "epoch": 0.020983898552209515, + "grad_norm": 2.540313243865967, + "learning_rate": 9.958230382131219e-06, + "loss": 4.5353, + "step": 21300 + }, + { + "epoch": 0.021033156530031603, + "grad_norm": 2.679269790649414, + "learning_rate": 9.958131864234457e-06, + "loss": 4.6805, + "step": 21350 + }, + { + "epoch": 0.02108241450785369, + "grad_norm": 2.53497314453125, + "learning_rate": 9.958033346337697e-06, + "loss": 4.6009, + "step": 21400 + }, + { + "epoch": 0.02113167248567578, + "grad_norm": 2.8101539611816406, + "learning_rate": 9.957934828440935e-06, + "loss": 4.5344, + "step": 21450 + }, + { + "epoch": 0.02118093046349787, + "grad_norm": 2.8896231651306152, + "learning_rate": 9.957836310544174e-06, + "loss": 4.6157, + "step": 21500 + }, + { + "epoch": 0.021230188441319957, + "grad_norm": 2.7836410999298096, + "learning_rate": 9.957737792647412e-06, + "loss": 4.5424, + "step": 21550 + }, + { + "epoch": 0.021279446419142044, + "grad_norm": 3.245649576187134, + "learning_rate": 9.957639274750652e-06, + "loss": 4.5937, + "step": 21600 + }, + { + "epoch": 0.021328704396964132, + "grad_norm": 3.2713537216186523, + "learning_rate": 9.957540756853891e-06, + "loss": 4.5765, + "step": 21650 + }, + { + "epoch": 0.02137796237478622, + "grad_norm": 2.6175053119659424, + "learning_rate": 9.95744223895713e-06, + "loss": 4.5474, + "step": 21700 + }, + { + "epoch": 0.021427220352608307, + "grad_norm": 2.647395372390747, + "learning_rate": 9.957343721060369e-06, + "loss": 4.6384, + "step": 21750 + }, + { + "epoch": 0.021476478330430395, + "grad_norm": 2.63905930519104, + "learning_rate": 9.957245203163609e-06, + "loss": 4.6055, + "step": 21800 + }, + { + "epoch": 0.021525736308252486, + "grad_norm": 3.3471617698669434, + "learning_rate": 9.957146685266846e-06, + "loss": 4.6197, + "step": 21850 + }, + { + "epoch": 0.021574994286074574, + "grad_norm": 2.6453027725219727, + "learning_rate": 9.957048167370084e-06, + "loss": 4.5625, + "step": 21900 + }, + { + "epoch": 0.02162425226389666, + "grad_norm": 2.775284767150879, + "learning_rate": 9.956949649473324e-06, + "loss": 4.5531, + "step": 21950 + }, + { + "epoch": 0.02167351024171875, + "grad_norm": 2.6043055057525635, + "learning_rate": 9.956851131576562e-06, + "loss": 4.6084, + "step": 22000 + }, + { + "epoch": 0.021722768219540837, + "grad_norm": 2.8373444080352783, + "learning_rate": 9.956752613679802e-06, + "loss": 4.617, + "step": 22050 + }, + { + "epoch": 0.021772026197362924, + "grad_norm": 2.9421136379241943, + "learning_rate": 9.956654095783041e-06, + "loss": 4.5574, + "step": 22100 + }, + { + "epoch": 0.021821284175185012, + "grad_norm": 2.6762993335723877, + "learning_rate": 9.95655557788628e-06, + "loss": 4.6044, + "step": 22150 + }, + { + "epoch": 0.0218705421530071, + "grad_norm": 2.8368875980377197, + "learning_rate": 9.956457059989519e-06, + "loss": 4.5705, + "step": 22200 + }, + { + "epoch": 0.021919800130829187, + "grad_norm": 2.7534544467926025, + "learning_rate": 9.956358542092758e-06, + "loss": 4.5357, + "step": 22250 + }, + { + "epoch": 0.02196905810865128, + "grad_norm": 2.546405792236328, + "learning_rate": 9.956260024195996e-06, + "loss": 4.5745, + "step": 22300 + }, + { + "epoch": 0.022018316086473366, + "grad_norm": 2.8086631298065186, + "learning_rate": 9.956161506299236e-06, + "loss": 4.5397, + "step": 22350 + }, + { + "epoch": 0.022067574064295454, + "grad_norm": 3.1287665367126465, + "learning_rate": 9.956062988402474e-06, + "loss": 4.6081, + "step": 22400 + }, + { + "epoch": 0.02211683204211754, + "grad_norm": 2.7218003273010254, + "learning_rate": 9.955964470505712e-06, + "loss": 4.6214, + "step": 22450 + }, + { + "epoch": 0.02216609001993963, + "grad_norm": 2.703472375869751, + "learning_rate": 9.955865952608951e-06, + "loss": 4.5923, + "step": 22500 + }, + { + "epoch": 0.022215347997761717, + "grad_norm": 2.7269394397735596, + "learning_rate": 9.95576743471219e-06, + "loss": 4.5588, + "step": 22550 + }, + { + "epoch": 0.022264605975583805, + "grad_norm": 2.916447877883911, + "learning_rate": 9.955668916815429e-06, + "loss": 4.5693, + "step": 22600 + }, + { + "epoch": 0.022313863953405892, + "grad_norm": 2.8139445781707764, + "learning_rate": 9.955570398918669e-06, + "loss": 4.5775, + "step": 22650 + }, + { + "epoch": 0.022363121931227983, + "grad_norm": 2.853302478790283, + "learning_rate": 9.955471881021908e-06, + "loss": 4.5705, + "step": 22700 + }, + { + "epoch": 0.02241237990905007, + "grad_norm": 2.9814963340759277, + "learning_rate": 9.955373363125146e-06, + "loss": 4.5372, + "step": 22750 + }, + { + "epoch": 0.02246163788687216, + "grad_norm": 3.0024280548095703, + "learning_rate": 9.955274845228386e-06, + "loss": 4.5645, + "step": 22800 + }, + { + "epoch": 0.022510895864694246, + "grad_norm": 3.3001363277435303, + "learning_rate": 9.955176327331624e-06, + "loss": 4.5601, + "step": 22850 + }, + { + "epoch": 0.022560153842516334, + "grad_norm": 2.813891887664795, + "learning_rate": 9.955077809434863e-06, + "loss": 4.548, + "step": 22900 + }, + { + "epoch": 0.02260941182033842, + "grad_norm": 2.678762674331665, + "learning_rate": 9.954979291538101e-06, + "loss": 4.5673, + "step": 22950 + }, + { + "epoch": 0.02265866979816051, + "grad_norm": 2.853184938430786, + "learning_rate": 9.95488077364134e-06, + "loss": 4.5617, + "step": 23000 + }, + { + "epoch": 0.022707927775982597, + "grad_norm": 3.1897239685058594, + "learning_rate": 9.954782255744579e-06, + "loss": 4.5146, + "step": 23050 + }, + { + "epoch": 0.022757185753804685, + "grad_norm": 2.755086898803711, + "learning_rate": 9.954683737847819e-06, + "loss": 4.5292, + "step": 23100 + }, + { + "epoch": 0.022806443731626776, + "grad_norm": 2.6911087036132812, + "learning_rate": 9.954585219951058e-06, + "loss": 4.5316, + "step": 23150 + }, + { + "epoch": 0.022855701709448863, + "grad_norm": 3.1968307495117188, + "learning_rate": 9.954486702054296e-06, + "loss": 4.5349, + "step": 23200 + }, + { + "epoch": 0.02290495968727095, + "grad_norm": 3.1381263732910156, + "learning_rate": 9.954388184157536e-06, + "loss": 4.5069, + "step": 23250 + }, + { + "epoch": 0.02295421766509304, + "grad_norm": 2.8112380504608154, + "learning_rate": 9.954289666260774e-06, + "loss": 4.4765, + "step": 23300 + }, + { + "epoch": 0.023003475642915126, + "grad_norm": 2.9694907665252686, + "learning_rate": 9.954191148364013e-06, + "loss": 4.5732, + "step": 23350 + }, + { + "epoch": 0.023052733620737214, + "grad_norm": 2.9523050785064697, + "learning_rate": 9.954092630467251e-06, + "loss": 4.4865, + "step": 23400 + }, + { + "epoch": 0.0231019915985593, + "grad_norm": 3.0660581588745117, + "learning_rate": 9.95399411257049e-06, + "loss": 4.558, + "step": 23450 + }, + { + "epoch": 0.02315124957638139, + "grad_norm": 2.924361228942871, + "learning_rate": 9.953895594673729e-06, + "loss": 4.5436, + "step": 23500 + }, + { + "epoch": 0.02320050755420348, + "grad_norm": 2.704207181930542, + "learning_rate": 9.953797076776968e-06, + "loss": 4.5111, + "step": 23550 + }, + { + "epoch": 0.023249765532025568, + "grad_norm": 2.8359827995300293, + "learning_rate": 9.953698558880206e-06, + "loss": 4.5091, + "step": 23600 + }, + { + "epoch": 0.023299023509847656, + "grad_norm": 2.9842169284820557, + "learning_rate": 9.953600040983446e-06, + "loss": 4.5456, + "step": 23650 + }, + { + "epoch": 0.023348281487669743, + "grad_norm": 3.054476499557495, + "learning_rate": 9.953501523086686e-06, + "loss": 4.5192, + "step": 23700 + }, + { + "epoch": 0.02339753946549183, + "grad_norm": 3.146836280822754, + "learning_rate": 9.953403005189923e-06, + "loss": 4.538, + "step": 23750 + }, + { + "epoch": 0.02344679744331392, + "grad_norm": 3.0552377700805664, + "learning_rate": 9.953304487293163e-06, + "loss": 4.5192, + "step": 23800 + }, + { + "epoch": 0.023496055421136006, + "grad_norm": 2.7679977416992188, + "learning_rate": 9.953205969396401e-06, + "loss": 4.5015, + "step": 23850 + }, + { + "epoch": 0.023545313398958094, + "grad_norm": 2.6137256622314453, + "learning_rate": 9.95310745149964e-06, + "loss": 4.4609, + "step": 23900 + }, + { + "epoch": 0.02359457137678018, + "grad_norm": 3.070784091949463, + "learning_rate": 9.953008933602879e-06, + "loss": 4.5464, + "step": 23950 + }, + { + "epoch": 0.023643829354602273, + "grad_norm": 2.8737852573394775, + "learning_rate": 9.952910415706118e-06, + "loss": 4.5648, + "step": 24000 + }, + { + "epoch": 0.02369308733242436, + "grad_norm": 2.717620372772217, + "learning_rate": 9.952811897809356e-06, + "loss": 4.5248, + "step": 24050 + }, + { + "epoch": 0.023742345310246448, + "grad_norm": 2.733323097229004, + "learning_rate": 9.952713379912596e-06, + "loss": 4.5215, + "step": 24100 + }, + { + "epoch": 0.023791603288068536, + "grad_norm": 2.8634002208709717, + "learning_rate": 9.952614862015835e-06, + "loss": 4.501, + "step": 24150 + }, + { + "epoch": 0.023840861265890623, + "grad_norm": 3.025319814682007, + "learning_rate": 9.952516344119073e-06, + "loss": 4.5525, + "step": 24200 + }, + { + "epoch": 0.02389011924371271, + "grad_norm": 2.849571466445923, + "learning_rate": 9.952417826222313e-06, + "loss": 4.5105, + "step": 24250 + }, + { + "epoch": 0.0239393772215348, + "grad_norm": 2.958639621734619, + "learning_rate": 9.952319308325551e-06, + "loss": 4.4538, + "step": 24300 + }, + { + "epoch": 0.023988635199356886, + "grad_norm": 3.1624443531036377, + "learning_rate": 9.95222079042879e-06, + "loss": 4.5027, + "step": 24350 + }, + { + "epoch": 0.024037893177178977, + "grad_norm": 2.622002124786377, + "learning_rate": 9.952122272532028e-06, + "loss": 4.4158, + "step": 24400 + }, + { + "epoch": 0.024087151155001065, + "grad_norm": 3.0104432106018066, + "learning_rate": 9.952023754635268e-06, + "loss": 4.4742, + "step": 24450 + }, + { + "epoch": 0.024136409132823153, + "grad_norm": 2.6320149898529053, + "learning_rate": 9.951925236738506e-06, + "loss": 4.5087, + "step": 24500 + }, + { + "epoch": 0.02418566711064524, + "grad_norm": 2.9587395191192627, + "learning_rate": 9.951826718841746e-06, + "loss": 4.4056, + "step": 24550 + }, + { + "epoch": 0.024234925088467328, + "grad_norm": 2.920125722885132, + "learning_rate": 9.951728200944985e-06, + "loss": 4.4541, + "step": 24600 + }, + { + "epoch": 0.024284183066289416, + "grad_norm": 3.174835205078125, + "learning_rate": 9.951629683048223e-06, + "loss": 4.5304, + "step": 24650 + }, + { + "epoch": 0.024333441044111503, + "grad_norm": 2.8356573581695557, + "learning_rate": 9.951531165151463e-06, + "loss": 4.4722, + "step": 24700 + }, + { + "epoch": 0.02438269902193359, + "grad_norm": 2.919393301010132, + "learning_rate": 9.9514326472547e-06, + "loss": 4.505, + "step": 24750 + }, + { + "epoch": 0.02443195699975568, + "grad_norm": 2.974968194961548, + "learning_rate": 9.95133412935794e-06, + "loss": 4.4704, + "step": 24800 + }, + { + "epoch": 0.02448121497757777, + "grad_norm": 3.164583683013916, + "learning_rate": 9.951235611461178e-06, + "loss": 4.461, + "step": 24850 + }, + { + "epoch": 0.024530472955399858, + "grad_norm": 2.5640127658843994, + "learning_rate": 9.951137093564418e-06, + "loss": 4.4747, + "step": 24900 + }, + { + "epoch": 0.024579730933221945, + "grad_norm": 2.9221560955047607, + "learning_rate": 9.951038575667656e-06, + "loss": 4.4707, + "step": 24950 + }, + { + "epoch": 0.024628988911044033, + "grad_norm": 2.738352060317993, + "learning_rate": 9.950940057770896e-06, + "loss": 4.4911, + "step": 25000 + }, + { + "epoch": 0.02467824688886612, + "grad_norm": 2.7208471298217773, + "learning_rate": 9.950841539874133e-06, + "loss": 4.4562, + "step": 25050 + }, + { + "epoch": 0.024727504866688208, + "grad_norm": 3.376863718032837, + "learning_rate": 9.950743021977373e-06, + "loss": 4.4455, + "step": 25100 + }, + { + "epoch": 0.024776762844510296, + "grad_norm": 2.95296049118042, + "learning_rate": 9.950644504080613e-06, + "loss": 4.4928, + "step": 25150 + }, + { + "epoch": 0.024826020822332383, + "grad_norm": 3.044597625732422, + "learning_rate": 9.95054598618385e-06, + "loss": 4.418, + "step": 25200 + }, + { + "epoch": 0.024875278800154475, + "grad_norm": 3.21134614944458, + "learning_rate": 9.95044746828709e-06, + "loss": 4.5426, + "step": 25250 + }, + { + "epoch": 0.024924536777976562, + "grad_norm": 2.8330888748168945, + "learning_rate": 9.950348950390328e-06, + "loss": 4.4309, + "step": 25300 + }, + { + "epoch": 0.02497379475579865, + "grad_norm": 2.7223148345947266, + "learning_rate": 9.950250432493568e-06, + "loss": 4.5098, + "step": 25350 + }, + { + "epoch": 0.025023052733620738, + "grad_norm": 2.5765719413757324, + "learning_rate": 9.950151914596806e-06, + "loss": 4.4615, + "step": 25400 + }, + { + "epoch": 0.025072310711442825, + "grad_norm": 2.6913654804229736, + "learning_rate": 9.950053396700045e-06, + "loss": 4.4401, + "step": 25450 + }, + { + "epoch": 0.025121568689264913, + "grad_norm": 2.719801664352417, + "learning_rate": 9.949954878803283e-06, + "loss": 4.481, + "step": 25500 + }, + { + "epoch": 0.025170826667087, + "grad_norm": 2.844313383102417, + "learning_rate": 9.949856360906523e-06, + "loss": 4.4655, + "step": 25550 + }, + { + "epoch": 0.025220084644909088, + "grad_norm": 2.9843645095825195, + "learning_rate": 9.949757843009763e-06, + "loss": 4.462, + "step": 25600 + }, + { + "epoch": 0.025269342622731176, + "grad_norm": 3.145681142807007, + "learning_rate": 9.949659325113002e-06, + "loss": 4.4722, + "step": 25650 + }, + { + "epoch": 0.025318600600553267, + "grad_norm": 2.826963424682617, + "learning_rate": 9.94956080721624e-06, + "loss": 4.3509, + "step": 25700 + }, + { + "epoch": 0.025367858578375355, + "grad_norm": 2.685471773147583, + "learning_rate": 9.949462289319478e-06, + "loss": 4.4905, + "step": 25750 + }, + { + "epoch": 0.025417116556197442, + "grad_norm": 2.5837581157684326, + "learning_rate": 9.949363771422718e-06, + "loss": 4.4478, + "step": 25800 + }, + { + "epoch": 0.02546637453401953, + "grad_norm": 2.8016183376312256, + "learning_rate": 9.949265253525956e-06, + "loss": 4.4359, + "step": 25850 + }, + { + "epoch": 0.025515632511841618, + "grad_norm": 3.0903573036193848, + "learning_rate": 9.949166735629195e-06, + "loss": 4.4128, + "step": 25900 + }, + { + "epoch": 0.025564890489663705, + "grad_norm": 2.566577434539795, + "learning_rate": 9.949068217732433e-06, + "loss": 4.4265, + "step": 25950 + }, + { + "epoch": 0.025614148467485793, + "grad_norm": 2.682568073272705, + "learning_rate": 9.948969699835673e-06, + "loss": 4.5022, + "step": 26000 + }, + { + "epoch": 0.02566340644530788, + "grad_norm": 2.708730459213257, + "learning_rate": 9.948871181938912e-06, + "loss": 4.4241, + "step": 26050 + }, + { + "epoch": 0.02571266442312997, + "grad_norm": 2.6141726970672607, + "learning_rate": 9.94877266404215e-06, + "loss": 4.4263, + "step": 26100 + }, + { + "epoch": 0.02576192240095206, + "grad_norm": 2.7675726413726807, + "learning_rate": 9.94867414614539e-06, + "loss": 4.3811, + "step": 26150 + }, + { + "epoch": 0.025811180378774147, + "grad_norm": 2.8282270431518555, + "learning_rate": 9.94857562824863e-06, + "loss": 4.4519, + "step": 26200 + }, + { + "epoch": 0.025860438356596235, + "grad_norm": 2.713754892349243, + "learning_rate": 9.948477110351868e-06, + "loss": 4.3615, + "step": 26250 + }, + { + "epoch": 0.025909696334418322, + "grad_norm": 2.759967088699341, + "learning_rate": 9.948378592455105e-06, + "loss": 4.3905, + "step": 26300 + }, + { + "epoch": 0.02595895431224041, + "grad_norm": 3.0386972427368164, + "learning_rate": 9.948280074558345e-06, + "loss": 4.4516, + "step": 26350 + }, + { + "epoch": 0.026008212290062498, + "grad_norm": 2.6274845600128174, + "learning_rate": 9.948181556661583e-06, + "loss": 4.3733, + "step": 26400 + }, + { + "epoch": 0.026057470267884585, + "grad_norm": 2.746767044067383, + "learning_rate": 9.948083038764823e-06, + "loss": 4.3587, + "step": 26450 + }, + { + "epoch": 0.026106728245706673, + "grad_norm": 2.960312843322754, + "learning_rate": 9.94798452086806e-06, + "loss": 4.4111, + "step": 26500 + }, + { + "epoch": 0.026155986223528764, + "grad_norm": 2.8746421337127686, + "learning_rate": 9.9478860029713e-06, + "loss": 4.3949, + "step": 26550 + }, + { + "epoch": 0.02620524420135085, + "grad_norm": 2.835115432739258, + "learning_rate": 9.94778748507454e-06, + "loss": 4.4458, + "step": 26600 + }, + { + "epoch": 0.02625450217917294, + "grad_norm": 2.494474411010742, + "learning_rate": 9.94768896717778e-06, + "loss": 4.3916, + "step": 26650 + }, + { + "epoch": 0.026303760156995027, + "grad_norm": 2.898473024368286, + "learning_rate": 9.947590449281017e-06, + "loss": 4.4364, + "step": 26700 + }, + { + "epoch": 0.026353018134817115, + "grad_norm": 2.974879503250122, + "learning_rate": 9.947491931384257e-06, + "loss": 4.4252, + "step": 26750 + }, + { + "epoch": 0.026402276112639202, + "grad_norm": 3.0099704265594482, + "learning_rate": 9.947393413487495e-06, + "loss": 4.433, + "step": 26800 + }, + { + "epoch": 0.02645153409046129, + "grad_norm": 2.701627254486084, + "learning_rate": 9.947294895590733e-06, + "loss": 4.427, + "step": 26850 + }, + { + "epoch": 0.026500792068283378, + "grad_norm": 2.887014865875244, + "learning_rate": 9.947196377693973e-06, + "loss": 4.4071, + "step": 26900 + }, + { + "epoch": 0.02655005004610547, + "grad_norm": 2.961686134338379, + "learning_rate": 9.94709785979721e-06, + "loss": 4.3608, + "step": 26950 + }, + { + "epoch": 0.026599308023927556, + "grad_norm": 2.853254556655884, + "learning_rate": 9.94699934190045e-06, + "loss": 4.4153, + "step": 27000 + }, + { + "epoch": 0.026648566001749644, + "grad_norm": 2.50703763961792, + "learning_rate": 9.94690082400369e-06, + "loss": 4.3889, + "step": 27050 + }, + { + "epoch": 0.026697823979571732, + "grad_norm": 2.8789684772491455, + "learning_rate": 9.94680230610693e-06, + "loss": 4.3725, + "step": 27100 + }, + { + "epoch": 0.02674708195739382, + "grad_norm": 2.7770514488220215, + "learning_rate": 9.946703788210167e-06, + "loss": 4.3919, + "step": 27150 + }, + { + "epoch": 0.026796339935215907, + "grad_norm": 3.4222967624664307, + "learning_rate": 9.946605270313407e-06, + "loss": 4.3957, + "step": 27200 + }, + { + "epoch": 0.026845597913037995, + "grad_norm": 2.6575770378112793, + "learning_rate": 9.946506752416645e-06, + "loss": 4.3981, + "step": 27250 + }, + { + "epoch": 0.026894855890860082, + "grad_norm": 2.8452625274658203, + "learning_rate": 9.946408234519884e-06, + "loss": 4.3767, + "step": 27300 + }, + { + "epoch": 0.02694411386868217, + "grad_norm": 3.266728639602661, + "learning_rate": 9.946309716623122e-06, + "loss": 4.436, + "step": 27350 + }, + { + "epoch": 0.02699337184650426, + "grad_norm": 2.686967134475708, + "learning_rate": 9.94621119872636e-06, + "loss": 4.3606, + "step": 27400 + }, + { + "epoch": 0.02704262982432635, + "grad_norm": 3.0076820850372314, + "learning_rate": 9.9461126808296e-06, + "loss": 4.4213, + "step": 27450 + }, + { + "epoch": 0.027091887802148436, + "grad_norm": 3.0195279121398926, + "learning_rate": 9.94601416293284e-06, + "loss": 4.374, + "step": 27500 + }, + { + "epoch": 0.027141145779970524, + "grad_norm": 3.0727953910827637, + "learning_rate": 9.945915645036077e-06, + "loss": 4.325, + "step": 27550 + }, + { + "epoch": 0.027190403757792612, + "grad_norm": 2.797142505645752, + "learning_rate": 9.945817127139317e-06, + "loss": 4.3611, + "step": 27600 + }, + { + "epoch": 0.0272396617356147, + "grad_norm": 2.734865665435791, + "learning_rate": 9.945718609242557e-06, + "loss": 4.3325, + "step": 27650 + }, + { + "epoch": 0.027288919713436787, + "grad_norm": 2.6199657917022705, + "learning_rate": 9.945620091345795e-06, + "loss": 4.4078, + "step": 27700 + }, + { + "epoch": 0.027338177691258875, + "grad_norm": 2.6894443035125732, + "learning_rate": 9.945521573449034e-06, + "loss": 4.3877, + "step": 27750 + }, + { + "epoch": 0.027387435669080966, + "grad_norm": 2.8690712451934814, + "learning_rate": 9.945423055552272e-06, + "loss": 4.3637, + "step": 27800 + }, + { + "epoch": 0.027436693646903054, + "grad_norm": 2.703543186187744, + "learning_rate": 9.945324537655512e-06, + "loss": 4.3674, + "step": 27850 + }, + { + "epoch": 0.02748595162472514, + "grad_norm": 3.5251989364624023, + "learning_rate": 9.94522601975875e-06, + "loss": 4.392, + "step": 27900 + }, + { + "epoch": 0.02753520960254723, + "grad_norm": 2.43642520904541, + "learning_rate": 9.94512750186199e-06, + "loss": 4.3497, + "step": 27950 + }, + { + "epoch": 0.027584467580369317, + "grad_norm": 3.0210397243499756, + "learning_rate": 9.945028983965227e-06, + "loss": 4.325, + "step": 28000 + }, + { + "epoch": 0.027633725558191404, + "grad_norm": 2.75996470451355, + "learning_rate": 9.944930466068467e-06, + "loss": 4.3611, + "step": 28050 + }, + { + "epoch": 0.027682983536013492, + "grad_norm": 2.9363393783569336, + "learning_rate": 9.944831948171707e-06, + "loss": 4.3001, + "step": 28100 + }, + { + "epoch": 0.02773224151383558, + "grad_norm": 3.142714738845825, + "learning_rate": 9.944733430274945e-06, + "loss": 4.3954, + "step": 28150 + }, + { + "epoch": 0.02778149949165767, + "grad_norm": 2.8299307823181152, + "learning_rate": 9.944634912378184e-06, + "loss": 4.2651, + "step": 28200 + }, + { + "epoch": 0.027830757469479758, + "grad_norm": 2.5900282859802246, + "learning_rate": 9.944536394481422e-06, + "loss": 4.3741, + "step": 28250 + }, + { + "epoch": 0.027880015447301846, + "grad_norm": 3.699406623840332, + "learning_rate": 9.944437876584662e-06, + "loss": 4.3389, + "step": 28300 + }, + { + "epoch": 0.027929273425123934, + "grad_norm": 3.636477470397949, + "learning_rate": 9.9443393586879e-06, + "loss": 4.3641, + "step": 28350 + }, + { + "epoch": 0.02797853140294602, + "grad_norm": 4.3456130027771, + "learning_rate": 9.944240840791138e-06, + "loss": 4.4359, + "step": 28400 + }, + { + "epoch": 0.02802778938076811, + "grad_norm": 2.626979351043701, + "learning_rate": 9.944142322894377e-06, + "loss": 4.2957, + "step": 28450 + }, + { + "epoch": 0.028077047358590197, + "grad_norm": 2.863654375076294, + "learning_rate": 9.944043804997617e-06, + "loss": 4.3753, + "step": 28500 + }, + { + "epoch": 0.028126305336412284, + "grad_norm": 3.0978944301605225, + "learning_rate": 9.943945287100856e-06, + "loss": 4.3368, + "step": 28550 + }, + { + "epoch": 0.028175563314234372, + "grad_norm": 2.9103078842163086, + "learning_rate": 9.943846769204094e-06, + "loss": 4.3345, + "step": 28600 + }, + { + "epoch": 0.028224821292056463, + "grad_norm": 2.957070827484131, + "learning_rate": 9.943748251307334e-06, + "loss": 4.2989, + "step": 28650 + }, + { + "epoch": 0.02827407926987855, + "grad_norm": 2.9758403301239014, + "learning_rate": 9.943649733410572e-06, + "loss": 4.3536, + "step": 28700 + }, + { + "epoch": 0.02832333724770064, + "grad_norm": 2.878831624984741, + "learning_rate": 9.943551215513812e-06, + "loss": 4.3593, + "step": 28750 + }, + { + "epoch": 0.028372595225522726, + "grad_norm": 2.9945123195648193, + "learning_rate": 9.94345269761705e-06, + "loss": 4.3802, + "step": 28800 + }, + { + "epoch": 0.028421853203344814, + "grad_norm": 2.517561197280884, + "learning_rate": 9.943354179720289e-06, + "loss": 4.292, + "step": 28850 + }, + { + "epoch": 0.0284711111811669, + "grad_norm": 3.021172285079956, + "learning_rate": 9.943255661823527e-06, + "loss": 4.3913, + "step": 28900 + }, + { + "epoch": 0.02852036915898899, + "grad_norm": 2.8447377681732178, + "learning_rate": 9.943157143926767e-06, + "loss": 4.3048, + "step": 28950 + }, + { + "epoch": 0.028569627136811077, + "grad_norm": 2.4886667728424072, + "learning_rate": 9.943058626030005e-06, + "loss": 4.3455, + "step": 29000 + }, + { + "epoch": 0.028618885114633168, + "grad_norm": 2.483572483062744, + "learning_rate": 9.942960108133244e-06, + "loss": 4.2838, + "step": 29050 + }, + { + "epoch": 0.028668143092455255, + "grad_norm": 2.782280206680298, + "learning_rate": 9.942861590236484e-06, + "loss": 4.3446, + "step": 29100 + }, + { + "epoch": 0.028717401070277343, + "grad_norm": 2.9190547466278076, + "learning_rate": 9.942763072339722e-06, + "loss": 4.3156, + "step": 29150 + }, + { + "epoch": 0.02876665904809943, + "grad_norm": 3.011664390563965, + "learning_rate": 9.942664554442961e-06, + "loss": 4.315, + "step": 29200 + }, + { + "epoch": 0.02881591702592152, + "grad_norm": 3.0116662979125977, + "learning_rate": 9.9425660365462e-06, + "loss": 4.3606, + "step": 29250 + }, + { + "epoch": 0.028865175003743606, + "grad_norm": 2.777803659439087, + "learning_rate": 9.942467518649439e-06, + "loss": 4.3271, + "step": 29300 + }, + { + "epoch": 0.028914432981565694, + "grad_norm": 2.8867990970611572, + "learning_rate": 9.942369000752677e-06, + "loss": 4.305, + "step": 29350 + }, + { + "epoch": 0.02896369095938778, + "grad_norm": 2.7769391536712646, + "learning_rate": 9.942270482855917e-06, + "loss": 4.3109, + "step": 29400 + }, + { + "epoch": 0.02901294893720987, + "grad_norm": 2.857863187789917, + "learning_rate": 9.942171964959155e-06, + "loss": 4.3277, + "step": 29450 + }, + { + "epoch": 0.02906220691503196, + "grad_norm": 2.756167411804199, + "learning_rate": 9.942073447062394e-06, + "loss": 4.3733, + "step": 29500 + }, + { + "epoch": 0.029111464892854048, + "grad_norm": 3.189030647277832, + "learning_rate": 9.941974929165634e-06, + "loss": 4.3219, + "step": 29550 + }, + { + "epoch": 0.029160722870676135, + "grad_norm": 2.9630534648895264, + "learning_rate": 9.941876411268872e-06, + "loss": 4.267, + "step": 29600 + }, + { + "epoch": 0.029209980848498223, + "grad_norm": 2.941082239151001, + "learning_rate": 9.941777893372111e-06, + "loss": 4.2504, + "step": 29650 + }, + { + "epoch": 0.02925923882632031, + "grad_norm": 2.5006723403930664, + "learning_rate": 9.94167937547535e-06, + "loss": 4.3189, + "step": 29700 + }, + { + "epoch": 0.0293084968041424, + "grad_norm": 3.473111152648926, + "learning_rate": 9.941580857578589e-06, + "loss": 4.29, + "step": 29750 + }, + { + "epoch": 0.029357754781964486, + "grad_norm": 3.0181171894073486, + "learning_rate": 9.941482339681827e-06, + "loss": 4.3411, + "step": 29800 + }, + { + "epoch": 0.029407012759786574, + "grad_norm": 2.606924057006836, + "learning_rate": 9.941383821785066e-06, + "loss": 4.3287, + "step": 29850 + }, + { + "epoch": 0.029456270737608665, + "grad_norm": 2.7728309631347656, + "learning_rate": 9.941285303888304e-06, + "loss": 4.2773, + "step": 29900 + }, + { + "epoch": 0.029505528715430752, + "grad_norm": 3.063476085662842, + "learning_rate": 9.941186785991544e-06, + "loss": 4.3055, + "step": 29950 + }, + { + "epoch": 0.02955478669325284, + "grad_norm": 2.701085090637207, + "learning_rate": 9.941088268094784e-06, + "loss": 4.3183, + "step": 30000 + }, + { + "epoch": 0.029604044671074928, + "grad_norm": 2.775357723236084, + "learning_rate": 9.940989750198022e-06, + "loss": 4.3379, + "step": 30050 + }, + { + "epoch": 0.029653302648897015, + "grad_norm": 2.993272304534912, + "learning_rate": 9.940891232301261e-06, + "loss": 4.2457, + "step": 30100 + }, + { + "epoch": 0.029702560626719103, + "grad_norm": 3.1036784648895264, + "learning_rate": 9.940792714404499e-06, + "loss": 4.3579, + "step": 30150 + }, + { + "epoch": 0.02975181860454119, + "grad_norm": 3.0962276458740234, + "learning_rate": 9.940694196507739e-06, + "loss": 4.3211, + "step": 30200 + }, + { + "epoch": 0.02980107658236328, + "grad_norm": 3.196259021759033, + "learning_rate": 9.940595678610977e-06, + "loss": 4.2959, + "step": 30250 + }, + { + "epoch": 0.029850334560185366, + "grad_norm": 2.890075445175171, + "learning_rate": 9.940497160714216e-06, + "loss": 4.2774, + "step": 30300 + }, + { + "epoch": 0.029899592538007457, + "grad_norm": 2.652261972427368, + "learning_rate": 9.940398642817454e-06, + "loss": 4.2812, + "step": 30350 + }, + { + "epoch": 0.029948850515829545, + "grad_norm": 2.8092494010925293, + "learning_rate": 9.940300124920694e-06, + "loss": 4.3164, + "step": 30400 + }, + { + "epoch": 0.029998108493651632, + "grad_norm": 2.6249241828918457, + "learning_rate": 9.940201607023933e-06, + "loss": 4.324, + "step": 30450 + }, + { + "epoch": 0.03004736647147372, + "grad_norm": 2.8325743675231934, + "learning_rate": 9.940103089127171e-06, + "loss": 4.2736, + "step": 30500 + }, + { + "epoch": 0.030096624449295808, + "grad_norm": 2.9740049839019775, + "learning_rate": 9.940004571230411e-06, + "loss": 4.2682, + "step": 30550 + }, + { + "epoch": 0.030145882427117895, + "grad_norm": 2.6152381896972656, + "learning_rate": 9.939906053333649e-06, + "loss": 4.3285, + "step": 30600 + }, + { + "epoch": 0.030195140404939983, + "grad_norm": 3.0052435398101807, + "learning_rate": 9.939807535436889e-06, + "loss": 4.163, + "step": 30650 + }, + { + "epoch": 0.03024439838276207, + "grad_norm": 2.832188606262207, + "learning_rate": 9.939709017540127e-06, + "loss": 4.2838, + "step": 30700 + }, + { + "epoch": 0.030293656360584162, + "grad_norm": 3.2174715995788574, + "learning_rate": 9.939610499643366e-06, + "loss": 4.2741, + "step": 30750 + }, + { + "epoch": 0.03034291433840625, + "grad_norm": 3.045895576477051, + "learning_rate": 9.939511981746604e-06, + "loss": 4.3186, + "step": 30800 + }, + { + "epoch": 0.030392172316228337, + "grad_norm": 2.9008102416992188, + "learning_rate": 9.939413463849844e-06, + "loss": 4.2653, + "step": 30850 + }, + { + "epoch": 0.030441430294050425, + "grad_norm": 3.228961706161499, + "learning_rate": 9.939314945953082e-06, + "loss": 4.263, + "step": 30900 + }, + { + "epoch": 0.030490688271872513, + "grad_norm": 2.837714195251465, + "learning_rate": 9.939216428056321e-06, + "loss": 4.2651, + "step": 30950 + }, + { + "epoch": 0.0305399462496946, + "grad_norm": 2.987471580505371, + "learning_rate": 9.939117910159561e-06, + "loss": 4.2813, + "step": 31000 + }, + { + "epoch": 0.030589204227516688, + "grad_norm": 2.773286819458008, + "learning_rate": 9.9390193922628e-06, + "loss": 4.2788, + "step": 31050 + }, + { + "epoch": 0.030638462205338775, + "grad_norm": 5.4688825607299805, + "learning_rate": 9.938920874366038e-06, + "loss": 4.2912, + "step": 31100 + }, + { + "epoch": 0.030687720183160863, + "grad_norm": 2.573085308074951, + "learning_rate": 9.938822356469276e-06, + "loss": 4.2504, + "step": 31150 + }, + { + "epoch": 0.030736978160982954, + "grad_norm": 3.1248066425323486, + "learning_rate": 9.938723838572516e-06, + "loss": 4.279, + "step": 31200 + }, + { + "epoch": 0.030786236138805042, + "grad_norm": 2.7262656688690186, + "learning_rate": 9.938625320675754e-06, + "loss": 4.2661, + "step": 31250 + }, + { + "epoch": 0.03083549411662713, + "grad_norm": 2.6642582416534424, + "learning_rate": 9.938526802778994e-06, + "loss": 4.2884, + "step": 31300 + }, + { + "epoch": 0.030884752094449217, + "grad_norm": 2.5909652709960938, + "learning_rate": 9.938428284882232e-06, + "loss": 4.2218, + "step": 31350 + }, + { + "epoch": 0.030934010072271305, + "grad_norm": 3.19968843460083, + "learning_rate": 9.938329766985471e-06, + "loss": 4.2665, + "step": 31400 + }, + { + "epoch": 0.030983268050093393, + "grad_norm": 2.6201486587524414, + "learning_rate": 9.93823124908871e-06, + "loss": 4.2467, + "step": 31450 + }, + { + "epoch": 0.03103252602791548, + "grad_norm": 2.6821377277374268, + "learning_rate": 9.93813273119195e-06, + "loss": 4.2395, + "step": 31500 + }, + { + "epoch": 0.031081784005737568, + "grad_norm": 2.692659616470337, + "learning_rate": 9.938034213295188e-06, + "loss": 4.27, + "step": 31550 + }, + { + "epoch": 0.03113104198355966, + "grad_norm": 2.770172357559204, + "learning_rate": 9.937935695398428e-06, + "loss": 4.2525, + "step": 31600 + }, + { + "epoch": 0.031180299961381747, + "grad_norm": 2.8454737663269043, + "learning_rate": 9.937837177501666e-06, + "loss": 4.3315, + "step": 31650 + }, + { + "epoch": 0.031229557939203834, + "grad_norm": 2.8132808208465576, + "learning_rate": 9.937738659604904e-06, + "loss": 4.2667, + "step": 31700 + }, + { + "epoch": 0.03127881591702592, + "grad_norm": 3.2848119735717773, + "learning_rate": 9.937640141708143e-06, + "loss": 4.2363, + "step": 31750 + }, + { + "epoch": 0.031328073894848006, + "grad_norm": 2.7837979793548584, + "learning_rate": 9.937541623811381e-06, + "loss": 4.2621, + "step": 31800 + }, + { + "epoch": 0.0313773318726701, + "grad_norm": 3.278162956237793, + "learning_rate": 9.937443105914621e-06, + "loss": 4.2258, + "step": 31850 + }, + { + "epoch": 0.03142658985049219, + "grad_norm": 3.696763038635254, + "learning_rate": 9.93734458801786e-06, + "loss": 4.2227, + "step": 31900 + }, + { + "epoch": 0.03147584782831427, + "grad_norm": 3.0131752490997314, + "learning_rate": 9.937246070121099e-06, + "loss": 4.1841, + "step": 31950 + }, + { + "epoch": 0.031525105806136364, + "grad_norm": 2.6339046955108643, + "learning_rate": 9.937147552224338e-06, + "loss": 4.2495, + "step": 32000 + }, + { + "epoch": 0.03157436378395845, + "grad_norm": 2.831568956375122, + "learning_rate": 9.937049034327578e-06, + "loss": 4.2901, + "step": 32050 + }, + { + "epoch": 0.03162362176178054, + "grad_norm": 3.0567262172698975, + "learning_rate": 9.936950516430816e-06, + "loss": 4.2202, + "step": 32100 + }, + { + "epoch": 0.03167287973960262, + "grad_norm": 2.884934663772583, + "learning_rate": 9.936851998534055e-06, + "loss": 4.2218, + "step": 32150 + }, + { + "epoch": 0.031722137717424714, + "grad_norm": 2.675477981567383, + "learning_rate": 9.936753480637293e-06, + "loss": 4.2183, + "step": 32200 + }, + { + "epoch": 0.031771395695246805, + "grad_norm": 2.8141536712646484, + "learning_rate": 9.936654962740531e-06, + "loss": 4.2555, + "step": 32250 + }, + { + "epoch": 0.03182065367306889, + "grad_norm": 2.88698148727417, + "learning_rate": 9.936556444843771e-06, + "loss": 4.2358, + "step": 32300 + }, + { + "epoch": 0.03186991165089098, + "grad_norm": 2.5037083625793457, + "learning_rate": 9.936457926947009e-06, + "loss": 4.266, + "step": 32350 + }, + { + "epoch": 0.031919169628713065, + "grad_norm": 2.6572444438934326, + "learning_rate": 9.936359409050248e-06, + "loss": 4.2571, + "step": 32400 + }, + { + "epoch": 0.031968427606535156, + "grad_norm": 2.7781388759613037, + "learning_rate": 9.936260891153488e-06, + "loss": 4.2409, + "step": 32450 + }, + { + "epoch": 0.03201768558435724, + "grad_norm": 3.4566988945007324, + "learning_rate": 9.936162373256728e-06, + "loss": 4.2466, + "step": 32500 + }, + { + "epoch": 0.03206694356217933, + "grad_norm": 3.0501184463500977, + "learning_rate": 9.936063855359966e-06, + "loss": 4.2242, + "step": 32550 + }, + { + "epoch": 0.032116201540001416, + "grad_norm": 2.704448699951172, + "learning_rate": 9.935965337463205e-06, + "loss": 4.263, + "step": 32600 + }, + { + "epoch": 0.03216545951782351, + "grad_norm": 2.9793906211853027, + "learning_rate": 9.935866819566443e-06, + "loss": 4.2366, + "step": 32650 + }, + { + "epoch": 0.0322147174956456, + "grad_norm": 2.798672676086426, + "learning_rate": 9.935768301669683e-06, + "loss": 4.1261, + "step": 32700 + }, + { + "epoch": 0.03226397547346768, + "grad_norm": 2.760988235473633, + "learning_rate": 9.93566978377292e-06, + "loss": 4.2116, + "step": 32750 + }, + { + "epoch": 0.03231323345128977, + "grad_norm": 2.995474338531494, + "learning_rate": 9.935571265876159e-06, + "loss": 4.2328, + "step": 32800 + }, + { + "epoch": 0.03236249142911186, + "grad_norm": 2.649573802947998, + "learning_rate": 9.935472747979398e-06, + "loss": 4.2374, + "step": 32850 + }, + { + "epoch": 0.03241174940693395, + "grad_norm": 3.1019508838653564, + "learning_rate": 9.935374230082638e-06, + "loss": 4.1998, + "step": 32900 + }, + { + "epoch": 0.03246100738475603, + "grad_norm": 2.9016926288604736, + "learning_rate": 9.935275712185878e-06, + "loss": 4.2147, + "step": 32950 + }, + { + "epoch": 0.032510265362578124, + "grad_norm": 2.883260488510132, + "learning_rate": 9.935177194289115e-06, + "loss": 4.2134, + "step": 33000 + }, + { + "epoch": 0.03255952334040021, + "grad_norm": 3.6488003730773926, + "learning_rate": 9.935078676392355e-06, + "loss": 4.1728, + "step": 33050 + }, + { + "epoch": 0.0326087813182223, + "grad_norm": 2.7432384490966797, + "learning_rate": 9.934980158495593e-06, + "loss": 4.2468, + "step": 33100 + }, + { + "epoch": 0.03265803929604439, + "grad_norm": 2.9957826137542725, + "learning_rate": 9.934881640598833e-06, + "loss": 4.2498, + "step": 33150 + }, + { + "epoch": 0.032707297273866474, + "grad_norm": 2.6980412006378174, + "learning_rate": 9.93478312270207e-06, + "loss": 4.1658, + "step": 33200 + }, + { + "epoch": 0.032756555251688566, + "grad_norm": 2.9208524227142334, + "learning_rate": 9.93468460480531e-06, + "loss": 4.204, + "step": 33250 + }, + { + "epoch": 0.03280581322951065, + "grad_norm": 2.606506586074829, + "learning_rate": 9.934586086908548e-06, + "loss": 4.169, + "step": 33300 + }, + { + "epoch": 0.03285507120733274, + "grad_norm": 2.5529887676239014, + "learning_rate": 9.934487569011788e-06, + "loss": 4.2361, + "step": 33350 + }, + { + "epoch": 0.032904329185154825, + "grad_norm": 3.1340713500976562, + "learning_rate": 9.934389051115026e-06, + "loss": 4.1706, + "step": 33400 + }, + { + "epoch": 0.032953587162976916, + "grad_norm": 2.99807071685791, + "learning_rate": 9.934290533218265e-06, + "loss": 4.2163, + "step": 33450 + }, + { + "epoch": 0.03300284514079901, + "grad_norm": 3.128819465637207, + "learning_rate": 9.934192015321505e-06, + "loss": 4.1475, + "step": 33500 + }, + { + "epoch": 0.03305210311862109, + "grad_norm": 2.877211570739746, + "learning_rate": 9.934093497424743e-06, + "loss": 4.2145, + "step": 33550 + }, + { + "epoch": 0.03310136109644318, + "grad_norm": 2.9433438777923584, + "learning_rate": 9.933994979527983e-06, + "loss": 4.1927, + "step": 33600 + }, + { + "epoch": 0.03315061907426527, + "grad_norm": 3.1519973278045654, + "learning_rate": 9.93389646163122e-06, + "loss": 4.2248, + "step": 33650 + }, + { + "epoch": 0.03319987705208736, + "grad_norm": 3.2136318683624268, + "learning_rate": 9.93379794373446e-06, + "loss": 4.1946, + "step": 33700 + }, + { + "epoch": 0.03324913502990944, + "grad_norm": 2.62773060798645, + "learning_rate": 9.933699425837698e-06, + "loss": 4.212, + "step": 33750 + }, + { + "epoch": 0.03329839300773153, + "grad_norm": 2.5957818031311035, + "learning_rate": 9.933600907940938e-06, + "loss": 4.2211, + "step": 33800 + }, + { + "epoch": 0.03334765098555362, + "grad_norm": 2.8922276496887207, + "learning_rate": 9.933502390044176e-06, + "loss": 4.1547, + "step": 33850 + }, + { + "epoch": 0.03339690896337571, + "grad_norm": 2.91373348236084, + "learning_rate": 9.933403872147415e-06, + "loss": 4.128, + "step": 33900 + }, + { + "epoch": 0.0334461669411978, + "grad_norm": 3.1384732723236084, + "learning_rate": 9.933305354250655e-06, + "loss": 4.1551, + "step": 33950 + }, + { + "epoch": 0.033495424919019884, + "grad_norm": 3.0681543350219727, + "learning_rate": 9.933206836353893e-06, + "loss": 4.1817, + "step": 34000 + }, + { + "epoch": 0.033544682896841975, + "grad_norm": 2.733565092086792, + "learning_rate": 9.933108318457132e-06, + "loss": 4.188, + "step": 34050 + }, + { + "epoch": 0.03359394087466406, + "grad_norm": 2.9783501625061035, + "learning_rate": 9.93300980056037e-06, + "loss": 4.1784, + "step": 34100 + }, + { + "epoch": 0.03364319885248615, + "grad_norm": 3.0292632579803467, + "learning_rate": 9.93291128266361e-06, + "loss": 4.2044, + "step": 34150 + }, + { + "epoch": 0.033692456830308234, + "grad_norm": 2.689316749572754, + "learning_rate": 9.932812764766848e-06, + "loss": 4.1782, + "step": 34200 + }, + { + "epoch": 0.033741714808130326, + "grad_norm": 2.7311851978302, + "learning_rate": 9.932714246870087e-06, + "loss": 4.212, + "step": 34250 + }, + { + "epoch": 0.03379097278595241, + "grad_norm": 3.307145595550537, + "learning_rate": 9.932615728973325e-06, + "loss": 4.2233, + "step": 34300 + }, + { + "epoch": 0.0338402307637745, + "grad_norm": 2.899125576019287, + "learning_rate": 9.932517211076565e-06, + "loss": 4.1904, + "step": 34350 + }, + { + "epoch": 0.03388948874159659, + "grad_norm": 2.9214344024658203, + "learning_rate": 9.932418693179805e-06, + "loss": 4.1458, + "step": 34400 + }, + { + "epoch": 0.033938746719418676, + "grad_norm": 3.1509246826171875, + "learning_rate": 9.932320175283043e-06, + "loss": 4.2167, + "step": 34450 + }, + { + "epoch": 0.03398800469724077, + "grad_norm": 2.7806036472320557, + "learning_rate": 9.932221657386282e-06, + "loss": 4.1926, + "step": 34500 + }, + { + "epoch": 0.03403726267506285, + "grad_norm": 2.930236339569092, + "learning_rate": 9.93212313948952e-06, + "loss": 4.111, + "step": 34550 + }, + { + "epoch": 0.03408652065288494, + "grad_norm": 3.0260918140411377, + "learning_rate": 9.93202462159276e-06, + "loss": 4.1488, + "step": 34600 + }, + { + "epoch": 0.03413577863070703, + "grad_norm": 2.7561604976654053, + "learning_rate": 9.931926103695998e-06, + "loss": 4.2183, + "step": 34650 + }, + { + "epoch": 0.03418503660852912, + "grad_norm": 2.8858392238616943, + "learning_rate": 9.931827585799237e-06, + "loss": 4.2273, + "step": 34700 + }, + { + "epoch": 0.0342342945863512, + "grad_norm": 2.719752550125122, + "learning_rate": 9.931729067902475e-06, + "loss": 4.1792, + "step": 34750 + }, + { + "epoch": 0.03428355256417329, + "grad_norm": 2.6967825889587402, + "learning_rate": 9.931630550005715e-06, + "loss": 4.1824, + "step": 34800 + }, + { + "epoch": 0.034332810541995384, + "grad_norm": 2.947338342666626, + "learning_rate": 9.931532032108953e-06, + "loss": 4.1162, + "step": 34850 + }, + { + "epoch": 0.03438206851981747, + "grad_norm": 3.062485933303833, + "learning_rate": 9.931433514212192e-06, + "loss": 4.2112, + "step": 34900 + }, + { + "epoch": 0.03443132649763956, + "grad_norm": 2.715425968170166, + "learning_rate": 9.931334996315432e-06, + "loss": 4.1605, + "step": 34950 + }, + { + "epoch": 0.034480584475461644, + "grad_norm": 2.655729293823242, + "learning_rate": 9.93123647841867e-06, + "loss": 4.1399, + "step": 35000 + }, + { + "epoch": 0.034529842453283735, + "grad_norm": 2.7333297729492188, + "learning_rate": 9.93113796052191e-06, + "loss": 4.1808, + "step": 35050 + }, + { + "epoch": 0.03457910043110582, + "grad_norm": 2.8066625595092773, + "learning_rate": 9.931039442625148e-06, + "loss": 4.2149, + "step": 35100 + }, + { + "epoch": 0.03462835840892791, + "grad_norm": 2.4432833194732666, + "learning_rate": 9.930940924728387e-06, + "loss": 4.1546, + "step": 35150 + }, + { + "epoch": 0.03467761638675, + "grad_norm": 2.8933513164520264, + "learning_rate": 9.930842406831625e-06, + "loss": 4.1255, + "step": 35200 + }, + { + "epoch": 0.034726874364572086, + "grad_norm": 2.8086421489715576, + "learning_rate": 9.930743888934865e-06, + "loss": 4.163, + "step": 35250 + }, + { + "epoch": 0.03477613234239418, + "grad_norm": 3.1362359523773193, + "learning_rate": 9.930645371038103e-06, + "loss": 4.1517, + "step": 35300 + }, + { + "epoch": 0.03482539032021626, + "grad_norm": 2.80601167678833, + "learning_rate": 9.930546853141342e-06, + "loss": 4.1405, + "step": 35350 + }, + { + "epoch": 0.03487464829803835, + "grad_norm": 2.7374587059020996, + "learning_rate": 9.930448335244582e-06, + "loss": 4.1281, + "step": 35400 + }, + { + "epoch": 0.034923906275860436, + "grad_norm": 2.8682756423950195, + "learning_rate": 9.93034981734782e-06, + "loss": 4.1333, + "step": 35450 + }, + { + "epoch": 0.03497316425368253, + "grad_norm": 3.0086607933044434, + "learning_rate": 9.93025129945106e-06, + "loss": 4.1201, + "step": 35500 + }, + { + "epoch": 0.03502242223150461, + "grad_norm": 2.7637526988983154, + "learning_rate": 9.930152781554297e-06, + "loss": 4.1463, + "step": 35550 + }, + { + "epoch": 0.0350716802093267, + "grad_norm": 3.0745182037353516, + "learning_rate": 9.930054263657537e-06, + "loss": 4.1661, + "step": 35600 + }, + { + "epoch": 0.035120938187148794, + "grad_norm": 2.8573169708251953, + "learning_rate": 9.929955745760775e-06, + "loss": 4.1086, + "step": 35650 + }, + { + "epoch": 0.03517019616497088, + "grad_norm": 2.6791625022888184, + "learning_rate": 9.929857227864015e-06, + "loss": 4.1214, + "step": 35700 + }, + { + "epoch": 0.03521945414279297, + "grad_norm": 3.7227554321289062, + "learning_rate": 9.929758709967253e-06, + "loss": 4.111, + "step": 35750 + }, + { + "epoch": 0.03526871212061505, + "grad_norm": 2.7368383407592773, + "learning_rate": 9.929660192070492e-06, + "loss": 4.103, + "step": 35800 + }, + { + "epoch": 0.035317970098437144, + "grad_norm": 3.2191810607910156, + "learning_rate": 9.929561674173732e-06, + "loss": 4.0781, + "step": 35850 + }, + { + "epoch": 0.03536722807625923, + "grad_norm": 2.909776449203491, + "learning_rate": 9.92946315627697e-06, + "loss": 4.0883, + "step": 35900 + }, + { + "epoch": 0.03541648605408132, + "grad_norm": 2.6579201221466064, + "learning_rate": 9.92936463838021e-06, + "loss": 4.1208, + "step": 35950 + }, + { + "epoch": 0.035465744031903404, + "grad_norm": 2.800654172897339, + "learning_rate": 9.929266120483447e-06, + "loss": 4.1171, + "step": 36000 + }, + { + "epoch": 0.035515002009725495, + "grad_norm": 2.6417629718780518, + "learning_rate": 9.929167602586687e-06, + "loss": 4.1728, + "step": 36050 + }, + { + "epoch": 0.035564259987547586, + "grad_norm": 2.5177643299102783, + "learning_rate": 9.929069084689925e-06, + "loss": 4.1347, + "step": 36100 + }, + { + "epoch": 0.03561351796536967, + "grad_norm": 2.960146903991699, + "learning_rate": 9.928970566793165e-06, + "loss": 4.1211, + "step": 36150 + }, + { + "epoch": 0.03566277594319176, + "grad_norm": 2.9792191982269287, + "learning_rate": 9.928872048896402e-06, + "loss": 4.2129, + "step": 36200 + }, + { + "epoch": 0.035712033921013846, + "grad_norm": 2.6205337047576904, + "learning_rate": 9.928773530999642e-06, + "loss": 4.1066, + "step": 36250 + }, + { + "epoch": 0.03576129189883594, + "grad_norm": 2.703212261199951, + "learning_rate": 9.928675013102882e-06, + "loss": 4.1596, + "step": 36300 + }, + { + "epoch": 0.03581054987665802, + "grad_norm": 2.9238173961639404, + "learning_rate": 9.92857649520612e-06, + "loss": 4.1132, + "step": 36350 + }, + { + "epoch": 0.03585980785448011, + "grad_norm": 2.7000010013580322, + "learning_rate": 9.92847797730936e-06, + "loss": 4.1297, + "step": 36400 + }, + { + "epoch": 0.035909065832302196, + "grad_norm": 3.70847749710083, + "learning_rate": 9.928379459412599e-06, + "loss": 4.1398, + "step": 36450 + }, + { + "epoch": 0.03595832381012429, + "grad_norm": 3.3755671977996826, + "learning_rate": 9.928280941515837e-06, + "loss": 4.0694, + "step": 36500 + }, + { + "epoch": 0.03600758178794638, + "grad_norm": 3.498547077178955, + "learning_rate": 9.928182423619075e-06, + "loss": 4.1035, + "step": 36550 + }, + { + "epoch": 0.03605683976576846, + "grad_norm": 2.8539059162139893, + "learning_rate": 9.928083905722314e-06, + "loss": 4.2017, + "step": 36600 + }, + { + "epoch": 0.036106097743590554, + "grad_norm": 2.741032123565674, + "learning_rate": 9.927985387825552e-06, + "loss": 4.1154, + "step": 36650 + }, + { + "epoch": 0.03615535572141264, + "grad_norm": 2.9323079586029053, + "learning_rate": 9.927886869928792e-06, + "loss": 4.1154, + "step": 36700 + }, + { + "epoch": 0.03620461369923473, + "grad_norm": 2.8776886463165283, + "learning_rate": 9.92778835203203e-06, + "loss": 4.0792, + "step": 36750 + }, + { + "epoch": 0.03625387167705681, + "grad_norm": 2.8153183460235596, + "learning_rate": 9.92768983413527e-06, + "loss": 4.1233, + "step": 36800 + }, + { + "epoch": 0.036303129654878905, + "grad_norm": 2.682251214981079, + "learning_rate": 9.927591316238509e-06, + "loss": 4.1254, + "step": 36850 + }, + { + "epoch": 0.036352387632700996, + "grad_norm": 3.0005993843078613, + "learning_rate": 9.998702870716563e-06, + "loss": 4.0933, + "step": 36900 + }, + { + "epoch": 0.03640164561052308, + "grad_norm": 2.998979330062866, + "learning_rate": 9.998699343571177e-06, + "loss": 4.0736, + "step": 36950 + }, + { + "epoch": 0.03645090358834517, + "grad_norm": 3.270879030227661, + "learning_rate": 9.998695811637427e-06, + "loss": 4.0595, + "step": 37000 + }, + { + "epoch": 0.036500161566167255, + "grad_norm": 2.8358864784240723, + "learning_rate": 9.998692274915318e-06, + "loss": 4.0795, + "step": 37050 + }, + { + "epoch": 0.036549419543989346, + "grad_norm": 2.8862786293029785, + "learning_rate": 9.998688733404855e-06, + "loss": 4.0613, + "step": 37100 + }, + { + "epoch": 0.03659867752181143, + "grad_norm": 2.930955648422241, + "learning_rate": 9.998685187106038e-06, + "loss": 4.0911, + "step": 37150 + }, + { + "epoch": 0.03664793549963352, + "grad_norm": 2.6923890113830566, + "learning_rate": 9.998681636018872e-06, + "loss": 4.1247, + "step": 37200 + }, + { + "epoch": 0.036697193477455606, + "grad_norm": 2.752511501312256, + "learning_rate": 9.99867808014336e-06, + "loss": 4.0601, + "step": 37250 + }, + { + "epoch": 0.0367464514552777, + "grad_norm": 2.8299922943115234, + "learning_rate": 9.998674519479508e-06, + "loss": 4.0436, + "step": 37300 + }, + { + "epoch": 0.03679570943309979, + "grad_norm": 2.979118585586548, + "learning_rate": 9.998670954027316e-06, + "loss": 4.152, + "step": 37350 + }, + { + "epoch": 0.03684496741092187, + "grad_norm": 2.846278190612793, + "learning_rate": 9.998667383786789e-06, + "loss": 4.0301, + "step": 37400 + }, + { + "epoch": 0.03689422538874396, + "grad_norm": 2.6260616779327393, + "learning_rate": 9.998663808757929e-06, + "loss": 4.0717, + "step": 37450 + }, + { + "epoch": 0.03694348336656605, + "grad_norm": 2.721881628036499, + "learning_rate": 9.998660228940741e-06, + "loss": 4.0508, + "step": 37500 + }, + { + "epoch": 0.03699274134438814, + "grad_norm": 2.9743435382843018, + "learning_rate": 9.998656644335228e-06, + "loss": 4.0843, + "step": 37550 + }, + { + "epoch": 0.03704199932221022, + "grad_norm": 2.772188901901245, + "learning_rate": 9.998653054941392e-06, + "loss": 4.1388, + "step": 37600 + }, + { + "epoch": 0.037091257300032314, + "grad_norm": 2.910201072692871, + "learning_rate": 9.99864946075924e-06, + "loss": 4.1406, + "step": 37650 + }, + { + "epoch": 0.0371405152778544, + "grad_norm": 2.79813551902771, + "learning_rate": 9.998645861788772e-06, + "loss": 4.0817, + "step": 37700 + }, + { + "epoch": 0.03718977325567649, + "grad_norm": 2.7184360027313232, + "learning_rate": 9.998642258029994e-06, + "loss": 4.1197, + "step": 37750 + }, + { + "epoch": 0.03723903123349858, + "grad_norm": 2.999150037765503, + "learning_rate": 9.998638649482907e-06, + "loss": 4.1145, + "step": 37800 + }, + { + "epoch": 0.037288289211320665, + "grad_norm": 2.551774024963379, + "learning_rate": 9.998635036147516e-06, + "loss": 4.0886, + "step": 37850 + }, + { + "epoch": 0.037337547189142756, + "grad_norm": 2.7964155673980713, + "learning_rate": 9.998631418023823e-06, + "loss": 4.0857, + "step": 37900 + }, + { + "epoch": 0.03738680516696484, + "grad_norm": 2.9535751342773438, + "learning_rate": 9.998627795111835e-06, + "loss": 4.1155, + "step": 37950 + }, + { + "epoch": 0.03743606314478693, + "grad_norm": 3.12491512298584, + "learning_rate": 9.99862416741155e-06, + "loss": 4.1098, + "step": 38000 + }, + { + "epoch": 0.037485321122609015, + "grad_norm": 2.8504207134246826, + "learning_rate": 9.998620534922976e-06, + "loss": 4.101, + "step": 38050 + }, + { + "epoch": 0.037534579100431106, + "grad_norm": 2.9374349117279053, + "learning_rate": 9.998616897646115e-06, + "loss": 4.027, + "step": 38100 + }, + { + "epoch": 0.03758383707825319, + "grad_norm": 2.603107213973999, + "learning_rate": 9.99861325558097e-06, + "loss": 4.1027, + "step": 38150 + }, + { + "epoch": 0.03763309505607528, + "grad_norm": 2.8215765953063965, + "learning_rate": 9.998609608727545e-06, + "loss": 4.1212, + "step": 38200 + }, + { + "epoch": 0.03768235303389737, + "grad_norm": 2.7805447578430176, + "learning_rate": 9.998605957085846e-06, + "loss": 4.0814, + "step": 38250 + }, + { + "epoch": 0.03773161101171946, + "grad_norm": 2.9574592113494873, + "learning_rate": 9.998602300655871e-06, + "loss": 4.0814, + "step": 38300 + }, + { + "epoch": 0.03778086898954155, + "grad_norm": 2.7715060710906982, + "learning_rate": 9.998598639437627e-06, + "loss": 4.1118, + "step": 38350 + }, + { + "epoch": 0.03783012696736363, + "grad_norm": 2.58461332321167, + "learning_rate": 9.998594973431117e-06, + "loss": 4.1173, + "step": 38400 + }, + { + "epoch": 0.03787938494518572, + "grad_norm": 2.791839361190796, + "learning_rate": 9.998591302636344e-06, + "loss": 4.0453, + "step": 38450 + }, + { + "epoch": 0.03792864292300781, + "grad_norm": 2.861480712890625, + "learning_rate": 9.998587627053311e-06, + "loss": 4.0709, + "step": 38500 + }, + { + "epoch": 0.0379779009008299, + "grad_norm": 2.8120946884155273, + "learning_rate": 9.998583946682024e-06, + "loss": 4.0649, + "step": 38550 + }, + { + "epoch": 0.03802715887865199, + "grad_norm": 2.9638471603393555, + "learning_rate": 9.998580261522486e-06, + "loss": 4.0673, + "step": 38600 + }, + { + "epoch": 0.038076416856474074, + "grad_norm": 2.6576764583587646, + "learning_rate": 9.998576571574698e-06, + "loss": 4.1153, + "step": 38650 + }, + { + "epoch": 0.038125674834296165, + "grad_norm": 2.9432432651519775, + "learning_rate": 9.998572876838664e-06, + "loss": 4.066, + "step": 38700 + }, + { + "epoch": 0.03817493281211825, + "grad_norm": 3.0570549964904785, + "learning_rate": 9.99856917731439e-06, + "loss": 4.0585, + "step": 38750 + }, + { + "epoch": 0.03822419078994034, + "grad_norm": 3.075031042098999, + "learning_rate": 9.998565473001879e-06, + "loss": 4.1043, + "step": 38800 + }, + { + "epoch": 0.038273448767762425, + "grad_norm": 2.788705825805664, + "learning_rate": 9.998561763901134e-06, + "loss": 4.0385, + "step": 38850 + }, + { + "epoch": 0.038322706745584516, + "grad_norm": 2.8631489276885986, + "learning_rate": 9.998558050012155e-06, + "loss": 4.0498, + "step": 38900 + }, + { + "epoch": 0.0383719647234066, + "grad_norm": 2.827277898788452, + "learning_rate": 9.99855433133495e-06, + "loss": 4.0671, + "step": 38950 + }, + { + "epoch": 0.03842122270122869, + "grad_norm": 2.749055862426758, + "learning_rate": 9.998550607869525e-06, + "loss": 4.0899, + "step": 39000 + }, + { + "epoch": 0.03847048067905078, + "grad_norm": 2.762758731842041, + "learning_rate": 9.998546879615876e-06, + "loss": 4.0084, + "step": 39050 + }, + { + "epoch": 0.038519738656872866, + "grad_norm": 2.815336227416992, + "learning_rate": 9.998543146574012e-06, + "loss": 4.1087, + "step": 39100 + }, + { + "epoch": 0.03856899663469496, + "grad_norm": 2.723120927810669, + "learning_rate": 9.998539408743935e-06, + "loss": 4.0246, + "step": 39150 + }, + { + "epoch": 0.03861825461251704, + "grad_norm": 3.1131174564361572, + "learning_rate": 9.998535666125648e-06, + "loss": 4.0957, + "step": 39200 + }, + { + "epoch": 0.03866751259033913, + "grad_norm": 2.836998224258423, + "learning_rate": 9.998531918719157e-06, + "loss": 4.0328, + "step": 39250 + }, + { + "epoch": 0.03871677056816122, + "grad_norm": 2.7810873985290527, + "learning_rate": 9.99852816652446e-06, + "loss": 4.0695, + "step": 39300 + }, + { + "epoch": 0.03876602854598331, + "grad_norm": 2.840212345123291, + "learning_rate": 9.998524409541568e-06, + "loss": 4.0878, + "step": 39350 + }, + { + "epoch": 0.03881528652380539, + "grad_norm": 2.7455148696899414, + "learning_rate": 9.99852064777048e-06, + "loss": 4.0332, + "step": 39400 + }, + { + "epoch": 0.038864544501627483, + "grad_norm": 2.9797582626342773, + "learning_rate": 9.9985168812112e-06, + "loss": 4.0475, + "step": 39450 + }, + { + "epoch": 0.038913802479449575, + "grad_norm": 3.239074945449829, + "learning_rate": 9.998513109863734e-06, + "loss": 4.0432, + "step": 39500 + }, + { + "epoch": 0.03896306045727166, + "grad_norm": 3.2542123794555664, + "learning_rate": 9.998509333728083e-06, + "loss": 4.079, + "step": 39550 + }, + { + "epoch": 0.03901231843509375, + "grad_norm": 2.9895012378692627, + "learning_rate": 9.99850555280425e-06, + "loss": 4.0879, + "step": 39600 + }, + { + "epoch": 0.039061576412915834, + "grad_norm": 2.6337368488311768, + "learning_rate": 9.998501767092241e-06, + "loss": 4.0793, + "step": 39650 + }, + { + "epoch": 0.039110834390737925, + "grad_norm": 2.9258251190185547, + "learning_rate": 9.99849797659206e-06, + "loss": 3.9796, + "step": 39700 + }, + { + "epoch": 0.03916009236856001, + "grad_norm": 2.780320882797241, + "learning_rate": 9.998494181303709e-06, + "loss": 4.118, + "step": 39750 + }, + { + "epoch": 0.0392093503463821, + "grad_norm": 2.6904499530792236, + "learning_rate": 9.99849038122719e-06, + "loss": 4.0857, + "step": 39800 + }, + { + "epoch": 0.039258608324204185, + "grad_norm": 2.6314303874969482, + "learning_rate": 9.998486576362511e-06, + "loss": 4.0408, + "step": 39850 + }, + { + "epoch": 0.039307866302026276, + "grad_norm": 2.7986602783203125, + "learning_rate": 9.998482766709672e-06, + "loss": 4.0728, + "step": 39900 + }, + { + "epoch": 0.03935712427984837, + "grad_norm": 2.933122396469116, + "learning_rate": 9.998478952268677e-06, + "loss": 4.0494, + "step": 39950 + }, + { + "epoch": 0.03940638225767045, + "grad_norm": 2.844691753387451, + "learning_rate": 9.998475133039534e-06, + "loss": 4.0467, + "step": 40000 + }, + { + "epoch": 0.03945564023549254, + "grad_norm": 2.870483160018921, + "learning_rate": 9.998471309022239e-06, + "loss": 4.1035, + "step": 40050 + }, + { + "epoch": 0.039504898213314626, + "grad_norm": 2.829791307449341, + "learning_rate": 9.998467480216802e-06, + "loss": 4.0624, + "step": 40100 + }, + { + "epoch": 0.03955415619113672, + "grad_norm": 2.665922164916992, + "learning_rate": 9.998463646623225e-06, + "loss": 4.0059, + "step": 40150 + }, + { + "epoch": 0.0396034141689588, + "grad_norm": 2.625274181365967, + "learning_rate": 9.998459808241512e-06, + "loss": 4.0042, + "step": 40200 + }, + { + "epoch": 0.03965267214678089, + "grad_norm": 2.7577829360961914, + "learning_rate": 9.998455965071664e-06, + "loss": 4.0405, + "step": 40250 + }, + { + "epoch": 0.039701930124602984, + "grad_norm": 3.1029510498046875, + "learning_rate": 9.998452117113687e-06, + "loss": 4.0802, + "step": 40300 + }, + { + "epoch": 0.03975118810242507, + "grad_norm": 2.890066385269165, + "learning_rate": 9.998448264367584e-06, + "loss": 3.9937, + "step": 40350 + }, + { + "epoch": 0.03980044608024716, + "grad_norm": 2.9326283931732178, + "learning_rate": 9.998444406833361e-06, + "loss": 4.0449, + "step": 40400 + }, + { + "epoch": 0.039849704058069244, + "grad_norm": 2.78877329826355, + "learning_rate": 9.998440544511017e-06, + "loss": 3.9658, + "step": 40450 + }, + { + "epoch": 0.039898962035891335, + "grad_norm": 2.731938362121582, + "learning_rate": 9.99843667740056e-06, + "loss": 4.0374, + "step": 40500 + }, + { + "epoch": 0.03994822001371342, + "grad_norm": 2.751181125640869, + "learning_rate": 9.998432805501992e-06, + "loss": 4.0805, + "step": 40550 + }, + { + "epoch": 0.03999747799153551, + "grad_norm": 2.676797389984131, + "learning_rate": 9.998428928815316e-06, + "loss": 3.9896, + "step": 40600 + }, + { + "epoch": 0.040046735969357594, + "grad_norm": 2.81471586227417, + "learning_rate": 9.998425047340537e-06, + "loss": 3.9935, + "step": 40650 + }, + { + "epoch": 0.040095993947179685, + "grad_norm": 2.654538631439209, + "learning_rate": 9.998421161077658e-06, + "loss": 4.021, + "step": 40700 + }, + { + "epoch": 0.040145251925001776, + "grad_norm": 3.0738894939422607, + "learning_rate": 9.998417270026684e-06, + "loss": 4.0458, + "step": 40750 + }, + { + "epoch": 0.04019450990282386, + "grad_norm": 2.681791305541992, + "learning_rate": 9.998413374187616e-06, + "loss": 4.0371, + "step": 40800 + }, + { + "epoch": 0.04024376788064595, + "grad_norm": 2.538512706756592, + "learning_rate": 9.99840947356046e-06, + "loss": 4.0911, + "step": 40850 + }, + { + "epoch": 0.040293025858468036, + "grad_norm": 3.414949417114258, + "learning_rate": 9.99840556814522e-06, + "loss": 4.0768, + "step": 40900 + }, + { + "epoch": 0.04034228383629013, + "grad_norm": 2.793264150619507, + "learning_rate": 9.998401657941897e-06, + "loss": 3.9975, + "step": 40950 + }, + { + "epoch": 0.04039154181411221, + "grad_norm": 2.9819114208221436, + "learning_rate": 9.998397742950498e-06, + "loss": 4.0483, + "step": 41000 + }, + { + "epoch": 0.0404407997919343, + "grad_norm": 3.319779634475708, + "learning_rate": 9.998393823171024e-06, + "loss": 3.9783, + "step": 41050 + }, + { + "epoch": 0.04049005776975639, + "grad_norm": 3.429198741912842, + "learning_rate": 9.998389898603482e-06, + "loss": 4.0345, + "step": 41100 + }, + { + "epoch": 0.04053931574757848, + "grad_norm": 2.9804835319519043, + "learning_rate": 9.998385969247872e-06, + "loss": 3.9826, + "step": 41150 + }, + { + "epoch": 0.04058857372540057, + "grad_norm": 2.816620111465454, + "learning_rate": 9.998382035104201e-06, + "loss": 4.007, + "step": 41200 + }, + { + "epoch": 0.04063783170322265, + "grad_norm": 3.1493327617645264, + "learning_rate": 9.998378096172472e-06, + "loss": 4.0055, + "step": 41250 + }, + { + "epoch": 0.040687089681044744, + "grad_norm": 2.6561801433563232, + "learning_rate": 9.998374152452688e-06, + "loss": 4.041, + "step": 41300 + }, + { + "epoch": 0.04073634765886683, + "grad_norm": 2.9765233993530273, + "learning_rate": 9.998370203944851e-06, + "loss": 4.0281, + "step": 41350 + }, + { + "epoch": 0.04078560563668892, + "grad_norm": 2.6032211780548096, + "learning_rate": 9.998366250648968e-06, + "loss": 4.0591, + "step": 41400 + }, + { + "epoch": 0.040834863614511004, + "grad_norm": 3.0853097438812256, + "learning_rate": 9.998362292565043e-06, + "loss": 4.0403, + "step": 41450 + }, + { + "epoch": 0.040884121592333095, + "grad_norm": 2.9708056449890137, + "learning_rate": 9.998358329693076e-06, + "loss": 3.972, + "step": 41500 + }, + { + "epoch": 0.04093337957015518, + "grad_norm": 2.916759967803955, + "learning_rate": 9.998354362033074e-06, + "loss": 4.0309, + "step": 41550 + }, + { + "epoch": 0.04098263754797727, + "grad_norm": 2.687000036239624, + "learning_rate": 9.99835038958504e-06, + "loss": 4.0237, + "step": 41600 + }, + { + "epoch": 0.04103189552579936, + "grad_norm": 2.680605173110962, + "learning_rate": 9.998346412348977e-06, + "loss": 4.0151, + "step": 41650 + }, + { + "epoch": 0.041081153503621445, + "grad_norm": 2.9819421768188477, + "learning_rate": 9.998342430324893e-06, + "loss": 4.0057, + "step": 41700 + }, + { + "epoch": 0.041130411481443536, + "grad_norm": 3.719841957092285, + "learning_rate": 9.998338443512785e-06, + "loss": 4.0621, + "step": 41750 + }, + { + "epoch": 0.04117966945926562, + "grad_norm": 2.829129934310913, + "learning_rate": 9.998334451912659e-06, + "loss": 4.077, + "step": 41800 + }, + { + "epoch": 0.04122892743708771, + "grad_norm": 2.880561351776123, + "learning_rate": 9.998330455524523e-06, + "loss": 3.9817, + "step": 41850 + }, + { + "epoch": 0.041278185414909796, + "grad_norm": 2.7640485763549805, + "learning_rate": 9.998326454348376e-06, + "loss": 4.0404, + "step": 41900 + }, + { + "epoch": 0.04132744339273189, + "grad_norm": 2.9133119583129883, + "learning_rate": 9.998322448384225e-06, + "loss": 4.0292, + "step": 41950 + }, + { + "epoch": 0.04137670137055398, + "grad_norm": 2.625391721725464, + "learning_rate": 9.99831843763207e-06, + "loss": 4.0255, + "step": 42000 + }, + { + "epoch": 0.04142595934837606, + "grad_norm": 2.824221134185791, + "learning_rate": 9.99831442209192e-06, + "loss": 4.08, + "step": 42050 + }, + { + "epoch": 0.041475217326198154, + "grad_norm": 2.952834129333496, + "learning_rate": 9.998310401763776e-06, + "loss": 4.0051, + "step": 42100 + }, + { + "epoch": 0.04152447530402024, + "grad_norm": 2.7279679775238037, + "learning_rate": 9.998306376647642e-06, + "loss": 4.0249, + "step": 42150 + }, + { + "epoch": 0.04157373328184233, + "grad_norm": 2.78761625289917, + "learning_rate": 9.99830234674352e-06, + "loss": 4.0253, + "step": 42200 + }, + { + "epoch": 0.04162299125966441, + "grad_norm": 3.137406349182129, + "learning_rate": 9.998298312051417e-06, + "loss": 3.8607, + "step": 42250 + }, + { + "epoch": 0.041672249237486504, + "grad_norm": 2.8308167457580566, + "learning_rate": 9.998294272571336e-06, + "loss": 3.9479, + "step": 42300 + }, + { + "epoch": 0.04172150721530859, + "grad_norm": 4.4350056648254395, + "learning_rate": 9.99829022830328e-06, + "loss": 3.9547, + "step": 42350 + }, + { + "epoch": 0.04177076519313068, + "grad_norm": 2.8853492736816406, + "learning_rate": 9.998286179247253e-06, + "loss": 3.9796, + "step": 42400 + }, + { + "epoch": 0.04182002317095277, + "grad_norm": 3.0448482036590576, + "learning_rate": 9.99828212540326e-06, + "loss": 4.0104, + "step": 42450 + }, + { + "epoch": 0.041869281148774855, + "grad_norm": 2.9808459281921387, + "learning_rate": 9.998278066771303e-06, + "loss": 4.0083, + "step": 42500 + }, + { + "epoch": 0.041918539126596946, + "grad_norm": 3.439976692199707, + "learning_rate": 9.998274003351389e-06, + "loss": 4.0077, + "step": 42550 + }, + { + "epoch": 0.04196779710441903, + "grad_norm": 3.0069916248321533, + "learning_rate": 9.998269935143519e-06, + "loss": 3.9632, + "step": 42600 + }, + { + "epoch": 0.04201705508224112, + "grad_norm": 2.7382593154907227, + "learning_rate": 9.998265862147698e-06, + "loss": 4.0104, + "step": 42650 + }, + { + "epoch": 0.042066313060063205, + "grad_norm": 2.9899399280548096, + "learning_rate": 9.998261784363928e-06, + "loss": 4.0595, + "step": 42700 + }, + { + "epoch": 0.0421155710378853, + "grad_norm": 2.828990936279297, + "learning_rate": 9.998257701792216e-06, + "loss": 3.9882, + "step": 42750 + }, + { + "epoch": 0.04216482901570738, + "grad_norm": 2.753329038619995, + "learning_rate": 9.998253614432565e-06, + "loss": 3.9874, + "step": 42800 + }, + { + "epoch": 0.04221408699352947, + "grad_norm": 2.9647271633148193, + "learning_rate": 9.998249522284977e-06, + "loss": 4.0674, + "step": 42850 + }, + { + "epoch": 0.04226334497135156, + "grad_norm": 2.7331337928771973, + "learning_rate": 9.998245425349458e-06, + "loss": 3.9639, + "step": 42900 + }, + { + "epoch": 0.04231260294917365, + "grad_norm": 2.717965841293335, + "learning_rate": 9.998241323626011e-06, + "loss": 3.9842, + "step": 42950 + }, + { + "epoch": 0.04236186092699574, + "grad_norm": 2.9853599071502686, + "learning_rate": 9.998237217114641e-06, + "loss": 3.9829, + "step": 43000 + }, + { + "epoch": 0.04241111890481782, + "grad_norm": 3.0525457859039307, + "learning_rate": 9.998233105815351e-06, + "loss": 3.9696, + "step": 43050 + }, + { + "epoch": 0.042460376882639914, + "grad_norm": 2.831200361251831, + "learning_rate": 9.998228989728145e-06, + "loss": 4.0234, + "step": 43100 + }, + { + "epoch": 0.042509634860462, + "grad_norm": 2.6662328243255615, + "learning_rate": 9.998224868853025e-06, + "loss": 3.9821, + "step": 43150 + }, + { + "epoch": 0.04255889283828409, + "grad_norm": 3.6581227779388428, + "learning_rate": 9.99822074319e-06, + "loss": 4.0025, + "step": 43200 + }, + { + "epoch": 0.04260815081610617, + "grad_norm": 3.1624743938446045, + "learning_rate": 9.99821661273907e-06, + "loss": 3.9493, + "step": 43250 + }, + { + "epoch": 0.042657408793928264, + "grad_norm": 2.837441921234131, + "learning_rate": 9.99821247750024e-06, + "loss": 3.9152, + "step": 43300 + }, + { + "epoch": 0.042706666771750355, + "grad_norm": 2.699232339859009, + "learning_rate": 9.998208337473512e-06, + "loss": 3.9054, + "step": 43350 + }, + { + "epoch": 0.04275592474957244, + "grad_norm": 3.163109064102173, + "learning_rate": 9.998204192658894e-06, + "loss": 4.0368, + "step": 43400 + }, + { + "epoch": 0.04280518272739453, + "grad_norm": 2.7115020751953125, + "learning_rate": 9.998200043056388e-06, + "loss": 3.9917, + "step": 43450 + }, + { + "epoch": 0.042854440705216615, + "grad_norm": 2.9153828620910645, + "learning_rate": 9.998195888665995e-06, + "loss": 4.0745, + "step": 43500 + }, + { + "epoch": 0.042903698683038706, + "grad_norm": 2.712707281112671, + "learning_rate": 9.998191729487724e-06, + "loss": 3.9769, + "step": 43550 + }, + { + "epoch": 0.04295295666086079, + "grad_norm": 2.638631820678711, + "learning_rate": 9.998187565521576e-06, + "loss": 3.9965, + "step": 43600 + }, + { + "epoch": 0.04300221463868288, + "grad_norm": 2.7745962142944336, + "learning_rate": 9.998183396767556e-06, + "loss": 4.023, + "step": 43650 + }, + { + "epoch": 0.04305147261650497, + "grad_norm": 2.624850273132324, + "learning_rate": 9.998179223225668e-06, + "loss": 4.0351, + "step": 43700 + }, + { + "epoch": 0.04310073059432706, + "grad_norm": 2.922191858291626, + "learning_rate": 9.998175044895916e-06, + "loss": 4.0039, + "step": 43750 + }, + { + "epoch": 0.04314998857214915, + "grad_norm": 2.808725357055664, + "learning_rate": 9.998170861778302e-06, + "loss": 3.9714, + "step": 43800 + }, + { + "epoch": 0.04319924654997123, + "grad_norm": 2.8285109996795654, + "learning_rate": 9.998166673872834e-06, + "loss": 3.9834, + "step": 43850 + }, + { + "epoch": 0.04324850452779332, + "grad_norm": 2.8687593936920166, + "learning_rate": 9.998162481179512e-06, + "loss": 3.9885, + "step": 43900 + }, + { + "epoch": 0.04329776250561541, + "grad_norm": 3.2354822158813477, + "learning_rate": 9.998158283698343e-06, + "loss": 3.9727, + "step": 43950 + }, + { + "epoch": 0.0433470204834375, + "grad_norm": 3.126126289367676, + "learning_rate": 9.998154081429327e-06, + "loss": 3.9721, + "step": 44000 + }, + { + "epoch": 0.04339627846125958, + "grad_norm": 2.744157075881958, + "learning_rate": 9.998149874372473e-06, + "loss": 3.9549, + "step": 44050 + }, + { + "epoch": 0.043445536439081674, + "grad_norm": 2.799764394760132, + "learning_rate": 9.998145662527784e-06, + "loss": 3.9611, + "step": 44100 + }, + { + "epoch": 0.043494794416903765, + "grad_norm": 2.676302433013916, + "learning_rate": 9.998141445895259e-06, + "loss": 3.9342, + "step": 44150 + }, + { + "epoch": 0.04354405239472585, + "grad_norm": 2.9476280212402344, + "learning_rate": 9.99813722447491e-06, + "loss": 4.0128, + "step": 44200 + }, + { + "epoch": 0.04359331037254794, + "grad_norm": 2.8259193897247314, + "learning_rate": 9.998132998266734e-06, + "loss": 4.0099, + "step": 44250 + }, + { + "epoch": 0.043642568350370024, + "grad_norm": 2.9133212566375732, + "learning_rate": 9.998128767270741e-06, + "loss": 3.945, + "step": 44300 + }, + { + "epoch": 0.043691826328192115, + "grad_norm": 2.631702423095703, + "learning_rate": 9.998124531486929e-06, + "loss": 3.9934, + "step": 44350 + }, + { + "epoch": 0.0437410843060142, + "grad_norm": 3.502730369567871, + "learning_rate": 9.998120290915307e-06, + "loss": 3.9934, + "step": 44400 + }, + { + "epoch": 0.04379034228383629, + "grad_norm": 2.8433287143707275, + "learning_rate": 9.998116045555875e-06, + "loss": 4.0292, + "step": 44450 + }, + { + "epoch": 0.043839600261658375, + "grad_norm": 2.8420639038085938, + "learning_rate": 9.998111795408642e-06, + "loss": 3.9876, + "step": 44500 + }, + { + "epoch": 0.043888858239480466, + "grad_norm": 5.383781433105469, + "learning_rate": 9.998107540473606e-06, + "loss": 3.9193, + "step": 44550 + }, + { + "epoch": 0.04393811621730256, + "grad_norm": 3.1629178524017334, + "learning_rate": 9.998103280750778e-06, + "loss": 3.9782, + "step": 44600 + }, + { + "epoch": 0.04398737419512464, + "grad_norm": 2.86757755279541, + "learning_rate": 9.998099016240157e-06, + "loss": 3.9504, + "step": 44650 + }, + { + "epoch": 0.04403663217294673, + "grad_norm": 3.108323812484741, + "learning_rate": 9.998094746941748e-06, + "loss": 4.0086, + "step": 44700 + }, + { + "epoch": 0.04408589015076882, + "grad_norm": 3.0131120681762695, + "learning_rate": 9.998090472855554e-06, + "loss": 4.002, + "step": 44750 + }, + { + "epoch": 0.04413514812859091, + "grad_norm": 2.864246368408203, + "learning_rate": 9.998086193981584e-06, + "loss": 3.9502, + "step": 44800 + }, + { + "epoch": 0.04418440610641299, + "grad_norm": 3.1510934829711914, + "learning_rate": 9.998081910319837e-06, + "loss": 3.9966, + "step": 44850 + }, + { + "epoch": 0.04423366408423508, + "grad_norm": 3.1508381366729736, + "learning_rate": 9.99807762187032e-06, + "loss": 3.9853, + "step": 44900 + }, + { + "epoch": 0.044282922062057174, + "grad_norm": 2.776519775390625, + "learning_rate": 9.998073328633035e-06, + "loss": 3.9902, + "step": 44950 + }, + { + "epoch": 0.04433218003987926, + "grad_norm": 2.839163303375244, + "learning_rate": 9.998069030607988e-06, + "loss": 3.8926, + "step": 45000 + }, + { + "epoch": 0.04438143801770135, + "grad_norm": 2.8651914596557617, + "learning_rate": 9.99806472779518e-06, + "loss": 3.8854, + "step": 45050 + }, + { + "epoch": 0.044430695995523434, + "grad_norm": 2.934861183166504, + "learning_rate": 9.99806042019462e-06, + "loss": 3.9618, + "step": 45100 + }, + { + "epoch": 0.044479953973345525, + "grad_norm": 2.8418431282043457, + "learning_rate": 9.99805610780631e-06, + "loss": 3.9133, + "step": 45150 + }, + { + "epoch": 0.04452921195116761, + "grad_norm": 2.765542984008789, + "learning_rate": 9.998051790630249e-06, + "loss": 3.9646, + "step": 45200 + }, + { + "epoch": 0.0445784699289897, + "grad_norm": 2.6040894985198975, + "learning_rate": 9.99804746866645e-06, + "loss": 3.9463, + "step": 45250 + }, + { + "epoch": 0.044627727906811784, + "grad_norm": 3.2018420696258545, + "learning_rate": 9.998043141914913e-06, + "loss": 3.9415, + "step": 45300 + }, + { + "epoch": 0.044676985884633875, + "grad_norm": 2.7809054851531982, + "learning_rate": 9.99803881037564e-06, + "loss": 3.9434, + "step": 45350 + }, + { + "epoch": 0.04472624386245597, + "grad_norm": 2.8362269401550293, + "learning_rate": 9.99803447404864e-06, + "loss": 3.9273, + "step": 45400 + }, + { + "epoch": 0.04477550184027805, + "grad_norm": 2.647585153579712, + "learning_rate": 9.998030132933912e-06, + "loss": 3.9475, + "step": 45450 + }, + { + "epoch": 0.04482475981810014, + "grad_norm": 2.858994722366333, + "learning_rate": 9.998025787031462e-06, + "loss": 3.9677, + "step": 45500 + }, + { + "epoch": 0.044874017795922226, + "grad_norm": 2.8087899684906006, + "learning_rate": 9.998021436341295e-06, + "loss": 3.9695, + "step": 45550 + }, + { + "epoch": 0.04492327577374432, + "grad_norm": 2.6973493099212646, + "learning_rate": 9.998017080863416e-06, + "loss": 3.8895, + "step": 45600 + }, + { + "epoch": 0.0449725337515664, + "grad_norm": 3.176442861557007, + "learning_rate": 9.998012720597828e-06, + "loss": 3.9548, + "step": 45650 + }, + { + "epoch": 0.04502179172938849, + "grad_norm": 3.080256223678589, + "learning_rate": 9.998008355544534e-06, + "loss": 3.9139, + "step": 45700 + }, + { + "epoch": 0.04507104970721058, + "grad_norm": 2.806225061416626, + "learning_rate": 9.99800398570354e-06, + "loss": 3.9353, + "step": 45750 + }, + { + "epoch": 0.04512030768503267, + "grad_norm": 2.7489073276519775, + "learning_rate": 9.99799961107485e-06, + "loss": 3.9306, + "step": 45800 + }, + { + "epoch": 0.04516956566285476, + "grad_norm": 3.1545615196228027, + "learning_rate": 9.997995231658467e-06, + "loss": 3.9437, + "step": 45850 + }, + { + "epoch": 0.04521882364067684, + "grad_norm": 3.233289957046509, + "learning_rate": 9.997990847454396e-06, + "loss": 3.9478, + "step": 45900 + }, + { + "epoch": 0.045268081618498934, + "grad_norm": 2.925522804260254, + "learning_rate": 9.997986458462642e-06, + "loss": 3.9102, + "step": 45950 + }, + { + "epoch": 0.04531733959632102, + "grad_norm": 2.9344165325164795, + "learning_rate": 9.997982064683207e-06, + "loss": 3.9114, + "step": 46000 + }, + { + "epoch": 0.04536659757414311, + "grad_norm": 2.7420713901519775, + "learning_rate": 9.997977666116098e-06, + "loss": 3.9843, + "step": 46050 + }, + { + "epoch": 0.045415855551965194, + "grad_norm": 2.9668960571289062, + "learning_rate": 9.997973262761319e-06, + "loss": 3.9546, + "step": 46100 + }, + { + "epoch": 0.045465113529787285, + "grad_norm": 3.048109531402588, + "learning_rate": 9.99796885461887e-06, + "loss": 3.9249, + "step": 46150 + }, + { + "epoch": 0.04551437150760937, + "grad_norm": 2.8706977367401123, + "learning_rate": 9.997964441688759e-06, + "loss": 3.9465, + "step": 46200 + }, + { + "epoch": 0.04556362948543146, + "grad_norm": 2.6603498458862305, + "learning_rate": 9.997960023970989e-06, + "loss": 3.9442, + "step": 46250 + }, + { + "epoch": 0.04561288746325355, + "grad_norm": 3.519437789916992, + "learning_rate": 9.997955601465565e-06, + "loss": 3.8905, + "step": 46300 + }, + { + "epoch": 0.045662145441075636, + "grad_norm": 2.6617069244384766, + "learning_rate": 9.997951174172492e-06, + "loss": 3.9277, + "step": 46350 + }, + { + "epoch": 0.04571140341889773, + "grad_norm": 2.7618274688720703, + "learning_rate": 9.997946742091773e-06, + "loss": 3.9431, + "step": 46400 + }, + { + "epoch": 0.04576066139671981, + "grad_norm": 2.935378313064575, + "learning_rate": 9.997942305223411e-06, + "loss": 3.9216, + "step": 46450 + }, + { + "epoch": 0.0458099193745419, + "grad_norm": 2.6798624992370605, + "learning_rate": 9.997937863567412e-06, + "loss": 3.9104, + "step": 46500 + }, + { + "epoch": 0.045859177352363986, + "grad_norm": 2.7616279125213623, + "learning_rate": 9.997933417123782e-06, + "loss": 3.8805, + "step": 46550 + }, + { + "epoch": 0.04590843533018608, + "grad_norm": 3.3764054775238037, + "learning_rate": 9.99792896589252e-06, + "loss": 3.9348, + "step": 46600 + }, + { + "epoch": 0.04595769330800817, + "grad_norm": 2.72837233543396, + "learning_rate": 9.997924509873636e-06, + "loss": 3.9121, + "step": 46650 + }, + { + "epoch": 0.04600695128583025, + "grad_norm": 2.9798216819763184, + "learning_rate": 9.99792004906713e-06, + "loss": 3.9433, + "step": 46700 + }, + { + "epoch": 0.046056209263652344, + "grad_norm": 2.8937313556671143, + "learning_rate": 9.99791558347301e-06, + "loss": 3.9411, + "step": 46750 + }, + { + "epoch": 0.04610546724147443, + "grad_norm": 2.4357857704162598, + "learning_rate": 9.997911113091277e-06, + "loss": 3.9535, + "step": 46800 + }, + { + "epoch": 0.04615472521929652, + "grad_norm": 2.5775749683380127, + "learning_rate": 9.997906637921935e-06, + "loss": 3.9757, + "step": 46850 + }, + { + "epoch": 0.0462039831971186, + "grad_norm": 3.0139052867889404, + "learning_rate": 9.997902157964992e-06, + "loss": 3.8996, + "step": 46900 + }, + { + "epoch": 0.046253241174940694, + "grad_norm": 2.7500407695770264, + "learning_rate": 9.99789767322045e-06, + "loss": 3.8929, + "step": 46950 + }, + { + "epoch": 0.04630249915276278, + "grad_norm": 3.091683864593506, + "learning_rate": 9.997893183688312e-06, + "loss": 3.8896, + "step": 47000 + }, + { + "epoch": 0.04635175713058487, + "grad_norm": 3.2092885971069336, + "learning_rate": 9.997888689368584e-06, + "loss": 3.9843, + "step": 47050 + }, + { + "epoch": 0.04640101510840696, + "grad_norm": 6.170140266418457, + "learning_rate": 9.997884190261272e-06, + "loss": 3.8261, + "step": 47100 + }, + { + "epoch": 0.046450273086229045, + "grad_norm": 2.982158899307251, + "learning_rate": 9.997879686366376e-06, + "loss": 3.982, + "step": 47150 + }, + { + "epoch": 0.046499531064051136, + "grad_norm": 2.87933349609375, + "learning_rate": 9.997875177683902e-06, + "loss": 3.9332, + "step": 47200 + }, + { + "epoch": 0.04654878904187322, + "grad_norm": 3.083808183670044, + "learning_rate": 9.997870664213858e-06, + "loss": 3.9288, + "step": 47250 + }, + { + "epoch": 0.04659804701969531, + "grad_norm": 2.714901924133301, + "learning_rate": 9.997866145956243e-06, + "loss": 3.9272, + "step": 47300 + }, + { + "epoch": 0.046647304997517396, + "grad_norm": 2.779989242553711, + "learning_rate": 9.997861622911063e-06, + "loss": 3.9487, + "step": 47350 + }, + { + "epoch": 0.04669656297533949, + "grad_norm": 2.8960483074188232, + "learning_rate": 9.997857095078325e-06, + "loss": 3.9274, + "step": 47400 + }, + { + "epoch": 0.04674582095316157, + "grad_norm": 3.3968732357025146, + "learning_rate": 9.99785256245803e-06, + "loss": 3.844, + "step": 47450 + }, + { + "epoch": 0.04679507893098366, + "grad_norm": 2.9894254207611084, + "learning_rate": 9.997848025050184e-06, + "loss": 3.9598, + "step": 47500 + }, + { + "epoch": 0.04684433690880575, + "grad_norm": 2.85400128364563, + "learning_rate": 9.997843482854793e-06, + "loss": 3.8948, + "step": 47550 + }, + { + "epoch": 0.04689359488662784, + "grad_norm": 3.113671064376831, + "learning_rate": 9.997838935871855e-06, + "loss": 3.9034, + "step": 47600 + }, + { + "epoch": 0.04694285286444993, + "grad_norm": 2.5373196601867676, + "learning_rate": 9.997834384101382e-06, + "loss": 3.9098, + "step": 47650 + }, + { + "epoch": 0.04699211084227201, + "grad_norm": 2.9561591148376465, + "learning_rate": 9.997829827543373e-06, + "loss": 3.8714, + "step": 47700 + }, + { + "epoch": 0.047041368820094104, + "grad_norm": 3.1219136714935303, + "learning_rate": 9.997825266197837e-06, + "loss": 3.8969, + "step": 47750 + }, + { + "epoch": 0.04709062679791619, + "grad_norm": 2.982424020767212, + "learning_rate": 9.997820700064773e-06, + "loss": 3.936, + "step": 47800 + }, + { + "epoch": 0.04713988477573828, + "grad_norm": 2.849760055541992, + "learning_rate": 9.99781612914419e-06, + "loss": 3.9427, + "step": 47850 + }, + { + "epoch": 0.04718914275356036, + "grad_norm": 2.9320459365844727, + "learning_rate": 9.997811553436089e-06, + "loss": 3.9398, + "step": 47900 + }, + { + "epoch": 0.047238400731382454, + "grad_norm": 2.6908745765686035, + "learning_rate": 9.997806972940476e-06, + "loss": 3.8829, + "step": 47950 + }, + { + "epoch": 0.047287658709204546, + "grad_norm": 2.7936155796051025, + "learning_rate": 9.997802387657356e-06, + "loss": 3.8968, + "step": 48000 + }, + { + "epoch": 0.04733691668702663, + "grad_norm": 2.6581978797912598, + "learning_rate": 9.997797797586733e-06, + "loss": 3.8878, + "step": 48050 + }, + { + "epoch": 0.04738617466484872, + "grad_norm": 2.5780277252197266, + "learning_rate": 9.99779320272861e-06, + "loss": 3.9156, + "step": 48100 + }, + { + "epoch": 0.047435432642670805, + "grad_norm": 3.2265820503234863, + "learning_rate": 9.997788603082994e-06, + "loss": 3.9044, + "step": 48150 + }, + { + "epoch": 0.047484690620492896, + "grad_norm": 2.7583863735198975, + "learning_rate": 9.997783998649886e-06, + "loss": 3.9311, + "step": 48200 + }, + { + "epoch": 0.04753394859831498, + "grad_norm": 2.8111581802368164, + "learning_rate": 9.997779389429295e-06, + "loss": 3.9372, + "step": 48250 + }, + { + "epoch": 0.04758320657613707, + "grad_norm": 2.8359758853912354, + "learning_rate": 9.997774775421221e-06, + "loss": 3.9307, + "step": 48300 + }, + { + "epoch": 0.04763246455395916, + "grad_norm": 2.854013204574585, + "learning_rate": 9.99777015662567e-06, + "loss": 3.8741, + "step": 48350 + }, + { + "epoch": 0.04768172253178125, + "grad_norm": 2.9451024532318115, + "learning_rate": 9.997765533042645e-06, + "loss": 3.8618, + "step": 48400 + }, + { + "epoch": 0.04773098050960334, + "grad_norm": 2.8137495517730713, + "learning_rate": 9.997760904672154e-06, + "loss": 4.0054, + "step": 48450 + }, + { + "epoch": 0.04778023848742542, + "grad_norm": 3.0297200679779053, + "learning_rate": 9.997756271514198e-06, + "loss": 3.9282, + "step": 48500 + }, + { + "epoch": 0.04782949646524751, + "grad_norm": 3.200852632522583, + "learning_rate": 9.997751633568785e-06, + "loss": 3.8727, + "step": 48550 + }, + { + "epoch": 0.0478787544430696, + "grad_norm": 2.7026126384735107, + "learning_rate": 9.997746990835918e-06, + "loss": 3.958, + "step": 48600 + }, + { + "epoch": 0.04792801242089169, + "grad_norm": 2.683682441711426, + "learning_rate": 9.997742343315597e-06, + "loss": 3.9041, + "step": 48650 + }, + { + "epoch": 0.04797727039871377, + "grad_norm": 2.732779026031494, + "learning_rate": 9.997737691007832e-06, + "loss": 3.9321, + "step": 48700 + }, + { + "epoch": 0.048026528376535864, + "grad_norm": 2.772275686264038, + "learning_rate": 9.997733033912627e-06, + "loss": 3.9673, + "step": 48750 + }, + { + "epoch": 0.048075786354357955, + "grad_norm": 3.05098557472229, + "learning_rate": 9.997728372029982e-06, + "loss": 3.8978, + "step": 48800 + }, + { + "epoch": 0.04812504433218004, + "grad_norm": 2.4665896892547607, + "learning_rate": 9.997723705359908e-06, + "loss": 3.9772, + "step": 48850 + }, + { + "epoch": 0.04817430231000213, + "grad_norm": 2.8390681743621826, + "learning_rate": 9.997719033902405e-06, + "loss": 3.9259, + "step": 48900 + }, + { + "epoch": 0.048223560287824214, + "grad_norm": 2.7310116291046143, + "learning_rate": 9.997714357657477e-06, + "loss": 3.8332, + "step": 48950 + }, + { + "epoch": 0.048272818265646306, + "grad_norm": 2.7073545455932617, + "learning_rate": 9.997709676625131e-06, + "loss": 3.8932, + "step": 49000 + }, + { + "epoch": 0.04832207624346839, + "grad_norm": 2.6311309337615967, + "learning_rate": 9.99770499080537e-06, + "loss": 3.866, + "step": 49050 + }, + { + "epoch": 0.04837133422129048, + "grad_norm": 3.015855312347412, + "learning_rate": 9.9977003001982e-06, + "loss": 3.8292, + "step": 49100 + }, + { + "epoch": 0.048420592199112565, + "grad_norm": 2.935037136077881, + "learning_rate": 9.997695604803624e-06, + "loss": 3.8886, + "step": 49150 + }, + { + "epoch": 0.048469850176934656, + "grad_norm": 3.531795024871826, + "learning_rate": 9.997690904621647e-06, + "loss": 3.9088, + "step": 49200 + }, + { + "epoch": 0.04851910815475675, + "grad_norm": 2.874725580215454, + "learning_rate": 9.997686199652273e-06, + "loss": 3.8848, + "step": 49250 + }, + { + "epoch": 0.04856836613257883, + "grad_norm": 2.7801880836486816, + "learning_rate": 9.997681489895507e-06, + "loss": 3.9368, + "step": 49300 + }, + { + "epoch": 0.04861762411040092, + "grad_norm": 2.7321619987487793, + "learning_rate": 9.997676775351354e-06, + "loss": 3.8599, + "step": 49350 + }, + { + "epoch": 0.04866688208822301, + "grad_norm": 2.704500436782837, + "learning_rate": 9.997672056019817e-06, + "loss": 3.8923, + "step": 49400 + }, + { + "epoch": 0.0487161400660451, + "grad_norm": 2.747087001800537, + "learning_rate": 9.997667331900903e-06, + "loss": 3.9252, + "step": 49450 + }, + { + "epoch": 0.04876539804386718, + "grad_norm": 2.991956949234009, + "learning_rate": 9.997662602994615e-06, + "loss": 3.8412, + "step": 49500 + }, + { + "epoch": 0.04881465602168927, + "grad_norm": 2.8603739738464355, + "learning_rate": 9.997657869300957e-06, + "loss": 3.8522, + "step": 49550 + }, + { + "epoch": 0.04886391399951136, + "grad_norm": 2.782435417175293, + "learning_rate": 9.997653130819933e-06, + "loss": 3.907, + "step": 49600 + }, + { + "epoch": 0.04891317197733345, + "grad_norm": 3.0265748500823975, + "learning_rate": 9.99764838755155e-06, + "loss": 3.9365, + "step": 49650 + }, + { + "epoch": 0.04896242995515554, + "grad_norm": 3.169058322906494, + "learning_rate": 9.997643639495811e-06, + "loss": 3.9001, + "step": 49700 + }, + { + "epoch": 0.049011687932977624, + "grad_norm": 2.8705403804779053, + "learning_rate": 9.997638886652722e-06, + "loss": 3.8899, + "step": 49750 + }, + { + "epoch": 0.049060945910799715, + "grad_norm": 2.956397771835327, + "learning_rate": 9.997634129022285e-06, + "loss": 3.7742, + "step": 49800 + }, + { + "epoch": 0.0491102038886218, + "grad_norm": 3.173853635787964, + "learning_rate": 9.997629366604504e-06, + "loss": 3.8971, + "step": 49850 + }, + { + "epoch": 0.04915946186644389, + "grad_norm": 3.153721570968628, + "learning_rate": 9.997624599399389e-06, + "loss": 3.8229, + "step": 49900 + }, + { + "epoch": 0.049208719844265975, + "grad_norm": 2.917570114135742, + "learning_rate": 9.997619827406937e-06, + "loss": 3.9339, + "step": 49950 + }, + { + "epoch": 0.049257977822088066, + "grad_norm": 2.6499991416931152, + "learning_rate": 9.997615050627161e-06, + "loss": 3.861, + "step": 50000 + }, + { + "epoch": 0.04930723579991016, + "grad_norm": 2.5623226165771484, + "learning_rate": 9.997610269060058e-06, + "loss": 3.8489, + "step": 50050 + }, + { + "epoch": 0.04935649377773224, + "grad_norm": 2.7566964626312256, + "learning_rate": 9.997605482705636e-06, + "loss": 3.9035, + "step": 50100 + }, + { + "epoch": 0.04940575175555433, + "grad_norm": 2.7969563007354736, + "learning_rate": 9.9976006915639e-06, + "loss": 3.9001, + "step": 50150 + }, + { + "epoch": 0.049455009733376416, + "grad_norm": 2.8503568172454834, + "learning_rate": 9.997595895634854e-06, + "loss": 3.9836, + "step": 50200 + }, + { + "epoch": 0.04950426771119851, + "grad_norm": 2.833604335784912, + "learning_rate": 9.997591094918501e-06, + "loss": 3.8187, + "step": 50250 + }, + { + "epoch": 0.04955352568902059, + "grad_norm": 2.880197763442993, + "learning_rate": 9.997586289414848e-06, + "loss": 3.8982, + "step": 50300 + }, + { + "epoch": 0.04960278366684268, + "grad_norm": 3.1043663024902344, + "learning_rate": 9.9975814791239e-06, + "loss": 3.8918, + "step": 50350 + }, + { + "epoch": 0.04965204164466477, + "grad_norm": 2.8828938007354736, + "learning_rate": 9.99757666404566e-06, + "loss": 3.8425, + "step": 50400 + }, + { + "epoch": 0.04970129962248686, + "grad_norm": 2.9447174072265625, + "learning_rate": 9.997571844180131e-06, + "loss": 3.9095, + "step": 50450 + }, + { + "epoch": 0.04975055760030895, + "grad_norm": 2.6926653385162354, + "learning_rate": 9.997567019527322e-06, + "loss": 3.8409, + "step": 50500 + }, + { + "epoch": 0.04979981557813103, + "grad_norm": 2.8311376571655273, + "learning_rate": 9.997562190087233e-06, + "loss": 3.8578, + "step": 50550 + }, + { + "epoch": 0.049849073555953125, + "grad_norm": 2.9068715572357178, + "learning_rate": 9.997557355859872e-06, + "loss": 3.8501, + "step": 50600 + }, + { + "epoch": 0.04989833153377521, + "grad_norm": 2.7963788509368896, + "learning_rate": 9.997552516845241e-06, + "loss": 3.873, + "step": 50650 + }, + { + "epoch": 0.0499475895115973, + "grad_norm": 2.8737502098083496, + "learning_rate": 9.997547673043348e-06, + "loss": 3.8366, + "step": 50700 + }, + { + "epoch": 0.049996847489419384, + "grad_norm": 2.575472116470337, + "learning_rate": 9.997542824454195e-06, + "loss": 3.8675, + "step": 50750 + }, + { + "epoch": 0.050046105467241475, + "grad_norm": 3.202331781387329, + "learning_rate": 9.997537971077787e-06, + "loss": 3.9172, + "step": 50800 + }, + { + "epoch": 0.05009536344506356, + "grad_norm": 3.1182198524475098, + "learning_rate": 9.99753311291413e-06, + "loss": 3.815, + "step": 50850 + }, + { + "epoch": 0.05014462142288565, + "grad_norm": 2.7427101135253906, + "learning_rate": 9.997528249963225e-06, + "loss": 3.864, + "step": 50900 + }, + { + "epoch": 0.05019387940070774, + "grad_norm": 2.807387113571167, + "learning_rate": 9.997523382225081e-06, + "loss": 3.8465, + "step": 50950 + }, + { + "epoch": 0.050243137378529826, + "grad_norm": 2.960515022277832, + "learning_rate": 9.997518509699702e-06, + "loss": 3.8985, + "step": 51000 + }, + { + "epoch": 0.05029239535635192, + "grad_norm": 3.088902235031128, + "learning_rate": 9.99751363238709e-06, + "loss": 3.8436, + "step": 51050 + }, + { + "epoch": 0.050341653334174, + "grad_norm": 2.700357675552368, + "learning_rate": 9.997508750287252e-06, + "loss": 3.8459, + "step": 51100 + }, + { + "epoch": 0.05039091131199609, + "grad_norm": 2.8286375999450684, + "learning_rate": 9.99750386340019e-06, + "loss": 3.8428, + "step": 51150 + }, + { + "epoch": 0.050440169289818176, + "grad_norm": 2.547328472137451, + "learning_rate": 9.997498971725912e-06, + "loss": 3.9117, + "step": 51200 + }, + { + "epoch": 0.05048942726764027, + "grad_norm": 2.795450210571289, + "learning_rate": 9.997494075264423e-06, + "loss": 3.8858, + "step": 51250 + }, + { + "epoch": 0.05053868524546235, + "grad_norm": 3.2690069675445557, + "learning_rate": 9.997489174015725e-06, + "loss": 3.8855, + "step": 51300 + }, + { + "epoch": 0.05058794322328444, + "grad_norm": 2.7202744483947754, + "learning_rate": 9.997484267979822e-06, + "loss": 3.8259, + "step": 51350 + }, + { + "epoch": 0.050637201201106534, + "grad_norm": 2.6972599029541016, + "learning_rate": 9.997479357156722e-06, + "loss": 3.841, + "step": 51400 + }, + { + "epoch": 0.05068645917892862, + "grad_norm": 3.017165422439575, + "learning_rate": 9.997474441546428e-06, + "loss": 3.7995, + "step": 51450 + }, + { + "epoch": 0.05073571715675071, + "grad_norm": 2.9616281986236572, + "learning_rate": 9.997469521148944e-06, + "loss": 3.8514, + "step": 51500 + }, + { + "epoch": 0.05078497513457279, + "grad_norm": 2.6211206912994385, + "learning_rate": 9.997464595964278e-06, + "loss": 3.8974, + "step": 51550 + }, + { + "epoch": 0.050834233112394885, + "grad_norm": 2.8088982105255127, + "learning_rate": 9.99745966599243e-06, + "loss": 3.8445, + "step": 51600 + }, + { + "epoch": 0.05088349109021697, + "grad_norm": 2.513404369354248, + "learning_rate": 9.997454731233407e-06, + "loss": 3.8675, + "step": 51650 + }, + { + "epoch": 0.05093274906803906, + "grad_norm": 2.5563173294067383, + "learning_rate": 9.997449791687214e-06, + "loss": 3.8445, + "step": 51700 + }, + { + "epoch": 0.05098200704586115, + "grad_norm": 2.7279183864593506, + "learning_rate": 9.997444847353855e-06, + "loss": 3.8824, + "step": 51750 + }, + { + "epoch": 0.051031265023683235, + "grad_norm": 2.6972129344940186, + "learning_rate": 9.997439898233336e-06, + "loss": 3.9101, + "step": 51800 + }, + { + "epoch": 0.051080523001505326, + "grad_norm": 2.407561779022217, + "learning_rate": 9.997434944325662e-06, + "loss": 3.861, + "step": 51850 + }, + { + "epoch": 0.05112978097932741, + "grad_norm": 2.736022472381592, + "learning_rate": 9.997429985630834e-06, + "loss": 3.8725, + "step": 51900 + }, + { + "epoch": 0.0511790389571495, + "grad_norm": 2.6911637783050537, + "learning_rate": 9.997425022148862e-06, + "loss": 3.8678, + "step": 51950 + }, + { + "epoch": 0.051228296934971586, + "grad_norm": 2.6286942958831787, + "learning_rate": 9.997420053879745e-06, + "loss": 3.8596, + "step": 52000 + }, + { + "epoch": 0.05127755491279368, + "grad_norm": 2.5649330615997314, + "learning_rate": 9.997415080823494e-06, + "loss": 3.9091, + "step": 52050 + }, + { + "epoch": 0.05132681289061576, + "grad_norm": 2.964287519454956, + "learning_rate": 9.99741010298011e-06, + "loss": 3.8209, + "step": 52100 + }, + { + "epoch": 0.05137607086843785, + "grad_norm": 2.688326358795166, + "learning_rate": 9.997405120349598e-06, + "loss": 3.8308, + "step": 52150 + }, + { + "epoch": 0.05142532884625994, + "grad_norm": 3.0511207580566406, + "learning_rate": 9.997400132931962e-06, + "loss": 3.8628, + "step": 52200 + }, + { + "epoch": 0.05147458682408203, + "grad_norm": 2.576521158218384, + "learning_rate": 9.997395140727209e-06, + "loss": 3.8031, + "step": 52250 + }, + { + "epoch": 0.05152384480190412, + "grad_norm": 2.754725694656372, + "learning_rate": 9.997390143735345e-06, + "loss": 3.8878, + "step": 52300 + }, + { + "epoch": 0.0515731027797262, + "grad_norm": 2.7631216049194336, + "learning_rate": 9.997385141956368e-06, + "loss": 3.7406, + "step": 52350 + }, + { + "epoch": 0.051622360757548294, + "grad_norm": 2.752963066101074, + "learning_rate": 9.99738013539029e-06, + "loss": 3.7946, + "step": 52400 + }, + { + "epoch": 0.05167161873537038, + "grad_norm": 2.733215093612671, + "learning_rate": 9.997375124037113e-06, + "loss": 3.8472, + "step": 52450 + }, + { + "epoch": 0.05172087671319247, + "grad_norm": 3.5411031246185303, + "learning_rate": 9.997370107896842e-06, + "loss": 3.7764, + "step": 52500 + }, + { + "epoch": 0.051770134691014554, + "grad_norm": 2.967059373855591, + "learning_rate": 9.997365086969482e-06, + "loss": 3.8453, + "step": 52550 + }, + { + "epoch": 0.051819392668836645, + "grad_norm": 2.7348504066467285, + "learning_rate": 9.997360061255037e-06, + "loss": 3.849, + "step": 52600 + }, + { + "epoch": 0.051868650646658736, + "grad_norm": 2.618776559829712, + "learning_rate": 9.997355030753512e-06, + "loss": 3.8766, + "step": 52650 + }, + { + "epoch": 0.05191790862448082, + "grad_norm": 2.9408116340637207, + "learning_rate": 9.997349995464913e-06, + "loss": 3.8434, + "step": 52700 + }, + { + "epoch": 0.05196716660230291, + "grad_norm": 2.8255062103271484, + "learning_rate": 9.997344955389246e-06, + "loss": 3.7812, + "step": 52750 + }, + { + "epoch": 0.052016424580124995, + "grad_norm": 2.517199754714966, + "learning_rate": 9.997339910526512e-06, + "loss": 3.8116, + "step": 52800 + }, + { + "epoch": 0.052065682557947086, + "grad_norm": 3.239884853363037, + "learning_rate": 9.997334860876716e-06, + "loss": 3.8507, + "step": 52850 + }, + { + "epoch": 0.05211494053576917, + "grad_norm": 3.4149253368377686, + "learning_rate": 9.997329806439867e-06, + "loss": 3.8211, + "step": 52900 + }, + { + "epoch": 0.05216419851359126, + "grad_norm": 2.7143008708953857, + "learning_rate": 9.997324747215965e-06, + "loss": 3.8318, + "step": 52950 + }, + { + "epoch": 0.052213456491413346, + "grad_norm": 3.0660479068756104, + "learning_rate": 9.997319683205019e-06, + "loss": 3.8197, + "step": 53000 + }, + { + "epoch": 0.05226271446923544, + "grad_norm": 2.659954786300659, + "learning_rate": 9.997314614407032e-06, + "loss": 3.8348, + "step": 53050 + }, + { + "epoch": 0.05231197244705753, + "grad_norm": 2.7625796794891357, + "learning_rate": 9.997309540822008e-06, + "loss": 3.8442, + "step": 53100 + }, + { + "epoch": 0.05236123042487961, + "grad_norm": 2.7676100730895996, + "learning_rate": 9.997304462449954e-06, + "loss": 3.8337, + "step": 53150 + }, + { + "epoch": 0.0524104884027017, + "grad_norm": 2.8207457065582275, + "learning_rate": 9.997299379290873e-06, + "loss": 3.7716, + "step": 53200 + }, + { + "epoch": 0.05245974638052379, + "grad_norm": 3.1832785606384277, + "learning_rate": 9.99729429134477e-06, + "loss": 3.8644, + "step": 53250 + }, + { + "epoch": 0.05250900435834588, + "grad_norm": 2.90838360786438, + "learning_rate": 9.99728919861165e-06, + "loss": 3.8319, + "step": 53300 + }, + { + "epoch": 0.05255826233616796, + "grad_norm": 3.9564828872680664, + "learning_rate": 9.99728410109152e-06, + "loss": 3.8446, + "step": 53350 + }, + { + "epoch": 0.052607520313990054, + "grad_norm": 2.7111916542053223, + "learning_rate": 9.997278998784383e-06, + "loss": 3.836, + "step": 53400 + }, + { + "epoch": 0.052656778291812145, + "grad_norm": 2.811368465423584, + "learning_rate": 9.997273891690243e-06, + "loss": 3.8567, + "step": 53450 + }, + { + "epoch": 0.05270603626963423, + "grad_norm": 2.774480104446411, + "learning_rate": 9.997268779809107e-06, + "loss": 3.8137, + "step": 53500 + }, + { + "epoch": 0.05275529424745632, + "grad_norm": 2.6644179821014404, + "learning_rate": 9.997263663140977e-06, + "loss": 3.8439, + "step": 53550 + }, + { + "epoch": 0.052804552225278405, + "grad_norm": 2.5807442665100098, + "learning_rate": 9.99725854168586e-06, + "loss": 3.8733, + "step": 53600 + }, + { + "epoch": 0.052853810203100496, + "grad_norm": 2.9254322052001953, + "learning_rate": 9.997253415443763e-06, + "loss": 3.7994, + "step": 53650 + }, + { + "epoch": 0.05290306818092258, + "grad_norm": 2.7968738079071045, + "learning_rate": 9.997248284414686e-06, + "loss": 3.8004, + "step": 53700 + }, + { + "epoch": 0.05295232615874467, + "grad_norm": 2.5174827575683594, + "learning_rate": 9.997243148598638e-06, + "loss": 3.8081, + "step": 53750 + }, + { + "epoch": 0.053001584136566755, + "grad_norm": 3.047924280166626, + "learning_rate": 9.997238007995622e-06, + "loss": 3.7518, + "step": 53800 + }, + { + "epoch": 0.053050842114388846, + "grad_norm": 2.904470682144165, + "learning_rate": 9.997232862605644e-06, + "loss": 3.7937, + "step": 53850 + }, + { + "epoch": 0.05310010009221094, + "grad_norm": 2.952720880508423, + "learning_rate": 9.997227712428706e-06, + "loss": 3.8555, + "step": 53900 + }, + { + "epoch": 0.05314935807003302, + "grad_norm": 2.68854022026062, + "learning_rate": 9.997222557464817e-06, + "loss": 3.8202, + "step": 53950 + }, + { + "epoch": 0.05319861604785511, + "grad_norm": 2.819537878036499, + "learning_rate": 9.99721739771398e-06, + "loss": 3.8291, + "step": 54000 + }, + { + "epoch": 0.0532478740256772, + "grad_norm": 2.603846788406372, + "learning_rate": 9.9972122331762e-06, + "loss": 3.8093, + "step": 54050 + }, + { + "epoch": 0.05329713200349929, + "grad_norm": 2.9189209938049316, + "learning_rate": 9.997207063851482e-06, + "loss": 3.7898, + "step": 54100 + }, + { + "epoch": 0.05334638998132137, + "grad_norm": 2.524442672729492, + "learning_rate": 9.99720188973983e-06, + "loss": 3.7895, + "step": 54150 + }, + { + "epoch": 0.053395647959143464, + "grad_norm": 2.7834534645080566, + "learning_rate": 9.997196710841251e-06, + "loss": 3.7529, + "step": 54200 + }, + { + "epoch": 0.05344490593696555, + "grad_norm": 2.882136821746826, + "learning_rate": 9.997191527155747e-06, + "loss": 3.839, + "step": 54250 + }, + { + "epoch": 0.05349416391478764, + "grad_norm": 2.5844919681549072, + "learning_rate": 9.997186338683327e-06, + "loss": 3.7814, + "step": 54300 + }, + { + "epoch": 0.05354342189260973, + "grad_norm": 2.851780891418457, + "learning_rate": 9.997181145423993e-06, + "loss": 3.8179, + "step": 54350 + }, + { + "epoch": 0.053592679870431814, + "grad_norm": 2.8690907955169678, + "learning_rate": 9.99717594737775e-06, + "loss": 3.7966, + "step": 54400 + }, + { + "epoch": 0.053641937848253905, + "grad_norm": 2.7862234115600586, + "learning_rate": 9.997170744544606e-06, + "loss": 3.8176, + "step": 54450 + }, + { + "epoch": 0.05369119582607599, + "grad_norm": 2.701183319091797, + "learning_rate": 9.997165536924562e-06, + "loss": 3.7946, + "step": 54500 + }, + { + "epoch": 0.05374045380389808, + "grad_norm": 2.8167624473571777, + "learning_rate": 9.997160324517625e-06, + "loss": 3.7749, + "step": 54550 + }, + { + "epoch": 0.053789711781720165, + "grad_norm": 2.747720718383789, + "learning_rate": 9.9971551073238e-06, + "loss": 3.7775, + "step": 54600 + }, + { + "epoch": 0.053838969759542256, + "grad_norm": 2.782846212387085, + "learning_rate": 9.99714988534309e-06, + "loss": 3.8121, + "step": 54650 + }, + { + "epoch": 0.05388822773736434, + "grad_norm": 2.709993839263916, + "learning_rate": 9.997144658575504e-06, + "loss": 3.789, + "step": 54700 + }, + { + "epoch": 0.05393748571518643, + "grad_norm": 2.776766538619995, + "learning_rate": 9.997139427021045e-06, + "loss": 3.7715, + "step": 54750 + }, + { + "epoch": 0.05398674369300852, + "grad_norm": 2.906484842300415, + "learning_rate": 9.997134190679716e-06, + "loss": 3.8935, + "step": 54800 + }, + { + "epoch": 0.054036001670830607, + "grad_norm": 2.7916693687438965, + "learning_rate": 9.997128949551525e-06, + "loss": 3.7786, + "step": 54850 + }, + { + "epoch": 0.0540852596486527, + "grad_norm": 2.716115951538086, + "learning_rate": 9.997123703636475e-06, + "loss": 3.8842, + "step": 54900 + }, + { + "epoch": 0.05413451762647478, + "grad_norm": 3.2908172607421875, + "learning_rate": 9.997118452934572e-06, + "loss": 3.7798, + "step": 54950 + }, + { + "epoch": 0.05418377560429687, + "grad_norm": 2.5399866104125977, + "learning_rate": 9.99711319744582e-06, + "loss": 3.8102, + "step": 55000 + }, + { + "epoch": 0.05423303358211896, + "grad_norm": 2.7346315383911133, + "learning_rate": 9.997107937170228e-06, + "loss": 3.8097, + "step": 55050 + }, + { + "epoch": 0.05428229155994105, + "grad_norm": 2.875230312347412, + "learning_rate": 9.997102672107794e-06, + "loss": 3.8519, + "step": 55100 + }, + { + "epoch": 0.05433154953776314, + "grad_norm": 2.673006296157837, + "learning_rate": 9.997097402258528e-06, + "loss": 3.7879, + "step": 55150 + }, + { + "epoch": 0.054380807515585224, + "grad_norm": 2.6999597549438477, + "learning_rate": 9.997092127622435e-06, + "loss": 3.8414, + "step": 55200 + }, + { + "epoch": 0.054430065493407315, + "grad_norm": 2.8893816471099854, + "learning_rate": 9.997086848199518e-06, + "loss": 3.7944, + "step": 55250 + }, + { + "epoch": 0.0544793234712294, + "grad_norm": 2.879107713699341, + "learning_rate": 9.997081563989784e-06, + "loss": 3.792, + "step": 55300 + }, + { + "epoch": 0.05452858144905149, + "grad_norm": 2.9256958961486816, + "learning_rate": 9.997076274993238e-06, + "loss": 3.8202, + "step": 55350 + }, + { + "epoch": 0.054577839426873574, + "grad_norm": 2.9391939640045166, + "learning_rate": 9.997070981209882e-06, + "loss": 3.8124, + "step": 55400 + }, + { + "epoch": 0.054627097404695665, + "grad_norm": 3.245795488357544, + "learning_rate": 9.997065682639724e-06, + "loss": 3.8092, + "step": 55450 + }, + { + "epoch": 0.05467635538251775, + "grad_norm": 2.8257791996002197, + "learning_rate": 9.99706037928277e-06, + "loss": 3.8236, + "step": 55500 + }, + { + "epoch": 0.05472561336033984, + "grad_norm": 2.634209156036377, + "learning_rate": 9.99705507113902e-06, + "loss": 3.7884, + "step": 55550 + }, + { + "epoch": 0.05477487133816193, + "grad_norm": 2.9316630363464355, + "learning_rate": 9.997049758208487e-06, + "loss": 3.8354, + "step": 55600 + }, + { + "epoch": 0.054824129315984016, + "grad_norm": 2.789832830429077, + "learning_rate": 9.99704444049117e-06, + "loss": 3.8054, + "step": 55650 + }, + { + "epoch": 0.05487338729380611, + "grad_norm": 2.7652952671051025, + "learning_rate": 9.997039117987074e-06, + "loss": 3.8282, + "step": 55700 + }, + { + "epoch": 0.05492264527162819, + "grad_norm": 3.1302547454833984, + "learning_rate": 9.997033790696206e-06, + "loss": 3.801, + "step": 55750 + }, + { + "epoch": 0.05497190324945028, + "grad_norm": 2.7514939308166504, + "learning_rate": 9.997028458618574e-06, + "loss": 3.765, + "step": 55800 + }, + { + "epoch": 0.05502116122727237, + "grad_norm": 2.4372856616973877, + "learning_rate": 9.997023121754177e-06, + "loss": 3.789, + "step": 55850 + }, + { + "epoch": 0.05507041920509446, + "grad_norm": 2.518200159072876, + "learning_rate": 9.997017780103024e-06, + "loss": 3.7629, + "step": 55900 + }, + { + "epoch": 0.05511967718291654, + "grad_norm": 2.8256750106811523, + "learning_rate": 9.99701243366512e-06, + "loss": 3.8043, + "step": 55950 + }, + { + "epoch": 0.05516893516073863, + "grad_norm": 3.1529719829559326, + "learning_rate": 9.997007082440468e-06, + "loss": 3.8362, + "step": 56000 + }, + { + "epoch": 0.055218193138560724, + "grad_norm": 2.9714698791503906, + "learning_rate": 9.997001726429075e-06, + "loss": 3.8361, + "step": 56050 + }, + { + "epoch": 0.05526745111638281, + "grad_norm": 2.7263238430023193, + "learning_rate": 9.996996365630947e-06, + "loss": 3.7947, + "step": 56100 + }, + { + "epoch": 0.0553167090942049, + "grad_norm": 2.9313087463378906, + "learning_rate": 9.996991000046085e-06, + "loss": 3.7969, + "step": 56150 + }, + { + "epoch": 0.055365967072026984, + "grad_norm": 3.372980833053589, + "learning_rate": 9.996985629674501e-06, + "loss": 3.8201, + "step": 56200 + }, + { + "epoch": 0.055415225049849075, + "grad_norm": 2.9852516651153564, + "learning_rate": 9.996980254516194e-06, + "loss": 3.8138, + "step": 56250 + }, + { + "epoch": 0.05546448302767116, + "grad_norm": 2.5766215324401855, + "learning_rate": 9.996974874571169e-06, + "loss": 3.7994, + "step": 56300 + }, + { + "epoch": 0.05551374100549325, + "grad_norm": 2.7387025356292725, + "learning_rate": 9.996969489839436e-06, + "loss": 3.7767, + "step": 56350 + }, + { + "epoch": 0.05556299898331534, + "grad_norm": 2.615220785140991, + "learning_rate": 9.996964100320996e-06, + "loss": 3.8271, + "step": 56400 + }, + { + "epoch": 0.055612256961137425, + "grad_norm": 2.8956141471862793, + "learning_rate": 9.996958706015857e-06, + "loss": 3.8913, + "step": 56450 + }, + { + "epoch": 0.055661514938959517, + "grad_norm": 2.5934765338897705, + "learning_rate": 9.996953306924022e-06, + "loss": 3.8133, + "step": 56500 + }, + { + "epoch": 0.0557107729167816, + "grad_norm": 2.5480055809020996, + "learning_rate": 9.996947903045497e-06, + "loss": 3.76, + "step": 56550 + }, + { + "epoch": 0.05576003089460369, + "grad_norm": 2.7125661373138428, + "learning_rate": 9.996942494380289e-06, + "loss": 3.8328, + "step": 56600 + }, + { + "epoch": 0.055809288872425776, + "grad_norm": 2.738781452178955, + "learning_rate": 9.9969370809284e-06, + "loss": 3.8444, + "step": 56650 + }, + { + "epoch": 0.05585854685024787, + "grad_norm": 2.7964601516723633, + "learning_rate": 9.996931662689835e-06, + "loss": 3.8459, + "step": 56700 + }, + { + "epoch": 0.05590780482806995, + "grad_norm": 2.6525187492370605, + "learning_rate": 9.996926239664603e-06, + "loss": 3.8073, + "step": 56750 + }, + { + "epoch": 0.05595706280589204, + "grad_norm": 2.7038049697875977, + "learning_rate": 9.996920811852707e-06, + "loss": 3.7618, + "step": 56800 + }, + { + "epoch": 0.056006320783714134, + "grad_norm": 3.1239099502563477, + "learning_rate": 9.99691537925415e-06, + "loss": 3.8331, + "step": 56850 + }, + { + "epoch": 0.05605557876153622, + "grad_norm": 2.738351821899414, + "learning_rate": 9.99690994186894e-06, + "loss": 3.8287, + "step": 56900 + }, + { + "epoch": 0.05610483673935831, + "grad_norm": 2.715238094329834, + "learning_rate": 9.996904499697084e-06, + "loss": 3.7652, + "step": 56950 + }, + { + "epoch": 0.05615409471718039, + "grad_norm": 2.8009490966796875, + "learning_rate": 9.996899052738582e-06, + "loss": 3.8043, + "step": 57000 + }, + { + "epoch": 0.056203352695002484, + "grad_norm": 2.6868886947631836, + "learning_rate": 9.996893600993442e-06, + "loss": 3.7626, + "step": 57050 + }, + { + "epoch": 0.05625261067282457, + "grad_norm": 2.589235305786133, + "learning_rate": 9.996888144461672e-06, + "loss": 3.7993, + "step": 57100 + }, + { + "epoch": 0.05630186865064666, + "grad_norm": 3.101914882659912, + "learning_rate": 9.996882683143271e-06, + "loss": 3.7456, + "step": 57150 + }, + { + "epoch": 0.056351126628468744, + "grad_norm": 2.593919515609741, + "learning_rate": 9.99687721703825e-06, + "loss": 3.8419, + "step": 57200 + }, + { + "epoch": 0.056400384606290835, + "grad_norm": 2.862473487854004, + "learning_rate": 9.996871746146612e-06, + "loss": 3.8377, + "step": 57250 + }, + { + "epoch": 0.056449642584112926, + "grad_norm": 2.76829195022583, + "learning_rate": 9.996866270468362e-06, + "loss": 3.7794, + "step": 57300 + }, + { + "epoch": 0.05649890056193501, + "grad_norm": 2.933870792388916, + "learning_rate": 9.996860790003504e-06, + "loss": 3.7912, + "step": 57350 + }, + { + "epoch": 0.0565481585397571, + "grad_norm": 2.7253201007843018, + "learning_rate": 9.996855304752045e-06, + "loss": 3.8249, + "step": 57400 + }, + { + "epoch": 0.056597416517579185, + "grad_norm": 2.575054883956909, + "learning_rate": 9.996849814713991e-06, + "loss": 3.7387, + "step": 57450 + }, + { + "epoch": 0.05664667449540128, + "grad_norm": 2.791195869445801, + "learning_rate": 9.996844319889346e-06, + "loss": 3.7993, + "step": 57500 + }, + { + "epoch": 0.05669593247322336, + "grad_norm": 2.7499754428863525, + "learning_rate": 9.996838820278113e-06, + "loss": 3.7296, + "step": 57550 + }, + { + "epoch": 0.05674519045104545, + "grad_norm": 2.7251698970794678, + "learning_rate": 9.996833315880303e-06, + "loss": 3.8121, + "step": 57600 + }, + { + "epoch": 0.056794448428867536, + "grad_norm": 3.0628533363342285, + "learning_rate": 9.996827806695916e-06, + "loss": 3.8075, + "step": 57650 + }, + { + "epoch": 0.05684370640668963, + "grad_norm": 3.40836238861084, + "learning_rate": 9.996822292724962e-06, + "loss": 3.7851, + "step": 57700 + }, + { + "epoch": 0.05689296438451172, + "grad_norm": 2.9646127223968506, + "learning_rate": 9.99681677396744e-06, + "loss": 3.7653, + "step": 57750 + }, + { + "epoch": 0.0569422223623338, + "grad_norm": 2.5266225337982178, + "learning_rate": 9.996811250423362e-06, + "loss": 3.7511, + "step": 57800 + }, + { + "epoch": 0.056991480340155894, + "grad_norm": 2.5867815017700195, + "learning_rate": 9.996805722092727e-06, + "loss": 3.8135, + "step": 57850 + }, + { + "epoch": 0.05704073831797798, + "grad_norm": 2.687371253967285, + "learning_rate": 9.996800188975545e-06, + "loss": 3.7901, + "step": 57900 + }, + { + "epoch": 0.05708999629580007, + "grad_norm": 2.585845947265625, + "learning_rate": 9.99679465107182e-06, + "loss": 3.7911, + "step": 57950 + }, + { + "epoch": 0.05713925427362215, + "grad_norm": 2.6444971561431885, + "learning_rate": 9.996789108381556e-06, + "loss": 3.7977, + "step": 58000 + }, + { + "epoch": 0.057188512251444244, + "grad_norm": 3.2465310096740723, + "learning_rate": 9.99678356090476e-06, + "loss": 3.7994, + "step": 58050 + }, + { + "epoch": 0.057237770229266335, + "grad_norm": 2.729257822036743, + "learning_rate": 9.996778008641437e-06, + "loss": 3.767, + "step": 58100 + }, + { + "epoch": 0.05728702820708842, + "grad_norm": 2.985215425491333, + "learning_rate": 9.99677245159159e-06, + "loss": 3.7526, + "step": 58150 + }, + { + "epoch": 0.05733628618491051, + "grad_norm": 2.9066357612609863, + "learning_rate": 9.996766889755228e-06, + "loss": 3.7897, + "step": 58200 + }, + { + "epoch": 0.057385544162732595, + "grad_norm": 2.7879366874694824, + "learning_rate": 9.996761323132355e-06, + "loss": 3.7648, + "step": 58250 + }, + { + "epoch": 0.057434802140554686, + "grad_norm": 2.8415310382843018, + "learning_rate": 9.996755751722975e-06, + "loss": 3.8156, + "step": 58300 + }, + { + "epoch": 0.05748406011837677, + "grad_norm": 2.735386371612549, + "learning_rate": 9.996750175527094e-06, + "loss": 3.7303, + "step": 58350 + }, + { + "epoch": 0.05753331809619886, + "grad_norm": 2.484065055847168, + "learning_rate": 9.996744594544719e-06, + "loss": 3.7211, + "step": 58400 + }, + { + "epoch": 0.057582576074020946, + "grad_norm": 2.826388120651245, + "learning_rate": 9.996739008775851e-06, + "loss": 3.7656, + "step": 58450 + }, + { + "epoch": 0.05763183405184304, + "grad_norm": 2.840162754058838, + "learning_rate": 9.9967334182205e-06, + "loss": 3.7816, + "step": 58500 + }, + { + "epoch": 0.05768109202966513, + "grad_norm": 3.03169322013855, + "learning_rate": 9.996727822878672e-06, + "loss": 3.7509, + "step": 58550 + }, + { + "epoch": 0.05773035000748721, + "grad_norm": 2.7932119369506836, + "learning_rate": 9.996722222750369e-06, + "loss": 3.7482, + "step": 58600 + }, + { + "epoch": 0.0577796079853093, + "grad_norm": 2.6114325523376465, + "learning_rate": 9.996716617835595e-06, + "loss": 3.826, + "step": 58650 + }, + { + "epoch": 0.05782886596313139, + "grad_norm": 2.7125561237335205, + "learning_rate": 9.99671100813436e-06, + "loss": 3.7955, + "step": 58700 + }, + { + "epoch": 0.05787812394095348, + "grad_norm": 2.8667004108428955, + "learning_rate": 9.996705393646666e-06, + "loss": 3.6536, + "step": 58750 + }, + { + "epoch": 0.05792738191877556, + "grad_norm": 2.8250913619995117, + "learning_rate": 9.996699774372519e-06, + "loss": 3.8288, + "step": 58800 + }, + { + "epoch": 0.057976639896597654, + "grad_norm": 2.67215633392334, + "learning_rate": 9.996694150311925e-06, + "loss": 3.7684, + "step": 58850 + }, + { + "epoch": 0.05802589787441974, + "grad_norm": 2.6591062545776367, + "learning_rate": 9.99668852146489e-06, + "loss": 3.8017, + "step": 58900 + }, + { + "epoch": 0.05807515585224183, + "grad_norm": 2.645397424697876, + "learning_rate": 9.996682887831419e-06, + "loss": 3.8416, + "step": 58950 + }, + { + "epoch": 0.05812441383006392, + "grad_norm": 3.108062982559204, + "learning_rate": 9.996677249411517e-06, + "loss": 3.7391, + "step": 59000 + }, + { + "epoch": 0.058173671807886004, + "grad_norm": 3.0808019638061523, + "learning_rate": 9.99667160620519e-06, + "loss": 3.7458, + "step": 59050 + }, + { + "epoch": 0.058222929785708095, + "grad_norm": 2.9304213523864746, + "learning_rate": 9.996665958212442e-06, + "loss": 3.7824, + "step": 59100 + }, + { + "epoch": 0.05827218776353018, + "grad_norm": 2.5487077236175537, + "learning_rate": 9.996660305433278e-06, + "loss": 3.8018, + "step": 59150 + }, + { + "epoch": 0.05832144574135227, + "grad_norm": 2.507504940032959, + "learning_rate": 9.996654647867708e-06, + "loss": 3.7701, + "step": 59200 + }, + { + "epoch": 0.058370703719174355, + "grad_norm": 2.793212413787842, + "learning_rate": 9.99664898551573e-06, + "loss": 3.7715, + "step": 59250 + }, + { + "epoch": 0.058419961696996446, + "grad_norm": 2.6383514404296875, + "learning_rate": 9.996643318377357e-06, + "loss": 3.829, + "step": 59300 + }, + { + "epoch": 0.05846921967481853, + "grad_norm": 3.466677665710449, + "learning_rate": 9.996637646452589e-06, + "loss": 3.7341, + "step": 59350 + }, + { + "epoch": 0.05851847765264062, + "grad_norm": 2.646310567855835, + "learning_rate": 9.996631969741435e-06, + "loss": 3.7494, + "step": 59400 + }, + { + "epoch": 0.05856773563046271, + "grad_norm": 2.6264052391052246, + "learning_rate": 9.996626288243897e-06, + "loss": 3.7765, + "step": 59450 + }, + { + "epoch": 0.0586169936082848, + "grad_norm": 2.5519556999206543, + "learning_rate": 9.996620601959983e-06, + "loss": 3.8204, + "step": 59500 + }, + { + "epoch": 0.05866625158610689, + "grad_norm": 2.620962619781494, + "learning_rate": 9.996614910889697e-06, + "loss": 3.8232, + "step": 59550 + }, + { + "epoch": 0.05871550956392897, + "grad_norm": 2.7627553939819336, + "learning_rate": 9.996609215033047e-06, + "loss": 3.79, + "step": 59600 + }, + { + "epoch": 0.05876476754175106, + "grad_norm": 2.370274066925049, + "learning_rate": 9.996603514390036e-06, + "loss": 3.7167, + "step": 59650 + }, + { + "epoch": 0.05881402551957315, + "grad_norm": 2.7622532844543457, + "learning_rate": 9.99659780896067e-06, + "loss": 3.7203, + "step": 59700 + }, + { + "epoch": 0.05886328349739524, + "grad_norm": 2.633330821990967, + "learning_rate": 9.996592098744955e-06, + "loss": 3.7304, + "step": 59750 + }, + { + "epoch": 0.05891254147521733, + "grad_norm": 3.5819284915924072, + "learning_rate": 9.996586383742895e-06, + "loss": 3.7869, + "step": 59800 + }, + { + "epoch": 0.058961799453039414, + "grad_norm": 3.0921247005462646, + "learning_rate": 9.996580663954496e-06, + "loss": 3.7221, + "step": 59850 + }, + { + "epoch": 0.059011057430861505, + "grad_norm": 2.8721766471862793, + "learning_rate": 9.996574939379766e-06, + "loss": 3.7516, + "step": 59900 + }, + { + "epoch": 0.05906031540868359, + "grad_norm": 2.776221990585327, + "learning_rate": 9.996569210018708e-06, + "loss": 3.7547, + "step": 59950 + }, + { + "epoch": 0.05910957338650568, + "grad_norm": 2.741987466812134, + "learning_rate": 9.996563475871327e-06, + "loss": 3.7508, + "step": 60000 + }, + { + "epoch": 0.059158831364327764, + "grad_norm": 2.4582152366638184, + "learning_rate": 9.996557736937628e-06, + "loss": 3.7539, + "step": 60050 + }, + { + "epoch": 0.059208089342149856, + "grad_norm": 2.9588818550109863, + "learning_rate": 9.99655199321762e-06, + "loss": 3.8448, + "step": 60100 + }, + { + "epoch": 0.05925734731997194, + "grad_norm": 2.7777903079986572, + "learning_rate": 9.996546244711307e-06, + "loss": 3.7646, + "step": 60150 + }, + { + "epoch": 0.05930660529779403, + "grad_norm": 2.8564724922180176, + "learning_rate": 9.996540491418693e-06, + "loss": 3.7748, + "step": 60200 + }, + { + "epoch": 0.05935586327561612, + "grad_norm": 2.779534101486206, + "learning_rate": 9.996534733339785e-06, + "loss": 3.79, + "step": 60250 + }, + { + "epoch": 0.059405121253438206, + "grad_norm": 2.674382448196411, + "learning_rate": 9.996528970474589e-06, + "loss": 3.7342, + "step": 60300 + }, + { + "epoch": 0.0594543792312603, + "grad_norm": 2.8330423831939697, + "learning_rate": 9.996523202823108e-06, + "loss": 3.7874, + "step": 60350 + }, + { + "epoch": 0.05950363720908238, + "grad_norm": 2.768350601196289, + "learning_rate": 9.996517430385348e-06, + "loss": 3.7221, + "step": 60400 + }, + { + "epoch": 0.05955289518690447, + "grad_norm": 2.5900590419769287, + "learning_rate": 9.996511653161315e-06, + "loss": 3.7502, + "step": 60450 + }, + { + "epoch": 0.05960215316472656, + "grad_norm": 2.826695680618286, + "learning_rate": 9.996505871151018e-06, + "loss": 3.6925, + "step": 60500 + }, + { + "epoch": 0.05965141114254865, + "grad_norm": 2.7383453845977783, + "learning_rate": 9.99650008435446e-06, + "loss": 3.7626, + "step": 60550 + }, + { + "epoch": 0.05970066912037073, + "grad_norm": 3.161367177963257, + "learning_rate": 9.996494292771643e-06, + "loss": 3.7084, + "step": 60600 + }, + { + "epoch": 0.05974992709819282, + "grad_norm": 3.196807384490967, + "learning_rate": 9.996488496402578e-06, + "loss": 3.7395, + "step": 60650 + }, + { + "epoch": 0.059799185076014914, + "grad_norm": 2.693335771560669, + "learning_rate": 9.996482695247268e-06, + "loss": 3.7552, + "step": 60700 + }, + { + "epoch": 0.059848443053837, + "grad_norm": 2.7385921478271484, + "learning_rate": 9.996476889305718e-06, + "loss": 3.7376, + "step": 60750 + }, + { + "epoch": 0.05989770103165909, + "grad_norm": 2.898216962814331, + "learning_rate": 9.996471078577936e-06, + "loss": 3.7204, + "step": 60800 + }, + { + "epoch": 0.059946959009481174, + "grad_norm": 3.002398729324341, + "learning_rate": 9.996465263063923e-06, + "loss": 3.7399, + "step": 60850 + }, + { + "epoch": 0.059996216987303265, + "grad_norm": 2.7590394020080566, + "learning_rate": 9.99645944276369e-06, + "loss": 3.7711, + "step": 60900 + }, + { + "epoch": 0.06004547496512535, + "grad_norm": 2.7998249530792236, + "learning_rate": 9.99645361767724e-06, + "loss": 3.7286, + "step": 60950 + }, + { + "epoch": 0.06009473294294744, + "grad_norm": 2.8708784580230713, + "learning_rate": 9.996447787804577e-06, + "loss": 3.751, + "step": 61000 + }, + { + "epoch": 0.060143990920769524, + "grad_norm": 2.7529208660125732, + "learning_rate": 9.99644195314571e-06, + "loss": 3.6778, + "step": 61050 + }, + { + "epoch": 0.060193248898591616, + "grad_norm": 2.7081685066223145, + "learning_rate": 9.996436113700643e-06, + "loss": 3.7249, + "step": 61100 + }, + { + "epoch": 0.06024250687641371, + "grad_norm": 3.028101921081543, + "learning_rate": 9.99643026946938e-06, + "loss": 3.6785, + "step": 61150 + }, + { + "epoch": 0.06029176485423579, + "grad_norm": 2.640751838684082, + "learning_rate": 9.996424420451929e-06, + "loss": 3.7471, + "step": 61200 + }, + { + "epoch": 0.06034102283205788, + "grad_norm": 2.8480353355407715, + "learning_rate": 9.996418566648295e-06, + "loss": 3.711, + "step": 61250 + }, + { + "epoch": 0.060390280809879966, + "grad_norm": 2.6508543491363525, + "learning_rate": 9.996412708058482e-06, + "loss": 3.6734, + "step": 61300 + }, + { + "epoch": 0.06043953878770206, + "grad_norm": 2.7539050579071045, + "learning_rate": 9.996406844682498e-06, + "loss": 3.7094, + "step": 61350 + }, + { + "epoch": 0.06048879676552414, + "grad_norm": 2.7134745121002197, + "learning_rate": 9.996400976520347e-06, + "loss": 3.713, + "step": 61400 + }, + { + "epoch": 0.06053805474334623, + "grad_norm": 2.97367262840271, + "learning_rate": 9.996395103572035e-06, + "loss": 3.7753, + "step": 61450 + }, + { + "epoch": 0.060587312721168324, + "grad_norm": 2.785409688949585, + "learning_rate": 9.996389225837569e-06, + "loss": 3.7404, + "step": 61500 + }, + { + "epoch": 0.06063657069899041, + "grad_norm": 2.693065881729126, + "learning_rate": 9.996383343316951e-06, + "loss": 3.7307, + "step": 61550 + }, + { + "epoch": 0.0606858286768125, + "grad_norm": 2.6083180904388428, + "learning_rate": 9.99637745601019e-06, + "loss": 3.7253, + "step": 61600 + }, + { + "epoch": 0.06073508665463458, + "grad_norm": 2.5235724449157715, + "learning_rate": 9.996371563917293e-06, + "loss": 3.7284, + "step": 61650 + }, + { + "epoch": 0.060784344632456674, + "grad_norm": 2.820072889328003, + "learning_rate": 9.99636566703826e-06, + "loss": 3.7328, + "step": 61700 + }, + { + "epoch": 0.06083360261027876, + "grad_norm": 2.5571258068084717, + "learning_rate": 9.996359765373102e-06, + "loss": 3.6691, + "step": 61750 + }, + { + "epoch": 0.06088286058810085, + "grad_norm": 2.911647081375122, + "learning_rate": 9.996353858921822e-06, + "loss": 3.6961, + "step": 61800 + }, + { + "epoch": 0.060932118565922934, + "grad_norm": 2.669754981994629, + "learning_rate": 9.996347947684425e-06, + "loss": 3.6748, + "step": 61850 + }, + { + "epoch": 0.060981376543745025, + "grad_norm": 2.7714967727661133, + "learning_rate": 9.996342031660919e-06, + "loss": 3.7938, + "step": 61900 + }, + { + "epoch": 0.061030634521567116, + "grad_norm": 2.675990104675293, + "learning_rate": 9.99633611085131e-06, + "loss": 3.7755, + "step": 61950 + }, + { + "epoch": 0.0610798924993892, + "grad_norm": 2.9145185947418213, + "learning_rate": 9.9963301852556e-06, + "loss": 3.7465, + "step": 62000 + }, + { + "epoch": 0.06112915047721129, + "grad_norm": 2.7616021633148193, + "learning_rate": 9.996324254873798e-06, + "loss": 3.7126, + "step": 62050 + }, + { + "epoch": 0.061178408455033376, + "grad_norm": 2.7643511295318604, + "learning_rate": 9.996318319705908e-06, + "loss": 3.7685, + "step": 62100 + }, + { + "epoch": 0.06122766643285547, + "grad_norm": 3.007451295852661, + "learning_rate": 9.996312379751937e-06, + "loss": 3.7003, + "step": 62150 + }, + { + "epoch": 0.06127692441067755, + "grad_norm": 2.5740201473236084, + "learning_rate": 9.996306435011889e-06, + "loss": 3.7065, + "step": 62200 + }, + { + "epoch": 0.06132618238849964, + "grad_norm": 2.876936912536621, + "learning_rate": 9.996300485485773e-06, + "loss": 3.7407, + "step": 62250 + }, + { + "epoch": 0.061375440366321726, + "grad_norm": 2.756286859512329, + "learning_rate": 9.996294531173588e-06, + "loss": 3.696, + "step": 62300 + }, + { + "epoch": 0.06142469834414382, + "grad_norm": 2.903721570968628, + "learning_rate": 9.996288572075348e-06, + "loss": 3.678, + "step": 62350 + }, + { + "epoch": 0.06147395632196591, + "grad_norm": 2.936685085296631, + "learning_rate": 9.996282608191054e-06, + "loss": 3.7227, + "step": 62400 + }, + { + "epoch": 0.06152321429978799, + "grad_norm": 2.588590145111084, + "learning_rate": 9.996276639520713e-06, + "loss": 3.7181, + "step": 62450 + }, + { + "epoch": 0.061572472277610084, + "grad_norm": 2.789505958557129, + "learning_rate": 9.996270666064328e-06, + "loss": 3.6941, + "step": 62500 + }, + { + "epoch": 0.06162173025543217, + "grad_norm": 2.6898155212402344, + "learning_rate": 9.996264687821908e-06, + "loss": 3.7163, + "step": 62550 + }, + { + "epoch": 0.06167098823325426, + "grad_norm": 2.727890968322754, + "learning_rate": 9.996258704793459e-06, + "loss": 3.6795, + "step": 62600 + }, + { + "epoch": 0.06172024621107634, + "grad_norm": 2.646885633468628, + "learning_rate": 9.996252716978983e-06, + "loss": 3.6687, + "step": 62650 + }, + { + "epoch": 0.061769504188898434, + "grad_norm": 2.4528701305389404, + "learning_rate": 9.99624672437849e-06, + "loss": 3.7218, + "step": 62700 + }, + { + "epoch": 0.06181876216672052, + "grad_norm": 2.754321813583374, + "learning_rate": 9.996240726991984e-06, + "loss": 3.7014, + "step": 62750 + }, + { + "epoch": 0.06186802014454261, + "grad_norm": 2.4906885623931885, + "learning_rate": 9.99623472481947e-06, + "loss": 3.7453, + "step": 62800 + }, + { + "epoch": 0.0619172781223647, + "grad_norm": 2.6868820190429688, + "learning_rate": 9.996228717860954e-06, + "loss": 3.7176, + "step": 62850 + }, + { + "epoch": 0.061966536100186785, + "grad_norm": 2.894132614135742, + "learning_rate": 9.996222706116443e-06, + "loss": 3.732, + "step": 62900 + }, + { + "epoch": 0.062015794078008876, + "grad_norm": 2.8179545402526855, + "learning_rate": 9.996216689585941e-06, + "loss": 3.7555, + "step": 62950 + }, + { + "epoch": 0.06206505205583096, + "grad_norm": 3.151822805404663, + "learning_rate": 9.996210668269454e-06, + "loss": 3.6918, + "step": 63000 + }, + { + "epoch": 0.06211431003365305, + "grad_norm": 2.6101737022399902, + "learning_rate": 9.99620464216699e-06, + "loss": 3.6697, + "step": 63050 + }, + { + "epoch": 0.062163568011475136, + "grad_norm": 2.518585443496704, + "learning_rate": 9.996198611278552e-06, + "loss": 3.7089, + "step": 63100 + }, + { + "epoch": 0.06221282598929723, + "grad_norm": 2.7241904735565186, + "learning_rate": 9.996192575604147e-06, + "loss": 3.6672, + "step": 63150 + }, + { + "epoch": 0.06226208396711932, + "grad_norm": 3.6932260990142822, + "learning_rate": 9.996186535143782e-06, + "loss": 3.7307, + "step": 63200 + }, + { + "epoch": 0.0623113419449414, + "grad_norm": 2.692938804626465, + "learning_rate": 9.996180489897461e-06, + "loss": 3.7466, + "step": 63250 + }, + { + "epoch": 0.06236059992276349, + "grad_norm": 2.5920846462249756, + "learning_rate": 9.99617443986519e-06, + "loss": 3.7592, + "step": 63300 + }, + { + "epoch": 0.06240985790058558, + "grad_norm": 3.367356777191162, + "learning_rate": 9.996168385046977e-06, + "loss": 3.7785, + "step": 63350 + }, + { + "epoch": 0.06245911587840767, + "grad_norm": 2.6570591926574707, + "learning_rate": 9.996162325442823e-06, + "loss": 3.7467, + "step": 63400 + }, + { + "epoch": 0.06250837385622976, + "grad_norm": 2.741964101791382, + "learning_rate": 9.996156261052738e-06, + "loss": 3.7198, + "step": 63450 + }, + { + "epoch": 0.06255763183405184, + "grad_norm": 2.8563601970672607, + "learning_rate": 9.996150191876726e-06, + "loss": 3.6756, + "step": 63500 + }, + { + "epoch": 0.06260688981187393, + "grad_norm": 2.790907621383667, + "learning_rate": 9.996144117914794e-06, + "loss": 3.7218, + "step": 63550 + }, + { + "epoch": 0.06265614778969601, + "grad_norm": 2.941011428833008, + "learning_rate": 9.996138039166947e-06, + "loss": 3.7259, + "step": 63600 + }, + { + "epoch": 0.06270540576751811, + "grad_norm": 2.8726089000701904, + "learning_rate": 9.99613195563319e-06, + "loss": 3.6857, + "step": 63650 + }, + { + "epoch": 0.0627546637453402, + "grad_norm": 2.582637310028076, + "learning_rate": 9.99612586731353e-06, + "loss": 3.7596, + "step": 63700 + }, + { + "epoch": 0.06280392172316228, + "grad_norm": 2.6552093029022217, + "learning_rate": 9.996119774207974e-06, + "loss": 3.6819, + "step": 63750 + }, + { + "epoch": 0.06285317970098438, + "grad_norm": 2.7356131076812744, + "learning_rate": 9.996113676316526e-06, + "loss": 3.7675, + "step": 63800 + }, + { + "epoch": 0.06290243767880646, + "grad_norm": 2.638216257095337, + "learning_rate": 9.996107573639194e-06, + "loss": 3.7673, + "step": 63850 + }, + { + "epoch": 0.06295169565662855, + "grad_norm": 2.760004997253418, + "learning_rate": 9.99610146617598e-06, + "loss": 3.6771, + "step": 63900 + }, + { + "epoch": 0.06300095363445063, + "grad_norm": 4.588542938232422, + "learning_rate": 9.996095353926892e-06, + "loss": 3.7097, + "step": 63950 + }, + { + "epoch": 0.06305021161227273, + "grad_norm": 2.6513986587524414, + "learning_rate": 9.996089236891938e-06, + "loss": 3.7472, + "step": 64000 + }, + { + "epoch": 0.06309946959009481, + "grad_norm": 2.725025177001953, + "learning_rate": 9.996083115071117e-06, + "loss": 3.7337, + "step": 64050 + }, + { + "epoch": 0.0631487275679169, + "grad_norm": 2.6722593307495117, + "learning_rate": 9.996076988464444e-06, + "loss": 3.732, + "step": 64100 + }, + { + "epoch": 0.063197985545739, + "grad_norm": 2.9061412811279297, + "learning_rate": 9.99607085707192e-06, + "loss": 3.696, + "step": 64150 + }, + { + "epoch": 0.06324724352356108, + "grad_norm": 2.9670908451080322, + "learning_rate": 9.99606472089355e-06, + "loss": 3.7158, + "step": 64200 + }, + { + "epoch": 0.06329650150138316, + "grad_norm": 3.5179617404937744, + "learning_rate": 9.996058579929341e-06, + "loss": 3.6542, + "step": 64250 + }, + { + "epoch": 0.06334575947920525, + "grad_norm": 2.726682186126709, + "learning_rate": 9.9960524341793e-06, + "loss": 3.7415, + "step": 64300 + }, + { + "epoch": 0.06339501745702734, + "grad_norm": 2.5933315753936768, + "learning_rate": 9.996046283643432e-06, + "loss": 3.6843, + "step": 64350 + }, + { + "epoch": 0.06344427543484943, + "grad_norm": 2.6134650707244873, + "learning_rate": 9.996040128321742e-06, + "loss": 3.7027, + "step": 64400 + }, + { + "epoch": 0.06349353341267151, + "grad_norm": 2.868830442428589, + "learning_rate": 9.996033968214238e-06, + "loss": 3.7166, + "step": 64450 + }, + { + "epoch": 0.06354279139049361, + "grad_norm": 3.2262396812438965, + "learning_rate": 9.996027803320925e-06, + "loss": 3.7137, + "step": 64500 + }, + { + "epoch": 0.0635920493683157, + "grad_norm": 2.581850528717041, + "learning_rate": 9.996021633641807e-06, + "loss": 3.7132, + "step": 64550 + }, + { + "epoch": 0.06364130734613778, + "grad_norm": 2.637319564819336, + "learning_rate": 9.996015459176893e-06, + "loss": 3.6475, + "step": 64600 + }, + { + "epoch": 0.06369056532395986, + "grad_norm": 2.994279384613037, + "learning_rate": 9.996009279926185e-06, + "loss": 3.7911, + "step": 64650 + }, + { + "epoch": 0.06373982330178196, + "grad_norm": 3.0654690265655518, + "learning_rate": 9.996003095889693e-06, + "loss": 3.6912, + "step": 64700 + }, + { + "epoch": 0.06378908127960405, + "grad_norm": 2.7070562839508057, + "learning_rate": 9.995996907067422e-06, + "loss": 3.7103, + "step": 64750 + }, + { + "epoch": 0.06383833925742613, + "grad_norm": 2.4365599155426025, + "learning_rate": 9.995990713459377e-06, + "loss": 3.7167, + "step": 64800 + }, + { + "epoch": 0.06388759723524821, + "grad_norm": 3.452657461166382, + "learning_rate": 9.995984515065562e-06, + "loss": 3.6756, + "step": 64850 + }, + { + "epoch": 0.06393685521307031, + "grad_norm": 2.942868709564209, + "learning_rate": 9.995978311885988e-06, + "loss": 3.6684, + "step": 64900 + }, + { + "epoch": 0.0639861131908924, + "grad_norm": 2.9266514778137207, + "learning_rate": 9.995972103920655e-06, + "loss": 3.7284, + "step": 64950 + }, + { + "epoch": 0.06403537116871448, + "grad_norm": 2.5801539421081543, + "learning_rate": 9.995965891169574e-06, + "loss": 3.7141, + "step": 65000 + }, + { + "epoch": 0.06408462914653658, + "grad_norm": 2.8556201457977295, + "learning_rate": 9.99595967363275e-06, + "loss": 3.6621, + "step": 65050 + }, + { + "epoch": 0.06413388712435866, + "grad_norm": 2.5468080043792725, + "learning_rate": 9.995953451310185e-06, + "loss": 3.6969, + "step": 65100 + }, + { + "epoch": 0.06418314510218075, + "grad_norm": 2.6413984298706055, + "learning_rate": 9.99594722420189e-06, + "loss": 3.7128, + "step": 65150 + }, + { + "epoch": 0.06423240308000283, + "grad_norm": 2.5951850414276123, + "learning_rate": 9.995940992307866e-06, + "loss": 3.7228, + "step": 65200 + }, + { + "epoch": 0.06428166105782493, + "grad_norm": 2.5401663780212402, + "learning_rate": 9.995934755628124e-06, + "loss": 3.7299, + "step": 65250 + }, + { + "epoch": 0.06433091903564701, + "grad_norm": 2.6954691410064697, + "learning_rate": 9.995928514162667e-06, + "loss": 3.7235, + "step": 65300 + }, + { + "epoch": 0.0643801770134691, + "grad_norm": 2.728203773498535, + "learning_rate": 9.995922267911502e-06, + "loss": 3.7012, + "step": 65350 + }, + { + "epoch": 0.0644294349912912, + "grad_norm": 2.9748966693878174, + "learning_rate": 9.995916016874635e-06, + "loss": 3.7108, + "step": 65400 + }, + { + "epoch": 0.06447869296911328, + "grad_norm": 2.5941824913024902, + "learning_rate": 9.995909761052071e-06, + "loss": 3.6936, + "step": 65450 + }, + { + "epoch": 0.06452795094693536, + "grad_norm": 2.7038962841033936, + "learning_rate": 9.995903500443818e-06, + "loss": 3.7402, + "step": 65500 + }, + { + "epoch": 0.06457720892475745, + "grad_norm": 2.6934900283813477, + "learning_rate": 9.995897235049878e-06, + "loss": 3.6949, + "step": 65550 + }, + { + "epoch": 0.06462646690257955, + "grad_norm": 2.6275341510772705, + "learning_rate": 9.995890964870263e-06, + "loss": 3.6344, + "step": 65600 + }, + { + "epoch": 0.06467572488040163, + "grad_norm": 3.036489248275757, + "learning_rate": 9.995884689904973e-06, + "loss": 3.7363, + "step": 65650 + }, + { + "epoch": 0.06472498285822371, + "grad_norm": 2.6166820526123047, + "learning_rate": 9.995878410154019e-06, + "loss": 3.6889, + "step": 65700 + }, + { + "epoch": 0.06477424083604581, + "grad_norm": 2.964157819747925, + "learning_rate": 9.995872125617402e-06, + "loss": 3.6786, + "step": 65750 + }, + { + "epoch": 0.0648234988138679, + "grad_norm": 2.7862625122070312, + "learning_rate": 9.995865836295132e-06, + "loss": 3.681, + "step": 65800 + }, + { + "epoch": 0.06487275679168998, + "grad_norm": 2.5488617420196533, + "learning_rate": 9.995859542187214e-06, + "loss": 3.7056, + "step": 65850 + }, + { + "epoch": 0.06492201476951207, + "grad_norm": 2.583716630935669, + "learning_rate": 9.995853243293655e-06, + "loss": 3.7904, + "step": 65900 + }, + { + "epoch": 0.06497127274733416, + "grad_norm": 2.800046920776367, + "learning_rate": 9.995846939614457e-06, + "loss": 3.7301, + "step": 65950 + }, + { + "epoch": 0.06502053072515625, + "grad_norm": 2.9489681720733643, + "learning_rate": 9.99584063114963e-06, + "loss": 3.6694, + "step": 66000 + }, + { + "epoch": 0.06506978870297833, + "grad_norm": 2.8252675533294678, + "learning_rate": 9.99583431789918e-06, + "loss": 3.6542, + "step": 66050 + }, + { + "epoch": 0.06511904668080042, + "grad_norm": 2.988348960876465, + "learning_rate": 9.995827999863111e-06, + "loss": 3.6568, + "step": 66100 + }, + { + "epoch": 0.06516830465862251, + "grad_norm": 3.0104172229766846, + "learning_rate": 9.99582167704143e-06, + "loss": 3.6298, + "step": 66150 + }, + { + "epoch": 0.0652175626364446, + "grad_norm": 2.784242630004883, + "learning_rate": 9.995815349434143e-06, + "loss": 3.6332, + "step": 66200 + }, + { + "epoch": 0.06526682061426668, + "grad_norm": 2.846703052520752, + "learning_rate": 9.995809017041256e-06, + "loss": 3.6847, + "step": 66250 + }, + { + "epoch": 0.06531607859208878, + "grad_norm": 2.8544907569885254, + "learning_rate": 9.995802679862774e-06, + "loss": 3.6677, + "step": 66300 + }, + { + "epoch": 0.06536533656991086, + "grad_norm": 2.620543956756592, + "learning_rate": 9.995796337898706e-06, + "loss": 3.7168, + "step": 66350 + }, + { + "epoch": 0.06541459454773295, + "grad_norm": 2.8716800212860107, + "learning_rate": 9.995789991149056e-06, + "loss": 3.6452, + "step": 66400 + }, + { + "epoch": 0.06546385252555503, + "grad_norm": 2.7985305786132812, + "learning_rate": 9.99578363961383e-06, + "loss": 3.6687, + "step": 66450 + }, + { + "epoch": 0.06551311050337713, + "grad_norm": 2.7281441688537598, + "learning_rate": 9.995777283293036e-06, + "loss": 3.6909, + "step": 66500 + }, + { + "epoch": 0.06556236848119922, + "grad_norm": 2.6076512336730957, + "learning_rate": 9.995770922186675e-06, + "loss": 3.6988, + "step": 66550 + }, + { + "epoch": 0.0656116264590213, + "grad_norm": 2.85115909576416, + "learning_rate": 9.99576455629476e-06, + "loss": 3.7141, + "step": 66600 + }, + { + "epoch": 0.0656608844368434, + "grad_norm": 2.996894598007202, + "learning_rate": 9.995758185617293e-06, + "loss": 3.6868, + "step": 66650 + }, + { + "epoch": 0.06571014241466548, + "grad_norm": 2.679750919342041, + "learning_rate": 9.99575181015428e-06, + "loss": 3.721, + "step": 66700 + }, + { + "epoch": 0.06575940039248757, + "grad_norm": 2.784324884414673, + "learning_rate": 9.99574542990573e-06, + "loss": 3.6911, + "step": 66750 + }, + { + "epoch": 0.06580865837030965, + "grad_norm": 2.7431397438049316, + "learning_rate": 9.995739044871645e-06, + "loss": 3.6608, + "step": 66800 + }, + { + "epoch": 0.06585791634813175, + "grad_norm": 2.8210859298706055, + "learning_rate": 9.995732655052033e-06, + "loss": 3.6894, + "step": 66850 + }, + { + "epoch": 0.06590717432595383, + "grad_norm": 2.549862861633301, + "learning_rate": 9.995726260446901e-06, + "loss": 3.6452, + "step": 66900 + }, + { + "epoch": 0.06595643230377592, + "grad_norm": 2.721755266189575, + "learning_rate": 9.995719861056254e-06, + "loss": 3.6709, + "step": 66950 + }, + { + "epoch": 0.06600569028159801, + "grad_norm": 2.8079075813293457, + "learning_rate": 9.9957134568801e-06, + "loss": 3.6714, + "step": 67000 + }, + { + "epoch": 0.0660549482594201, + "grad_norm": 2.9801430702209473, + "learning_rate": 9.995707047918443e-06, + "loss": 3.7196, + "step": 67050 + }, + { + "epoch": 0.06610420623724218, + "grad_norm": 2.6767866611480713, + "learning_rate": 9.99570063417129e-06, + "loss": 3.7199, + "step": 67100 + }, + { + "epoch": 0.06615346421506427, + "grad_norm": 2.5830349922180176, + "learning_rate": 9.995694215638647e-06, + "loss": 3.7052, + "step": 67150 + }, + { + "epoch": 0.06620272219288637, + "grad_norm": 2.797785997390747, + "learning_rate": 9.995687792320519e-06, + "loss": 3.7105, + "step": 67200 + }, + { + "epoch": 0.06625198017070845, + "grad_norm": 2.8043534755706787, + "learning_rate": 9.995681364216913e-06, + "loss": 3.6795, + "step": 67250 + }, + { + "epoch": 0.06630123814853053, + "grad_norm": 2.6308836936950684, + "learning_rate": 9.995674931327839e-06, + "loss": 3.6974, + "step": 67300 + }, + { + "epoch": 0.06635049612635262, + "grad_norm": 2.7820475101470947, + "learning_rate": 9.995668493653296e-06, + "loss": 3.6913, + "step": 67350 + }, + { + "epoch": 0.06639975410417472, + "grad_norm": 2.9104535579681396, + "learning_rate": 9.995662051193295e-06, + "loss": 3.6462, + "step": 67400 + }, + { + "epoch": 0.0664490120819968, + "grad_norm": 2.507720708847046, + "learning_rate": 9.99565560394784e-06, + "loss": 3.6808, + "step": 67450 + }, + { + "epoch": 0.06649827005981888, + "grad_norm": 2.683095932006836, + "learning_rate": 9.99564915191694e-06, + "loss": 3.6738, + "step": 67500 + }, + { + "epoch": 0.06654752803764098, + "grad_norm": 2.4717819690704346, + "learning_rate": 9.995642695100598e-06, + "loss": 3.6557, + "step": 67550 + }, + { + "epoch": 0.06659678601546307, + "grad_norm": 2.506570339202881, + "learning_rate": 9.995636233498823e-06, + "loss": 3.6892, + "step": 67600 + }, + { + "epoch": 0.06664604399328515, + "grad_norm": 2.6206767559051514, + "learning_rate": 9.995629767111618e-06, + "loss": 3.6387, + "step": 67650 + }, + { + "epoch": 0.06669530197110723, + "grad_norm": 2.8186750411987305, + "learning_rate": 9.995623295938991e-06, + "loss": 3.6422, + "step": 67700 + }, + { + "epoch": 0.06674455994892933, + "grad_norm": 2.849534034729004, + "learning_rate": 9.99561681998095e-06, + "loss": 3.6753, + "step": 67750 + }, + { + "epoch": 0.06679381792675142, + "grad_norm": 2.7502522468566895, + "learning_rate": 9.995610339237495e-06, + "loss": 3.6918, + "step": 67800 + }, + { + "epoch": 0.0668430759045735, + "grad_norm": 2.680155038833618, + "learning_rate": 9.99560385370864e-06, + "loss": 3.6419, + "step": 67850 + }, + { + "epoch": 0.0668923338823956, + "grad_norm": 2.67852783203125, + "learning_rate": 9.995597363394387e-06, + "loss": 3.6259, + "step": 67900 + }, + { + "epoch": 0.06694159186021768, + "grad_norm": 2.801994562149048, + "learning_rate": 9.995590868294742e-06, + "loss": 3.637, + "step": 67950 + }, + { + "epoch": 0.06699084983803977, + "grad_norm": 2.822499990463257, + "learning_rate": 9.995584368409714e-06, + "loss": 3.7181, + "step": 68000 + }, + { + "epoch": 0.06704010781586185, + "grad_norm": 2.5790913105010986, + "learning_rate": 9.995577863739305e-06, + "loss": 3.6916, + "step": 68050 + }, + { + "epoch": 0.06708936579368395, + "grad_norm": 2.928649663925171, + "learning_rate": 9.995571354283526e-06, + "loss": 3.7781, + "step": 68100 + }, + { + "epoch": 0.06713862377150603, + "grad_norm": 2.8456549644470215, + "learning_rate": 9.99556484004238e-06, + "loss": 3.7117, + "step": 68150 + }, + { + "epoch": 0.06718788174932812, + "grad_norm": 2.619476795196533, + "learning_rate": 9.995558321015873e-06, + "loss": 3.6417, + "step": 68200 + }, + { + "epoch": 0.0672371397271502, + "grad_norm": 2.623859405517578, + "learning_rate": 9.995551797204012e-06, + "loss": 3.7017, + "step": 68250 + }, + { + "epoch": 0.0672863977049723, + "grad_norm": 2.7679502964019775, + "learning_rate": 9.995545268606805e-06, + "loss": 3.6567, + "step": 68300 + }, + { + "epoch": 0.06733565568279438, + "grad_norm": 2.713867425918579, + "learning_rate": 9.995538735224258e-06, + "loss": 3.667, + "step": 68350 + }, + { + "epoch": 0.06738491366061647, + "grad_norm": 2.7669084072113037, + "learning_rate": 9.995532197056374e-06, + "loss": 3.6816, + "step": 68400 + }, + { + "epoch": 0.06743417163843857, + "grad_norm": 2.79721999168396, + "learning_rate": 9.995525654103161e-06, + "loss": 3.6512, + "step": 68450 + }, + { + "epoch": 0.06748342961626065, + "grad_norm": 2.477349281311035, + "learning_rate": 9.995519106364627e-06, + "loss": 3.7428, + "step": 68500 + }, + { + "epoch": 0.06753268759408274, + "grad_norm": 2.895503282546997, + "learning_rate": 9.995512553840775e-06, + "loss": 3.6677, + "step": 68550 + }, + { + "epoch": 0.06758194557190482, + "grad_norm": 2.718409299850464, + "learning_rate": 9.995505996531616e-06, + "loss": 3.6753, + "step": 68600 + }, + { + "epoch": 0.06763120354972692, + "grad_norm": 2.7651071548461914, + "learning_rate": 9.995499434437153e-06, + "loss": 3.7204, + "step": 68650 + }, + { + "epoch": 0.067680461527549, + "grad_norm": 2.6033096313476562, + "learning_rate": 9.99549286755739e-06, + "loss": 3.6773, + "step": 68700 + }, + { + "epoch": 0.06772971950537109, + "grad_norm": 2.903674840927124, + "learning_rate": 9.995486295892338e-06, + "loss": 3.6904, + "step": 68750 + }, + { + "epoch": 0.06777897748319318, + "grad_norm": 2.7972679138183594, + "learning_rate": 9.995479719442001e-06, + "loss": 3.6332, + "step": 68800 + }, + { + "epoch": 0.06782823546101527, + "grad_norm": 2.6775546073913574, + "learning_rate": 9.995473138206386e-06, + "loss": 3.6784, + "step": 68850 + }, + { + "epoch": 0.06787749343883735, + "grad_norm": 3.021813154220581, + "learning_rate": 9.9954665521855e-06, + "loss": 3.6889, + "step": 68900 + }, + { + "epoch": 0.06792675141665944, + "grad_norm": 2.641707420349121, + "learning_rate": 9.995459961379346e-06, + "loss": 3.6664, + "step": 68950 + }, + { + "epoch": 0.06797600939448153, + "grad_norm": 2.8372721672058105, + "learning_rate": 9.995453365787933e-06, + "loss": 3.7011, + "step": 69000 + }, + { + "epoch": 0.06802526737230362, + "grad_norm": 3.4274492263793945, + "learning_rate": 9.995446765411267e-06, + "loss": 3.6451, + "step": 69050 + }, + { + "epoch": 0.0680745253501257, + "grad_norm": 2.832369089126587, + "learning_rate": 9.995440160249354e-06, + "loss": 3.6733, + "step": 69100 + }, + { + "epoch": 0.0681237833279478, + "grad_norm": 2.603482246398926, + "learning_rate": 9.995433550302201e-06, + "loss": 3.6992, + "step": 69150 + }, + { + "epoch": 0.06817304130576989, + "grad_norm": 2.6224849224090576, + "learning_rate": 9.995426935569813e-06, + "loss": 3.7153, + "step": 69200 + }, + { + "epoch": 0.06822229928359197, + "grad_norm": 2.580613136291504, + "learning_rate": 9.995420316052198e-06, + "loss": 3.7002, + "step": 69250 + }, + { + "epoch": 0.06827155726141405, + "grad_norm": 2.3886590003967285, + "learning_rate": 9.995413691749361e-06, + "loss": 3.6898, + "step": 69300 + }, + { + "epoch": 0.06832081523923615, + "grad_norm": 2.630833625793457, + "learning_rate": 9.99540706266131e-06, + "loss": 3.7163, + "step": 69350 + }, + { + "epoch": 0.06837007321705824, + "grad_norm": 3.051788091659546, + "learning_rate": 9.995400428788048e-06, + "loss": 3.701, + "step": 69400 + }, + { + "epoch": 0.06841933119488032, + "grad_norm": 2.7844479084014893, + "learning_rate": 9.995393790129586e-06, + "loss": 3.6846, + "step": 69450 + }, + { + "epoch": 0.0684685891727024, + "grad_norm": 2.484066963195801, + "learning_rate": 9.995387146685927e-06, + "loss": 3.6316, + "step": 69500 + }, + { + "epoch": 0.0685178471505245, + "grad_norm": 2.7856152057647705, + "learning_rate": 9.99538049845708e-06, + "loss": 3.6809, + "step": 69550 + }, + { + "epoch": 0.06856710512834659, + "grad_norm": 2.687910318374634, + "learning_rate": 9.995373845443046e-06, + "loss": 3.6079, + "step": 69600 + }, + { + "epoch": 0.06861636310616867, + "grad_norm": 2.677640199661255, + "learning_rate": 9.995367187643836e-06, + "loss": 3.687, + "step": 69650 + }, + { + "epoch": 0.06866562108399077, + "grad_norm": 2.5716683864593506, + "learning_rate": 9.995360525059457e-06, + "loss": 3.6982, + "step": 69700 + }, + { + "epoch": 0.06871487906181285, + "grad_norm": 2.392432451248169, + "learning_rate": 9.995353857689913e-06, + "loss": 3.7175, + "step": 69750 + }, + { + "epoch": 0.06876413703963494, + "grad_norm": 2.8337388038635254, + "learning_rate": 9.99534718553521e-06, + "loss": 3.6757, + "step": 69800 + }, + { + "epoch": 0.06881339501745702, + "grad_norm": 2.436741352081299, + "learning_rate": 9.995340508595358e-06, + "loss": 3.6196, + "step": 69850 + }, + { + "epoch": 0.06886265299527912, + "grad_norm": 2.593412160873413, + "learning_rate": 9.995333826870359e-06, + "loss": 3.6514, + "step": 69900 + }, + { + "epoch": 0.0689119109731012, + "grad_norm": 2.501225471496582, + "learning_rate": 9.995327140360223e-06, + "loss": 3.6725, + "step": 69950 + }, + { + "epoch": 0.06896116895092329, + "grad_norm": 2.9814841747283936, + "learning_rate": 9.995320449064953e-06, + "loss": 3.6984, + "step": 70000 + }, + { + "epoch": 0.06901042692874539, + "grad_norm": 2.6169559955596924, + "learning_rate": 9.99531375298456e-06, + "loss": 3.6585, + "step": 70050 + }, + { + "epoch": 0.06905968490656747, + "grad_norm": 2.7606382369995117, + "learning_rate": 9.995307052119045e-06, + "loss": 3.6963, + "step": 70100 + }, + { + "epoch": 0.06910894288438955, + "grad_norm": 2.7260844707489014, + "learning_rate": 9.995300346468418e-06, + "loss": 3.664, + "step": 70150 + }, + { + "epoch": 0.06915820086221164, + "grad_norm": 2.8161635398864746, + "learning_rate": 9.995293636032683e-06, + "loss": 3.6813, + "step": 70200 + }, + { + "epoch": 0.06920745884003374, + "grad_norm": 2.6676254272460938, + "learning_rate": 9.995286920811849e-06, + "loss": 3.6607, + "step": 70250 + }, + { + "epoch": 0.06925671681785582, + "grad_norm": 2.661923885345459, + "learning_rate": 9.99528020080592e-06, + "loss": 3.6198, + "step": 70300 + }, + { + "epoch": 0.0693059747956779, + "grad_norm": 2.5987586975097656, + "learning_rate": 9.995273476014907e-06, + "loss": 3.7121, + "step": 70350 + }, + { + "epoch": 0.0693552327735, + "grad_norm": 2.387489080429077, + "learning_rate": 9.99526674643881e-06, + "loss": 3.6561, + "step": 70400 + }, + { + "epoch": 0.06940449075132209, + "grad_norm": 2.600698709487915, + "learning_rate": 9.99526001207764e-06, + "loss": 3.6802, + "step": 70450 + }, + { + "epoch": 0.06945374872914417, + "grad_norm": 2.67354416847229, + "learning_rate": 9.995253272931402e-06, + "loss": 3.6887, + "step": 70500 + }, + { + "epoch": 0.06950300670696626, + "grad_norm": 2.8217203617095947, + "learning_rate": 9.995246529000102e-06, + "loss": 3.6617, + "step": 70550 + }, + { + "epoch": 0.06955226468478835, + "grad_norm": 3.004943370819092, + "learning_rate": 9.995239780283748e-06, + "loss": 3.6442, + "step": 70600 + }, + { + "epoch": 0.06960152266261044, + "grad_norm": 2.8911502361297607, + "learning_rate": 9.995233026782346e-06, + "loss": 3.7146, + "step": 70650 + }, + { + "epoch": 0.06965078064043252, + "grad_norm": 2.5290520191192627, + "learning_rate": 9.995226268495898e-06, + "loss": 3.6641, + "step": 70700 + }, + { + "epoch": 0.0697000386182546, + "grad_norm": 3.099642038345337, + "learning_rate": 9.995219505424418e-06, + "loss": 3.6278, + "step": 70750 + }, + { + "epoch": 0.0697492965960767, + "grad_norm": 2.4764974117279053, + "learning_rate": 9.995212737567908e-06, + "loss": 3.6048, + "step": 70800 + }, + { + "epoch": 0.06979855457389879, + "grad_norm": 2.7321791648864746, + "learning_rate": 9.995205964926375e-06, + "loss": 3.6714, + "step": 70850 + }, + { + "epoch": 0.06984781255172087, + "grad_norm": 2.5434348583221436, + "learning_rate": 9.995199187499826e-06, + "loss": 3.7312, + "step": 70900 + }, + { + "epoch": 0.06989707052954297, + "grad_norm": 2.8627142906188965, + "learning_rate": 9.995192405288267e-06, + "loss": 3.6505, + "step": 70950 + }, + { + "epoch": 0.06994632850736505, + "grad_norm": 2.8490633964538574, + "learning_rate": 9.995185618291706e-06, + "loss": 3.6901, + "step": 71000 + }, + { + "epoch": 0.06999558648518714, + "grad_norm": 2.5741195678710938, + "learning_rate": 9.995178826510147e-06, + "loss": 3.6637, + "step": 71050 + }, + { + "epoch": 0.07004484446300922, + "grad_norm": 2.684366226196289, + "learning_rate": 9.9951720299436e-06, + "loss": 3.6121, + "step": 71100 + }, + { + "epoch": 0.07009410244083132, + "grad_norm": 2.7699320316314697, + "learning_rate": 9.995165228592068e-06, + "loss": 3.7427, + "step": 71150 + }, + { + "epoch": 0.0701433604186534, + "grad_norm": 2.8960466384887695, + "learning_rate": 9.995158422455557e-06, + "loss": 3.6214, + "step": 71200 + }, + { + "epoch": 0.07019261839647549, + "grad_norm": 3.0051112174987793, + "learning_rate": 9.995151611534078e-06, + "loss": 3.6948, + "step": 71250 + }, + { + "epoch": 0.07024187637429759, + "grad_norm": 2.827670097351074, + "learning_rate": 9.995144795827634e-06, + "loss": 3.6922, + "step": 71300 + }, + { + "epoch": 0.07029113435211967, + "grad_norm": 2.649594306945801, + "learning_rate": 9.995137975336233e-06, + "loss": 3.6204, + "step": 71350 + }, + { + "epoch": 0.07034039232994176, + "grad_norm": 2.717921018600464, + "learning_rate": 9.995131150059881e-06, + "loss": 3.6187, + "step": 71400 + }, + { + "epoch": 0.07038965030776384, + "grad_norm": 2.6892218589782715, + "learning_rate": 9.995124319998582e-06, + "loss": 3.6327, + "step": 71450 + }, + { + "epoch": 0.07043890828558594, + "grad_norm": 3.0993971824645996, + "learning_rate": 9.995117485152347e-06, + "loss": 3.6796, + "step": 71500 + }, + { + "epoch": 0.07048816626340802, + "grad_norm": 2.848966121673584, + "learning_rate": 9.995110645521181e-06, + "loss": 3.6522, + "step": 71550 + }, + { + "epoch": 0.0705374242412301, + "grad_norm": 2.4713757038116455, + "learning_rate": 9.995103801105091e-06, + "loss": 3.6022, + "step": 71600 + }, + { + "epoch": 0.07058668221905219, + "grad_norm": 3.00170636177063, + "learning_rate": 9.99509695190408e-06, + "loss": 3.6149, + "step": 71650 + }, + { + "epoch": 0.07063594019687429, + "grad_norm": 2.485034704208374, + "learning_rate": 9.995090097918159e-06, + "loss": 3.6421, + "step": 71700 + }, + { + "epoch": 0.07068519817469637, + "grad_norm": 2.7099106311798096, + "learning_rate": 9.995083239147332e-06, + "loss": 3.5373, + "step": 71750 + }, + { + "epoch": 0.07073445615251846, + "grad_norm": 2.885221242904663, + "learning_rate": 9.995076375591608e-06, + "loss": 3.6877, + "step": 71800 + }, + { + "epoch": 0.07078371413034056, + "grad_norm": 2.833265542984009, + "learning_rate": 9.99506950725099e-06, + "loss": 3.6161, + "step": 71850 + }, + { + "epoch": 0.07083297210816264, + "grad_norm": 2.7573535442352295, + "learning_rate": 9.995062634125488e-06, + "loss": 3.6813, + "step": 71900 + }, + { + "epoch": 0.07088223008598472, + "grad_norm": 3.0593013763427734, + "learning_rate": 9.995055756215107e-06, + "loss": 3.5762, + "step": 71950 + }, + { + "epoch": 0.07093148806380681, + "grad_norm": 2.7071948051452637, + "learning_rate": 9.995048873519852e-06, + "loss": 3.6267, + "step": 72000 + }, + { + "epoch": 0.0709807460416289, + "grad_norm": 2.6270852088928223, + "learning_rate": 9.995041986039733e-06, + "loss": 3.6888, + "step": 72050 + }, + { + "epoch": 0.07103000401945099, + "grad_norm": 2.7508726119995117, + "learning_rate": 9.995035093774756e-06, + "loss": 3.6479, + "step": 72100 + }, + { + "epoch": 0.07107926199727307, + "grad_norm": 2.5797231197357178, + "learning_rate": 9.995028196724925e-06, + "loss": 3.6842, + "step": 72150 + }, + { + "epoch": 0.07112851997509517, + "grad_norm": 2.882253408432007, + "learning_rate": 9.995021294890247e-06, + "loss": 3.6116, + "step": 72200 + }, + { + "epoch": 0.07117777795291726, + "grad_norm": 3.025050401687622, + "learning_rate": 9.995014388270731e-06, + "loss": 3.6421, + "step": 72250 + }, + { + "epoch": 0.07122703593073934, + "grad_norm": 2.678253173828125, + "learning_rate": 9.995007476866382e-06, + "loss": 3.7096, + "step": 72300 + }, + { + "epoch": 0.07127629390856142, + "grad_norm": 2.800248146057129, + "learning_rate": 9.995000560677208e-06, + "loss": 3.6463, + "step": 72350 + }, + { + "epoch": 0.07132555188638352, + "grad_norm": 2.965043544769287, + "learning_rate": 9.994993639703215e-06, + "loss": 3.6308, + "step": 72400 + }, + { + "epoch": 0.07137480986420561, + "grad_norm": 2.6530370712280273, + "learning_rate": 9.994986713944408e-06, + "loss": 3.5791, + "step": 72450 + }, + { + "epoch": 0.07142406784202769, + "grad_norm": 2.798475503921509, + "learning_rate": 9.994979783400794e-06, + "loss": 3.6106, + "step": 72500 + }, + { + "epoch": 0.07147332581984979, + "grad_norm": 2.546464681625366, + "learning_rate": 9.994972848072382e-06, + "loss": 3.656, + "step": 72550 + }, + { + "epoch": 0.07152258379767187, + "grad_norm": 2.5798041820526123, + "learning_rate": 9.994965907959176e-06, + "loss": 3.6222, + "step": 72600 + }, + { + "epoch": 0.07157184177549396, + "grad_norm": 2.6365602016448975, + "learning_rate": 9.994958963061186e-06, + "loss": 3.711, + "step": 72650 + }, + { + "epoch": 0.07162109975331604, + "grad_norm": 2.6040518283843994, + "learning_rate": 9.994952013378413e-06, + "loss": 3.665, + "step": 72700 + }, + { + "epoch": 0.07167035773113814, + "grad_norm": 2.791257381439209, + "learning_rate": 9.99494505891087e-06, + "loss": 3.654, + "step": 72750 + }, + { + "epoch": 0.07171961570896022, + "grad_norm": 2.5012261867523193, + "learning_rate": 9.99493809965856e-06, + "loss": 3.5989, + "step": 72800 + }, + { + "epoch": 0.07176887368678231, + "grad_norm": 2.8017241954803467, + "learning_rate": 9.99493113562149e-06, + "loss": 3.6419, + "step": 72850 + }, + { + "epoch": 0.07181813166460439, + "grad_norm": 2.886157989501953, + "learning_rate": 9.994924166799668e-06, + "loss": 3.645, + "step": 72900 + }, + { + "epoch": 0.07186738964242649, + "grad_norm": 2.587864637374878, + "learning_rate": 9.994917193193099e-06, + "loss": 3.6195, + "step": 72950 + }, + { + "epoch": 0.07191664762024857, + "grad_norm": 2.6062097549438477, + "learning_rate": 9.994910214801791e-06, + "loss": 3.6503, + "step": 73000 + }, + { + "epoch": 0.07196590559807066, + "grad_norm": 2.972599506378174, + "learning_rate": 9.99490323162575e-06, + "loss": 3.6452, + "step": 73050 + }, + { + "epoch": 0.07201516357589276, + "grad_norm": 2.6599671840667725, + "learning_rate": 9.994896243664983e-06, + "loss": 3.6479, + "step": 73100 + }, + { + "epoch": 0.07206442155371484, + "grad_norm": 3.5980241298675537, + "learning_rate": 9.994889250919495e-06, + "loss": 3.608, + "step": 73150 + }, + { + "epoch": 0.07211367953153693, + "grad_norm": 3.08015513420105, + "learning_rate": 9.994882253389295e-06, + "loss": 3.7253, + "step": 73200 + }, + { + "epoch": 0.07216293750935901, + "grad_norm": 2.792656183242798, + "learning_rate": 9.994875251074389e-06, + "loss": 3.5673, + "step": 73250 + }, + { + "epoch": 0.07221219548718111, + "grad_norm": 2.592083692550659, + "learning_rate": 9.994868243974785e-06, + "loss": 3.6473, + "step": 73300 + }, + { + "epoch": 0.07226145346500319, + "grad_norm": 2.6621804237365723, + "learning_rate": 9.994861232090487e-06, + "loss": 3.6207, + "step": 73350 + }, + { + "epoch": 0.07231071144282528, + "grad_norm": 2.6156387329101562, + "learning_rate": 9.994854215421504e-06, + "loss": 3.6283, + "step": 73400 + }, + { + "epoch": 0.07235996942064737, + "grad_norm": 2.7689008712768555, + "learning_rate": 9.994847193967841e-06, + "loss": 3.5898, + "step": 73450 + }, + { + "epoch": 0.07240922739846946, + "grad_norm": 2.7425997257232666, + "learning_rate": 9.994840167729506e-06, + "loss": 3.6368, + "step": 73500 + }, + { + "epoch": 0.07245848537629154, + "grad_norm": 2.7740464210510254, + "learning_rate": 9.994833136706505e-06, + "loss": 3.6445, + "step": 73550 + }, + { + "epoch": 0.07250774335411363, + "grad_norm": 2.5702173709869385, + "learning_rate": 9.994826100898845e-06, + "loss": 3.6603, + "step": 73600 + }, + { + "epoch": 0.07255700133193572, + "grad_norm": 2.607623338699341, + "learning_rate": 9.994819060306532e-06, + "loss": 3.6451, + "step": 73650 + }, + { + "epoch": 0.07260625930975781, + "grad_norm": 2.7097530364990234, + "learning_rate": 9.994812014929573e-06, + "loss": 3.6392, + "step": 73700 + }, + { + "epoch": 0.0726555172875799, + "grad_norm": 2.643967628479004, + "learning_rate": 9.994804964767976e-06, + "loss": 3.6257, + "step": 73750 + }, + { + "epoch": 0.07270477526540199, + "grad_norm": 2.7616708278656006, + "learning_rate": 9.994797909821748e-06, + "loss": 3.6016, + "step": 73800 + }, + { + "epoch": 0.07275403324322408, + "grad_norm": 2.607081174850464, + "learning_rate": 9.994790850090894e-06, + "loss": 3.6168, + "step": 73850 + }, + { + "epoch": 0.07280329122104616, + "grad_norm": 2.5300259590148926, + "learning_rate": 9.994783785575421e-06, + "loss": 3.6344, + "step": 73900 + }, + { + "epoch": 0.07285254919886824, + "grad_norm": 2.6927247047424316, + "learning_rate": 9.994776716275338e-06, + "loss": 3.6252, + "step": 73950 + }, + { + "epoch": 0.07290180717669034, + "grad_norm": 2.677111864089966, + "learning_rate": 9.99476964219065e-06, + "loss": 3.6237, + "step": 74000 + }, + { + "epoch": 0.07295106515451243, + "grad_norm": 2.928058385848999, + "learning_rate": 9.994762563321361e-06, + "loss": 3.5721, + "step": 74050 + }, + { + "epoch": 0.07300032313233451, + "grad_norm": 2.744075059890747, + "learning_rate": 9.994755479667482e-06, + "loss": 3.5986, + "step": 74100 + }, + { + "epoch": 0.0730495811101566, + "grad_norm": 2.5685763359069824, + "learning_rate": 9.994748391229018e-06, + "loss": 3.6138, + "step": 74150 + }, + { + "epoch": 0.07309883908797869, + "grad_norm": 2.582242488861084, + "learning_rate": 9.994741298005977e-06, + "loss": 3.6385, + "step": 74200 + }, + { + "epoch": 0.07314809706580078, + "grad_norm": 2.5937206745147705, + "learning_rate": 9.994734199998366e-06, + "loss": 3.6381, + "step": 74250 + }, + { + "epoch": 0.07319735504362286, + "grad_norm": 2.5076181888580322, + "learning_rate": 9.994727097206189e-06, + "loss": 3.594, + "step": 74300 + }, + { + "epoch": 0.07324661302144496, + "grad_norm": 2.7037906646728516, + "learning_rate": 9.994719989629455e-06, + "loss": 3.6886, + "step": 74350 + }, + { + "epoch": 0.07329587099926704, + "grad_norm": 2.7960517406463623, + "learning_rate": 9.99471287726817e-06, + "loss": 3.6601, + "step": 74400 + }, + { + "epoch": 0.07334512897708913, + "grad_norm": 2.716439723968506, + "learning_rate": 9.994705760122342e-06, + "loss": 3.6461, + "step": 74450 + }, + { + "epoch": 0.07339438695491121, + "grad_norm": 2.6618385314941406, + "learning_rate": 9.994698638191977e-06, + "loss": 3.5955, + "step": 74500 + }, + { + "epoch": 0.07344364493273331, + "grad_norm": 2.8261427879333496, + "learning_rate": 9.99469151147708e-06, + "loss": 3.602, + "step": 74550 + }, + { + "epoch": 0.0734929029105554, + "grad_norm": 2.7226064205169678, + "learning_rate": 9.994684379977661e-06, + "loss": 3.6213, + "step": 74600 + }, + { + "epoch": 0.07354216088837748, + "grad_norm": 2.9325544834136963, + "learning_rate": 9.994677243693727e-06, + "loss": 3.6381, + "step": 74650 + }, + { + "epoch": 0.07359141886619958, + "grad_norm": 2.766641139984131, + "learning_rate": 9.994670102625281e-06, + "loss": 3.6667, + "step": 74700 + }, + { + "epoch": 0.07364067684402166, + "grad_norm": 2.987947463989258, + "learning_rate": 9.994662956772332e-06, + "loss": 3.6074, + "step": 74750 + }, + { + "epoch": 0.07368993482184374, + "grad_norm": 2.605741500854492, + "learning_rate": 9.99465580613489e-06, + "loss": 3.6531, + "step": 74800 + }, + { + "epoch": 0.07373919279966583, + "grad_norm": 2.561439275741577, + "learning_rate": 9.994648650712955e-06, + "loss": 3.6461, + "step": 74850 + }, + { + "epoch": 0.07378845077748793, + "grad_norm": 2.6646921634674072, + "learning_rate": 9.99464149050654e-06, + "loss": 3.6528, + "step": 74900 + }, + { + "epoch": 0.07383770875531001, + "grad_norm": 2.671616554260254, + "learning_rate": 9.994634325515648e-06, + "loss": 3.6359, + "step": 74950 + }, + { + "epoch": 0.0738869667331321, + "grad_norm": 3.020414352416992, + "learning_rate": 9.99462715574029e-06, + "loss": 3.6615, + "step": 75000 + }, + { + "epoch": 0.07393622471095418, + "grad_norm": 2.4770257472991943, + "learning_rate": 9.994619981180468e-06, + "loss": 3.5992, + "step": 75050 + }, + { + "epoch": 0.07398548268877628, + "grad_norm": 2.883547306060791, + "learning_rate": 9.994612801836193e-06, + "loss": 3.6267, + "step": 75100 + }, + { + "epoch": 0.07403474066659836, + "grad_norm": 2.6869547367095947, + "learning_rate": 9.994605617707469e-06, + "loss": 3.6229, + "step": 75150 + }, + { + "epoch": 0.07408399864442045, + "grad_norm": 2.956298589706421, + "learning_rate": 9.994598428794304e-06, + "loss": 3.6463, + "step": 75200 + }, + { + "epoch": 0.07413325662224254, + "grad_norm": 2.4470391273498535, + "learning_rate": 9.994591235096705e-06, + "loss": 3.5897, + "step": 75250 + }, + { + "epoch": 0.07418251460006463, + "grad_norm": 2.615194082260132, + "learning_rate": 9.994584036614678e-06, + "loss": 3.6559, + "step": 75300 + }, + { + "epoch": 0.07423177257788671, + "grad_norm": 2.567371368408203, + "learning_rate": 9.99457683334823e-06, + "loss": 3.593, + "step": 75350 + }, + { + "epoch": 0.0742810305557088, + "grad_norm": 2.517949104309082, + "learning_rate": 9.99456962529737e-06, + "loss": 3.6654, + "step": 75400 + }, + { + "epoch": 0.0743302885335309, + "grad_norm": 2.58486270904541, + "learning_rate": 9.994562412462103e-06, + "loss": 3.574, + "step": 75450 + }, + { + "epoch": 0.07437954651135298, + "grad_norm": 2.7378883361816406, + "learning_rate": 9.994555194842438e-06, + "loss": 3.6217, + "step": 75500 + }, + { + "epoch": 0.07442880448917506, + "grad_norm": 2.8276114463806152, + "learning_rate": 9.994547972438378e-06, + "loss": 3.6062, + "step": 75550 + }, + { + "epoch": 0.07447806246699716, + "grad_norm": 2.5560925006866455, + "learning_rate": 9.994540745249933e-06, + "loss": 3.6145, + "step": 75600 + }, + { + "epoch": 0.07452732044481925, + "grad_norm": 2.887159585952759, + "learning_rate": 9.994533513277108e-06, + "loss": 3.638, + "step": 75650 + }, + { + "epoch": 0.07457657842264133, + "grad_norm": 2.5389602184295654, + "learning_rate": 9.994526276519911e-06, + "loss": 3.595, + "step": 75700 + }, + { + "epoch": 0.07462583640046341, + "grad_norm": 2.3895115852355957, + "learning_rate": 9.99451903497835e-06, + "loss": 3.5791, + "step": 75750 + }, + { + "epoch": 0.07467509437828551, + "grad_norm": 2.5909411907196045, + "learning_rate": 9.994511788652432e-06, + "loss": 3.6239, + "step": 75800 + }, + { + "epoch": 0.0747243523561076, + "grad_norm": 2.6543195247650146, + "learning_rate": 9.99450453754216e-06, + "loss": 3.5759, + "step": 75850 + }, + { + "epoch": 0.07477361033392968, + "grad_norm": 2.590404987335205, + "learning_rate": 9.994497281647546e-06, + "loss": 3.6394, + "step": 75900 + }, + { + "epoch": 0.07482286831175178, + "grad_norm": 2.6828038692474365, + "learning_rate": 9.994490020968594e-06, + "loss": 3.6476, + "step": 75950 + }, + { + "epoch": 0.07487212628957386, + "grad_norm": 2.7572526931762695, + "learning_rate": 9.994482755505311e-06, + "loss": 3.6188, + "step": 76000 + }, + { + "epoch": 0.07492138426739595, + "grad_norm": 2.4381093978881836, + "learning_rate": 9.994475485257704e-06, + "loss": 3.6484, + "step": 76050 + }, + { + "epoch": 0.07497064224521803, + "grad_norm": 2.3182830810546875, + "learning_rate": 9.994468210225783e-06, + "loss": 3.5418, + "step": 76100 + }, + { + "epoch": 0.07501990022304013, + "grad_norm": 2.761262893676758, + "learning_rate": 9.99446093040955e-06, + "loss": 3.6533, + "step": 76150 + }, + { + "epoch": 0.07506915820086221, + "grad_norm": 2.8115320205688477, + "learning_rate": 9.994453645809017e-06, + "loss": 3.6134, + "step": 76200 + }, + { + "epoch": 0.0751184161786843, + "grad_norm": 2.7130353450775146, + "learning_rate": 9.994446356424188e-06, + "loss": 3.6109, + "step": 76250 + }, + { + "epoch": 0.07516767415650638, + "grad_norm": 2.639683961868286, + "learning_rate": 9.994439062255069e-06, + "loss": 3.6099, + "step": 76300 + }, + { + "epoch": 0.07521693213432848, + "grad_norm": 2.6402246952056885, + "learning_rate": 9.994431763301669e-06, + "loss": 3.6275, + "step": 76350 + }, + { + "epoch": 0.07526619011215056, + "grad_norm": 2.9030849933624268, + "learning_rate": 9.994424459563995e-06, + "loss": 3.6109, + "step": 76400 + }, + { + "epoch": 0.07531544808997265, + "grad_norm": 3.5715205669403076, + "learning_rate": 9.994417151042053e-06, + "loss": 3.6429, + "step": 76450 + }, + { + "epoch": 0.07536470606779475, + "grad_norm": 2.611229658126831, + "learning_rate": 9.99440983773585e-06, + "loss": 3.6049, + "step": 76500 + }, + { + "epoch": 0.07541396404561683, + "grad_norm": 2.777139902114868, + "learning_rate": 9.994402519645395e-06, + "loss": 3.6235, + "step": 76550 + }, + { + "epoch": 0.07546322202343891, + "grad_norm": 2.6420998573303223, + "learning_rate": 9.994395196770692e-06, + "loss": 3.6286, + "step": 76600 + }, + { + "epoch": 0.075512480001261, + "grad_norm": 2.5565552711486816, + "learning_rate": 9.99438786911175e-06, + "loss": 3.6462, + "step": 76650 + }, + { + "epoch": 0.0755617379790831, + "grad_norm": 2.4049456119537354, + "learning_rate": 9.994380536668575e-06, + "loss": 3.654, + "step": 76700 + }, + { + "epoch": 0.07561099595690518, + "grad_norm": 2.695481538772583, + "learning_rate": 9.994373199441177e-06, + "loss": 3.5298, + "step": 76750 + }, + { + "epoch": 0.07566025393472726, + "grad_norm": 2.5945494174957275, + "learning_rate": 9.994365857429558e-06, + "loss": 3.6266, + "step": 76800 + }, + { + "epoch": 0.07570951191254936, + "grad_norm": 2.878767728805542, + "learning_rate": 9.994358510633728e-06, + "loss": 3.5033, + "step": 76850 + }, + { + "epoch": 0.07575876989037145, + "grad_norm": 2.37325382232666, + "learning_rate": 9.994351159053695e-06, + "loss": 3.6621, + "step": 76900 + }, + { + "epoch": 0.07580802786819353, + "grad_norm": 3.0814969539642334, + "learning_rate": 9.994343802689464e-06, + "loss": 3.6279, + "step": 76950 + }, + { + "epoch": 0.07585728584601562, + "grad_norm": 2.934889316558838, + "learning_rate": 9.994336441541042e-06, + "loss": 3.5978, + "step": 77000 + }, + { + "epoch": 0.07590654382383771, + "grad_norm": 2.6793806552886963, + "learning_rate": 9.994329075608437e-06, + "loss": 3.564, + "step": 77050 + }, + { + "epoch": 0.0759558018016598, + "grad_norm": 2.661285877227783, + "learning_rate": 9.994321704891655e-06, + "loss": 3.6109, + "step": 77100 + }, + { + "epoch": 0.07600505977948188, + "grad_norm": 2.576117515563965, + "learning_rate": 9.994314329390705e-06, + "loss": 3.6132, + "step": 77150 + }, + { + "epoch": 0.07605431775730398, + "grad_norm": 2.7961719036102295, + "learning_rate": 9.994306949105593e-06, + "loss": 3.6825, + "step": 77200 + }, + { + "epoch": 0.07610357573512606, + "grad_norm": 2.4003701210021973, + "learning_rate": 9.994299564036327e-06, + "loss": 3.6218, + "step": 77250 + }, + { + "epoch": 0.07615283371294815, + "grad_norm": 2.6332144737243652, + "learning_rate": 9.994292174182912e-06, + "loss": 3.6142, + "step": 77300 + }, + { + "epoch": 0.07620209169077023, + "grad_norm": 2.495497941970825, + "learning_rate": 9.994284779545356e-06, + "loss": 3.5891, + "step": 77350 + }, + { + "epoch": 0.07625134966859233, + "grad_norm": 2.729457378387451, + "learning_rate": 9.994277380123664e-06, + "loss": 3.636, + "step": 77400 + }, + { + "epoch": 0.07630060764641441, + "grad_norm": 2.39570951461792, + "learning_rate": 9.99426997591785e-06, + "loss": 3.5865, + "step": 77450 + }, + { + "epoch": 0.0763498656242365, + "grad_norm": 2.624575614929199, + "learning_rate": 9.994262566927913e-06, + "loss": 3.5783, + "step": 77500 + }, + { + "epoch": 0.07639912360205858, + "grad_norm": 2.4480931758880615, + "learning_rate": 9.994255153153864e-06, + "loss": 3.5392, + "step": 77550 + }, + { + "epoch": 0.07644838157988068, + "grad_norm": 2.610196590423584, + "learning_rate": 9.994247734595712e-06, + "loss": 3.61, + "step": 77600 + }, + { + "epoch": 0.07649763955770277, + "grad_norm": 2.655616044998169, + "learning_rate": 9.99424031125346e-06, + "loss": 3.5909, + "step": 77650 + }, + { + "epoch": 0.07654689753552485, + "grad_norm": 2.848410129547119, + "learning_rate": 9.994232883127118e-06, + "loss": 3.5885, + "step": 77700 + }, + { + "epoch": 0.07659615551334695, + "grad_norm": 2.6791510581970215, + "learning_rate": 9.99422545021669e-06, + "loss": 3.5365, + "step": 77750 + }, + { + "epoch": 0.07664541349116903, + "grad_norm": 2.8551318645477295, + "learning_rate": 9.994218012522185e-06, + "loss": 3.6117, + "step": 77800 + }, + { + "epoch": 0.07669467146899112, + "grad_norm": 2.731682062149048, + "learning_rate": 9.994210570043613e-06, + "loss": 3.6046, + "step": 77850 + }, + { + "epoch": 0.0767439294468132, + "grad_norm": 2.5910708904266357, + "learning_rate": 9.994203122780976e-06, + "loss": 3.6615, + "step": 77900 + }, + { + "epoch": 0.0767931874246353, + "grad_norm": 2.7816951274871826, + "learning_rate": 9.994195670734284e-06, + "loss": 3.629, + "step": 77950 + }, + { + "epoch": 0.07684244540245738, + "grad_norm": 2.4194846153259277, + "learning_rate": 9.994188213903545e-06, + "loss": 3.5913, + "step": 78000 + }, + { + "epoch": 0.07689170338027947, + "grad_norm": 2.4510204792022705, + "learning_rate": 9.994180752288762e-06, + "loss": 3.6189, + "step": 78050 + }, + { + "epoch": 0.07694096135810156, + "grad_norm": 3.543703079223633, + "learning_rate": 9.994173285889948e-06, + "loss": 3.5794, + "step": 78100 + }, + { + "epoch": 0.07699021933592365, + "grad_norm": 2.7752790451049805, + "learning_rate": 9.994165814707105e-06, + "loss": 3.5912, + "step": 78150 + }, + { + "epoch": 0.07703947731374573, + "grad_norm": 2.6488571166992188, + "learning_rate": 9.994158338740243e-06, + "loss": 3.5987, + "step": 78200 + }, + { + "epoch": 0.07708873529156782, + "grad_norm": 2.561976909637451, + "learning_rate": 9.994150857989367e-06, + "loss": 3.6018, + "step": 78250 + }, + { + "epoch": 0.07713799326938992, + "grad_norm": 3.1859700679779053, + "learning_rate": 9.994143372454487e-06, + "loss": 3.5352, + "step": 78300 + }, + { + "epoch": 0.077187251247212, + "grad_norm": 2.742384433746338, + "learning_rate": 9.99413588213561e-06, + "loss": 3.5665, + "step": 78350 + }, + { + "epoch": 0.07723650922503408, + "grad_norm": 2.7384369373321533, + "learning_rate": 9.994128387032738e-06, + "loss": 3.6001, + "step": 78400 + }, + { + "epoch": 0.07728576720285618, + "grad_norm": 2.6327786445617676, + "learning_rate": 9.994120887145886e-06, + "loss": 3.6087, + "step": 78450 + }, + { + "epoch": 0.07733502518067827, + "grad_norm": 2.589012384414673, + "learning_rate": 9.994113382475054e-06, + "loss": 3.6411, + "step": 78500 + }, + { + "epoch": 0.07738428315850035, + "grad_norm": 2.4711496829986572, + "learning_rate": 9.994105873020255e-06, + "loss": 3.6762, + "step": 78550 + }, + { + "epoch": 0.07743354113632243, + "grad_norm": 2.9146006107330322, + "learning_rate": 9.994098358781494e-06, + "loss": 3.5652, + "step": 78600 + }, + { + "epoch": 0.07748279911414453, + "grad_norm": 2.840862512588501, + "learning_rate": 9.994090839758775e-06, + "loss": 3.5447, + "step": 78650 + }, + { + "epoch": 0.07753205709196662, + "grad_norm": 2.6190738677978516, + "learning_rate": 9.99408331595211e-06, + "loss": 3.5823, + "step": 78700 + }, + { + "epoch": 0.0775813150697887, + "grad_norm": 2.683576822280884, + "learning_rate": 9.994075787361504e-06, + "loss": 3.5888, + "step": 78750 + }, + { + "epoch": 0.07763057304761078, + "grad_norm": 2.739565372467041, + "learning_rate": 9.994068253986963e-06, + "loss": 3.5937, + "step": 78800 + }, + { + "epoch": 0.07767983102543288, + "grad_norm": 2.480062484741211, + "learning_rate": 9.994060715828498e-06, + "loss": 3.6413, + "step": 78850 + }, + { + "epoch": 0.07772908900325497, + "grad_norm": 2.461531639099121, + "learning_rate": 9.994053172886114e-06, + "loss": 3.638, + "step": 78900 + }, + { + "epoch": 0.07777834698107705, + "grad_norm": 2.5659523010253906, + "learning_rate": 9.994045625159817e-06, + "loss": 3.5782, + "step": 78950 + }, + { + "epoch": 0.07782760495889915, + "grad_norm": 2.706355094909668, + "learning_rate": 9.994038072649615e-06, + "loss": 3.6286, + "step": 79000 + }, + { + "epoch": 0.07787686293672123, + "grad_norm": 2.6770951747894287, + "learning_rate": 9.994030515355516e-06, + "loss": 3.5217, + "step": 79050 + }, + { + "epoch": 0.07792612091454332, + "grad_norm": 2.8909950256347656, + "learning_rate": 9.994022953277526e-06, + "loss": 3.6623, + "step": 79100 + }, + { + "epoch": 0.0779753788923654, + "grad_norm": 2.6451656818389893, + "learning_rate": 9.994015386415655e-06, + "loss": 3.6235, + "step": 79150 + }, + { + "epoch": 0.0780246368701875, + "grad_norm": 2.821711301803589, + "learning_rate": 9.994007814769906e-06, + "loss": 3.65, + "step": 79200 + }, + { + "epoch": 0.07807389484800958, + "grad_norm": 2.878124952316284, + "learning_rate": 9.994000238340292e-06, + "loss": 3.6128, + "step": 79250 + }, + { + "epoch": 0.07812315282583167, + "grad_norm": 2.720065116882324, + "learning_rate": 9.993992657126815e-06, + "loss": 3.5102, + "step": 79300 + }, + { + "epoch": 0.07817241080365377, + "grad_norm": 2.8402578830718994, + "learning_rate": 9.993985071129483e-06, + "loss": 3.6065, + "step": 79350 + }, + { + "epoch": 0.07822166878147585, + "grad_norm": 2.5894715785980225, + "learning_rate": 9.993977480348305e-06, + "loss": 3.5863, + "step": 79400 + }, + { + "epoch": 0.07827092675929793, + "grad_norm": 2.5315945148468018, + "learning_rate": 9.993969884783288e-06, + "loss": 3.5701, + "step": 79450 + }, + { + "epoch": 0.07832018473712002, + "grad_norm": 2.865124464035034, + "learning_rate": 9.993962284434437e-06, + "loss": 3.613, + "step": 79500 + }, + { + "epoch": 0.07836944271494212, + "grad_norm": 2.835350275039673, + "learning_rate": 9.993954679301766e-06, + "loss": 3.5383, + "step": 79550 + }, + { + "epoch": 0.0784187006927642, + "grad_norm": 2.6091625690460205, + "learning_rate": 9.993947069385273e-06, + "loss": 3.6222, + "step": 79600 + }, + { + "epoch": 0.07846795867058629, + "grad_norm": 2.510575532913208, + "learning_rate": 9.993939454684971e-06, + "loss": 3.5605, + "step": 79650 + }, + { + "epoch": 0.07851721664840837, + "grad_norm": 2.857231378555298, + "learning_rate": 9.993931835200865e-06, + "loss": 3.5389, + "step": 79700 + }, + { + "epoch": 0.07856647462623047, + "grad_norm": 2.524425506591797, + "learning_rate": 9.993924210932966e-06, + "loss": 3.5836, + "step": 79750 + }, + { + "epoch": 0.07861573260405255, + "grad_norm": 2.5206527709960938, + "learning_rate": 9.993916581881277e-06, + "loss": 3.5662, + "step": 79800 + }, + { + "epoch": 0.07866499058187464, + "grad_norm": 2.827385187149048, + "learning_rate": 9.993908948045807e-06, + "loss": 3.5492, + "step": 79850 + }, + { + "epoch": 0.07871424855969673, + "grad_norm": 2.6258528232574463, + "learning_rate": 9.993901309426564e-06, + "loss": 3.5732, + "step": 79900 + }, + { + "epoch": 0.07876350653751882, + "grad_norm": 2.4983890056610107, + "learning_rate": 9.993893666023553e-06, + "loss": 3.577, + "step": 79950 + }, + { + "epoch": 0.0788127645153409, + "grad_norm": 2.4640541076660156, + "learning_rate": 9.993886017836785e-06, + "loss": 3.6314, + "step": 80000 + }, + { + "epoch": 0.07886202249316299, + "grad_norm": 2.6135013103485107, + "learning_rate": 9.993878364866263e-06, + "loss": 3.6251, + "step": 80050 + }, + { + "epoch": 0.07891128047098508, + "grad_norm": 2.940502882003784, + "learning_rate": 9.993870707111999e-06, + "loss": 3.5992, + "step": 80100 + }, + { + "epoch": 0.07896053844880717, + "grad_norm": 2.6008734703063965, + "learning_rate": 9.993863044573996e-06, + "loss": 3.6376, + "step": 80150 + }, + { + "epoch": 0.07900979642662925, + "grad_norm": 2.803135395050049, + "learning_rate": 9.993855377252264e-06, + "loss": 3.5755, + "step": 80200 + }, + { + "epoch": 0.07905905440445135, + "grad_norm": 2.534965753555298, + "learning_rate": 9.99384770514681e-06, + "loss": 3.5858, + "step": 80250 + }, + { + "epoch": 0.07910831238227344, + "grad_norm": 2.4554474353790283, + "learning_rate": 9.99384002825764e-06, + "loss": 3.5692, + "step": 80300 + }, + { + "epoch": 0.07915757036009552, + "grad_norm": 2.5986316204071045, + "learning_rate": 9.993832346584764e-06, + "loss": 3.5769, + "step": 80350 + }, + { + "epoch": 0.0792068283379176, + "grad_norm": 2.474804162979126, + "learning_rate": 9.993824660128187e-06, + "loss": 3.6002, + "step": 80400 + }, + { + "epoch": 0.0792560863157397, + "grad_norm": 2.91159987449646, + "learning_rate": 9.993816968887915e-06, + "loss": 3.4805, + "step": 80450 + }, + { + "epoch": 0.07930534429356179, + "grad_norm": 2.571033000946045, + "learning_rate": 9.99380927286396e-06, + "loss": 3.6005, + "step": 80500 + }, + { + "epoch": 0.07935460227138387, + "grad_norm": 2.9510879516601562, + "learning_rate": 9.993801572056325e-06, + "loss": 3.63, + "step": 80550 + }, + { + "epoch": 0.07940386024920597, + "grad_norm": 2.8745508193969727, + "learning_rate": 9.99379386646502e-06, + "loss": 3.5616, + "step": 80600 + }, + { + "epoch": 0.07945311822702805, + "grad_norm": 2.6009600162506104, + "learning_rate": 9.99378615609005e-06, + "loss": 3.6428, + "step": 80650 + }, + { + "epoch": 0.07950237620485014, + "grad_norm": 2.789090633392334, + "learning_rate": 9.993778440931426e-06, + "loss": 3.577, + "step": 80700 + }, + { + "epoch": 0.07955163418267222, + "grad_norm": 2.7155349254608154, + "learning_rate": 9.993770720989153e-06, + "loss": 3.6402, + "step": 80750 + }, + { + "epoch": 0.07960089216049432, + "grad_norm": 2.6621360778808594, + "learning_rate": 9.99376299626324e-06, + "loss": 3.6139, + "step": 80800 + }, + { + "epoch": 0.0796501501383164, + "grad_norm": 2.6630618572235107, + "learning_rate": 9.993755266753688e-06, + "loss": 3.5622, + "step": 80850 + }, + { + "epoch": 0.07969940811613849, + "grad_norm": 2.847508668899536, + "learning_rate": 9.993747532460514e-06, + "loss": 3.5647, + "step": 80900 + }, + { + "epoch": 0.07974866609396057, + "grad_norm": 3.3109116554260254, + "learning_rate": 9.99373979338372e-06, + "loss": 3.6132, + "step": 80950 + }, + { + "epoch": 0.07979792407178267, + "grad_norm": 2.7042760848999023, + "learning_rate": 9.993732049523314e-06, + "loss": 3.5918, + "step": 81000 + }, + { + "epoch": 0.07984718204960475, + "grad_norm": 2.6336474418640137, + "learning_rate": 9.993724300879304e-06, + "loss": 3.592, + "step": 81050 + }, + { + "epoch": 0.07989644002742684, + "grad_norm": 2.385345458984375, + "learning_rate": 9.993716547451698e-06, + "loss": 3.6123, + "step": 81100 + }, + { + "epoch": 0.07994569800524894, + "grad_norm": 2.7782771587371826, + "learning_rate": 9.9937087892405e-06, + "loss": 3.5801, + "step": 81150 + }, + { + "epoch": 0.07999495598307102, + "grad_norm": 2.6983256340026855, + "learning_rate": 9.99370102624572e-06, + "loss": 3.6101, + "step": 81200 + }, + { + "epoch": 0.0800442139608931, + "grad_norm": 3.1907289028167725, + "learning_rate": 9.99369325846737e-06, + "loss": 3.609, + "step": 81250 + }, + { + "epoch": 0.08009347193871519, + "grad_norm": 2.7355217933654785, + "learning_rate": 9.993685485905449e-06, + "loss": 3.6224, + "step": 81300 + }, + { + "epoch": 0.08014272991653729, + "grad_norm": 2.4982829093933105, + "learning_rate": 9.99367770855997e-06, + "loss": 3.6367, + "step": 81350 + }, + { + "epoch": 0.08019198789435937, + "grad_norm": 2.634140968322754, + "learning_rate": 9.993669926430938e-06, + "loss": 3.4926, + "step": 81400 + }, + { + "epoch": 0.08024124587218145, + "grad_norm": 2.3907389640808105, + "learning_rate": 9.993662139518361e-06, + "loss": 3.4938, + "step": 81450 + }, + { + "epoch": 0.08029050385000355, + "grad_norm": 2.639362096786499, + "learning_rate": 9.993654347822246e-06, + "loss": 3.539, + "step": 81500 + }, + { + "epoch": 0.08033976182782564, + "grad_norm": 2.5869786739349365, + "learning_rate": 9.993646551342603e-06, + "loss": 3.5478, + "step": 81550 + }, + { + "epoch": 0.08038901980564772, + "grad_norm": 2.3697030544281006, + "learning_rate": 9.993638750079438e-06, + "loss": 3.6165, + "step": 81600 + }, + { + "epoch": 0.0804382777834698, + "grad_norm": 2.643968105316162, + "learning_rate": 9.993630944032756e-06, + "loss": 3.6269, + "step": 81650 + }, + { + "epoch": 0.0804875357612919, + "grad_norm": 2.7823052406311035, + "learning_rate": 9.993623133202568e-06, + "loss": 3.5842, + "step": 81700 + }, + { + "epoch": 0.08053679373911399, + "grad_norm": 2.7021989822387695, + "learning_rate": 9.99361531758888e-06, + "loss": 3.5726, + "step": 81750 + }, + { + "epoch": 0.08058605171693607, + "grad_norm": 2.6700222492218018, + "learning_rate": 9.993607497191697e-06, + "loss": 3.6108, + "step": 81800 + }, + { + "epoch": 0.08063530969475817, + "grad_norm": 2.8914437294006348, + "learning_rate": 9.993599672011032e-06, + "loss": 3.5854, + "step": 81850 + }, + { + "epoch": 0.08068456767258025, + "grad_norm": 2.826688528060913, + "learning_rate": 9.993591842046889e-06, + "loss": 3.5801, + "step": 81900 + }, + { + "epoch": 0.08073382565040234, + "grad_norm": 2.4949636459350586, + "learning_rate": 9.993584007299276e-06, + "loss": 3.5815, + "step": 81950 + }, + { + "epoch": 0.08078308362822442, + "grad_norm": 2.6836111545562744, + "learning_rate": 9.9935761677682e-06, + "loss": 3.5297, + "step": 82000 + }, + { + "epoch": 0.08083234160604652, + "grad_norm": 2.6047656536102295, + "learning_rate": 9.993568323453669e-06, + "loss": 3.5641, + "step": 82050 + }, + { + "epoch": 0.0808815995838686, + "grad_norm": 2.683647632598877, + "learning_rate": 9.993560474355692e-06, + "loss": 3.5003, + "step": 82100 + }, + { + "epoch": 0.08093085756169069, + "grad_norm": 2.6664469242095947, + "learning_rate": 9.993552620474274e-06, + "loss": 3.5817, + "step": 82150 + }, + { + "epoch": 0.08098011553951277, + "grad_norm": 2.6779887676239014, + "learning_rate": 9.993544761809424e-06, + "loss": 3.6213, + "step": 82200 + }, + { + "epoch": 0.08102937351733487, + "grad_norm": 2.6139307022094727, + "learning_rate": 9.993536898361149e-06, + "loss": 3.5381, + "step": 82250 + }, + { + "epoch": 0.08107863149515696, + "grad_norm": 2.6804659366607666, + "learning_rate": 9.993529030129456e-06, + "loss": 3.6028, + "step": 82300 + }, + { + "epoch": 0.08112788947297904, + "grad_norm": 2.5710816383361816, + "learning_rate": 9.993521157114355e-06, + "loss": 3.6792, + "step": 82350 + }, + { + "epoch": 0.08117714745080114, + "grad_norm": 2.461784601211548, + "learning_rate": 9.993513279315849e-06, + "loss": 3.6058, + "step": 82400 + }, + { + "epoch": 0.08122640542862322, + "grad_norm": 2.76576828956604, + "learning_rate": 9.99350539673395e-06, + "loss": 3.5626, + "step": 82450 + }, + { + "epoch": 0.0812756634064453, + "grad_norm": 2.6112589836120605, + "learning_rate": 9.993497509368664e-06, + "loss": 3.5211, + "step": 82500 + }, + { + "epoch": 0.08132492138426739, + "grad_norm": 2.596060037612915, + "learning_rate": 9.993489617219998e-06, + "loss": 3.553, + "step": 82550 + }, + { + "epoch": 0.08137417936208949, + "grad_norm": 2.709398031234741, + "learning_rate": 9.99348172028796e-06, + "loss": 3.6273, + "step": 82600 + }, + { + "epoch": 0.08142343733991157, + "grad_norm": 2.744860887527466, + "learning_rate": 9.993473818572557e-06, + "loss": 3.5767, + "step": 82650 + }, + { + "epoch": 0.08147269531773366, + "grad_norm": 2.5852866172790527, + "learning_rate": 9.993465912073798e-06, + "loss": 3.5873, + "step": 82700 + }, + { + "epoch": 0.08152195329555575, + "grad_norm": 2.9701457023620605, + "learning_rate": 9.99345800079169e-06, + "loss": 3.5377, + "step": 82750 + }, + { + "epoch": 0.08157121127337784, + "grad_norm": 2.768817901611328, + "learning_rate": 9.99345008472624e-06, + "loss": 3.5916, + "step": 82800 + }, + { + "epoch": 0.08162046925119992, + "grad_norm": 2.5297558307647705, + "learning_rate": 9.993442163877453e-06, + "loss": 3.4865, + "step": 82850 + }, + { + "epoch": 0.08166972722902201, + "grad_norm": 2.386624336242676, + "learning_rate": 9.993434238245343e-06, + "loss": 3.5865, + "step": 82900 + }, + { + "epoch": 0.0817189852068441, + "grad_norm": 2.538627862930298, + "learning_rate": 9.993426307829912e-06, + "loss": 3.5824, + "step": 82950 + }, + { + "epoch": 0.08176824318466619, + "grad_norm": 2.5463125705718994, + "learning_rate": 9.993418372631172e-06, + "loss": 3.523, + "step": 83000 + }, + { + "epoch": 0.08181750116248827, + "grad_norm": 2.7368357181549072, + "learning_rate": 9.993410432649126e-06, + "loss": 3.561, + "step": 83050 + }, + { + "epoch": 0.08186675914031036, + "grad_norm": 2.91473126411438, + "learning_rate": 9.993402487883785e-06, + "loss": 3.5516, + "step": 83100 + }, + { + "epoch": 0.08191601711813246, + "grad_norm": 2.842916250228882, + "learning_rate": 9.993394538335154e-06, + "loss": 3.5482, + "step": 83150 + }, + { + "epoch": 0.08196527509595454, + "grad_norm": 2.469609260559082, + "learning_rate": 9.993386584003244e-06, + "loss": 3.5721, + "step": 83200 + }, + { + "epoch": 0.08201453307377662, + "grad_norm": 2.6840031147003174, + "learning_rate": 9.99337862488806e-06, + "loss": 3.5425, + "step": 83250 + }, + { + "epoch": 0.08206379105159872, + "grad_norm": 2.7899723052978516, + "learning_rate": 9.99337066098961e-06, + "loss": 3.5184, + "step": 83300 + }, + { + "epoch": 0.0821130490294208, + "grad_norm": 2.701415538787842, + "learning_rate": 9.993362692307901e-06, + "loss": 3.5715, + "step": 83350 + }, + { + "epoch": 0.08216230700724289, + "grad_norm": 2.996854305267334, + "learning_rate": 9.993354718842943e-06, + "loss": 3.6097, + "step": 83400 + }, + { + "epoch": 0.08221156498506497, + "grad_norm": 2.7720508575439453, + "learning_rate": 9.993346740594742e-06, + "loss": 3.5882, + "step": 83450 + }, + { + "epoch": 0.08226082296288707, + "grad_norm": 2.5792629718780518, + "learning_rate": 9.993338757563305e-06, + "loss": 3.5467, + "step": 83500 + }, + { + "epoch": 0.08231008094070916, + "grad_norm": 2.9115114212036133, + "learning_rate": 9.993330769748641e-06, + "loss": 3.5538, + "step": 83550 + }, + { + "epoch": 0.08235933891853124, + "grad_norm": 2.5848231315612793, + "learning_rate": 9.993322777150759e-06, + "loss": 3.5436, + "step": 83600 + }, + { + "epoch": 0.08240859689635334, + "grad_norm": 3.1945884227752686, + "learning_rate": 9.99331477976966e-06, + "loss": 3.5299, + "step": 83650 + }, + { + "epoch": 0.08245785487417542, + "grad_norm": 2.5118203163146973, + "learning_rate": 9.993306777605361e-06, + "loss": 3.5181, + "step": 83700 + }, + { + "epoch": 0.08250711285199751, + "grad_norm": 2.635374069213867, + "learning_rate": 9.993298770657863e-06, + "loss": 3.5096, + "step": 83750 + }, + { + "epoch": 0.08255637082981959, + "grad_norm": 2.7606754302978516, + "learning_rate": 9.993290758927176e-06, + "loss": 3.5188, + "step": 83800 + }, + { + "epoch": 0.08260562880764169, + "grad_norm": 2.7675282955169678, + "learning_rate": 9.993282742413308e-06, + "loss": 3.557, + "step": 83850 + }, + { + "epoch": 0.08265488678546377, + "grad_norm": 2.632143497467041, + "learning_rate": 9.993274721116267e-06, + "loss": 3.6154, + "step": 83900 + }, + { + "epoch": 0.08270414476328586, + "grad_norm": 2.952312469482422, + "learning_rate": 9.993266695036056e-06, + "loss": 3.5026, + "step": 83950 + }, + { + "epoch": 0.08275340274110796, + "grad_norm": 2.521624803543091, + "learning_rate": 9.993258664172691e-06, + "loss": 3.5469, + "step": 84000 + }, + { + "epoch": 0.08280266071893004, + "grad_norm": 2.532771348953247, + "learning_rate": 9.993250628526172e-06, + "loss": 3.5172, + "step": 84050 + }, + { + "epoch": 0.08285191869675212, + "grad_norm": 2.7896690368652344, + "learning_rate": 9.993242588096513e-06, + "loss": 3.5624, + "step": 84100 + }, + { + "epoch": 0.08290117667457421, + "grad_norm": 2.5722405910491943, + "learning_rate": 9.993234542883717e-06, + "loss": 3.5807, + "step": 84150 + }, + { + "epoch": 0.08295043465239631, + "grad_norm": 2.8355026245117188, + "learning_rate": 9.993226492887793e-06, + "loss": 3.556, + "step": 84200 + }, + { + "epoch": 0.08299969263021839, + "grad_norm": 2.637105703353882, + "learning_rate": 9.993218438108748e-06, + "loss": 3.6392, + "step": 84250 + }, + { + "epoch": 0.08304895060804048, + "grad_norm": 2.335650682449341, + "learning_rate": 9.993210378546592e-06, + "loss": 3.6058, + "step": 84300 + }, + { + "epoch": 0.08309820858586256, + "grad_norm": 3.0109500885009766, + "learning_rate": 9.99320231420133e-06, + "loss": 3.5696, + "step": 84350 + }, + { + "epoch": 0.08314746656368466, + "grad_norm": 2.690244436264038, + "learning_rate": 9.993194245072974e-06, + "loss": 3.5141, + "step": 84400 + }, + { + "epoch": 0.08319672454150674, + "grad_norm": 2.534149169921875, + "learning_rate": 9.99318617116153e-06, + "loss": 3.5352, + "step": 84450 + }, + { + "epoch": 0.08324598251932883, + "grad_norm": 2.8025267124176025, + "learning_rate": 9.993178092467e-06, + "loss": 3.5739, + "step": 84500 + }, + { + "epoch": 0.08329524049715092, + "grad_norm": 2.7458815574645996, + "learning_rate": 9.993170008989399e-06, + "loss": 3.5933, + "step": 84550 + }, + { + "epoch": 0.08334449847497301, + "grad_norm": 2.5686168670654297, + "learning_rate": 9.993161920728731e-06, + "loss": 3.5118, + "step": 84600 + }, + { + "epoch": 0.08339375645279509, + "grad_norm": 2.4805991649627686, + "learning_rate": 9.993153827685006e-06, + "loss": 3.5379, + "step": 84650 + }, + { + "epoch": 0.08344301443061718, + "grad_norm": 2.742490768432617, + "learning_rate": 9.99314572985823e-06, + "loss": 3.5668, + "step": 84700 + }, + { + "epoch": 0.08349227240843927, + "grad_norm": 2.7217047214508057, + "learning_rate": 9.993137627248411e-06, + "loss": 3.5622, + "step": 84750 + }, + { + "epoch": 0.08354153038626136, + "grad_norm": 2.5703630447387695, + "learning_rate": 9.993129519855558e-06, + "loss": 3.5837, + "step": 84800 + }, + { + "epoch": 0.08359078836408344, + "grad_norm": 2.553809881210327, + "learning_rate": 9.993121407679678e-06, + "loss": 3.495, + "step": 84850 + }, + { + "epoch": 0.08364004634190554, + "grad_norm": 2.5125691890716553, + "learning_rate": 9.993113290720779e-06, + "loss": 3.5538, + "step": 84900 + }, + { + "epoch": 0.08368930431972763, + "grad_norm": 3.1222012042999268, + "learning_rate": 9.993105168978868e-06, + "loss": 3.5866, + "step": 84950 + }, + { + "epoch": 0.08373856229754971, + "grad_norm": 2.4522104263305664, + "learning_rate": 9.993097042453951e-06, + "loss": 3.5397, + "step": 85000 + }, + { + "epoch": 0.0837878202753718, + "grad_norm": 2.7148947715759277, + "learning_rate": 9.993088911146041e-06, + "loss": 3.4826, + "step": 85050 + }, + { + "epoch": 0.08383707825319389, + "grad_norm": 2.6041154861450195, + "learning_rate": 9.993080775055141e-06, + "loss": 3.5449, + "step": 85100 + }, + { + "epoch": 0.08388633623101598, + "grad_norm": 3.046959400177002, + "learning_rate": 9.993072634181263e-06, + "loss": 3.5281, + "step": 85150 + }, + { + "epoch": 0.08393559420883806, + "grad_norm": 2.5197510719299316, + "learning_rate": 9.993064488524412e-06, + "loss": 3.5353, + "step": 85200 + }, + { + "epoch": 0.08398485218666016, + "grad_norm": 2.660175323486328, + "learning_rate": 9.993056338084594e-06, + "loss": 3.5334, + "step": 85250 + }, + { + "epoch": 0.08403411016448224, + "grad_norm": 2.4798688888549805, + "learning_rate": 9.99304818286182e-06, + "loss": 3.5673, + "step": 85300 + }, + { + "epoch": 0.08408336814230433, + "grad_norm": 2.895433187484741, + "learning_rate": 9.993040022856098e-06, + "loss": 3.5246, + "step": 85350 + }, + { + "epoch": 0.08413262612012641, + "grad_norm": 2.505220413208008, + "learning_rate": 9.993031858067432e-06, + "loss": 3.6022, + "step": 85400 + }, + { + "epoch": 0.08418188409794851, + "grad_norm": 2.4558920860290527, + "learning_rate": 9.993023688495834e-06, + "loss": 3.5304, + "step": 85450 + }, + { + "epoch": 0.0842311420757706, + "grad_norm": 2.706031084060669, + "learning_rate": 9.993015514141311e-06, + "loss": 3.5175, + "step": 85500 + }, + { + "epoch": 0.08428040005359268, + "grad_norm": 2.73598575592041, + "learning_rate": 9.99300733500387e-06, + "loss": 3.5644, + "step": 85550 + }, + { + "epoch": 0.08432965803141476, + "grad_norm": 3.419055938720703, + "learning_rate": 9.992999151083518e-06, + "loss": 3.5961, + "step": 85600 + }, + { + "epoch": 0.08437891600923686, + "grad_norm": 2.535879373550415, + "learning_rate": 9.992990962380264e-06, + "loss": 3.546, + "step": 85650 + }, + { + "epoch": 0.08442817398705894, + "grad_norm": 2.622371196746826, + "learning_rate": 9.992982768894115e-06, + "loss": 3.5623, + "step": 85700 + }, + { + "epoch": 0.08447743196488103, + "grad_norm": 3.0832736492156982, + "learning_rate": 9.992974570625081e-06, + "loss": 3.5773, + "step": 85750 + }, + { + "epoch": 0.08452668994270313, + "grad_norm": 2.691267490386963, + "learning_rate": 9.992966367573167e-06, + "loss": 3.5602, + "step": 85800 + }, + { + "epoch": 0.08457594792052521, + "grad_norm": 2.6377806663513184, + "learning_rate": 9.992958159738383e-06, + "loss": 3.5294, + "step": 85850 + }, + { + "epoch": 0.0846252058983473, + "grad_norm": 2.8323581218719482, + "learning_rate": 9.992949947120737e-06, + "loss": 3.5228, + "step": 85900 + }, + { + "epoch": 0.08467446387616938, + "grad_norm": 7.656682014465332, + "learning_rate": 9.992941729720235e-06, + "loss": 3.513, + "step": 85950 + }, + { + "epoch": 0.08472372185399148, + "grad_norm": 2.609022855758667, + "learning_rate": 9.992933507536886e-06, + "loss": 3.5711, + "step": 86000 + }, + { + "epoch": 0.08477297983181356, + "grad_norm": 2.923875331878662, + "learning_rate": 9.992925280570696e-06, + "loss": 3.5956, + "step": 86050 + }, + { + "epoch": 0.08482223780963564, + "grad_norm": 2.524531364440918, + "learning_rate": 9.992917048821677e-06, + "loss": 3.6039, + "step": 86100 + }, + { + "epoch": 0.08487149578745774, + "grad_norm": 2.6002161502838135, + "learning_rate": 9.992908812289833e-06, + "loss": 3.4705, + "step": 86150 + }, + { + "epoch": 0.08492075376527983, + "grad_norm": 2.611948251724243, + "learning_rate": 9.992900570975174e-06, + "loss": 3.6147, + "step": 86200 + }, + { + "epoch": 0.08497001174310191, + "grad_norm": 2.7273263931274414, + "learning_rate": 9.992892324877707e-06, + "loss": 3.4891, + "step": 86250 + }, + { + "epoch": 0.085019269720924, + "grad_norm": 2.6298270225524902, + "learning_rate": 9.99288407399744e-06, + "loss": 3.5598, + "step": 86300 + }, + { + "epoch": 0.0850685276987461, + "grad_norm": 2.372058629989624, + "learning_rate": 9.99287581833438e-06, + "loss": 3.5594, + "step": 86350 + }, + { + "epoch": 0.08511778567656818, + "grad_norm": 3.4173614978790283, + "learning_rate": 9.992867557888539e-06, + "loss": 3.5739, + "step": 86400 + }, + { + "epoch": 0.08516704365439026, + "grad_norm": 2.603400707244873, + "learning_rate": 9.992859292659919e-06, + "loss": 3.5375, + "step": 86450 + }, + { + "epoch": 0.08521630163221235, + "grad_norm": 2.4780337810516357, + "learning_rate": 9.99285102264853e-06, + "loss": 3.6018, + "step": 86500 + }, + { + "epoch": 0.08526555961003444, + "grad_norm": 2.813688039779663, + "learning_rate": 9.992842747854383e-06, + "loss": 3.5412, + "step": 86550 + }, + { + "epoch": 0.08531481758785653, + "grad_norm": 2.7290074825286865, + "learning_rate": 9.992834468277484e-06, + "loss": 3.5557, + "step": 86600 + }, + { + "epoch": 0.08536407556567861, + "grad_norm": 2.385192394256592, + "learning_rate": 9.992826183917839e-06, + "loss": 3.5122, + "step": 86650 + }, + { + "epoch": 0.08541333354350071, + "grad_norm": 2.5369277000427246, + "learning_rate": 9.992817894775458e-06, + "loss": 3.5678, + "step": 86700 + }, + { + "epoch": 0.0854625915213228, + "grad_norm": 2.7546024322509766, + "learning_rate": 9.992809600850348e-06, + "loss": 3.4976, + "step": 86750 + }, + { + "epoch": 0.08551184949914488, + "grad_norm": 2.501375913619995, + "learning_rate": 9.992801302142519e-06, + "loss": 3.5853, + "step": 86800 + }, + { + "epoch": 0.08556110747696696, + "grad_norm": 2.810171365737915, + "learning_rate": 9.992792998651977e-06, + "loss": 3.5156, + "step": 86850 + }, + { + "epoch": 0.08561036545478906, + "grad_norm": 2.856325387954712, + "learning_rate": 9.992784690378728e-06, + "loss": 3.5626, + "step": 86900 + }, + { + "epoch": 0.08565962343261115, + "grad_norm": 2.920994520187378, + "learning_rate": 9.992776377322784e-06, + "loss": 3.486, + "step": 86950 + }, + { + "epoch": 0.08570888141043323, + "grad_norm": 2.8368146419525146, + "learning_rate": 9.99276805948415e-06, + "loss": 3.5304, + "step": 87000 + }, + { + "epoch": 0.08575813938825533, + "grad_norm": 2.583176851272583, + "learning_rate": 9.992759736862838e-06, + "loss": 3.6053, + "step": 87050 + }, + { + "epoch": 0.08580739736607741, + "grad_norm": 2.7900843620300293, + "learning_rate": 9.99275140945885e-06, + "loss": 3.5985, + "step": 87100 + }, + { + "epoch": 0.0858566553438995, + "grad_norm": 3.079324960708618, + "learning_rate": 9.992743077272198e-06, + "loss": 3.6397, + "step": 87150 + }, + { + "epoch": 0.08590591332172158, + "grad_norm": 2.7468321323394775, + "learning_rate": 9.992734740302889e-06, + "loss": 3.5446, + "step": 87200 + }, + { + "epoch": 0.08595517129954368, + "grad_norm": 2.8026750087738037, + "learning_rate": 9.992726398550931e-06, + "loss": 3.5369, + "step": 87250 + }, + { + "epoch": 0.08600442927736576, + "grad_norm": 2.53709077835083, + "learning_rate": 9.992718052016333e-06, + "loss": 3.5228, + "step": 87300 + }, + { + "epoch": 0.08605368725518785, + "grad_norm": 2.520190715789795, + "learning_rate": 9.992709700699103e-06, + "loss": 3.6005, + "step": 87350 + }, + { + "epoch": 0.08610294523300994, + "grad_norm": 2.77109956741333, + "learning_rate": 9.992701344599245e-06, + "loss": 3.5916, + "step": 87400 + }, + { + "epoch": 0.08615220321083203, + "grad_norm": 2.5478734970092773, + "learning_rate": 9.992692983716773e-06, + "loss": 3.5448, + "step": 87450 + }, + { + "epoch": 0.08620146118865411, + "grad_norm": 2.9430477619171143, + "learning_rate": 9.992684618051691e-06, + "loss": 3.5113, + "step": 87500 + }, + { + "epoch": 0.0862507191664762, + "grad_norm": 2.5092997550964355, + "learning_rate": 9.992676247604008e-06, + "loss": 3.552, + "step": 87550 + }, + { + "epoch": 0.0862999771442983, + "grad_norm": 2.726875066757202, + "learning_rate": 9.992667872373732e-06, + "loss": 3.5148, + "step": 87600 + }, + { + "epoch": 0.08634923512212038, + "grad_norm": 2.6471102237701416, + "learning_rate": 9.992659492360872e-06, + "loss": 3.5582, + "step": 87650 + }, + { + "epoch": 0.08639849309994246, + "grad_norm": 2.5109517574310303, + "learning_rate": 9.992651107565433e-06, + "loss": 3.5547, + "step": 87700 + }, + { + "epoch": 0.08644775107776455, + "grad_norm": 2.7187154293060303, + "learning_rate": 9.992642717987427e-06, + "loss": 3.4924, + "step": 87750 + }, + { + "epoch": 0.08649700905558665, + "grad_norm": 2.7590205669403076, + "learning_rate": 9.99263432362686e-06, + "loss": 3.5396, + "step": 87800 + }, + { + "epoch": 0.08654626703340873, + "grad_norm": 2.751372814178467, + "learning_rate": 9.992625924483739e-06, + "loss": 3.5425, + "step": 87850 + }, + { + "epoch": 0.08659552501123081, + "grad_norm": 2.77827525138855, + "learning_rate": 9.992617520558075e-06, + "loss": 3.5343, + "step": 87900 + }, + { + "epoch": 0.08664478298905291, + "grad_norm": 2.4039108753204346, + "learning_rate": 9.992609111849873e-06, + "loss": 3.486, + "step": 87950 + }, + { + "epoch": 0.086694040966875, + "grad_norm": 3.9515085220336914, + "learning_rate": 9.992600698359144e-06, + "loss": 3.5407, + "step": 88000 + }, + { + "epoch": 0.08674329894469708, + "grad_norm": 2.4379289150238037, + "learning_rate": 9.992592280085894e-06, + "loss": 3.4459, + "step": 88050 + }, + { + "epoch": 0.08679255692251917, + "grad_norm": 2.4810924530029297, + "learning_rate": 9.992583857030132e-06, + "loss": 3.5778, + "step": 88100 + }, + { + "epoch": 0.08684181490034126, + "grad_norm": 2.5288853645324707, + "learning_rate": 9.992575429191864e-06, + "loss": 3.5012, + "step": 88150 + }, + { + "epoch": 0.08689107287816335, + "grad_norm": 2.4745914936065674, + "learning_rate": 9.9925669965711e-06, + "loss": 3.512, + "step": 88200 + }, + { + "epoch": 0.08694033085598543, + "grad_norm": 2.5554139614105225, + "learning_rate": 9.992558559167848e-06, + "loss": 3.5573, + "step": 88250 + }, + { + "epoch": 0.08698958883380753, + "grad_norm": 2.7023372650146484, + "learning_rate": 9.992550116982116e-06, + "loss": 3.5133, + "step": 88300 + }, + { + "epoch": 0.08703884681162961, + "grad_norm": 2.445462703704834, + "learning_rate": 9.992541670013912e-06, + "loss": 3.6232, + "step": 88350 + }, + { + "epoch": 0.0870881047894517, + "grad_norm": 2.461852788925171, + "learning_rate": 9.992533218263245e-06, + "loss": 3.5407, + "step": 88400 + }, + { + "epoch": 0.08713736276727378, + "grad_norm": 2.715186595916748, + "learning_rate": 9.99252476173012e-06, + "loss": 3.5156, + "step": 88450 + }, + { + "epoch": 0.08718662074509588, + "grad_norm": 2.658315658569336, + "learning_rate": 9.99251630041455e-06, + "loss": 3.5975, + "step": 88500 + }, + { + "epoch": 0.08723587872291796, + "grad_norm": 2.541517734527588, + "learning_rate": 9.992507834316539e-06, + "loss": 3.5416, + "step": 88550 + }, + { + "epoch": 0.08728513670074005, + "grad_norm": 2.6636147499084473, + "learning_rate": 9.992499363436095e-06, + "loss": 3.499, + "step": 88600 + }, + { + "epoch": 0.08733439467856215, + "grad_norm": 2.708833694458008, + "learning_rate": 9.99249088777323e-06, + "loss": 3.5042, + "step": 88650 + }, + { + "epoch": 0.08738365265638423, + "grad_norm": 2.585986375808716, + "learning_rate": 9.992482407327948e-06, + "loss": 3.5843, + "step": 88700 + }, + { + "epoch": 0.08743291063420632, + "grad_norm": 2.4351749420166016, + "learning_rate": 9.992473922100259e-06, + "loss": 3.5687, + "step": 88750 + }, + { + "epoch": 0.0874821686120284, + "grad_norm": 2.382880449295044, + "learning_rate": 9.99246543209017e-06, + "loss": 3.5761, + "step": 88800 + }, + { + "epoch": 0.0875314265898505, + "grad_norm": 2.5152156352996826, + "learning_rate": 9.992456937297692e-06, + "loss": 3.5486, + "step": 88850 + }, + { + "epoch": 0.08758068456767258, + "grad_norm": 2.8831043243408203, + "learning_rate": 9.99244843772283e-06, + "loss": 3.5019, + "step": 88900 + }, + { + "epoch": 0.08762994254549467, + "grad_norm": 2.4755122661590576, + "learning_rate": 9.992439933365594e-06, + "loss": 3.5072, + "step": 88950 + }, + { + "epoch": 0.08767920052331675, + "grad_norm": 2.9149246215820312, + "learning_rate": 9.992431424225992e-06, + "loss": 3.5254, + "step": 89000 + }, + { + "epoch": 0.08772845850113885, + "grad_norm": 2.470930814743042, + "learning_rate": 9.992422910304031e-06, + "loss": 3.5119, + "step": 89050 + }, + { + "epoch": 0.08777771647896093, + "grad_norm": 2.594900608062744, + "learning_rate": 9.99241439159972e-06, + "loss": 3.5471, + "step": 89100 + }, + { + "epoch": 0.08782697445678302, + "grad_norm": 2.8226711750030518, + "learning_rate": 9.992405868113066e-06, + "loss": 3.5427, + "step": 89150 + }, + { + "epoch": 0.08787623243460511, + "grad_norm": 2.391951322555542, + "learning_rate": 9.992397339844079e-06, + "loss": 3.477, + "step": 89200 + }, + { + "epoch": 0.0879254904124272, + "grad_norm": 2.280829429626465, + "learning_rate": 9.992388806792765e-06, + "loss": 3.5308, + "step": 89250 + }, + { + "epoch": 0.08797474839024928, + "grad_norm": 2.977295160293579, + "learning_rate": 9.992380268959134e-06, + "loss": 3.5034, + "step": 89300 + }, + { + "epoch": 0.08802400636807137, + "grad_norm": 2.482581615447998, + "learning_rate": 9.992371726343195e-06, + "loss": 3.53, + "step": 89350 + }, + { + "epoch": 0.08807326434589346, + "grad_norm": 2.6620121002197266, + "learning_rate": 9.992363178944954e-06, + "loss": 3.4888, + "step": 89400 + }, + { + "epoch": 0.08812252232371555, + "grad_norm": 2.6313095092773438, + "learning_rate": 9.99235462676442e-06, + "loss": 3.5438, + "step": 89450 + }, + { + "epoch": 0.08817178030153763, + "grad_norm": 2.8939120769500732, + "learning_rate": 9.992346069801602e-06, + "loss": 3.5419, + "step": 89500 + }, + { + "epoch": 0.08822103827935973, + "grad_norm": 2.3940343856811523, + "learning_rate": 9.992337508056505e-06, + "loss": 3.5407, + "step": 89550 + }, + { + "epoch": 0.08827029625718182, + "grad_norm": 2.6430909633636475, + "learning_rate": 9.992328941529141e-06, + "loss": 3.5123, + "step": 89600 + }, + { + "epoch": 0.0883195542350039, + "grad_norm": 2.820066452026367, + "learning_rate": 9.992320370219516e-06, + "loss": 3.5074, + "step": 89650 + }, + { + "epoch": 0.08836881221282598, + "grad_norm": 2.598034143447876, + "learning_rate": 9.992311794127641e-06, + "loss": 3.5068, + "step": 89700 + }, + { + "epoch": 0.08841807019064808, + "grad_norm": 2.3448190689086914, + "learning_rate": 9.992303213253522e-06, + "loss": 3.5129, + "step": 89750 + }, + { + "epoch": 0.08846732816847017, + "grad_norm": 2.763636350631714, + "learning_rate": 9.992294627597166e-06, + "loss": 3.4955, + "step": 89800 + }, + { + "epoch": 0.08851658614629225, + "grad_norm": 2.390789270401001, + "learning_rate": 9.992286037158584e-06, + "loss": 3.5049, + "step": 89850 + }, + { + "epoch": 0.08856584412411435, + "grad_norm": 2.3159124851226807, + "learning_rate": 9.992277441937781e-06, + "loss": 3.5574, + "step": 89900 + }, + { + "epoch": 0.08861510210193643, + "grad_norm": 2.6102449893951416, + "learning_rate": 9.99226884193477e-06, + "loss": 3.4645, + "step": 89950 + }, + { + "epoch": 0.08866436007975852, + "grad_norm": 2.8563361167907715, + "learning_rate": 9.992260237149554e-06, + "loss": 3.5104, + "step": 90000 + }, + { + "epoch": 0.0887136180575806, + "grad_norm": 2.7328379154205322, + "learning_rate": 9.992251627582145e-06, + "loss": 3.5954, + "step": 90050 + }, + { + "epoch": 0.0887628760354027, + "grad_norm": 2.6661548614501953, + "learning_rate": 9.992243013232549e-06, + "loss": 3.544, + "step": 90100 + }, + { + "epoch": 0.08881213401322478, + "grad_norm": 2.617321014404297, + "learning_rate": 9.992234394100775e-06, + "loss": 3.5255, + "step": 90150 + }, + { + "epoch": 0.08886139199104687, + "grad_norm": 2.4210362434387207, + "learning_rate": 9.992225770186833e-06, + "loss": 3.5065, + "step": 90200 + }, + { + "epoch": 0.08891064996886895, + "grad_norm": 2.6725893020629883, + "learning_rate": 9.992217141490727e-06, + "loss": 3.5117, + "step": 90250 + }, + { + "epoch": 0.08895990794669105, + "grad_norm": 2.578807830810547, + "learning_rate": 9.99220850801247e-06, + "loss": 3.5545, + "step": 90300 + }, + { + "epoch": 0.08900916592451313, + "grad_norm": 2.3690056800842285, + "learning_rate": 9.992199869752066e-06, + "loss": 3.503, + "step": 90350 + }, + { + "epoch": 0.08905842390233522, + "grad_norm": 2.781726837158203, + "learning_rate": 9.992191226709528e-06, + "loss": 3.5626, + "step": 90400 + }, + { + "epoch": 0.08910768188015732, + "grad_norm": 2.4719045162200928, + "learning_rate": 9.99218257888486e-06, + "loss": 3.5145, + "step": 90450 + }, + { + "epoch": 0.0891569398579794, + "grad_norm": 2.624699115753174, + "learning_rate": 9.992173926278072e-06, + "loss": 3.4759, + "step": 90500 + }, + { + "epoch": 0.08920619783580148, + "grad_norm": 2.647613286972046, + "learning_rate": 9.992165268889174e-06, + "loss": 3.5719, + "step": 90550 + }, + { + "epoch": 0.08925545581362357, + "grad_norm": 2.4592554569244385, + "learning_rate": 9.99215660671817e-06, + "loss": 3.5596, + "step": 90600 + }, + { + "epoch": 0.08930471379144567, + "grad_norm": 2.569150924682617, + "learning_rate": 9.992147939765073e-06, + "loss": 3.5558, + "step": 90650 + }, + { + "epoch": 0.08935397176926775, + "grad_norm": 2.7038497924804688, + "learning_rate": 9.99213926802989e-06, + "loss": 3.4813, + "step": 90700 + }, + { + "epoch": 0.08940322974708984, + "grad_norm": 2.477156162261963, + "learning_rate": 9.992130591512627e-06, + "loss": 3.5581, + "step": 90750 + }, + { + "epoch": 0.08945248772491193, + "grad_norm": 2.522063732147217, + "learning_rate": 9.992121910213294e-06, + "loss": 3.5009, + "step": 90800 + }, + { + "epoch": 0.08950174570273402, + "grad_norm": 2.414885997772217, + "learning_rate": 9.992113224131898e-06, + "loss": 3.4959, + "step": 90850 + }, + { + "epoch": 0.0895510036805561, + "grad_norm": 2.3920798301696777, + "learning_rate": 9.99210453326845e-06, + "loss": 3.4681, + "step": 90900 + }, + { + "epoch": 0.08960026165837819, + "grad_norm": 2.5773704051971436, + "learning_rate": 9.992095837622957e-06, + "loss": 3.5545, + "step": 90950 + }, + { + "epoch": 0.08964951963620028, + "grad_norm": 2.5034501552581787, + "learning_rate": 9.992087137195426e-06, + "loss": 3.5353, + "step": 91000 + }, + { + "epoch": 0.08969877761402237, + "grad_norm": 2.741173028945923, + "learning_rate": 9.992078431985867e-06, + "loss": 3.5645, + "step": 91050 + }, + { + "epoch": 0.08974803559184445, + "grad_norm": 2.4738497734069824, + "learning_rate": 9.992069721994286e-06, + "loss": 3.573, + "step": 91100 + }, + { + "epoch": 0.08979729356966654, + "grad_norm": 2.6511213779449463, + "learning_rate": 9.992061007220695e-06, + "loss": 3.4823, + "step": 91150 + }, + { + "epoch": 0.08984655154748863, + "grad_norm": 2.5052990913391113, + "learning_rate": 9.992052287665102e-06, + "loss": 3.5282, + "step": 91200 + }, + { + "epoch": 0.08989580952531072, + "grad_norm": 2.4966471195220947, + "learning_rate": 9.992043563327511e-06, + "loss": 3.5504, + "step": 91250 + }, + { + "epoch": 0.0899450675031328, + "grad_norm": 2.365420341491699, + "learning_rate": 9.992034834207933e-06, + "loss": 3.5086, + "step": 91300 + }, + { + "epoch": 0.0899943254809549, + "grad_norm": 2.395207166671753, + "learning_rate": 9.992026100306378e-06, + "loss": 3.5467, + "step": 91350 + }, + { + "epoch": 0.09004358345877699, + "grad_norm": 2.4911882877349854, + "learning_rate": 9.992017361622853e-06, + "loss": 3.5371, + "step": 91400 + }, + { + "epoch": 0.09009284143659907, + "grad_norm": 2.4821033477783203, + "learning_rate": 9.992008618157366e-06, + "loss": 3.5372, + "step": 91450 + }, + { + "epoch": 0.09014209941442115, + "grad_norm": 2.7418088912963867, + "learning_rate": 9.991999869909926e-06, + "loss": 3.5821, + "step": 91500 + }, + { + "epoch": 0.09019135739224325, + "grad_norm": 2.600388526916504, + "learning_rate": 9.991991116880539e-06, + "loss": 3.4726, + "step": 91550 + }, + { + "epoch": 0.09024061537006534, + "grad_norm": 2.75099515914917, + "learning_rate": 9.991982359069219e-06, + "loss": 3.5501, + "step": 91600 + }, + { + "epoch": 0.09028987334788742, + "grad_norm": 2.580803394317627, + "learning_rate": 9.991973596475968e-06, + "loss": 3.4941, + "step": 91650 + }, + { + "epoch": 0.09033913132570952, + "grad_norm": 2.90989089012146, + "learning_rate": 9.991964829100797e-06, + "loss": 3.4952, + "step": 91700 + }, + { + "epoch": 0.0903883893035316, + "grad_norm": 2.6223320960998535, + "learning_rate": 9.991956056943718e-06, + "loss": 3.4964, + "step": 91750 + }, + { + "epoch": 0.09043764728135369, + "grad_norm": 2.605656862258911, + "learning_rate": 9.991947280004732e-06, + "loss": 3.5144, + "step": 91800 + }, + { + "epoch": 0.09048690525917577, + "grad_norm": 2.6077051162719727, + "learning_rate": 9.991938498283852e-06, + "loss": 3.5102, + "step": 91850 + }, + { + "epoch": 0.09053616323699787, + "grad_norm": 2.599778890609741, + "learning_rate": 9.991929711781087e-06, + "loss": 3.4575, + "step": 91900 + }, + { + "epoch": 0.09058542121481995, + "grad_norm": 2.926546812057495, + "learning_rate": 9.991920920496445e-06, + "loss": 3.6151, + "step": 91950 + }, + { + "epoch": 0.09063467919264204, + "grad_norm": 2.5250444412231445, + "learning_rate": 9.991912124429932e-06, + "loss": 3.5043, + "step": 92000 + }, + { + "epoch": 0.09068393717046414, + "grad_norm": 2.434934616088867, + "learning_rate": 9.99190332358156e-06, + "loss": 3.4979, + "step": 92050 + }, + { + "epoch": 0.09073319514828622, + "grad_norm": 2.545457124710083, + "learning_rate": 9.991894517951334e-06, + "loss": 3.4963, + "step": 92100 + }, + { + "epoch": 0.0907824531261083, + "grad_norm": 2.6983044147491455, + "learning_rate": 9.991885707539264e-06, + "loss": 3.5408, + "step": 92150 + }, + { + "epoch": 0.09083171110393039, + "grad_norm": 2.5740928649902344, + "learning_rate": 9.991876892345359e-06, + "loss": 3.4939, + "step": 92200 + }, + { + "epoch": 0.09088096908175249, + "grad_norm": 2.4838335514068604, + "learning_rate": 9.991868072369627e-06, + "loss": 3.5084, + "step": 92250 + }, + { + "epoch": 0.09093022705957457, + "grad_norm": 2.5277609825134277, + "learning_rate": 9.991859247612075e-06, + "loss": 3.5361, + "step": 92300 + }, + { + "epoch": 0.09097948503739665, + "grad_norm": 2.6219544410705566, + "learning_rate": 9.991850418072713e-06, + "loss": 3.5214, + "step": 92350 + }, + { + "epoch": 0.09102874301521874, + "grad_norm": 2.58064866065979, + "learning_rate": 9.99184158375155e-06, + "loss": 3.4821, + "step": 92400 + }, + { + "epoch": 0.09107800099304084, + "grad_norm": 2.35748028755188, + "learning_rate": 9.991832744648593e-06, + "loss": 3.5567, + "step": 92450 + }, + { + "epoch": 0.09112725897086292, + "grad_norm": 2.5825164318084717, + "learning_rate": 9.991823900763851e-06, + "loss": 3.4979, + "step": 92500 + }, + { + "epoch": 0.091176516948685, + "grad_norm": 2.380385160446167, + "learning_rate": 9.991815052097333e-06, + "loss": 3.5532, + "step": 92550 + }, + { + "epoch": 0.0912257749265071, + "grad_norm": 2.7011330127716064, + "learning_rate": 9.991806198649048e-06, + "loss": 3.4907, + "step": 92600 + }, + { + "epoch": 0.09127503290432919, + "grad_norm": 2.463148593902588, + "learning_rate": 9.991797340419003e-06, + "loss": 3.5428, + "step": 92650 + }, + { + "epoch": 0.09132429088215127, + "grad_norm": 2.667295455932617, + "learning_rate": 9.991788477407205e-06, + "loss": 3.5165, + "step": 92700 + }, + { + "epoch": 0.09137354885997336, + "grad_norm": 2.5165274143218994, + "learning_rate": 9.991779609613668e-06, + "loss": 3.5115, + "step": 92750 + }, + { + "epoch": 0.09142280683779545, + "grad_norm": 2.5392086505889893, + "learning_rate": 9.991770737038395e-06, + "loss": 3.4982, + "step": 92800 + }, + { + "epoch": 0.09147206481561754, + "grad_norm": 3.4226720333099365, + "learning_rate": 9.991761859681397e-06, + "loss": 3.4924, + "step": 92850 + }, + { + "epoch": 0.09152132279343962, + "grad_norm": 2.593413829803467, + "learning_rate": 9.991752977542681e-06, + "loss": 3.5206, + "step": 92900 + }, + { + "epoch": 0.09157058077126172, + "grad_norm": 2.3894574642181396, + "learning_rate": 9.991744090622258e-06, + "loss": 3.5297, + "step": 92950 + }, + { + "epoch": 0.0916198387490838, + "grad_norm": 2.5359370708465576, + "learning_rate": 9.991735198920133e-06, + "loss": 3.5035, + "step": 93000 + }, + { + "epoch": 0.09166909672690589, + "grad_norm": 2.5807111263275146, + "learning_rate": 9.991726302436318e-06, + "loss": 3.5291, + "step": 93050 + }, + { + "epoch": 0.09171835470472797, + "grad_norm": 2.6205883026123047, + "learning_rate": 9.99171740117082e-06, + "loss": 3.4919, + "step": 93100 + }, + { + "epoch": 0.09176761268255007, + "grad_norm": 2.713055372238159, + "learning_rate": 9.99170849512365e-06, + "loss": 3.4739, + "step": 93150 + }, + { + "epoch": 0.09181687066037215, + "grad_norm": 2.6073999404907227, + "learning_rate": 9.991699584294811e-06, + "loss": 3.4595, + "step": 93200 + }, + { + "epoch": 0.09186612863819424, + "grad_norm": 2.469595193862915, + "learning_rate": 9.991690668684315e-06, + "loss": 3.4965, + "step": 93250 + }, + { + "epoch": 0.09191538661601634, + "grad_norm": 2.6497719287872314, + "learning_rate": 9.99168174829217e-06, + "loss": 3.5542, + "step": 93300 + }, + { + "epoch": 0.09196464459383842, + "grad_norm": 2.485102891921997, + "learning_rate": 9.991672823118386e-06, + "loss": 3.5522, + "step": 93350 + }, + { + "epoch": 0.0920139025716605, + "grad_norm": 2.45676589012146, + "learning_rate": 9.99166389316297e-06, + "loss": 3.4387, + "step": 93400 + }, + { + "epoch": 0.09206316054948259, + "grad_norm": 2.5401172637939453, + "learning_rate": 9.99165495842593e-06, + "loss": 3.4868, + "step": 93450 + }, + { + "epoch": 0.09211241852730469, + "grad_norm": 2.681870698928833, + "learning_rate": 9.991646018907277e-06, + "loss": 3.501, + "step": 93500 + }, + { + "epoch": 0.09216167650512677, + "grad_norm": 2.6447935104370117, + "learning_rate": 9.991637074607017e-06, + "loss": 3.5465, + "step": 93550 + }, + { + "epoch": 0.09221093448294886, + "grad_norm": 2.541200876235962, + "learning_rate": 9.99162812552516e-06, + "loss": 3.5026, + "step": 93600 + }, + { + "epoch": 0.09226019246077094, + "grad_norm": 2.5497496128082275, + "learning_rate": 9.991619171661713e-06, + "loss": 3.5523, + "step": 93650 + }, + { + "epoch": 0.09230945043859304, + "grad_norm": 2.5159003734588623, + "learning_rate": 9.991610213016687e-06, + "loss": 3.5206, + "step": 93700 + }, + { + "epoch": 0.09235870841641512, + "grad_norm": 2.498478412628174, + "learning_rate": 9.991601249590088e-06, + "loss": 3.4579, + "step": 93750 + }, + { + "epoch": 0.0924079663942372, + "grad_norm": 2.622728109359741, + "learning_rate": 9.991592281381928e-06, + "loss": 3.5274, + "step": 93800 + }, + { + "epoch": 0.0924572243720593, + "grad_norm": 2.6374659538269043, + "learning_rate": 9.99158330839221e-06, + "loss": 3.498, + "step": 93850 + }, + { + "epoch": 0.09250648234988139, + "grad_norm": 2.5417380332946777, + "learning_rate": 9.99157433062095e-06, + "loss": 3.4728, + "step": 93900 + }, + { + "epoch": 0.09255574032770347, + "grad_norm": 2.5940327644348145, + "learning_rate": 9.991565348068151e-06, + "loss": 3.4344, + "step": 93950 + }, + { + "epoch": 0.09260499830552556, + "grad_norm": 2.634363889694214, + "learning_rate": 9.991556360733823e-06, + "loss": 3.4755, + "step": 94000 + }, + { + "epoch": 0.09265425628334766, + "grad_norm": 2.5082905292510986, + "learning_rate": 9.991547368617974e-06, + "loss": 3.5322, + "step": 94050 + }, + { + "epoch": 0.09270351426116974, + "grad_norm": 2.5004141330718994, + "learning_rate": 9.991538371720615e-06, + "loss": 3.4453, + "step": 94100 + }, + { + "epoch": 0.09275277223899182, + "grad_norm": 2.683391809463501, + "learning_rate": 9.991529370041752e-06, + "loss": 3.497, + "step": 94150 + }, + { + "epoch": 0.09280203021681392, + "grad_norm": 2.576052188873291, + "learning_rate": 9.991520363581395e-06, + "loss": 3.4334, + "step": 94200 + }, + { + "epoch": 0.092851288194636, + "grad_norm": 2.607886552810669, + "learning_rate": 9.991511352339554e-06, + "loss": 3.448, + "step": 94250 + }, + { + "epoch": 0.09290054617245809, + "grad_norm": 2.674027919769287, + "learning_rate": 9.991502336316234e-06, + "loss": 3.4608, + "step": 94300 + }, + { + "epoch": 0.09294980415028017, + "grad_norm": 2.5196359157562256, + "learning_rate": 9.991493315511447e-06, + "loss": 3.4313, + "step": 94350 + }, + { + "epoch": 0.09299906212810227, + "grad_norm": 2.801942825317383, + "learning_rate": 9.991484289925199e-06, + "loss": 3.5405, + "step": 94400 + }, + { + "epoch": 0.09304832010592436, + "grad_norm": 2.6478660106658936, + "learning_rate": 9.991475259557501e-06, + "loss": 3.4951, + "step": 94450 + }, + { + "epoch": 0.09309757808374644, + "grad_norm": 3.284269332885742, + "learning_rate": 9.99146622440836e-06, + "loss": 3.5049, + "step": 94500 + }, + { + "epoch": 0.09314683606156852, + "grad_norm": 2.835618257522583, + "learning_rate": 9.991457184477784e-06, + "loss": 3.5369, + "step": 94550 + }, + { + "epoch": 0.09319609403939062, + "grad_norm": 2.5260472297668457, + "learning_rate": 9.991448139765785e-06, + "loss": 3.4654, + "step": 94600 + }, + { + "epoch": 0.09324535201721271, + "grad_norm": 2.541609287261963, + "learning_rate": 9.99143909027237e-06, + "loss": 3.5424, + "step": 94650 + }, + { + "epoch": 0.09329460999503479, + "grad_norm": 2.5830399990081787, + "learning_rate": 9.991430035997544e-06, + "loss": 3.5411, + "step": 94700 + }, + { + "epoch": 0.09334386797285689, + "grad_norm": 2.764085531234741, + "learning_rate": 9.991420976941322e-06, + "loss": 3.4757, + "step": 94750 + }, + { + "epoch": 0.09339312595067897, + "grad_norm": 2.7741506099700928, + "learning_rate": 9.991411913103708e-06, + "loss": 3.5134, + "step": 94800 + }, + { + "epoch": 0.09344238392850106, + "grad_norm": 2.8838443756103516, + "learning_rate": 9.991402844484712e-06, + "loss": 3.5217, + "step": 94850 + }, + { + "epoch": 0.09349164190632314, + "grad_norm": 2.6583337783813477, + "learning_rate": 9.991393771084343e-06, + "loss": 3.5017, + "step": 94900 + }, + { + "epoch": 0.09354089988414524, + "grad_norm": 2.9703612327575684, + "learning_rate": 9.99138469290261e-06, + "loss": 3.4961, + "step": 94950 + }, + { + "epoch": 0.09359015786196732, + "grad_norm": 2.417046546936035, + "learning_rate": 9.991375609939524e-06, + "loss": 3.493, + "step": 95000 + }, + { + "epoch": 0.09363941583978941, + "grad_norm": 2.505190372467041, + "learning_rate": 9.991366522195088e-06, + "loss": 3.476, + "step": 95050 + }, + { + "epoch": 0.0936886738176115, + "grad_norm": 2.5905630588531494, + "learning_rate": 9.991357429669314e-06, + "loss": 3.4537, + "step": 95100 + }, + { + "epoch": 0.09373793179543359, + "grad_norm": 2.4609169960021973, + "learning_rate": 9.99134833236221e-06, + "loss": 3.4682, + "step": 95150 + }, + { + "epoch": 0.09378718977325567, + "grad_norm": 2.482228994369507, + "learning_rate": 9.991339230273786e-06, + "loss": 3.5, + "step": 95200 + }, + { + "epoch": 0.09383644775107776, + "grad_norm": 2.9098143577575684, + "learning_rate": 9.991330123404051e-06, + "loss": 3.513, + "step": 95250 + }, + { + "epoch": 0.09388570572889986, + "grad_norm": 2.8260180950164795, + "learning_rate": 9.99132101175301e-06, + "loss": 3.4946, + "step": 95300 + }, + { + "epoch": 0.09393496370672194, + "grad_norm": 2.4881205558776855, + "learning_rate": 9.991311895320677e-06, + "loss": 3.5306, + "step": 95350 + }, + { + "epoch": 0.09398422168454403, + "grad_norm": 2.8624229431152344, + "learning_rate": 9.991302774107055e-06, + "loss": 3.4836, + "step": 95400 + }, + { + "epoch": 0.09403347966236612, + "grad_norm": 2.796124219894409, + "learning_rate": 9.991293648112158e-06, + "loss": 3.5702, + "step": 95450 + }, + { + "epoch": 0.09408273764018821, + "grad_norm": 2.7107365131378174, + "learning_rate": 9.991284517335993e-06, + "loss": 3.4546, + "step": 95500 + }, + { + "epoch": 0.09413199561801029, + "grad_norm": 2.4883761405944824, + "learning_rate": 9.991275381778566e-06, + "loss": 3.4415, + "step": 95550 + }, + { + "epoch": 0.09418125359583238, + "grad_norm": 2.442410707473755, + "learning_rate": 9.99126624143989e-06, + "loss": 3.5253, + "step": 95600 + }, + { + "epoch": 0.09423051157365447, + "grad_norm": 2.4142189025878906, + "learning_rate": 9.99125709631997e-06, + "loss": 3.5348, + "step": 95650 + }, + { + "epoch": 0.09427976955147656, + "grad_norm": 2.625922679901123, + "learning_rate": 9.991247946418818e-06, + "loss": 3.4561, + "step": 95700 + }, + { + "epoch": 0.09432902752929864, + "grad_norm": 2.813594102859497, + "learning_rate": 9.991238791736442e-06, + "loss": 3.5081, + "step": 95750 + }, + { + "epoch": 0.09437828550712073, + "grad_norm": 2.6372132301330566, + "learning_rate": 9.991229632272849e-06, + "loss": 3.5865, + "step": 95800 + }, + { + "epoch": 0.09442754348494282, + "grad_norm": 2.5123181343078613, + "learning_rate": 9.991220468028048e-06, + "loss": 3.4986, + "step": 95850 + }, + { + "epoch": 0.09447680146276491, + "grad_norm": 2.395050048828125, + "learning_rate": 9.991211299002051e-06, + "loss": 3.4792, + "step": 95900 + }, + { + "epoch": 0.09452605944058699, + "grad_norm": 2.519516944885254, + "learning_rate": 9.991202125194863e-06, + "loss": 3.4771, + "step": 95950 + }, + { + "epoch": 0.09457531741840909, + "grad_norm": 2.4439661502838135, + "learning_rate": 9.991192946606493e-06, + "loss": 3.4459, + "step": 96000 + }, + { + "epoch": 0.09462457539623118, + "grad_norm": 2.567617654800415, + "learning_rate": 9.991183763236954e-06, + "loss": 3.4779, + "step": 96050 + }, + { + "epoch": 0.09467383337405326, + "grad_norm": 2.4986140727996826, + "learning_rate": 9.991174575086249e-06, + "loss": 3.4662, + "step": 96100 + }, + { + "epoch": 0.09472309135187534, + "grad_norm": 2.4114882946014404, + "learning_rate": 9.99116538215439e-06, + "loss": 3.4822, + "step": 96150 + }, + { + "epoch": 0.09477234932969744, + "grad_norm": 2.7959210872650146, + "learning_rate": 9.991156184441387e-06, + "loss": 3.5351, + "step": 96200 + }, + { + "epoch": 0.09482160730751953, + "grad_norm": 3.051692008972168, + "learning_rate": 9.991146981947246e-06, + "loss": 3.4586, + "step": 96250 + }, + { + "epoch": 0.09487086528534161, + "grad_norm": 2.402531862258911, + "learning_rate": 9.991137774671977e-06, + "loss": 3.4468, + "step": 96300 + }, + { + "epoch": 0.09492012326316371, + "grad_norm": 2.6857526302337646, + "learning_rate": 9.991128562615589e-06, + "loss": 3.4635, + "step": 96350 + }, + { + "epoch": 0.09496938124098579, + "grad_norm": 2.657498598098755, + "learning_rate": 9.99111934577809e-06, + "loss": 3.5091, + "step": 96400 + }, + { + "epoch": 0.09501863921880788, + "grad_norm": 2.8503901958465576, + "learning_rate": 9.991110124159491e-06, + "loss": 3.5072, + "step": 96450 + }, + { + "epoch": 0.09506789719662996, + "grad_norm": 2.5324859619140625, + "learning_rate": 9.991100897759799e-06, + "loss": 3.4034, + "step": 96500 + }, + { + "epoch": 0.09511715517445206, + "grad_norm": 2.7217602729797363, + "learning_rate": 9.991091666579022e-06, + "loss": 3.4746, + "step": 96550 + }, + { + "epoch": 0.09516641315227414, + "grad_norm": 2.5247881412506104, + "learning_rate": 9.991082430617172e-06, + "loss": 3.4707, + "step": 96600 + }, + { + "epoch": 0.09521567113009623, + "grad_norm": 2.921921968460083, + "learning_rate": 9.991073189874254e-06, + "loss": 3.4536, + "step": 96650 + }, + { + "epoch": 0.09526492910791833, + "grad_norm": 2.5481796264648438, + "learning_rate": 9.99106394435028e-06, + "loss": 3.4458, + "step": 96700 + }, + { + "epoch": 0.09531418708574041, + "grad_norm": 2.909856081008911, + "learning_rate": 9.991054694045257e-06, + "loss": 3.5254, + "step": 96750 + }, + { + "epoch": 0.0953634450635625, + "grad_norm": 2.6333370208740234, + "learning_rate": 9.991045438959195e-06, + "loss": 3.4705, + "step": 96800 + }, + { + "epoch": 0.09541270304138458, + "grad_norm": 2.746682643890381, + "learning_rate": 9.991036179092101e-06, + "loss": 3.4788, + "step": 96850 + }, + { + "epoch": 0.09546196101920668, + "grad_norm": 2.63202166557312, + "learning_rate": 9.991026914443986e-06, + "loss": 3.4861, + "step": 96900 + }, + { + "epoch": 0.09551121899702876, + "grad_norm": 2.33595609664917, + "learning_rate": 9.991017645014857e-06, + "loss": 3.4755, + "step": 96950 + }, + { + "epoch": 0.09556047697485084, + "grad_norm": 2.4882144927978516, + "learning_rate": 9.991008370804726e-06, + "loss": 3.455, + "step": 97000 + }, + { + "epoch": 0.09560973495267293, + "grad_norm": 2.552701234817505, + "learning_rate": 9.990999091813598e-06, + "loss": 3.4825, + "step": 97050 + }, + { + "epoch": 0.09565899293049503, + "grad_norm": 2.5472116470336914, + "learning_rate": 9.990989808041484e-06, + "loss": 3.4527, + "step": 97100 + }, + { + "epoch": 0.09570825090831711, + "grad_norm": 2.492727279663086, + "learning_rate": 9.990980519488393e-06, + "loss": 3.5282, + "step": 97150 + }, + { + "epoch": 0.0957575088861392, + "grad_norm": 2.5237650871276855, + "learning_rate": 9.990971226154333e-06, + "loss": 3.4578, + "step": 97200 + }, + { + "epoch": 0.09580676686396129, + "grad_norm": 2.453138828277588, + "learning_rate": 9.990961928039316e-06, + "loss": 3.4018, + "step": 97250 + }, + { + "epoch": 0.09585602484178338, + "grad_norm": 2.765453338623047, + "learning_rate": 9.990952625143344e-06, + "loss": 3.5062, + "step": 97300 + }, + { + "epoch": 0.09590528281960546, + "grad_norm": 2.573054790496826, + "learning_rate": 9.990943317466433e-06, + "loss": 3.5221, + "step": 97350 + }, + { + "epoch": 0.09595454079742755, + "grad_norm": 2.7383389472961426, + "learning_rate": 9.99093400500859e-06, + "loss": 3.5395, + "step": 97400 + }, + { + "epoch": 0.09600379877524964, + "grad_norm": 2.4440360069274902, + "learning_rate": 9.990924687769821e-06, + "loss": 3.5486, + "step": 97450 + }, + { + "epoch": 0.09605305675307173, + "grad_norm": 2.4890618324279785, + "learning_rate": 9.990915365750137e-06, + "loss": 3.5377, + "step": 97500 + }, + { + "epoch": 0.09610231473089381, + "grad_norm": 2.441108465194702, + "learning_rate": 9.990906038949547e-06, + "loss": 3.5227, + "step": 97550 + }, + { + "epoch": 0.09615157270871591, + "grad_norm": 2.552497625350952, + "learning_rate": 9.99089670736806e-06, + "loss": 3.5359, + "step": 97600 + }, + { + "epoch": 0.096200830686538, + "grad_norm": 2.4606144428253174, + "learning_rate": 9.990887371005685e-06, + "loss": 3.4534, + "step": 97650 + }, + { + "epoch": 0.09625008866436008, + "grad_norm": 2.6555192470550537, + "learning_rate": 9.990878029862432e-06, + "loss": 3.486, + "step": 97700 + }, + { + "epoch": 0.09629934664218216, + "grad_norm": 2.334169864654541, + "learning_rate": 9.990868683938307e-06, + "loss": 3.4815, + "step": 97750 + }, + { + "epoch": 0.09634860462000426, + "grad_norm": 2.764765739440918, + "learning_rate": 9.99085933323332e-06, + "loss": 3.444, + "step": 97800 + }, + { + "epoch": 0.09639786259782634, + "grad_norm": 2.4359214305877686, + "learning_rate": 9.990849977747482e-06, + "loss": 3.4403, + "step": 97850 + }, + { + "epoch": 0.09644712057564843, + "grad_norm": 2.440549612045288, + "learning_rate": 9.9908406174808e-06, + "loss": 3.4835, + "step": 97900 + }, + { + "epoch": 0.09649637855347051, + "grad_norm": 2.7988245487213135, + "learning_rate": 9.990831252433283e-06, + "loss": 3.4705, + "step": 97950 + }, + { + "epoch": 0.09654563653129261, + "grad_norm": 2.4685730934143066, + "learning_rate": 9.990821882604941e-06, + "loss": 3.3753, + "step": 98000 + }, + { + "epoch": 0.0965948945091147, + "grad_norm": 2.4733502864837646, + "learning_rate": 9.990812507995785e-06, + "loss": 3.4926, + "step": 98050 + }, + { + "epoch": 0.09664415248693678, + "grad_norm": 2.232922315597534, + "learning_rate": 9.990803128605818e-06, + "loss": 3.5051, + "step": 98100 + }, + { + "epoch": 0.09669341046475888, + "grad_norm": 2.513568878173828, + "learning_rate": 9.990793744435052e-06, + "loss": 3.4412, + "step": 98150 + }, + { + "epoch": 0.09674266844258096, + "grad_norm": 2.5386228561401367, + "learning_rate": 9.990784355483498e-06, + "loss": 3.5011, + "step": 98200 + }, + { + "epoch": 0.09679192642040305, + "grad_norm": 2.622732162475586, + "learning_rate": 9.990774961751164e-06, + "loss": 3.4878, + "step": 98250 + }, + { + "epoch": 0.09684118439822513, + "grad_norm": 2.6431615352630615, + "learning_rate": 9.990765563238057e-06, + "loss": 3.519, + "step": 98300 + }, + { + "epoch": 0.09689044237604723, + "grad_norm": 2.5935304164886475, + "learning_rate": 9.990756159944188e-06, + "loss": 3.4962, + "step": 98350 + }, + { + "epoch": 0.09693970035386931, + "grad_norm": 2.5681283473968506, + "learning_rate": 9.990746751869566e-06, + "loss": 3.5362, + "step": 98400 + }, + { + "epoch": 0.0969889583316914, + "grad_norm": 2.5565953254699707, + "learning_rate": 9.990737339014199e-06, + "loss": 3.4671, + "step": 98450 + }, + { + "epoch": 0.0970382163095135, + "grad_norm": 2.694472312927246, + "learning_rate": 9.990727921378096e-06, + "loss": 3.4981, + "step": 98500 + }, + { + "epoch": 0.09708747428733558, + "grad_norm": 2.4938342571258545, + "learning_rate": 9.990718498961268e-06, + "loss": 3.4925, + "step": 98550 + }, + { + "epoch": 0.09713673226515766, + "grad_norm": 2.255632162094116, + "learning_rate": 9.99070907176372e-06, + "loss": 3.4718, + "step": 98600 + }, + { + "epoch": 0.09718599024297975, + "grad_norm": 2.745765209197998, + "learning_rate": 9.990699639785465e-06, + "loss": 3.4349, + "step": 98650 + }, + { + "epoch": 0.09723524822080185, + "grad_norm": 2.6263084411621094, + "learning_rate": 9.99069020302651e-06, + "loss": 3.4559, + "step": 98700 + }, + { + "epoch": 0.09728450619862393, + "grad_norm": 2.5612058639526367, + "learning_rate": 9.990680761486864e-06, + "loss": 3.5187, + "step": 98750 + }, + { + "epoch": 0.09733376417644601, + "grad_norm": 2.5737497806549072, + "learning_rate": 9.990671315166538e-06, + "loss": 3.4553, + "step": 98800 + }, + { + "epoch": 0.09738302215426811, + "grad_norm": 2.6999330520629883, + "learning_rate": 9.990661864065539e-06, + "loss": 3.4896, + "step": 98850 + }, + { + "epoch": 0.0974322801320902, + "grad_norm": 2.508227825164795, + "learning_rate": 9.990652408183877e-06, + "loss": 3.5152, + "step": 98900 + }, + { + "epoch": 0.09748153810991228, + "grad_norm": 2.356147289276123, + "learning_rate": 9.99064294752156e-06, + "loss": 3.4644, + "step": 98950 + }, + { + "epoch": 0.09753079608773436, + "grad_norm": 2.2947211265563965, + "learning_rate": 9.9906334820786e-06, + "loss": 3.4727, + "step": 99000 + }, + { + "epoch": 0.09758005406555646, + "grad_norm": 2.6120007038116455, + "learning_rate": 9.990624011855003e-06, + "loss": 3.4392, + "step": 99050 + }, + { + "epoch": 0.09762931204337855, + "grad_norm": 2.534365177154541, + "learning_rate": 9.990614536850778e-06, + "loss": 3.4336, + "step": 99100 + }, + { + "epoch": 0.09767857002120063, + "grad_norm": 2.5490798950195312, + "learning_rate": 9.990605057065935e-06, + "loss": 3.449, + "step": 99150 + }, + { + "epoch": 0.09772782799902271, + "grad_norm": 2.6555633544921875, + "learning_rate": 9.990595572500484e-06, + "loss": 3.4595, + "step": 99200 + }, + { + "epoch": 0.09777708597684481, + "grad_norm": 2.5927019119262695, + "learning_rate": 9.990586083154433e-06, + "loss": 3.397, + "step": 99250 + }, + { + "epoch": 0.0978263439546669, + "grad_norm": 2.411048412322998, + "learning_rate": 9.990576589027792e-06, + "loss": 3.4678, + "step": 99300 + }, + { + "epoch": 0.09787560193248898, + "grad_norm": 2.452605962753296, + "learning_rate": 9.990567090120569e-06, + "loss": 3.5182, + "step": 99350 + }, + { + "epoch": 0.09792485991031108, + "grad_norm": 2.798715114593506, + "learning_rate": 9.990557586432773e-06, + "loss": 3.4316, + "step": 99400 + }, + { + "epoch": 0.09797411788813316, + "grad_norm": 2.5999791622161865, + "learning_rate": 9.990548077964412e-06, + "loss": 3.5288, + "step": 99450 + }, + { + "epoch": 0.09802337586595525, + "grad_norm": 2.5821330547332764, + "learning_rate": 9.9905385647155e-06, + "loss": 3.4817, + "step": 99500 + }, + { + "epoch": 0.09807263384377733, + "grad_norm": 2.5211875438690186, + "learning_rate": 9.990529046686042e-06, + "loss": 3.4781, + "step": 99550 + }, + { + "epoch": 0.09812189182159943, + "grad_norm": 2.659984588623047, + "learning_rate": 9.990519523876046e-06, + "loss": 3.476, + "step": 99600 + }, + { + "epoch": 0.09817114979942151, + "grad_norm": 2.32822322845459, + "learning_rate": 9.990509996285524e-06, + "loss": 3.4623, + "step": 99650 + }, + { + "epoch": 0.0982204077772436, + "grad_norm": 2.5946106910705566, + "learning_rate": 9.990500463914486e-06, + "loss": 3.4747, + "step": 99700 + }, + { + "epoch": 0.0982696657550657, + "grad_norm": 2.855876922607422, + "learning_rate": 9.990490926762937e-06, + "loss": 3.3972, + "step": 99750 + }, + { + "epoch": 0.09831892373288778, + "grad_norm": 2.671902656555176, + "learning_rate": 9.99048138483089e-06, + "loss": 3.4539, + "step": 99800 + }, + { + "epoch": 0.09836818171070986, + "grad_norm": 2.4329681396484375, + "learning_rate": 9.990471838118352e-06, + "loss": 3.4546, + "step": 99850 + }, + { + "epoch": 0.09841743968853195, + "grad_norm": 2.5714337825775146, + "learning_rate": 9.990462286625332e-06, + "loss": 3.488, + "step": 99900 + }, + { + "epoch": 0.09846669766635405, + "grad_norm": 2.980865716934204, + "learning_rate": 9.990452730351842e-06, + "loss": 3.5138, + "step": 99950 + }, + { + "epoch": 0.09851595564417613, + "grad_norm": 2.672783851623535, + "learning_rate": 9.990443169297887e-06, + "loss": 3.4683, + "step": 100000 + }, + { + "epoch": 0.09856521362199822, + "grad_norm": 2.368260383605957, + "learning_rate": 9.99043360346348e-06, + "loss": 3.4939, + "step": 100050 + }, + { + "epoch": 0.09861447159982031, + "grad_norm": 2.5601234436035156, + "learning_rate": 9.990424032848626e-06, + "loss": 3.4729, + "step": 100100 + }, + { + "epoch": 0.0986637295776424, + "grad_norm": 2.4284610748291016, + "learning_rate": 9.990414457453338e-06, + "loss": 3.4955, + "step": 100150 + }, + { + "epoch": 0.09871298755546448, + "grad_norm": 2.730605363845825, + "learning_rate": 9.990404877277625e-06, + "loss": 3.4554, + "step": 100200 + }, + { + "epoch": 0.09876224553328657, + "grad_norm": 2.4088621139526367, + "learning_rate": 9.990395292321493e-06, + "loss": 3.4361, + "step": 100250 + }, + { + "epoch": 0.09881150351110866, + "grad_norm": 2.9302256107330322, + "learning_rate": 9.990385702584955e-06, + "loss": 3.4401, + "step": 100300 + }, + { + "epoch": 0.09886076148893075, + "grad_norm": 2.779808282852173, + "learning_rate": 9.990376108068017e-06, + "loss": 3.4329, + "step": 100350 + }, + { + "epoch": 0.09891001946675283, + "grad_norm": 2.444542646408081, + "learning_rate": 9.990366508770691e-06, + "loss": 3.439, + "step": 100400 + }, + { + "epoch": 0.09895927744457492, + "grad_norm": 2.5757060050964355, + "learning_rate": 9.990356904692982e-06, + "loss": 3.4789, + "step": 100450 + }, + { + "epoch": 0.09900853542239701, + "grad_norm": 2.5243523120880127, + "learning_rate": 9.990347295834903e-06, + "loss": 3.4918, + "step": 100500 + }, + { + "epoch": 0.0990577934002191, + "grad_norm": 2.7300150394439697, + "learning_rate": 9.990337682196463e-06, + "loss": 3.4574, + "step": 100550 + }, + { + "epoch": 0.09910705137804118, + "grad_norm": 2.6037099361419678, + "learning_rate": 9.990328063777667e-06, + "loss": 3.4234, + "step": 100600 + }, + { + "epoch": 0.09915630935586328, + "grad_norm": 2.6875269412994385, + "learning_rate": 9.990318440578532e-06, + "loss": 3.4544, + "step": 100650 + }, + { + "epoch": 0.09920556733368537, + "grad_norm": 2.674828052520752, + "learning_rate": 9.99030881259906e-06, + "loss": 3.4433, + "step": 100700 + }, + { + "epoch": 0.09925482531150745, + "grad_norm": 2.975830554962158, + "learning_rate": 9.990299179839263e-06, + "loss": 3.5119, + "step": 100750 + }, + { + "epoch": 0.09930408328932953, + "grad_norm": 2.3529393672943115, + "learning_rate": 9.990289542299151e-06, + "loss": 3.4886, + "step": 100800 + }, + { + "epoch": 0.09935334126715163, + "grad_norm": 2.622812271118164, + "learning_rate": 9.990279899978731e-06, + "loss": 3.4206, + "step": 100850 + }, + { + "epoch": 0.09940259924497372, + "grad_norm": 2.516540288925171, + "learning_rate": 9.990270252878017e-06, + "loss": 3.4618, + "step": 100900 + }, + { + "epoch": 0.0994518572227958, + "grad_norm": 2.7003159523010254, + "learning_rate": 9.990260600997012e-06, + "loss": 3.4045, + "step": 100950 + }, + { + "epoch": 0.0995011152006179, + "grad_norm": 2.4062156677246094, + "learning_rate": 9.990250944335728e-06, + "loss": 3.5073, + "step": 101000 + }, + { + "epoch": 0.09955037317843998, + "grad_norm": 2.9946587085723877, + "learning_rate": 9.990241282894175e-06, + "loss": 3.4648, + "step": 101050 + }, + { + "epoch": 0.09959963115626207, + "grad_norm": 2.6976659297943115, + "learning_rate": 9.99023161667236e-06, + "loss": 3.4952, + "step": 101100 + }, + { + "epoch": 0.09964888913408415, + "grad_norm": 2.6516623497009277, + "learning_rate": 9.990221945670297e-06, + "loss": 3.4891, + "step": 101150 + }, + { + "epoch": 0.09969814711190625, + "grad_norm": 2.345975399017334, + "learning_rate": 9.990212269887991e-06, + "loss": 3.5224, + "step": 101200 + }, + { + "epoch": 0.09974740508972833, + "grad_norm": 2.7140519618988037, + "learning_rate": 9.990202589325453e-06, + "loss": 3.5205, + "step": 101250 + }, + { + "epoch": 0.09979666306755042, + "grad_norm": 2.496894359588623, + "learning_rate": 9.99019290398269e-06, + "loss": 3.473, + "step": 101300 + }, + { + "epoch": 0.09984592104537252, + "grad_norm": 2.4504098892211914, + "learning_rate": 9.990183213859714e-06, + "loss": 3.4246, + "step": 101350 + }, + { + "epoch": 0.0998951790231946, + "grad_norm": 2.4256160259246826, + "learning_rate": 9.990173518956533e-06, + "loss": 3.4312, + "step": 101400 + }, + { + "epoch": 0.09994443700101668, + "grad_norm": 2.5933609008789062, + "learning_rate": 9.990163819273157e-06, + "loss": 3.4689, + "step": 101450 + }, + { + "epoch": 0.09999369497883877, + "grad_norm": 2.5092689990997314, + "learning_rate": 9.990154114809594e-06, + "loss": 3.4108, + "step": 101500 + }, + { + "epoch": 0.10004295295666087, + "grad_norm": 2.6835668087005615, + "learning_rate": 9.990144405565853e-06, + "loss": 3.4288, + "step": 101550 + }, + { + "epoch": 0.10009221093448295, + "grad_norm": 2.7285449504852295, + "learning_rate": 9.990134691541948e-06, + "loss": 3.4368, + "step": 101600 + }, + { + "epoch": 0.10014146891230503, + "grad_norm": 2.494976758956909, + "learning_rate": 9.990124972737882e-06, + "loss": 3.4657, + "step": 101650 + }, + { + "epoch": 0.10019072689012712, + "grad_norm": 2.7615764141082764, + "learning_rate": 9.990115249153666e-06, + "loss": 3.4515, + "step": 101700 + }, + { + "epoch": 0.10023998486794922, + "grad_norm": 2.5332746505737305, + "learning_rate": 9.990105520789314e-06, + "loss": 3.467, + "step": 101750 + }, + { + "epoch": 0.1002892428457713, + "grad_norm": 2.5252227783203125, + "learning_rate": 9.990095787644829e-06, + "loss": 3.4811, + "step": 101800 + }, + { + "epoch": 0.10033850082359339, + "grad_norm": 2.4719691276550293, + "learning_rate": 9.990086049720225e-06, + "loss": 3.4801, + "step": 101850 + }, + { + "epoch": 0.10038775880141548, + "grad_norm": 2.521556854248047, + "learning_rate": 9.990076307015507e-06, + "loss": 3.406, + "step": 101900 + }, + { + "epoch": 0.10043701677923757, + "grad_norm": 2.5171749591827393, + "learning_rate": 9.990066559530688e-06, + "loss": 3.4356, + "step": 101950 + }, + { + "epoch": 0.10048627475705965, + "grad_norm": 2.8130905628204346, + "learning_rate": 9.990056807265776e-06, + "loss": 3.5226, + "step": 102000 + }, + { + "epoch": 0.10053553273488174, + "grad_norm": 2.659457206726074, + "learning_rate": 9.99004705022078e-06, + "loss": 3.4655, + "step": 102050 + }, + { + "epoch": 0.10058479071270383, + "grad_norm": 2.511193037033081, + "learning_rate": 9.990037288395708e-06, + "loss": 3.4708, + "step": 102100 + }, + { + "epoch": 0.10063404869052592, + "grad_norm": 2.5251622200012207, + "learning_rate": 9.990027521790573e-06, + "loss": 3.4235, + "step": 102150 + }, + { + "epoch": 0.100683306668348, + "grad_norm": 2.5951144695281982, + "learning_rate": 9.990017750405383e-06, + "loss": 3.4367, + "step": 102200 + }, + { + "epoch": 0.1007325646461701, + "grad_norm": 2.5631210803985596, + "learning_rate": 9.990007974240145e-06, + "loss": 3.5078, + "step": 102250 + }, + { + "epoch": 0.10078182262399218, + "grad_norm": 2.601407766342163, + "learning_rate": 9.989998193294872e-06, + "loss": 3.4266, + "step": 102300 + }, + { + "epoch": 0.10083108060181427, + "grad_norm": 2.742953062057495, + "learning_rate": 9.98998840756957e-06, + "loss": 3.4437, + "step": 102350 + }, + { + "epoch": 0.10088033857963635, + "grad_norm": 2.7548887729644775, + "learning_rate": 9.98997861706425e-06, + "loss": 3.5472, + "step": 102400 + }, + { + "epoch": 0.10092959655745845, + "grad_norm": 2.5202136039733887, + "learning_rate": 9.989968821778921e-06, + "loss": 3.5267, + "step": 102450 + }, + { + "epoch": 0.10097885453528054, + "grad_norm": 2.8414618968963623, + "learning_rate": 9.989959021713593e-06, + "loss": 3.46, + "step": 102500 + }, + { + "epoch": 0.10102811251310262, + "grad_norm": 2.468543767929077, + "learning_rate": 9.989949216868276e-06, + "loss": 3.4299, + "step": 102550 + }, + { + "epoch": 0.1010773704909247, + "grad_norm": 2.5138635635375977, + "learning_rate": 9.989939407242978e-06, + "loss": 3.418, + "step": 102600 + }, + { + "epoch": 0.1011266284687468, + "grad_norm": 2.4124672412872314, + "learning_rate": 9.989929592837708e-06, + "loss": 3.4285, + "step": 102650 + }, + { + "epoch": 0.10117588644656889, + "grad_norm": 2.6053659915924072, + "learning_rate": 9.989919773652477e-06, + "loss": 3.4339, + "step": 102700 + }, + { + "epoch": 0.10122514442439097, + "grad_norm": 2.5849623680114746, + "learning_rate": 9.989909949687293e-06, + "loss": 3.4488, + "step": 102750 + }, + { + "epoch": 0.10127440240221307, + "grad_norm": 2.5343995094299316, + "learning_rate": 9.989900120942165e-06, + "loss": 3.4805, + "step": 102800 + }, + { + "epoch": 0.10132366038003515, + "grad_norm": 2.77571439743042, + "learning_rate": 9.989890287417105e-06, + "loss": 3.4309, + "step": 102850 + }, + { + "epoch": 0.10137291835785724, + "grad_norm": 3.9947614669799805, + "learning_rate": 9.98988044911212e-06, + "loss": 3.3827, + "step": 102900 + }, + { + "epoch": 0.10142217633567932, + "grad_norm": 2.9862940311431885, + "learning_rate": 9.98987060602722e-06, + "loss": 3.3701, + "step": 102950 + }, + { + "epoch": 0.10147143431350142, + "grad_norm": 2.440326690673828, + "learning_rate": 9.989860758162416e-06, + "loss": 3.4573, + "step": 103000 + }, + { + "epoch": 0.1015206922913235, + "grad_norm": 2.4570956230163574, + "learning_rate": 9.989850905517715e-06, + "loss": 3.4363, + "step": 103050 + }, + { + "epoch": 0.10156995026914559, + "grad_norm": 2.504934787750244, + "learning_rate": 9.989841048093129e-06, + "loss": 3.3862, + "step": 103100 + }, + { + "epoch": 0.10161920824696768, + "grad_norm": 2.584400177001953, + "learning_rate": 9.989831185888665e-06, + "loss": 3.4427, + "step": 103150 + }, + { + "epoch": 0.10166846622478977, + "grad_norm": 2.9653422832489014, + "learning_rate": 9.989821318904333e-06, + "loss": 3.4554, + "step": 103200 + }, + { + "epoch": 0.10171772420261185, + "grad_norm": 2.6193161010742188, + "learning_rate": 9.989811447140143e-06, + "loss": 3.4106, + "step": 103250 + }, + { + "epoch": 0.10176698218043394, + "grad_norm": 2.246081829071045, + "learning_rate": 9.989801570596106e-06, + "loss": 3.2985, + "step": 103300 + }, + { + "epoch": 0.10181624015825604, + "grad_norm": 2.726445198059082, + "learning_rate": 9.989791689272228e-06, + "loss": 3.4363, + "step": 103350 + }, + { + "epoch": 0.10186549813607812, + "grad_norm": 2.35740065574646, + "learning_rate": 9.989781803168521e-06, + "loss": 3.4767, + "step": 103400 + }, + { + "epoch": 0.1019147561139002, + "grad_norm": 2.625596284866333, + "learning_rate": 9.989771912284994e-06, + "loss": 3.409, + "step": 103450 + }, + { + "epoch": 0.1019640140917223, + "grad_norm": 2.3832321166992188, + "learning_rate": 9.989762016621656e-06, + "loss": 3.4212, + "step": 103500 + }, + { + "epoch": 0.10201327206954439, + "grad_norm": 2.4753546714782715, + "learning_rate": 9.989752116178516e-06, + "loss": 3.5071, + "step": 103550 + }, + { + "epoch": 0.10206253004736647, + "grad_norm": 2.5028698444366455, + "learning_rate": 9.989742210955587e-06, + "loss": 3.4451, + "step": 103600 + }, + { + "epoch": 0.10211178802518855, + "grad_norm": 2.589632272720337, + "learning_rate": 9.989732300952872e-06, + "loss": 3.4173, + "step": 103650 + }, + { + "epoch": 0.10216104600301065, + "grad_norm": 2.3595945835113525, + "learning_rate": 9.989722386170387e-06, + "loss": 3.4366, + "step": 103700 + }, + { + "epoch": 0.10221030398083274, + "grad_norm": 2.5656590461730957, + "learning_rate": 9.989712466608136e-06, + "loss": 3.3858, + "step": 103750 + }, + { + "epoch": 0.10225956195865482, + "grad_norm": 2.4863128662109375, + "learning_rate": 9.989702542266133e-06, + "loss": 3.4264, + "step": 103800 + }, + { + "epoch": 0.1023088199364769, + "grad_norm": 2.644341230392456, + "learning_rate": 9.989692613144386e-06, + "loss": 3.401, + "step": 103850 + }, + { + "epoch": 0.102358077914299, + "grad_norm": 2.296281099319458, + "learning_rate": 9.989682679242903e-06, + "loss": 3.4655, + "step": 103900 + }, + { + "epoch": 0.10240733589212109, + "grad_norm": 2.4843084812164307, + "learning_rate": 9.989672740561696e-06, + "loss": 3.4834, + "step": 103950 + }, + { + "epoch": 0.10245659386994317, + "grad_norm": 2.814067840576172, + "learning_rate": 9.989662797100771e-06, + "loss": 3.4329, + "step": 104000 + }, + { + "epoch": 0.10250585184776527, + "grad_norm": 2.458986759185791, + "learning_rate": 9.989652848860142e-06, + "loss": 3.5131, + "step": 104050 + }, + { + "epoch": 0.10255510982558735, + "grad_norm": 2.8316445350646973, + "learning_rate": 9.989642895839816e-06, + "loss": 3.4472, + "step": 104100 + }, + { + "epoch": 0.10260436780340944, + "grad_norm": 2.4114742279052734, + "learning_rate": 9.989632938039801e-06, + "loss": 3.5042, + "step": 104150 + }, + { + "epoch": 0.10265362578123152, + "grad_norm": 2.50154709815979, + "learning_rate": 9.98962297546011e-06, + "loss": 3.3802, + "step": 104200 + }, + { + "epoch": 0.10270288375905362, + "grad_norm": 2.5768649578094482, + "learning_rate": 9.989613008100751e-06, + "loss": 3.4427, + "step": 104250 + }, + { + "epoch": 0.1027521417368757, + "grad_norm": 2.5603206157684326, + "learning_rate": 9.989603035961732e-06, + "loss": 3.47, + "step": 104300 + }, + { + "epoch": 0.10280139971469779, + "grad_norm": 2.428147315979004, + "learning_rate": 9.989593059043066e-06, + "loss": 3.3894, + "step": 104350 + }, + { + "epoch": 0.10285065769251989, + "grad_norm": 2.6639344692230225, + "learning_rate": 9.98958307734476e-06, + "loss": 3.3962, + "step": 104400 + }, + { + "epoch": 0.10289991567034197, + "grad_norm": 2.7756402492523193, + "learning_rate": 9.989573090866824e-06, + "loss": 3.4937, + "step": 104450 + }, + { + "epoch": 0.10294917364816406, + "grad_norm": 2.6371467113494873, + "learning_rate": 9.989563099609268e-06, + "loss": 3.5071, + "step": 104500 + }, + { + "epoch": 0.10299843162598614, + "grad_norm": 2.8595943450927734, + "learning_rate": 9.9895531035721e-06, + "loss": 3.4386, + "step": 104550 + }, + { + "epoch": 0.10304768960380824, + "grad_norm": 2.5792438983917236, + "learning_rate": 9.989543102755331e-06, + "loss": 3.4549, + "step": 104600 + }, + { + "epoch": 0.10309694758163032, + "grad_norm": 2.5201122760772705, + "learning_rate": 9.989533097158972e-06, + "loss": 3.3948, + "step": 104650 + }, + { + "epoch": 0.1031462055594524, + "grad_norm": 2.5167293548583984, + "learning_rate": 9.98952308678303e-06, + "loss": 3.4983, + "step": 104700 + }, + { + "epoch": 0.1031954635372745, + "grad_norm": 2.4705235958099365, + "learning_rate": 9.989513071627515e-06, + "loss": 3.4197, + "step": 104750 + }, + { + "epoch": 0.10324472151509659, + "grad_norm": 2.478104829788208, + "learning_rate": 9.989503051692437e-06, + "loss": 3.4536, + "step": 104800 + }, + { + "epoch": 0.10329397949291867, + "grad_norm": 2.5004122257232666, + "learning_rate": 9.989493026977809e-06, + "loss": 3.3995, + "step": 104850 + }, + { + "epoch": 0.10334323747074076, + "grad_norm": 2.452639102935791, + "learning_rate": 9.989482997483633e-06, + "loss": 3.4266, + "step": 104900 + }, + { + "epoch": 0.10339249544856285, + "grad_norm": 2.6803834438323975, + "learning_rate": 9.989472963209925e-06, + "loss": 3.4234, + "step": 104950 + }, + { + "epoch": 0.10344175342638494, + "grad_norm": 2.54144024848938, + "learning_rate": 9.989462924156694e-06, + "loss": 3.4369, + "step": 105000 + }, + { + "epoch": 0.10349101140420702, + "grad_norm": 2.48311185836792, + "learning_rate": 9.989452880323946e-06, + "loss": 3.4788, + "step": 105050 + }, + { + "epoch": 0.10354026938202911, + "grad_norm": 2.560347318649292, + "learning_rate": 9.989442831711693e-06, + "loss": 3.451, + "step": 105100 + }, + { + "epoch": 0.1035895273598512, + "grad_norm": 2.6288838386535645, + "learning_rate": 9.989432778319944e-06, + "loss": 3.3832, + "step": 105150 + }, + { + "epoch": 0.10363878533767329, + "grad_norm": 2.608074426651001, + "learning_rate": 9.989422720148711e-06, + "loss": 3.4797, + "step": 105200 + }, + { + "epoch": 0.10368804331549537, + "grad_norm": 2.577819585800171, + "learning_rate": 9.989412657198e-06, + "loss": 3.3888, + "step": 105250 + }, + { + "epoch": 0.10373730129331747, + "grad_norm": 2.646688461303711, + "learning_rate": 9.989402589467824e-06, + "loss": 3.397, + "step": 105300 + }, + { + "epoch": 0.10378655927113956, + "grad_norm": 2.7764625549316406, + "learning_rate": 9.989392516958192e-06, + "loss": 3.4673, + "step": 105350 + }, + { + "epoch": 0.10383581724896164, + "grad_norm": 2.627352476119995, + "learning_rate": 9.989382439669111e-06, + "loss": 3.506, + "step": 105400 + }, + { + "epoch": 0.10388507522678372, + "grad_norm": 2.774658203125, + "learning_rate": 9.989372357600591e-06, + "loss": 3.4066, + "step": 105450 + }, + { + "epoch": 0.10393433320460582, + "grad_norm": 2.4145090579986572, + "learning_rate": 9.989362270752646e-06, + "loss": 3.3803, + "step": 105500 + }, + { + "epoch": 0.1039835911824279, + "grad_norm": 2.5385334491729736, + "learning_rate": 9.98935217912528e-06, + "loss": 3.4549, + "step": 105550 + }, + { + "epoch": 0.10403284916024999, + "grad_norm": 2.7633135318756104, + "learning_rate": 9.989342082718508e-06, + "loss": 3.5103, + "step": 105600 + }, + { + "epoch": 0.10408210713807209, + "grad_norm": 2.586259603500366, + "learning_rate": 9.989331981532334e-06, + "loss": 3.4195, + "step": 105650 + }, + { + "epoch": 0.10413136511589417, + "grad_norm": 2.5349018573760986, + "learning_rate": 9.989321875566772e-06, + "loss": 3.4737, + "step": 105700 + }, + { + "epoch": 0.10418062309371626, + "grad_norm": 2.62892484664917, + "learning_rate": 9.98931176482183e-06, + "loss": 3.4073, + "step": 105750 + }, + { + "epoch": 0.10422988107153834, + "grad_norm": 2.5253756046295166, + "learning_rate": 9.989301649297518e-06, + "loss": 3.4352, + "step": 105800 + }, + { + "epoch": 0.10427913904936044, + "grad_norm": 2.4391913414001465, + "learning_rate": 9.989291528993846e-06, + "loss": 3.4147, + "step": 105850 + }, + { + "epoch": 0.10432839702718252, + "grad_norm": 2.642686605453491, + "learning_rate": 9.989281403910824e-06, + "loss": 3.5669, + "step": 105900 + }, + { + "epoch": 0.10437765500500461, + "grad_norm": 2.4868695735931396, + "learning_rate": 9.98927127404846e-06, + "loss": 3.4567, + "step": 105950 + }, + { + "epoch": 0.10442691298282669, + "grad_norm": 2.4317967891693115, + "learning_rate": 9.989261139406766e-06, + "loss": 3.3978, + "step": 106000 + }, + { + "epoch": 0.10447617096064879, + "grad_norm": 2.5884909629821777, + "learning_rate": 9.989250999985748e-06, + "loss": 3.4413, + "step": 106050 + }, + { + "epoch": 0.10452542893847087, + "grad_norm": 2.6073176860809326, + "learning_rate": 9.989240855785422e-06, + "loss": 3.3917, + "step": 106100 + }, + { + "epoch": 0.10457468691629296, + "grad_norm": 2.508674144744873, + "learning_rate": 9.989230706805792e-06, + "loss": 3.4787, + "step": 106150 + }, + { + "epoch": 0.10462394489411506, + "grad_norm": 2.3065237998962402, + "learning_rate": 9.98922055304687e-06, + "loss": 3.4365, + "step": 106200 + }, + { + "epoch": 0.10467320287193714, + "grad_norm": 2.5615105628967285, + "learning_rate": 9.989210394508665e-06, + "loss": 3.4132, + "step": 106250 + }, + { + "epoch": 0.10472246084975922, + "grad_norm": 2.578866481781006, + "learning_rate": 9.989200231191188e-06, + "loss": 3.3706, + "step": 106300 + }, + { + "epoch": 0.10477171882758131, + "grad_norm": 2.652883529663086, + "learning_rate": 9.989190063094446e-06, + "loss": 3.4375, + "step": 106350 + }, + { + "epoch": 0.1048209768054034, + "grad_norm": 2.5515949726104736, + "learning_rate": 9.989179890218453e-06, + "loss": 3.3565, + "step": 106400 + }, + { + "epoch": 0.10487023478322549, + "grad_norm": 2.346762180328369, + "learning_rate": 9.989169712563214e-06, + "loss": 3.5482, + "step": 106450 + }, + { + "epoch": 0.10491949276104758, + "grad_norm": 2.4448182582855225, + "learning_rate": 9.989159530128742e-06, + "loss": 3.4846, + "step": 106500 + }, + { + "epoch": 0.10496875073886967, + "grad_norm": 2.5624773502349854, + "learning_rate": 9.989149342915046e-06, + "loss": 3.4437, + "step": 106550 + }, + { + "epoch": 0.10501800871669176, + "grad_norm": 2.554595470428467, + "learning_rate": 9.989139150922136e-06, + "loss": 3.505, + "step": 106600 + }, + { + "epoch": 0.10506726669451384, + "grad_norm": 2.5186970233917236, + "learning_rate": 9.98912895415002e-06, + "loss": 3.4999, + "step": 106650 + }, + { + "epoch": 0.10511652467233593, + "grad_norm": 2.8461217880249023, + "learning_rate": 9.989118752598711e-06, + "loss": 3.4892, + "step": 106700 + }, + { + "epoch": 0.10516578265015802, + "grad_norm": 2.5799806118011475, + "learning_rate": 9.989108546268216e-06, + "loss": 3.413, + "step": 106750 + }, + { + "epoch": 0.10521504062798011, + "grad_norm": 2.627345085144043, + "learning_rate": 9.989098335158547e-06, + "loss": 3.4796, + "step": 106800 + }, + { + "epoch": 0.10526429860580219, + "grad_norm": 2.4923176765441895, + "learning_rate": 9.98908811926971e-06, + "loss": 3.4554, + "step": 106850 + }, + { + "epoch": 0.10531355658362429, + "grad_norm": 2.5942320823669434, + "learning_rate": 9.989077898601719e-06, + "loss": 3.3898, + "step": 106900 + }, + { + "epoch": 0.10536281456144637, + "grad_norm": 2.6207265853881836, + "learning_rate": 9.989067673154582e-06, + "loss": 3.4644, + "step": 106950 + }, + { + "epoch": 0.10541207253926846, + "grad_norm": 2.5532491207122803, + "learning_rate": 9.989057442928307e-06, + "loss": 3.4471, + "step": 107000 + }, + { + "epoch": 0.10546133051709054, + "grad_norm": 2.5012567043304443, + "learning_rate": 9.989047207922908e-06, + "loss": 3.4172, + "step": 107050 + }, + { + "epoch": 0.10551058849491264, + "grad_norm": 2.380570888519287, + "learning_rate": 9.989036968138391e-06, + "loss": 3.4249, + "step": 107100 + }, + { + "epoch": 0.10555984647273473, + "grad_norm": 2.500776529312134, + "learning_rate": 9.989026723574768e-06, + "loss": 3.427, + "step": 107150 + }, + { + "epoch": 0.10560910445055681, + "grad_norm": 2.5395073890686035, + "learning_rate": 9.989016474232049e-06, + "loss": 3.3926, + "step": 107200 + }, + { + "epoch": 0.1056583624283789, + "grad_norm": 2.7072510719299316, + "learning_rate": 9.98900622011024e-06, + "loss": 3.4548, + "step": 107250 + }, + { + "epoch": 0.10570762040620099, + "grad_norm": 2.9147465229034424, + "learning_rate": 9.988995961209355e-06, + "loss": 3.4481, + "step": 107300 + }, + { + "epoch": 0.10575687838402308, + "grad_norm": 2.5399296283721924, + "learning_rate": 9.988985697529404e-06, + "loss": 3.4287, + "step": 107350 + }, + { + "epoch": 0.10580613636184516, + "grad_norm": 2.587733268737793, + "learning_rate": 9.988975429070394e-06, + "loss": 3.4861, + "step": 107400 + }, + { + "epoch": 0.10585539433966726, + "grad_norm": 2.714301347732544, + "learning_rate": 9.988965155832337e-06, + "loss": 3.4571, + "step": 107450 + }, + { + "epoch": 0.10590465231748934, + "grad_norm": 2.3904097080230713, + "learning_rate": 9.988954877815242e-06, + "loss": 3.4008, + "step": 107500 + }, + { + "epoch": 0.10595391029531143, + "grad_norm": 2.5300099849700928, + "learning_rate": 9.98894459501912e-06, + "loss": 3.4124, + "step": 107550 + }, + { + "epoch": 0.10600316827313351, + "grad_norm": 3.1428849697113037, + "learning_rate": 9.988934307443976e-06, + "loss": 3.3973, + "step": 107600 + }, + { + "epoch": 0.10605242625095561, + "grad_norm": 2.380772590637207, + "learning_rate": 9.988924015089827e-06, + "loss": 3.4239, + "step": 107650 + }, + { + "epoch": 0.10610168422877769, + "grad_norm": 2.4956820011138916, + "learning_rate": 9.988913717956677e-06, + "loss": 3.3983, + "step": 107700 + }, + { + "epoch": 0.10615094220659978, + "grad_norm": 2.362903118133545, + "learning_rate": 9.98890341604454e-06, + "loss": 3.3915, + "step": 107750 + }, + { + "epoch": 0.10620020018442188, + "grad_norm": 2.6356241703033447, + "learning_rate": 9.988893109353423e-06, + "loss": 3.4059, + "step": 107800 + }, + { + "epoch": 0.10624945816224396, + "grad_norm": 2.397761106491089, + "learning_rate": 9.98888279788334e-06, + "loss": 3.4271, + "step": 107850 + }, + { + "epoch": 0.10629871614006604, + "grad_norm": 2.544606924057007, + "learning_rate": 9.988872481634294e-06, + "loss": 3.4545, + "step": 107900 + }, + { + "epoch": 0.10634797411788813, + "grad_norm": 2.4447109699249268, + "learning_rate": 9.988862160606301e-06, + "loss": 3.4591, + "step": 107950 + }, + { + "epoch": 0.10639723209571023, + "grad_norm": 2.4964346885681152, + "learning_rate": 9.988851834799368e-06, + "loss": 3.4385, + "step": 108000 + }, + { + "epoch": 0.10644649007353231, + "grad_norm": 2.578468084335327, + "learning_rate": 9.988841504213506e-06, + "loss": 3.4315, + "step": 108050 + }, + { + "epoch": 0.1064957480513544, + "grad_norm": 2.128343343734741, + "learning_rate": 9.988831168848725e-06, + "loss": 3.4199, + "step": 108100 + }, + { + "epoch": 0.10654500602917649, + "grad_norm": 2.490886926651001, + "learning_rate": 9.988820828705034e-06, + "loss": 3.4563, + "step": 108150 + }, + { + "epoch": 0.10659426400699858, + "grad_norm": 2.655769109725952, + "learning_rate": 9.988810483782443e-06, + "loss": 3.4231, + "step": 108200 + }, + { + "epoch": 0.10664352198482066, + "grad_norm": 2.489685297012329, + "learning_rate": 9.988800134080962e-06, + "loss": 3.3999, + "step": 108250 + }, + { + "epoch": 0.10669277996264274, + "grad_norm": 2.65108060836792, + "learning_rate": 9.988789779600603e-06, + "loss": 3.481, + "step": 108300 + }, + { + "epoch": 0.10674203794046484, + "grad_norm": 2.439152956008911, + "learning_rate": 9.988779420341373e-06, + "loss": 3.4463, + "step": 108350 + }, + { + "epoch": 0.10679129591828693, + "grad_norm": 2.5587260723114014, + "learning_rate": 9.988769056303281e-06, + "loss": 3.494, + "step": 108400 + }, + { + "epoch": 0.10684055389610901, + "grad_norm": 2.501852035522461, + "learning_rate": 9.988758687486341e-06, + "loss": 3.4207, + "step": 108450 + }, + { + "epoch": 0.1068898118739311, + "grad_norm": 2.6238811016082764, + "learning_rate": 9.98874831389056e-06, + "loss": 3.3929, + "step": 108500 + }, + { + "epoch": 0.1069390698517532, + "grad_norm": 2.5396251678466797, + "learning_rate": 9.988737935515948e-06, + "loss": 3.4537, + "step": 108550 + }, + { + "epoch": 0.10698832782957528, + "grad_norm": 2.4617257118225098, + "learning_rate": 9.988727552362518e-06, + "loss": 3.436, + "step": 108600 + }, + { + "epoch": 0.10703758580739736, + "grad_norm": 2.3989791870117188, + "learning_rate": 9.988717164430277e-06, + "loss": 3.3739, + "step": 108650 + }, + { + "epoch": 0.10708684378521946, + "grad_norm": 2.312450885772705, + "learning_rate": 9.988706771719234e-06, + "loss": 3.4191, + "step": 108700 + }, + { + "epoch": 0.10713610176304154, + "grad_norm": 2.483731985092163, + "learning_rate": 9.988696374229402e-06, + "loss": 3.4227, + "step": 108750 + }, + { + "epoch": 0.10718535974086363, + "grad_norm": 2.3549368381500244, + "learning_rate": 9.98868597196079e-06, + "loss": 3.443, + "step": 108800 + }, + { + "epoch": 0.10723461771868571, + "grad_norm": 2.426194906234741, + "learning_rate": 9.988675564913406e-06, + "loss": 3.4576, + "step": 108850 + }, + { + "epoch": 0.10728387569650781, + "grad_norm": 2.4857242107391357, + "learning_rate": 9.988665153087262e-06, + "loss": 3.4134, + "step": 108900 + }, + { + "epoch": 0.1073331336743299, + "grad_norm": 2.5802876949310303, + "learning_rate": 9.988654736482367e-06, + "loss": 3.4335, + "step": 108950 + }, + { + "epoch": 0.10738239165215198, + "grad_norm": 2.865737199783325, + "learning_rate": 9.988644315098733e-06, + "loss": 3.3975, + "step": 109000 + }, + { + "epoch": 0.10743164962997408, + "grad_norm": 2.97513747215271, + "learning_rate": 9.988633888936365e-06, + "loss": 3.4605, + "step": 109050 + }, + { + "epoch": 0.10748090760779616, + "grad_norm": 2.5182788372039795, + "learning_rate": 9.98862345799528e-06, + "loss": 3.3922, + "step": 109100 + }, + { + "epoch": 0.10753016558561825, + "grad_norm": 2.4117255210876465, + "learning_rate": 9.988613022275483e-06, + "loss": 3.4451, + "step": 109150 + }, + { + "epoch": 0.10757942356344033, + "grad_norm": 2.4379544258117676, + "learning_rate": 9.988602581776985e-06, + "loss": 3.3883, + "step": 109200 + }, + { + "epoch": 0.10762868154126243, + "grad_norm": 2.53374981880188, + "learning_rate": 9.988592136499798e-06, + "loss": 3.4512, + "step": 109250 + }, + { + "epoch": 0.10767793951908451, + "grad_norm": 2.4503746032714844, + "learning_rate": 9.988581686443928e-06, + "loss": 3.4373, + "step": 109300 + }, + { + "epoch": 0.1077271974969066, + "grad_norm": 2.627257823944092, + "learning_rate": 9.98857123160939e-06, + "loss": 3.3885, + "step": 109350 + }, + { + "epoch": 0.10777645547472868, + "grad_norm": 2.5857720375061035, + "learning_rate": 9.98856077199619e-06, + "loss": 3.3983, + "step": 109400 + }, + { + "epoch": 0.10782571345255078, + "grad_norm": 2.38537859916687, + "learning_rate": 9.988550307604337e-06, + "loss": 3.4365, + "step": 109450 + }, + { + "epoch": 0.10787497143037286, + "grad_norm": 2.4997787475585938, + "learning_rate": 9.988539838433846e-06, + "loss": 3.3819, + "step": 109500 + }, + { + "epoch": 0.10792422940819495, + "grad_norm": 2.4812381267547607, + "learning_rate": 9.988529364484725e-06, + "loss": 3.4365, + "step": 109550 + }, + { + "epoch": 0.10797348738601704, + "grad_norm": 2.4586451053619385, + "learning_rate": 9.988518885756983e-06, + "loss": 3.3783, + "step": 109600 + }, + { + "epoch": 0.10802274536383913, + "grad_norm": 2.6401255130767822, + "learning_rate": 9.98850840225063e-06, + "loss": 3.4164, + "step": 109650 + }, + { + "epoch": 0.10807200334166121, + "grad_norm": 2.408278465270996, + "learning_rate": 9.988497913965677e-06, + "loss": 3.4213, + "step": 109700 + }, + { + "epoch": 0.1081212613194833, + "grad_norm": 2.5933334827423096, + "learning_rate": 9.988487420902134e-06, + "loss": 3.4468, + "step": 109750 + }, + { + "epoch": 0.1081705192973054, + "grad_norm": 2.545337438583374, + "learning_rate": 9.98847692306001e-06, + "loss": 3.4492, + "step": 109800 + }, + { + "epoch": 0.10821977727512748, + "grad_norm": 2.454753875732422, + "learning_rate": 9.988466420439316e-06, + "loss": 3.4457, + "step": 109850 + }, + { + "epoch": 0.10826903525294956, + "grad_norm": 2.561382532119751, + "learning_rate": 9.988455913040061e-06, + "loss": 3.4308, + "step": 109900 + }, + { + "epoch": 0.10831829323077166, + "grad_norm": 2.6643896102905273, + "learning_rate": 9.988445400862256e-06, + "loss": 3.4418, + "step": 109950 + }, + { + "epoch": 0.10836755120859375, + "grad_norm": 2.618042469024658, + "learning_rate": 9.988434883905911e-06, + "loss": 3.4745, + "step": 110000 + }, + { + "epoch": 0.10841680918641583, + "grad_norm": 2.850022315979004, + "learning_rate": 9.988424362171037e-06, + "loss": 3.3956, + "step": 110050 + }, + { + "epoch": 0.10846606716423791, + "grad_norm": 2.4870285987854004, + "learning_rate": 9.988413835657642e-06, + "loss": 3.3957, + "step": 110100 + }, + { + "epoch": 0.10851532514206001, + "grad_norm": 2.5803768634796143, + "learning_rate": 9.988403304365739e-06, + "loss": 3.4583, + "step": 110150 + }, + { + "epoch": 0.1085645831198821, + "grad_norm": 2.4523186683654785, + "learning_rate": 9.988392768295335e-06, + "loss": 3.4915, + "step": 110200 + }, + { + "epoch": 0.10861384109770418, + "grad_norm": 2.5048675537109375, + "learning_rate": 9.98838222744644e-06, + "loss": 3.3927, + "step": 110250 + }, + { + "epoch": 0.10866309907552628, + "grad_norm": 2.492319107055664, + "learning_rate": 9.988371681819067e-06, + "loss": 3.3645, + "step": 110300 + }, + { + "epoch": 0.10871235705334836, + "grad_norm": 3.1961677074432373, + "learning_rate": 9.988361131413224e-06, + "loss": 3.4328, + "step": 110350 + }, + { + "epoch": 0.10876161503117045, + "grad_norm": 2.7520387172698975, + "learning_rate": 9.988350576228922e-06, + "loss": 3.4286, + "step": 110400 + }, + { + "epoch": 0.10881087300899253, + "grad_norm": 2.4118900299072266, + "learning_rate": 9.988340016266172e-06, + "loss": 3.4115, + "step": 110450 + }, + { + "epoch": 0.10886013098681463, + "grad_norm": 2.679257869720459, + "learning_rate": 9.98832945152498e-06, + "loss": 3.4613, + "step": 110500 + }, + { + "epoch": 0.10890938896463671, + "grad_norm": 2.663564443588257, + "learning_rate": 9.988318882005362e-06, + "loss": 3.4672, + "step": 110550 + }, + { + "epoch": 0.1089586469424588, + "grad_norm": 2.775219440460205, + "learning_rate": 9.988308307707325e-06, + "loss": 3.4399, + "step": 110600 + }, + { + "epoch": 0.10900790492028088, + "grad_norm": 2.414954423904419, + "learning_rate": 9.988297728630878e-06, + "loss": 3.4513, + "step": 110650 + }, + { + "epoch": 0.10905716289810298, + "grad_norm": 2.408740758895874, + "learning_rate": 9.988287144776032e-06, + "loss": 3.4014, + "step": 110700 + }, + { + "epoch": 0.10910642087592506, + "grad_norm": 2.64711594581604, + "learning_rate": 9.988276556142797e-06, + "loss": 3.4552, + "step": 110750 + }, + { + "epoch": 0.10915567885374715, + "grad_norm": 2.481914520263672, + "learning_rate": 9.988265962731185e-06, + "loss": 3.4262, + "step": 110800 + }, + { + "epoch": 0.10920493683156925, + "grad_norm": 2.362440824508667, + "learning_rate": 9.988255364541204e-06, + "loss": 3.3931, + "step": 110850 + }, + { + "epoch": 0.10925419480939133, + "grad_norm": 2.5215508937835693, + "learning_rate": 9.988244761572867e-06, + "loss": 3.4026, + "step": 110900 + }, + { + "epoch": 0.10930345278721341, + "grad_norm": 2.737044334411621, + "learning_rate": 9.988234153826179e-06, + "loss": 3.4186, + "step": 110950 + }, + { + "epoch": 0.1093527107650355, + "grad_norm": 2.4314589500427246, + "learning_rate": 9.988223541301157e-06, + "loss": 3.4509, + "step": 111000 + }, + { + "epoch": 0.1094019687428576, + "grad_norm": 2.458906412124634, + "learning_rate": 9.988212923997805e-06, + "loss": 3.3762, + "step": 111050 + }, + { + "epoch": 0.10945122672067968, + "grad_norm": 2.286505699157715, + "learning_rate": 9.988202301916135e-06, + "loss": 3.3965, + "step": 111100 + }, + { + "epoch": 0.10950048469850177, + "grad_norm": 2.4821290969848633, + "learning_rate": 9.988191675056157e-06, + "loss": 3.4179, + "step": 111150 + }, + { + "epoch": 0.10954974267632386, + "grad_norm": 2.560870409011841, + "learning_rate": 9.988181043417886e-06, + "loss": 3.3824, + "step": 111200 + }, + { + "epoch": 0.10959900065414595, + "grad_norm": 2.3149266242980957, + "learning_rate": 9.988170407001325e-06, + "loss": 3.4375, + "step": 111250 + }, + { + "epoch": 0.10964825863196803, + "grad_norm": 2.9345152378082275, + "learning_rate": 9.988159765806489e-06, + "loss": 3.4518, + "step": 111300 + }, + { + "epoch": 0.10969751660979012, + "grad_norm": 2.558701753616333, + "learning_rate": 9.988149119833386e-06, + "loss": 3.4397, + "step": 111350 + }, + { + "epoch": 0.10974677458761221, + "grad_norm": 2.439044952392578, + "learning_rate": 9.988138469082026e-06, + "loss": 3.4335, + "step": 111400 + }, + { + "epoch": 0.1097960325654343, + "grad_norm": 2.5487821102142334, + "learning_rate": 9.98812781355242e-06, + "loss": 3.3254, + "step": 111450 + }, + { + "epoch": 0.10984529054325638, + "grad_norm": 2.534282684326172, + "learning_rate": 9.988117153244579e-06, + "loss": 3.4294, + "step": 111500 + }, + { + "epoch": 0.10989454852107848, + "grad_norm": 2.481159210205078, + "learning_rate": 9.988106488158513e-06, + "loss": 3.4523, + "step": 111550 + }, + { + "epoch": 0.10994380649890056, + "grad_norm": 2.996933937072754, + "learning_rate": 9.98809581829423e-06, + "loss": 3.4765, + "step": 111600 + }, + { + "epoch": 0.10999306447672265, + "grad_norm": 2.6198647022247314, + "learning_rate": 9.988085143651742e-06, + "loss": 3.4115, + "step": 111650 + }, + { + "epoch": 0.11004232245454473, + "grad_norm": 2.666602611541748, + "learning_rate": 9.98807446423106e-06, + "loss": 3.4559, + "step": 111700 + }, + { + "epoch": 0.11009158043236683, + "grad_norm": 2.5945160388946533, + "learning_rate": 9.988063780032193e-06, + "loss": 3.4468, + "step": 111750 + }, + { + "epoch": 0.11014083841018892, + "grad_norm": 2.5586225986480713, + "learning_rate": 9.98805309105515e-06, + "loss": 3.3748, + "step": 111800 + }, + { + "epoch": 0.110190096388011, + "grad_norm": 2.5541446208953857, + "learning_rate": 9.988042397299945e-06, + "loss": 3.4333, + "step": 111850 + }, + { + "epoch": 0.11023935436583308, + "grad_norm": 2.7662041187286377, + "learning_rate": 9.988031698766585e-06, + "loss": 3.4169, + "step": 111900 + }, + { + "epoch": 0.11028861234365518, + "grad_norm": 2.494795083999634, + "learning_rate": 9.988020995455081e-06, + "loss": 3.401, + "step": 111950 + }, + { + "epoch": 0.11033787032147727, + "grad_norm": 2.4045021533966064, + "learning_rate": 9.988010287365444e-06, + "loss": 3.3917, + "step": 112000 + }, + { + "epoch": 0.11038712829929935, + "grad_norm": 2.6514217853546143, + "learning_rate": 9.987999574497683e-06, + "loss": 3.3701, + "step": 112050 + }, + { + "epoch": 0.11043638627712145, + "grad_norm": 2.3198745250701904, + "learning_rate": 9.98798885685181e-06, + "loss": 3.3977, + "step": 112100 + }, + { + "epoch": 0.11048564425494353, + "grad_norm": 2.224750518798828, + "learning_rate": 9.987978134427833e-06, + "loss": 3.416, + "step": 112150 + }, + { + "epoch": 0.11053490223276562, + "grad_norm": 2.7536113262176514, + "learning_rate": 9.987967407225765e-06, + "loss": 3.4961, + "step": 112200 + }, + { + "epoch": 0.1105841602105877, + "grad_norm": 2.658810615539551, + "learning_rate": 9.987956675245614e-06, + "loss": 3.374, + "step": 112250 + }, + { + "epoch": 0.1106334181884098, + "grad_norm": 2.635411024093628, + "learning_rate": 9.98794593848739e-06, + "loss": 3.3899, + "step": 112300 + }, + { + "epoch": 0.11068267616623188, + "grad_norm": 2.497048854827881, + "learning_rate": 9.987935196951107e-06, + "loss": 3.4449, + "step": 112350 + }, + { + "epoch": 0.11073193414405397, + "grad_norm": 2.940375328063965, + "learning_rate": 9.98792445063677e-06, + "loss": 3.4267, + "step": 112400 + }, + { + "epoch": 0.11078119212187607, + "grad_norm": 2.435006856918335, + "learning_rate": 9.987913699544394e-06, + "loss": 3.3526, + "step": 112450 + }, + { + "epoch": 0.11083045009969815, + "grad_norm": 2.365358352661133, + "learning_rate": 9.987902943673986e-06, + "loss": 3.4065, + "step": 112500 + }, + { + "epoch": 0.11087970807752023, + "grad_norm": 2.5144810676574707, + "learning_rate": 9.987892183025557e-06, + "loss": 3.3353, + "step": 112550 + }, + { + "epoch": 0.11092896605534232, + "grad_norm": 2.7100422382354736, + "learning_rate": 9.98788141759912e-06, + "loss": 3.3739, + "step": 112600 + }, + { + "epoch": 0.11097822403316442, + "grad_norm": 2.4877023696899414, + "learning_rate": 9.987870647394682e-06, + "loss": 3.4418, + "step": 112650 + }, + { + "epoch": 0.1110274820109865, + "grad_norm": 2.6677637100219727, + "learning_rate": 9.987859872412252e-06, + "loss": 3.3764, + "step": 112700 + }, + { + "epoch": 0.11107673998880858, + "grad_norm": 2.953568935394287, + "learning_rate": 9.987849092651847e-06, + "loss": 3.4827, + "step": 112750 + }, + { + "epoch": 0.11112599796663068, + "grad_norm": 2.5357887744903564, + "learning_rate": 9.987838308113471e-06, + "loss": 3.4318, + "step": 112800 + }, + { + "epoch": 0.11117525594445277, + "grad_norm": 2.476473808288574, + "learning_rate": 9.987827518797138e-06, + "loss": 3.4021, + "step": 112850 + }, + { + "epoch": 0.11122451392227485, + "grad_norm": 2.4701685905456543, + "learning_rate": 9.987816724702857e-06, + "loss": 3.432, + "step": 112900 + }, + { + "epoch": 0.11127377190009693, + "grad_norm": 2.5180435180664062, + "learning_rate": 9.987805925830637e-06, + "loss": 3.4738, + "step": 112950 + }, + { + "epoch": 0.11132302987791903, + "grad_norm": 2.4799001216888428, + "learning_rate": 9.987795122180489e-06, + "loss": 3.3909, + "step": 113000 + }, + { + "epoch": 0.11137228785574112, + "grad_norm": 2.8029589653015137, + "learning_rate": 9.987784313752424e-06, + "loss": 3.36, + "step": 113050 + }, + { + "epoch": 0.1114215458335632, + "grad_norm": 2.607605218887329, + "learning_rate": 9.987773500546452e-06, + "loss": 3.4136, + "step": 113100 + }, + { + "epoch": 0.11147080381138529, + "grad_norm": 2.538440227508545, + "learning_rate": 9.987762682562583e-06, + "loss": 3.4064, + "step": 113150 + }, + { + "epoch": 0.11152006178920738, + "grad_norm": 2.293917417526245, + "learning_rate": 9.98775185980083e-06, + "loss": 3.4403, + "step": 113200 + }, + { + "epoch": 0.11156931976702947, + "grad_norm": 2.5304901599884033, + "learning_rate": 9.9877410322612e-06, + "loss": 3.4569, + "step": 113250 + }, + { + "epoch": 0.11161857774485155, + "grad_norm": 2.6614298820495605, + "learning_rate": 9.987730199943706e-06, + "loss": 3.491, + "step": 113300 + }, + { + "epoch": 0.11166783572267365, + "grad_norm": 2.291957139968872, + "learning_rate": 9.987719362848357e-06, + "loss": 3.4138, + "step": 113350 + }, + { + "epoch": 0.11171709370049573, + "grad_norm": 2.4876787662506104, + "learning_rate": 9.98770852097516e-06, + "loss": 3.4173, + "step": 113400 + }, + { + "epoch": 0.11176635167831782, + "grad_norm": 2.5979630947113037, + "learning_rate": 9.987697674324131e-06, + "loss": 3.3969, + "step": 113450 + }, + { + "epoch": 0.1118156096561399, + "grad_norm": 2.491980791091919, + "learning_rate": 9.987686822895279e-06, + "loss": 3.3456, + "step": 113500 + }, + { + "epoch": 0.111864867633962, + "grad_norm": 3.744938850402832, + "learning_rate": 9.987675966688612e-06, + "loss": 3.4717, + "step": 113550 + }, + { + "epoch": 0.11191412561178408, + "grad_norm": 2.775392532348633, + "learning_rate": 9.987665105704144e-06, + "loss": 3.3637, + "step": 113600 + }, + { + "epoch": 0.11196338358960617, + "grad_norm": 2.565307855606079, + "learning_rate": 9.987654239941883e-06, + "loss": 3.4053, + "step": 113650 + }, + { + "epoch": 0.11201264156742827, + "grad_norm": 2.610602617263794, + "learning_rate": 9.987643369401837e-06, + "loss": 3.3738, + "step": 113700 + }, + { + "epoch": 0.11206189954525035, + "grad_norm": 4.281893730163574, + "learning_rate": 9.987632494084023e-06, + "loss": 3.3887, + "step": 113750 + }, + { + "epoch": 0.11211115752307244, + "grad_norm": 2.573922872543335, + "learning_rate": 9.987621613988447e-06, + "loss": 3.4215, + "step": 113800 + }, + { + "epoch": 0.11216041550089452, + "grad_norm": 2.975816488265991, + "learning_rate": 9.987610729115119e-06, + "loss": 3.37, + "step": 113850 + }, + { + "epoch": 0.11220967347871662, + "grad_norm": 2.7732412815093994, + "learning_rate": 9.98759983946405e-06, + "loss": 3.3931, + "step": 113900 + }, + { + "epoch": 0.1122589314565387, + "grad_norm": 2.857531785964966, + "learning_rate": 9.987588945035252e-06, + "loss": 3.4307, + "step": 113950 + }, + { + "epoch": 0.11230818943436079, + "grad_norm": 2.2794342041015625, + "learning_rate": 9.987578045828733e-06, + "loss": 3.4288, + "step": 114000 + }, + { + "epoch": 0.11235744741218287, + "grad_norm": 2.3085877895355225, + "learning_rate": 9.987567141844507e-06, + "loss": 3.3807, + "step": 114050 + }, + { + "epoch": 0.11240670539000497, + "grad_norm": 2.9306790828704834, + "learning_rate": 9.98755623308258e-06, + "loss": 3.3656, + "step": 114100 + }, + { + "epoch": 0.11245596336782705, + "grad_norm": 2.5620062351226807, + "learning_rate": 9.987545319542966e-06, + "loss": 3.4017, + "step": 114150 + }, + { + "epoch": 0.11250522134564914, + "grad_norm": 2.433534860610962, + "learning_rate": 9.987534401225674e-06, + "loss": 3.4368, + "step": 114200 + }, + { + "epoch": 0.11255447932347123, + "grad_norm": 2.8827786445617676, + "learning_rate": 9.987523478130714e-06, + "loss": 3.4199, + "step": 114250 + }, + { + "epoch": 0.11260373730129332, + "grad_norm": 2.3539559841156006, + "learning_rate": 9.987512550258097e-06, + "loss": 3.4186, + "step": 114300 + }, + { + "epoch": 0.1126529952791154, + "grad_norm": 2.4843831062316895, + "learning_rate": 9.987501617607832e-06, + "loss": 3.3831, + "step": 114350 + }, + { + "epoch": 0.11270225325693749, + "grad_norm": 2.338747024536133, + "learning_rate": 9.987490680179935e-06, + "loss": 3.4218, + "step": 114400 + }, + { + "epoch": 0.11275151123475959, + "grad_norm": 2.682001829147339, + "learning_rate": 9.98747973797441e-06, + "loss": 3.4335, + "step": 114450 + }, + { + "epoch": 0.11280076921258167, + "grad_norm": 3.1694600582122803, + "learning_rate": 9.98746879099127e-06, + "loss": 3.3911, + "step": 114500 + }, + { + "epoch": 0.11285002719040375, + "grad_norm": 2.632359266281128, + "learning_rate": 9.987457839230527e-06, + "loss": 3.3858, + "step": 114550 + }, + { + "epoch": 0.11289928516822585, + "grad_norm": 2.5634548664093018, + "learning_rate": 9.987446882692188e-06, + "loss": 3.3981, + "step": 114600 + }, + { + "epoch": 0.11294854314604794, + "grad_norm": 2.483888626098633, + "learning_rate": 9.987435921376266e-06, + "loss": 3.3978, + "step": 114650 + }, + { + "epoch": 0.11299780112387002, + "grad_norm": 2.4412615299224854, + "learning_rate": 9.987424955282771e-06, + "loss": 3.41, + "step": 114700 + }, + { + "epoch": 0.1130470591016921, + "grad_norm": 2.408216953277588, + "learning_rate": 9.987413984411714e-06, + "loss": 3.3307, + "step": 114750 + }, + { + "epoch": 0.1130963170795142, + "grad_norm": 2.5929269790649414, + "learning_rate": 9.987403008763105e-06, + "loss": 3.3723, + "step": 114800 + }, + { + "epoch": 0.11314557505733629, + "grad_norm": 2.459618330001831, + "learning_rate": 9.987392028336954e-06, + "loss": 3.3847, + "step": 114850 + }, + { + "epoch": 0.11319483303515837, + "grad_norm": 2.676130533218384, + "learning_rate": 9.987381043133272e-06, + "loss": 3.4079, + "step": 114900 + }, + { + "epoch": 0.11324409101298047, + "grad_norm": 2.4563794136047363, + "learning_rate": 9.98737005315207e-06, + "loss": 3.419, + "step": 114950 + }, + { + "epoch": 0.11329334899080255, + "grad_norm": 2.5015296936035156, + "learning_rate": 9.987359058393357e-06, + "loss": 3.3676, + "step": 115000 + }, + { + "epoch": 0.11334260696862464, + "grad_norm": 2.5444447994232178, + "learning_rate": 9.987348058857145e-06, + "loss": 3.3635, + "step": 115050 + }, + { + "epoch": 0.11339186494644672, + "grad_norm": 2.5149121284484863, + "learning_rate": 9.987337054543445e-06, + "loss": 3.49, + "step": 115100 + }, + { + "epoch": 0.11344112292426882, + "grad_norm": 2.5545473098754883, + "learning_rate": 9.987326045452265e-06, + "loss": 3.3597, + "step": 115150 + }, + { + "epoch": 0.1134903809020909, + "grad_norm": 2.407400608062744, + "learning_rate": 9.987315031583617e-06, + "loss": 3.4364, + "step": 115200 + }, + { + "epoch": 0.11353963887991299, + "grad_norm": 2.4975545406341553, + "learning_rate": 9.987304012937513e-06, + "loss": 3.3888, + "step": 115250 + }, + { + "epoch": 0.11358889685773507, + "grad_norm": 2.6385886669158936, + "learning_rate": 9.987292989513962e-06, + "loss": 3.3589, + "step": 115300 + }, + { + "epoch": 0.11363815483555717, + "grad_norm": 2.5099990367889404, + "learning_rate": 9.987281961312975e-06, + "loss": 3.3985, + "step": 115350 + }, + { + "epoch": 0.11368741281337925, + "grad_norm": 2.9271302223205566, + "learning_rate": 9.987270928334562e-06, + "loss": 3.4282, + "step": 115400 + }, + { + "epoch": 0.11373667079120134, + "grad_norm": 2.9757797718048096, + "learning_rate": 9.987259890578736e-06, + "loss": 3.4637, + "step": 115450 + }, + { + "epoch": 0.11378592876902344, + "grad_norm": 2.576037883758545, + "learning_rate": 9.987248848045503e-06, + "loss": 3.3522, + "step": 115500 + }, + { + "epoch": 0.11383518674684552, + "grad_norm": 3.0438504219055176, + "learning_rate": 9.987237800734876e-06, + "loss": 3.4641, + "step": 115550 + }, + { + "epoch": 0.1138844447246676, + "grad_norm": 2.401563882827759, + "learning_rate": 9.987226748646868e-06, + "loss": 3.3784, + "step": 115600 + }, + { + "epoch": 0.11393370270248969, + "grad_norm": 2.865088939666748, + "learning_rate": 9.987215691781485e-06, + "loss": 3.3725, + "step": 115650 + }, + { + "epoch": 0.11398296068031179, + "grad_norm": 2.649177074432373, + "learning_rate": 9.987204630138742e-06, + "loss": 3.401, + "step": 115700 + }, + { + "epoch": 0.11403221865813387, + "grad_norm": 2.3352818489074707, + "learning_rate": 9.987193563718645e-06, + "loss": 3.3696, + "step": 115750 + }, + { + "epoch": 0.11408147663595596, + "grad_norm": 2.5297980308532715, + "learning_rate": 9.987182492521211e-06, + "loss": 3.3961, + "step": 115800 + }, + { + "epoch": 0.11413073461377805, + "grad_norm": 2.8760290145874023, + "learning_rate": 9.987171416546443e-06, + "loss": 3.3294, + "step": 115850 + }, + { + "epoch": 0.11417999259160014, + "grad_norm": 2.4567251205444336, + "learning_rate": 9.987160335794357e-06, + "loss": 3.3969, + "step": 115900 + }, + { + "epoch": 0.11422925056942222, + "grad_norm": 2.4522342681884766, + "learning_rate": 9.98714925026496e-06, + "loss": 3.3311, + "step": 115950 + }, + { + "epoch": 0.1142785085472443, + "grad_norm": 2.6093015670776367, + "learning_rate": 9.987138159958268e-06, + "loss": 3.3944, + "step": 116000 + }, + { + "epoch": 0.1143277665250664, + "grad_norm": 2.5351269245147705, + "learning_rate": 9.987127064874284e-06, + "loss": 3.4455, + "step": 116050 + }, + { + "epoch": 0.11437702450288849, + "grad_norm": 2.4875807762145996, + "learning_rate": 9.987115965013026e-06, + "loss": 3.4511, + "step": 116100 + }, + { + "epoch": 0.11442628248071057, + "grad_norm": 2.4287095069885254, + "learning_rate": 9.9871048603745e-06, + "loss": 3.3922, + "step": 116150 + }, + { + "epoch": 0.11447554045853267, + "grad_norm": 2.602886438369751, + "learning_rate": 9.987093750958719e-06, + "loss": 3.3649, + "step": 116200 + }, + { + "epoch": 0.11452479843635475, + "grad_norm": 2.7385787963867188, + "learning_rate": 9.987082636765692e-06, + "loss": 3.3948, + "step": 116250 + }, + { + "epoch": 0.11457405641417684, + "grad_norm": 2.4290661811828613, + "learning_rate": 9.98707151779543e-06, + "loss": 3.3955, + "step": 116300 + }, + { + "epoch": 0.11462331439199892, + "grad_norm": 2.3842856884002686, + "learning_rate": 9.987060394047945e-06, + "loss": 3.3444, + "step": 116350 + }, + { + "epoch": 0.11467257236982102, + "grad_norm": 2.7101097106933594, + "learning_rate": 9.987049265523245e-06, + "loss": 3.3841, + "step": 116400 + }, + { + "epoch": 0.1147218303476431, + "grad_norm": 2.384512424468994, + "learning_rate": 9.987038132221345e-06, + "loss": 3.4431, + "step": 116450 + }, + { + "epoch": 0.11477108832546519, + "grad_norm": 2.711897373199463, + "learning_rate": 9.98702699414225e-06, + "loss": 3.4281, + "step": 116500 + }, + { + "epoch": 0.11482034630328727, + "grad_norm": 3.0361146926879883, + "learning_rate": 9.987015851285976e-06, + "loss": 3.3605, + "step": 116550 + }, + { + "epoch": 0.11486960428110937, + "grad_norm": 2.562206506729126, + "learning_rate": 9.987004703652531e-06, + "loss": 3.4053, + "step": 116600 + }, + { + "epoch": 0.11491886225893146, + "grad_norm": 2.438856601715088, + "learning_rate": 9.986993551241926e-06, + "loss": 3.4045, + "step": 116650 + }, + { + "epoch": 0.11496812023675354, + "grad_norm": 2.4061732292175293, + "learning_rate": 9.98698239405417e-06, + "loss": 3.4771, + "step": 116700 + }, + { + "epoch": 0.11501737821457564, + "grad_norm": 2.3539230823516846, + "learning_rate": 9.986971232089277e-06, + "loss": 3.4231, + "step": 116750 + }, + { + "epoch": 0.11506663619239772, + "grad_norm": 2.4087886810302734, + "learning_rate": 9.986960065347255e-06, + "loss": 3.4331, + "step": 116800 + }, + { + "epoch": 0.1151158941702198, + "grad_norm": 2.420719861984253, + "learning_rate": 9.986948893828116e-06, + "loss": 3.351, + "step": 116850 + }, + { + "epoch": 0.11516515214804189, + "grad_norm": 2.4583969116210938, + "learning_rate": 9.986937717531873e-06, + "loss": 3.3334, + "step": 116900 + }, + { + "epoch": 0.11521441012586399, + "grad_norm": 2.504702568054199, + "learning_rate": 9.986926536458531e-06, + "loss": 3.3842, + "step": 116950 + }, + { + "epoch": 0.11526366810368607, + "grad_norm": 2.506558895111084, + "learning_rate": 9.986915350608104e-06, + "loss": 3.3571, + "step": 117000 + }, + { + "epoch": 0.11531292608150816, + "grad_norm": 2.288512706756592, + "learning_rate": 9.986904159980604e-06, + "loss": 3.4083, + "step": 117050 + }, + { + "epoch": 0.11536218405933026, + "grad_norm": 2.3676340579986572, + "learning_rate": 9.98689296457604e-06, + "loss": 3.3842, + "step": 117100 + }, + { + "epoch": 0.11541144203715234, + "grad_norm": 2.7400388717651367, + "learning_rate": 9.986881764394423e-06, + "loss": 3.3377, + "step": 117150 + }, + { + "epoch": 0.11546070001497442, + "grad_norm": 2.4120335578918457, + "learning_rate": 9.986870559435763e-06, + "loss": 3.4129, + "step": 117200 + }, + { + "epoch": 0.11550995799279651, + "grad_norm": 2.4424989223480225, + "learning_rate": 9.986859349700073e-06, + "loss": 3.3205, + "step": 117250 + }, + { + "epoch": 0.1155592159706186, + "grad_norm": 2.194718360900879, + "learning_rate": 9.986848135187362e-06, + "loss": 3.3958, + "step": 117300 + }, + { + "epoch": 0.11560847394844069, + "grad_norm": 2.450803518295288, + "learning_rate": 9.98683691589764e-06, + "loss": 3.3719, + "step": 117350 + }, + { + "epoch": 0.11565773192626277, + "grad_norm": 2.6813697814941406, + "learning_rate": 9.98682569183092e-06, + "loss": 3.3923, + "step": 117400 + }, + { + "epoch": 0.11570698990408486, + "grad_norm": 2.2688100337982178, + "learning_rate": 9.98681446298721e-06, + "loss": 3.4133, + "step": 117450 + }, + { + "epoch": 0.11575624788190696, + "grad_norm": 2.422471284866333, + "learning_rate": 9.986803229366523e-06, + "loss": 3.4232, + "step": 117500 + }, + { + "epoch": 0.11580550585972904, + "grad_norm": 2.672807216644287, + "learning_rate": 9.986791990968869e-06, + "loss": 3.3791, + "step": 117550 + }, + { + "epoch": 0.11585476383755113, + "grad_norm": 2.564429521560669, + "learning_rate": 9.986780747794258e-06, + "loss": 3.3959, + "step": 117600 + }, + { + "epoch": 0.11590402181537322, + "grad_norm": 2.615173816680908, + "learning_rate": 9.986769499842703e-06, + "loss": 3.4127, + "step": 117650 + }, + { + "epoch": 0.11595327979319531, + "grad_norm": 2.2499983310699463, + "learning_rate": 9.986758247114213e-06, + "loss": 3.4047, + "step": 117700 + }, + { + "epoch": 0.11600253777101739, + "grad_norm": 2.393465042114258, + "learning_rate": 9.986746989608797e-06, + "loss": 3.4065, + "step": 117750 + }, + { + "epoch": 0.11605179574883948, + "grad_norm": 2.6967051029205322, + "learning_rate": 9.986735727326471e-06, + "loss": 3.3891, + "step": 117800 + }, + { + "epoch": 0.11610105372666157, + "grad_norm": 2.344642162322998, + "learning_rate": 9.98672446026724e-06, + "loss": 3.3676, + "step": 117850 + }, + { + "epoch": 0.11615031170448366, + "grad_norm": 2.5381789207458496, + "learning_rate": 9.986713188431118e-06, + "loss": 3.4245, + "step": 117900 + }, + { + "epoch": 0.11619956968230574, + "grad_norm": 2.7226550579071045, + "learning_rate": 9.986701911818118e-06, + "loss": 3.3739, + "step": 117950 + }, + { + "epoch": 0.11624882766012784, + "grad_norm": 2.496734142303467, + "learning_rate": 9.986690630428245e-06, + "loss": 3.4016, + "step": 118000 + }, + { + "epoch": 0.11629808563794992, + "grad_norm": 2.3350565433502197, + "learning_rate": 9.986679344261515e-06, + "loss": 3.3929, + "step": 118050 + }, + { + "epoch": 0.11634734361577201, + "grad_norm": 2.6802561283111572, + "learning_rate": 9.986668053317933e-06, + "loss": 3.3455, + "step": 118100 + }, + { + "epoch": 0.11639660159359409, + "grad_norm": 2.4967687129974365, + "learning_rate": 9.986656757597517e-06, + "loss": 3.3472, + "step": 118150 + }, + { + "epoch": 0.11644585957141619, + "grad_norm": 2.9904658794403076, + "learning_rate": 9.986645457100273e-06, + "loss": 3.3658, + "step": 118200 + }, + { + "epoch": 0.11649511754923828, + "grad_norm": 2.658781051635742, + "learning_rate": 9.986634151826214e-06, + "loss": 3.4207, + "step": 118250 + }, + { + "epoch": 0.11654437552706036, + "grad_norm": 2.4548611640930176, + "learning_rate": 9.98662284177535e-06, + "loss": 3.3874, + "step": 118300 + }, + { + "epoch": 0.11659363350488246, + "grad_norm": 2.3201584815979004, + "learning_rate": 9.986611526947691e-06, + "loss": 3.3915, + "step": 118350 + }, + { + "epoch": 0.11664289148270454, + "grad_norm": 2.2402336597442627, + "learning_rate": 9.986600207343249e-06, + "loss": 3.3725, + "step": 118400 + }, + { + "epoch": 0.11669214946052663, + "grad_norm": 2.5013039112091064, + "learning_rate": 9.986588882962034e-06, + "loss": 3.3552, + "step": 118450 + }, + { + "epoch": 0.11674140743834871, + "grad_norm": 2.4952564239501953, + "learning_rate": 9.986577553804058e-06, + "loss": 3.406, + "step": 118500 + }, + { + "epoch": 0.11679066541617081, + "grad_norm": 2.4043290615081787, + "learning_rate": 9.98656621986933e-06, + "loss": 3.3593, + "step": 118550 + }, + { + "epoch": 0.11683992339399289, + "grad_norm": 2.5888142585754395, + "learning_rate": 9.986554881157864e-06, + "loss": 3.3795, + "step": 118600 + }, + { + "epoch": 0.11688918137181498, + "grad_norm": 2.7791879177093506, + "learning_rate": 9.986543537669667e-06, + "loss": 3.4292, + "step": 118650 + }, + { + "epoch": 0.11693843934963706, + "grad_norm": 2.4515879154205322, + "learning_rate": 9.986532189404754e-06, + "loss": 3.3872, + "step": 118700 + }, + { + "epoch": 0.11698769732745916, + "grad_norm": 2.5687601566314697, + "learning_rate": 9.986520836363132e-06, + "loss": 3.4178, + "step": 118750 + }, + { + "epoch": 0.11703695530528124, + "grad_norm": 2.5656723976135254, + "learning_rate": 9.986509478544812e-06, + "loss": 3.4869, + "step": 118800 + }, + { + "epoch": 0.11708621328310333, + "grad_norm": 2.6637625694274902, + "learning_rate": 9.98649811594981e-06, + "loss": 3.3672, + "step": 118850 + }, + { + "epoch": 0.11713547126092543, + "grad_norm": 2.489666223526001, + "learning_rate": 9.98648674857813e-06, + "loss": 3.4695, + "step": 118900 + }, + { + "epoch": 0.11718472923874751, + "grad_norm": 2.4020326137542725, + "learning_rate": 9.986475376429787e-06, + "loss": 3.4194, + "step": 118950 + }, + { + "epoch": 0.1172339872165696, + "grad_norm": 2.356001615524292, + "learning_rate": 9.986463999504792e-06, + "loss": 3.4037, + "step": 119000 + }, + { + "epoch": 0.11728324519439168, + "grad_norm": 2.5342743396759033, + "learning_rate": 9.986452617803155e-06, + "loss": 3.3623, + "step": 119050 + }, + { + "epoch": 0.11733250317221378, + "grad_norm": 2.199453353881836, + "learning_rate": 9.986441231324886e-06, + "loss": 3.3526, + "step": 119100 + }, + { + "epoch": 0.11738176115003586, + "grad_norm": 2.533907651901245, + "learning_rate": 9.986429840069998e-06, + "loss": 3.3416, + "step": 119150 + }, + { + "epoch": 0.11743101912785794, + "grad_norm": 2.598482608795166, + "learning_rate": 9.986418444038499e-06, + "loss": 3.3478, + "step": 119200 + }, + { + "epoch": 0.11748027710568004, + "grad_norm": 2.2924203872680664, + "learning_rate": 9.986407043230404e-06, + "loss": 3.3704, + "step": 119250 + }, + { + "epoch": 0.11752953508350213, + "grad_norm": 3.296250104904175, + "learning_rate": 9.98639563764572e-06, + "loss": 3.3679, + "step": 119300 + }, + { + "epoch": 0.11757879306132421, + "grad_norm": 2.2870922088623047, + "learning_rate": 9.986384227284459e-06, + "loss": 3.3773, + "step": 119350 + }, + { + "epoch": 0.1176280510391463, + "grad_norm": 2.4280543327331543, + "learning_rate": 9.986372812146633e-06, + "loss": 3.4006, + "step": 119400 + }, + { + "epoch": 0.11767730901696839, + "grad_norm": 2.4473037719726562, + "learning_rate": 9.986361392232252e-06, + "loss": 3.3982, + "step": 119450 + }, + { + "epoch": 0.11772656699479048, + "grad_norm": 2.367959976196289, + "learning_rate": 9.986349967541326e-06, + "loss": 3.3366, + "step": 119500 + }, + { + "epoch": 0.11777582497261256, + "grad_norm": 2.499955177307129, + "learning_rate": 9.98633853807387e-06, + "loss": 3.3938, + "step": 119550 + }, + { + "epoch": 0.11782508295043466, + "grad_norm": 2.601670980453491, + "learning_rate": 9.986327103829892e-06, + "loss": 3.4265, + "step": 119600 + }, + { + "epoch": 0.11787434092825674, + "grad_norm": 2.3009002208709717, + "learning_rate": 9.986315664809401e-06, + "loss": 3.363, + "step": 119650 + }, + { + "epoch": 0.11792359890607883, + "grad_norm": 2.57027268409729, + "learning_rate": 9.986304221012411e-06, + "loss": 3.3786, + "step": 119700 + }, + { + "epoch": 0.11797285688390091, + "grad_norm": 2.846162796020508, + "learning_rate": 9.98629277243893e-06, + "loss": 3.3378, + "step": 119750 + }, + { + "epoch": 0.11802211486172301, + "grad_norm": 2.499687671661377, + "learning_rate": 9.986281319088976e-06, + "loss": 3.3746, + "step": 119800 + }, + { + "epoch": 0.1180713728395451, + "grad_norm": 2.413245439529419, + "learning_rate": 9.98626986096255e-06, + "loss": 3.3939, + "step": 119850 + }, + { + "epoch": 0.11812063081736718, + "grad_norm": 2.565244197845459, + "learning_rate": 9.986258398059669e-06, + "loss": 3.3749, + "step": 119900 + }, + { + "epoch": 0.11816988879518926, + "grad_norm": 2.9420268535614014, + "learning_rate": 9.986246930380346e-06, + "loss": 3.3252, + "step": 119950 + }, + { + "epoch": 0.11821914677301136, + "grad_norm": 2.5591540336608887, + "learning_rate": 9.986235457924585e-06, + "loss": 3.3613, + "step": 120000 + }, + { + "epoch": 0.11826840475083344, + "grad_norm": 2.420945644378662, + "learning_rate": 9.986223980692404e-06, + "loss": 3.4282, + "step": 120050 + }, + { + "epoch": 0.11831766272865553, + "grad_norm": 2.4914259910583496, + "learning_rate": 9.986212498683808e-06, + "loss": 3.333, + "step": 120100 + }, + { + "epoch": 0.11836692070647763, + "grad_norm": 2.4967002868652344, + "learning_rate": 9.986201011898813e-06, + "loss": 3.411, + "step": 120150 + }, + { + "epoch": 0.11841617868429971, + "grad_norm": 2.440558433532715, + "learning_rate": 9.986189520337428e-06, + "loss": 3.358, + "step": 120200 + }, + { + "epoch": 0.1184654366621218, + "grad_norm": 2.6565065383911133, + "learning_rate": 9.986178023999661e-06, + "loss": 3.3219, + "step": 120250 + }, + { + "epoch": 0.11851469463994388, + "grad_norm": 2.4349400997161865, + "learning_rate": 9.986166522885529e-06, + "loss": 3.412, + "step": 120300 + }, + { + "epoch": 0.11856395261776598, + "grad_norm": 2.342496395111084, + "learning_rate": 9.986155016995038e-06, + "loss": 3.3634, + "step": 120350 + }, + { + "epoch": 0.11861321059558806, + "grad_norm": 2.357039213180542, + "learning_rate": 9.986143506328203e-06, + "loss": 3.3296, + "step": 120400 + }, + { + "epoch": 0.11866246857341015, + "grad_norm": 2.4615566730499268, + "learning_rate": 9.986131990885031e-06, + "loss": 3.4237, + "step": 120450 + }, + { + "epoch": 0.11871172655123224, + "grad_norm": 2.640629768371582, + "learning_rate": 9.986120470665537e-06, + "loss": 3.3853, + "step": 120500 + }, + { + "epoch": 0.11876098452905433, + "grad_norm": 3.8036375045776367, + "learning_rate": 9.986108945669728e-06, + "loss": 3.4622, + "step": 120550 + }, + { + "epoch": 0.11881024250687641, + "grad_norm": 2.3227899074554443, + "learning_rate": 9.986097415897618e-06, + "loss": 3.3347, + "step": 120600 + }, + { + "epoch": 0.1188595004846985, + "grad_norm": 2.4758241176605225, + "learning_rate": 9.986085881349216e-06, + "loss": 3.3862, + "step": 120650 + }, + { + "epoch": 0.1189087584625206, + "grad_norm": 2.5483996868133545, + "learning_rate": 9.986074342024536e-06, + "loss": 3.3601, + "step": 120700 + }, + { + "epoch": 0.11895801644034268, + "grad_norm": 2.3937344551086426, + "learning_rate": 9.986062797923587e-06, + "loss": 3.3481, + "step": 120750 + }, + { + "epoch": 0.11900727441816476, + "grad_norm": 2.416670560836792, + "learning_rate": 9.986051249046379e-06, + "loss": 3.3648, + "step": 120800 + }, + { + "epoch": 0.11905653239598685, + "grad_norm": 2.3983354568481445, + "learning_rate": 9.986039695392925e-06, + "loss": 3.3242, + "step": 120850 + }, + { + "epoch": 0.11910579037380895, + "grad_norm": 2.3442978858947754, + "learning_rate": 9.986028136963234e-06, + "loss": 3.3946, + "step": 120900 + }, + { + "epoch": 0.11915504835163103, + "grad_norm": 2.420311689376831, + "learning_rate": 9.986016573757322e-06, + "loss": 3.3143, + "step": 120950 + }, + { + "epoch": 0.11920430632945311, + "grad_norm": 2.454395055770874, + "learning_rate": 9.986005005775195e-06, + "loss": 3.3566, + "step": 121000 + }, + { + "epoch": 0.11925356430727521, + "grad_norm": 2.85882306098938, + "learning_rate": 9.985993433016864e-06, + "loss": 3.3512, + "step": 121050 + }, + { + "epoch": 0.1193028222850973, + "grad_norm": 2.697298049926758, + "learning_rate": 9.985981855482343e-06, + "loss": 3.3747, + "step": 121100 + }, + { + "epoch": 0.11935208026291938, + "grad_norm": 3.051520347595215, + "learning_rate": 9.985970273171641e-06, + "loss": 3.3738, + "step": 121150 + }, + { + "epoch": 0.11940133824074146, + "grad_norm": Infinity, + "learning_rate": 9.985958686084772e-06, + "loss": 3.424, + "step": 121200 + }, + { + "epoch": 0.11945059621856356, + "grad_norm": 2.6462533473968506, + "learning_rate": 9.985947094221743e-06, + "loss": 3.3941, + "step": 121250 + }, + { + "epoch": 0.11949985419638565, + "grad_norm": 2.2422988414764404, + "learning_rate": 9.985935497582567e-06, + "loss": 3.3592, + "step": 121300 + }, + { + "epoch": 0.11954911217420773, + "grad_norm": 2.5982553958892822, + "learning_rate": 9.985923896167257e-06, + "loss": 3.3838, + "step": 121350 + }, + { + "epoch": 0.11959837015202983, + "grad_norm": 2.4534506797790527, + "learning_rate": 9.98591228997582e-06, + "loss": 3.3209, + "step": 121400 + }, + { + "epoch": 0.11964762812985191, + "grad_norm": 2.539276361465454, + "learning_rate": 9.985900679008272e-06, + "loss": 3.3532, + "step": 121450 + }, + { + "epoch": 0.119696886107674, + "grad_norm": 2.7114434242248535, + "learning_rate": 9.98588906326462e-06, + "loss": 3.3845, + "step": 121500 + }, + { + "epoch": 0.11974614408549608, + "grad_norm": 2.4282960891723633, + "learning_rate": 9.985877442744876e-06, + "loss": 3.387, + "step": 121550 + }, + { + "epoch": 0.11979540206331818, + "grad_norm": 2.381016492843628, + "learning_rate": 9.985865817449054e-06, + "loss": 3.3797, + "step": 121600 + }, + { + "epoch": 0.11984466004114026, + "grad_norm": 2.7338695526123047, + "learning_rate": 9.985854187377162e-06, + "loss": 3.4115, + "step": 121650 + }, + { + "epoch": 0.11989391801896235, + "grad_norm": 2.306929111480713, + "learning_rate": 9.98584255252921e-06, + "loss": 3.308, + "step": 121700 + }, + { + "epoch": 0.11994317599678445, + "grad_norm": 2.5156071186065674, + "learning_rate": 9.985830912905214e-06, + "loss": 3.3536, + "step": 121750 + }, + { + "epoch": 0.11999243397460653, + "grad_norm": 2.9159653186798096, + "learning_rate": 9.98581926850518e-06, + "loss": 3.3338, + "step": 121800 + }, + { + "epoch": 0.12004169195242861, + "grad_norm": 2.4436471462249756, + "learning_rate": 9.985807619329125e-06, + "loss": 3.3451, + "step": 121850 + }, + { + "epoch": 0.1200909499302507, + "grad_norm": 2.936408281326294, + "learning_rate": 9.985795965377055e-06, + "loss": 3.409, + "step": 121900 + }, + { + "epoch": 0.1201402079080728, + "grad_norm": 2.6326308250427246, + "learning_rate": 9.985784306648983e-06, + "loss": 3.3498, + "step": 121950 + }, + { + "epoch": 0.12018946588589488, + "grad_norm": 2.7693350315093994, + "learning_rate": 9.985772643144919e-06, + "loss": 3.4038, + "step": 122000 + }, + { + "epoch": 0.12023872386371696, + "grad_norm": 2.5912423133850098, + "learning_rate": 9.985760974864877e-06, + "loss": 3.3555, + "step": 122050 + }, + { + "epoch": 0.12028798184153905, + "grad_norm": 2.664357900619507, + "learning_rate": 9.985749301808865e-06, + "loss": 3.4534, + "step": 122100 + }, + { + "epoch": 0.12033723981936115, + "grad_norm": 2.665661573410034, + "learning_rate": 9.985737623976896e-06, + "loss": 3.4263, + "step": 122150 + }, + { + "epoch": 0.12038649779718323, + "grad_norm": 2.4949114322662354, + "learning_rate": 9.985725941368982e-06, + "loss": 3.3545, + "step": 122200 + }, + { + "epoch": 0.12043575577500532, + "grad_norm": 2.5493500232696533, + "learning_rate": 9.98571425398513e-06, + "loss": 3.3484, + "step": 122250 + }, + { + "epoch": 0.12048501375282741, + "grad_norm": 2.5368669033050537, + "learning_rate": 9.985702561825357e-06, + "loss": 3.3268, + "step": 122300 + }, + { + "epoch": 0.1205342717306495, + "grad_norm": 2.389108180999756, + "learning_rate": 9.98569086488967e-06, + "loss": 3.3515, + "step": 122350 + }, + { + "epoch": 0.12058352970847158, + "grad_norm": 2.8298611640930176, + "learning_rate": 9.985679163178081e-06, + "loss": 3.3818, + "step": 122400 + }, + { + "epoch": 0.12063278768629367, + "grad_norm": 2.499889373779297, + "learning_rate": 9.985667456690603e-06, + "loss": 3.3564, + "step": 122450 + }, + { + "epoch": 0.12068204566411576, + "grad_norm": 2.4032227993011475, + "learning_rate": 9.985655745427246e-06, + "loss": 3.3711, + "step": 122500 + }, + { + "epoch": 0.12073130364193785, + "grad_norm": 2.599724292755127, + "learning_rate": 9.985644029388022e-06, + "loss": 3.2988, + "step": 122550 + }, + { + "epoch": 0.12078056161975993, + "grad_norm": 2.9838271141052246, + "learning_rate": 9.98563230857294e-06, + "loss": 3.3695, + "step": 122600 + }, + { + "epoch": 0.12082981959758203, + "grad_norm": 2.3377504348754883, + "learning_rate": 9.985620582982011e-06, + "loss": 3.3895, + "step": 122650 + }, + { + "epoch": 0.12087907757540411, + "grad_norm": 2.69498610496521, + "learning_rate": 9.985608852615252e-06, + "loss": 3.3382, + "step": 122700 + }, + { + "epoch": 0.1209283355532262, + "grad_norm": 3.016826868057251, + "learning_rate": 9.985597117472667e-06, + "loss": 3.3315, + "step": 122750 + }, + { + "epoch": 0.12097759353104828, + "grad_norm": 2.6515443325042725, + "learning_rate": 9.985585377554272e-06, + "loss": 3.4221, + "step": 122800 + }, + { + "epoch": 0.12102685150887038, + "grad_norm": 2.4090754985809326, + "learning_rate": 9.985573632860075e-06, + "loss": 3.3556, + "step": 122850 + }, + { + "epoch": 0.12107610948669247, + "grad_norm": 2.5005359649658203, + "learning_rate": 9.98556188339009e-06, + "loss": 3.3568, + "step": 122900 + }, + { + "epoch": 0.12112536746451455, + "grad_norm": 2.7486608028411865, + "learning_rate": 9.985550129144326e-06, + "loss": 3.3633, + "step": 122950 + }, + { + "epoch": 0.12117462544233665, + "grad_norm": 2.646340847015381, + "learning_rate": 9.985538370122796e-06, + "loss": 3.4326, + "step": 123000 + }, + { + "epoch": 0.12122388342015873, + "grad_norm": 2.5295801162719727, + "learning_rate": 9.985526606325512e-06, + "loss": 3.4044, + "step": 123050 + }, + { + "epoch": 0.12127314139798082, + "grad_norm": 2.5088963508605957, + "learning_rate": 9.985514837752483e-06, + "loss": 3.3727, + "step": 123100 + }, + { + "epoch": 0.1213223993758029, + "grad_norm": 2.481874465942383, + "learning_rate": 9.985503064403721e-06, + "loss": 3.3439, + "step": 123150 + }, + { + "epoch": 0.121371657353625, + "grad_norm": 2.664064645767212, + "learning_rate": 9.985491286279238e-06, + "loss": 3.3924, + "step": 123200 + }, + { + "epoch": 0.12142091533144708, + "grad_norm": 2.445732593536377, + "learning_rate": 9.985479503379043e-06, + "loss": 3.3778, + "step": 123250 + }, + { + "epoch": 0.12147017330926917, + "grad_norm": 2.5974678993225098, + "learning_rate": 9.985467715703151e-06, + "loss": 3.37, + "step": 123300 + }, + { + "epoch": 0.12151943128709125, + "grad_norm": 2.5868520736694336, + "learning_rate": 9.98545592325157e-06, + "loss": 3.3612, + "step": 123350 + }, + { + "epoch": 0.12156868926491335, + "grad_norm": 2.523359537124634, + "learning_rate": 9.985444126024313e-06, + "loss": 3.356, + "step": 123400 + }, + { + "epoch": 0.12161794724273543, + "grad_norm": 2.445063829421997, + "learning_rate": 9.985432324021392e-06, + "loss": 3.3267, + "step": 123450 + }, + { + "epoch": 0.12166720522055752, + "grad_norm": 2.591714859008789, + "learning_rate": 9.985420517242816e-06, + "loss": 3.4095, + "step": 123500 + }, + { + "epoch": 0.12171646319837962, + "grad_norm": 2.659104347229004, + "learning_rate": 9.985408705688598e-06, + "loss": 3.4179, + "step": 123550 + }, + { + "epoch": 0.1217657211762017, + "grad_norm": 2.5036659240722656, + "learning_rate": 9.985396889358748e-06, + "loss": 3.3967, + "step": 123600 + }, + { + "epoch": 0.12181497915402378, + "grad_norm": 2.3203115463256836, + "learning_rate": 9.98538506825328e-06, + "loss": 3.4213, + "step": 123650 + }, + { + "epoch": 0.12186423713184587, + "grad_norm": 2.3608579635620117, + "learning_rate": 9.985373242372202e-06, + "loss": 3.3931, + "step": 123700 + }, + { + "epoch": 0.12191349510966797, + "grad_norm": 2.6284995079040527, + "learning_rate": 9.985361411715527e-06, + "loss": 3.3248, + "step": 123750 + }, + { + "epoch": 0.12196275308749005, + "grad_norm": 2.335951089859009, + "learning_rate": 9.985349576283267e-06, + "loss": 3.4039, + "step": 123800 + }, + { + "epoch": 0.12201201106531213, + "grad_norm": 2.4036426544189453, + "learning_rate": 9.985337736075431e-06, + "loss": 3.3789, + "step": 123850 + }, + { + "epoch": 0.12206126904313423, + "grad_norm": 2.565584897994995, + "learning_rate": 9.985325891092033e-06, + "loss": 3.4236, + "step": 123900 + }, + { + "epoch": 0.12211052702095632, + "grad_norm": 2.6063807010650635, + "learning_rate": 9.985314041333083e-06, + "loss": 3.4226, + "step": 123950 + }, + { + "epoch": 0.1221597849987784, + "grad_norm": 2.611232280731201, + "learning_rate": 9.985302186798593e-06, + "loss": 3.3117, + "step": 124000 + }, + { + "epoch": 0.12220904297660048, + "grad_norm": 2.4905476570129395, + "learning_rate": 9.985290327488572e-06, + "loss": 3.365, + "step": 124050 + }, + { + "epoch": 0.12225830095442258, + "grad_norm": 2.5467653274536133, + "learning_rate": 9.985278463403034e-06, + "loss": 3.3794, + "step": 124100 + }, + { + "epoch": 0.12230755893224467, + "grad_norm": 2.6284213066101074, + "learning_rate": 9.985266594541991e-06, + "loss": 3.3888, + "step": 124150 + }, + { + "epoch": 0.12235681691006675, + "grad_norm": 2.4711623191833496, + "learning_rate": 9.985254720905454e-06, + "loss": 3.3722, + "step": 124200 + }, + { + "epoch": 0.12240607488788885, + "grad_norm": 3.0786995887756348, + "learning_rate": 9.98524284249343e-06, + "loss": 3.4041, + "step": 124250 + }, + { + "epoch": 0.12245533286571093, + "grad_norm": 2.5038070678710938, + "learning_rate": 9.985230959305935e-06, + "loss": 3.3604, + "step": 124300 + }, + { + "epoch": 0.12250459084353302, + "grad_norm": 2.5674710273742676, + "learning_rate": 9.98521907134298e-06, + "loss": 3.4048, + "step": 124350 + }, + { + "epoch": 0.1225538488213551, + "grad_norm": 2.59773325920105, + "learning_rate": 9.985207178604574e-06, + "loss": 3.4055, + "step": 124400 + }, + { + "epoch": 0.1226031067991772, + "grad_norm": 2.373234272003174, + "learning_rate": 9.98519528109073e-06, + "loss": 3.3459, + "step": 124450 + }, + { + "epoch": 0.12265236477699928, + "grad_norm": 2.448775291442871, + "learning_rate": 9.985183378801461e-06, + "loss": 3.3651, + "step": 124500 + }, + { + "epoch": 0.12270162275482137, + "grad_norm": 2.51942777633667, + "learning_rate": 9.985171471736775e-06, + "loss": 3.4245, + "step": 124550 + }, + { + "epoch": 0.12275088073264345, + "grad_norm": 2.5790319442749023, + "learning_rate": 9.985159559896686e-06, + "loss": 3.4078, + "step": 124600 + }, + { + "epoch": 0.12280013871046555, + "grad_norm": 2.4631943702697754, + "learning_rate": 9.985147643281205e-06, + "loss": 3.3464, + "step": 124650 + }, + { + "epoch": 0.12284939668828763, + "grad_norm": 2.5700247287750244, + "learning_rate": 9.985135721890342e-06, + "loss": 3.3293, + "step": 124700 + }, + { + "epoch": 0.12289865466610972, + "grad_norm": 2.6977152824401855, + "learning_rate": 9.98512379572411e-06, + "loss": 3.286, + "step": 124750 + }, + { + "epoch": 0.12294791264393182, + "grad_norm": 2.402130365371704, + "learning_rate": 9.98511186478252e-06, + "loss": 3.4421, + "step": 124800 + }, + { + "epoch": 0.1229971706217539, + "grad_norm": 2.6723873615264893, + "learning_rate": 9.985099929065582e-06, + "loss": 3.3605, + "step": 124850 + }, + { + "epoch": 0.12304642859957599, + "grad_norm": 2.8979437351226807, + "learning_rate": 9.98508798857331e-06, + "loss": 3.3561, + "step": 124900 + }, + { + "epoch": 0.12309568657739807, + "grad_norm": 2.2580835819244385, + "learning_rate": 9.985076043305713e-06, + "loss": 3.3198, + "step": 124950 + }, + { + "epoch": 0.12314494455522017, + "grad_norm": 2.860356330871582, + "learning_rate": 9.985064093262803e-06, + "loss": 3.325, + "step": 125000 + }, + { + "epoch": 0.12319420253304225, + "grad_norm": 2.4417760372161865, + "learning_rate": 9.985052138444593e-06, + "loss": 3.3606, + "step": 125050 + }, + { + "epoch": 0.12324346051086434, + "grad_norm": 2.25171160697937, + "learning_rate": 9.985040178851093e-06, + "loss": 3.3449, + "step": 125100 + }, + { + "epoch": 0.12329271848868643, + "grad_norm": 2.885326623916626, + "learning_rate": 9.985028214482315e-06, + "loss": 3.4114, + "step": 125150 + }, + { + "epoch": 0.12334197646650852, + "grad_norm": 2.3991408348083496, + "learning_rate": 9.98501624533827e-06, + "loss": 3.3933, + "step": 125200 + }, + { + "epoch": 0.1233912344443306, + "grad_norm": 2.696899890899658, + "learning_rate": 9.98500427141897e-06, + "loss": 3.348, + "step": 125250 + }, + { + "epoch": 0.12344049242215269, + "grad_norm": 2.531999111175537, + "learning_rate": 9.984992292724427e-06, + "loss": 3.4239, + "step": 125300 + }, + { + "epoch": 0.12348975039997478, + "grad_norm": 2.6619131565093994, + "learning_rate": 9.98498030925465e-06, + "loss": 3.3471, + "step": 125350 + }, + { + "epoch": 0.12353900837779687, + "grad_norm": 2.84726881980896, + "learning_rate": 9.984968321009654e-06, + "loss": 3.3269, + "step": 125400 + }, + { + "epoch": 0.12358826635561895, + "grad_norm": 2.5031566619873047, + "learning_rate": 9.984956327989448e-06, + "loss": 3.3903, + "step": 125450 + }, + { + "epoch": 0.12363752433344104, + "grad_norm": 2.4966518878936768, + "learning_rate": 9.984944330194045e-06, + "loss": 3.345, + "step": 125500 + }, + { + "epoch": 0.12368678231126314, + "grad_norm": 2.308844804763794, + "learning_rate": 9.984932327623454e-06, + "loss": 3.4259, + "step": 125550 + }, + { + "epoch": 0.12373604028908522, + "grad_norm": 2.545069694519043, + "learning_rate": 9.984920320277689e-06, + "loss": 3.4161, + "step": 125600 + }, + { + "epoch": 0.1237852982669073, + "grad_norm": 2.506284713745117, + "learning_rate": 9.984908308156763e-06, + "loss": 3.3083, + "step": 125650 + }, + { + "epoch": 0.1238345562447294, + "grad_norm": 2.846367835998535, + "learning_rate": 9.984896291260682e-06, + "loss": 3.4033, + "step": 125700 + }, + { + "epoch": 0.12388381422255149, + "grad_norm": 2.473738670349121, + "learning_rate": 9.984884269589462e-06, + "loss": 3.3572, + "step": 125750 + }, + { + "epoch": 0.12393307220037357, + "grad_norm": 2.783252716064453, + "learning_rate": 9.984872243143111e-06, + "loss": 3.3345, + "step": 125800 + }, + { + "epoch": 0.12398233017819565, + "grad_norm": 2.513139486312866, + "learning_rate": 9.984860211921645e-06, + "loss": 3.3149, + "step": 125850 + }, + { + "epoch": 0.12403158815601775, + "grad_norm": 2.4643876552581787, + "learning_rate": 9.984848175925073e-06, + "loss": 3.3681, + "step": 125900 + }, + { + "epoch": 0.12408084613383984, + "grad_norm": 2.3363003730773926, + "learning_rate": 9.984836135153409e-06, + "loss": 3.414, + "step": 125950 + }, + { + "epoch": 0.12413010411166192, + "grad_norm": 2.3982656002044678, + "learning_rate": 9.984824089606659e-06, + "loss": 3.311, + "step": 126000 + }, + { + "epoch": 0.12417936208948402, + "grad_norm": 2.3449435234069824, + "learning_rate": 9.984812039284838e-06, + "loss": 3.3731, + "step": 126050 + }, + { + "epoch": 0.1242286200673061, + "grad_norm": 2.5280776023864746, + "learning_rate": 9.984799984187958e-06, + "loss": 3.3904, + "step": 126100 + }, + { + "epoch": 0.12427787804512819, + "grad_norm": 2.6861987113952637, + "learning_rate": 9.984787924316033e-06, + "loss": 3.3425, + "step": 126150 + }, + { + "epoch": 0.12432713602295027, + "grad_norm": 2.347813367843628, + "learning_rate": 9.984775859669067e-06, + "loss": 3.4089, + "step": 126200 + }, + { + "epoch": 0.12437639400077237, + "grad_norm": 2.3460347652435303, + "learning_rate": 9.984763790247078e-06, + "loss": 3.3517, + "step": 126250 + }, + { + "epoch": 0.12442565197859445, + "grad_norm": 2.53999400138855, + "learning_rate": 9.984751716050077e-06, + "loss": 3.2911, + "step": 126300 + }, + { + "epoch": 0.12447490995641654, + "grad_norm": 2.590735673904419, + "learning_rate": 9.984739637078072e-06, + "loss": 3.3965, + "step": 126350 + }, + { + "epoch": 0.12452416793423864, + "grad_norm": 2.4318368434906006, + "learning_rate": 9.984727553331078e-06, + "loss": 3.3027, + "step": 126400 + }, + { + "epoch": 0.12457342591206072, + "grad_norm": 2.2616639137268066, + "learning_rate": 9.984715464809103e-06, + "loss": 3.396, + "step": 126450 + }, + { + "epoch": 0.1246226838898828, + "grad_norm": 2.5016982555389404, + "learning_rate": 9.984703371512163e-06, + "loss": 3.3572, + "step": 126500 + }, + { + "epoch": 0.12467194186770489, + "grad_norm": 2.3482229709625244, + "learning_rate": 9.984691273440269e-06, + "loss": 3.3767, + "step": 126550 + }, + { + "epoch": 0.12472119984552699, + "grad_norm": 2.5039920806884766, + "learning_rate": 9.984679170593427e-06, + "loss": 3.3509, + "step": 126600 + }, + { + "epoch": 0.12477045782334907, + "grad_norm": 2.7355797290802, + "learning_rate": 9.984667062971656e-06, + "loss": 3.3252, + "step": 126650 + }, + { + "epoch": 0.12481971580117115, + "grad_norm": 2.572892904281616, + "learning_rate": 9.984654950574963e-06, + "loss": 3.3435, + "step": 126700 + }, + { + "epoch": 0.12486897377899324, + "grad_norm": 2.4872069358825684, + "learning_rate": 9.984642833403361e-06, + "loss": 3.354, + "step": 126750 + }, + { + "epoch": 0.12491823175681534, + "grad_norm": 2.4385619163513184, + "learning_rate": 9.984630711456862e-06, + "loss": 3.372, + "step": 126800 + }, + { + "epoch": 0.12496748973463742, + "grad_norm": 2.4161319732666016, + "learning_rate": 9.984618584735477e-06, + "loss": 3.4139, + "step": 126850 + }, + { + "epoch": 0.12501674771245952, + "grad_norm": 2.577685594558716, + "learning_rate": 9.984606453239218e-06, + "loss": 3.375, + "step": 126900 + }, + { + "epoch": 0.1250660056902816, + "grad_norm": 2.304260730743408, + "learning_rate": 9.984594316968095e-06, + "loss": 3.3388, + "step": 126950 + }, + { + "epoch": 0.1251152636681037, + "grad_norm": 2.5747199058532715, + "learning_rate": 9.984582175922121e-06, + "loss": 3.3446, + "step": 127000 + }, + { + "epoch": 0.12516452164592579, + "grad_norm": 2.859485149383545, + "learning_rate": 9.98457003010131e-06, + "loss": 3.3462, + "step": 127050 + }, + { + "epoch": 0.12521377962374786, + "grad_norm": 2.40120267868042, + "learning_rate": 9.98455787950567e-06, + "loss": 3.3476, + "step": 127100 + }, + { + "epoch": 0.12526303760156995, + "grad_norm": 2.2962496280670166, + "learning_rate": 9.984545724135213e-06, + "loss": 3.3969, + "step": 127150 + }, + { + "epoch": 0.12531229557939202, + "grad_norm": 3.085458517074585, + "learning_rate": 9.984533563989952e-06, + "loss": 3.3507, + "step": 127200 + }, + { + "epoch": 0.12536155355721412, + "grad_norm": 2.4457104206085205, + "learning_rate": 9.984521399069898e-06, + "loss": 3.3291, + "step": 127250 + }, + { + "epoch": 0.12541081153503622, + "grad_norm": 2.6180813312530518, + "learning_rate": 9.984509229375064e-06, + "loss": 3.404, + "step": 127300 + }, + { + "epoch": 0.1254600695128583, + "grad_norm": 2.5495731830596924, + "learning_rate": 9.984497054905459e-06, + "loss": 3.3685, + "step": 127350 + }, + { + "epoch": 0.1255093274906804, + "grad_norm": 2.4389400482177734, + "learning_rate": 9.984484875661098e-06, + "loss": 3.3128, + "step": 127400 + }, + { + "epoch": 0.1255585854685025, + "grad_norm": 2.565800666809082, + "learning_rate": 9.984472691641989e-06, + "loss": 3.2781, + "step": 127450 + }, + { + "epoch": 0.12560784344632456, + "grad_norm": 2.2861902713775635, + "learning_rate": 9.984460502848146e-06, + "loss": 3.3634, + "step": 127500 + }, + { + "epoch": 0.12565710142414666, + "grad_norm": 2.599116086959839, + "learning_rate": 9.98444830927958e-06, + "loss": 3.333, + "step": 127550 + }, + { + "epoch": 0.12570635940196875, + "grad_norm": 2.475360155105591, + "learning_rate": 9.984436110936302e-06, + "loss": 3.4194, + "step": 127600 + }, + { + "epoch": 0.12575561737979082, + "grad_norm": 2.8991916179656982, + "learning_rate": 9.984423907818325e-06, + "loss": 3.3118, + "step": 127650 + }, + { + "epoch": 0.12580487535761292, + "grad_norm": 2.756822347640991, + "learning_rate": 9.984411699925663e-06, + "loss": 3.4449, + "step": 127700 + }, + { + "epoch": 0.12585413333543502, + "grad_norm": 2.6148695945739746, + "learning_rate": 9.984399487258321e-06, + "loss": 3.3703, + "step": 127750 + }, + { + "epoch": 0.1259033913132571, + "grad_norm": 2.5003671646118164, + "learning_rate": 9.984387269816316e-06, + "loss": 3.3664, + "step": 127800 + }, + { + "epoch": 0.1259526492910792, + "grad_norm": 2.6351938247680664, + "learning_rate": 9.984375047599659e-06, + "loss": 3.2768, + "step": 127850 + }, + { + "epoch": 0.12600190726890126, + "grad_norm": 2.361804246902466, + "learning_rate": 9.98436282060836e-06, + "loss": 3.3174, + "step": 127900 + }, + { + "epoch": 0.12605116524672336, + "grad_norm": 2.313678026199341, + "learning_rate": 9.984350588842433e-06, + "loss": 3.3497, + "step": 127950 + }, + { + "epoch": 0.12610042322454545, + "grad_norm": 2.446336030960083, + "learning_rate": 9.984338352301888e-06, + "loss": 3.3676, + "step": 128000 + }, + { + "epoch": 0.12614968120236753, + "grad_norm": 2.4840900897979736, + "learning_rate": 9.984326110986737e-06, + "loss": 3.352, + "step": 128050 + }, + { + "epoch": 0.12619893918018962, + "grad_norm": 2.4800477027893066, + "learning_rate": 9.98431386489699e-06, + "loss": 3.3736, + "step": 128100 + }, + { + "epoch": 0.12624819715801172, + "grad_norm": 2.34621262550354, + "learning_rate": 9.984301614032664e-06, + "loss": 3.3201, + "step": 128150 + }, + { + "epoch": 0.1262974551358338, + "grad_norm": 2.4865853786468506, + "learning_rate": 9.984289358393765e-06, + "loss": 3.394, + "step": 128200 + }, + { + "epoch": 0.1263467131136559, + "grad_norm": 2.546816110610962, + "learning_rate": 9.984277097980308e-06, + "loss": 3.3659, + "step": 128250 + }, + { + "epoch": 0.126395971091478, + "grad_norm": 2.4290060997009277, + "learning_rate": 9.984264832792303e-06, + "loss": 3.2442, + "step": 128300 + }, + { + "epoch": 0.12644522906930006, + "grad_norm": 2.3757212162017822, + "learning_rate": 9.984252562829764e-06, + "loss": 3.4223, + "step": 128350 + }, + { + "epoch": 0.12649448704712216, + "grad_norm": 2.5061349868774414, + "learning_rate": 9.9842402880927e-06, + "loss": 3.3722, + "step": 128400 + }, + { + "epoch": 0.12654374502494423, + "grad_norm": 2.4515717029571533, + "learning_rate": 9.984228008581125e-06, + "loss": 3.3824, + "step": 128450 + }, + { + "epoch": 0.12659300300276632, + "grad_norm": 2.509746551513672, + "learning_rate": 9.98421572429505e-06, + "loss": 3.4066, + "step": 128500 + }, + { + "epoch": 0.12664226098058842, + "grad_norm": 2.514237403869629, + "learning_rate": 9.984203435234485e-06, + "loss": 3.3711, + "step": 128550 + }, + { + "epoch": 0.1266915189584105, + "grad_norm": 2.3045802116394043, + "learning_rate": 9.984191141399447e-06, + "loss": 3.3603, + "step": 128600 + }, + { + "epoch": 0.1267407769362326, + "grad_norm": 2.6559059619903564, + "learning_rate": 9.98417884278994e-06, + "loss": 3.3484, + "step": 128650 + }, + { + "epoch": 0.1267900349140547, + "grad_norm": 2.2892990112304688, + "learning_rate": 9.984166539405982e-06, + "loss": 3.3772, + "step": 128700 + }, + { + "epoch": 0.12683929289187676, + "grad_norm": 2.599705934524536, + "learning_rate": 9.984154231247583e-06, + "loss": 3.3043, + "step": 128750 + }, + { + "epoch": 0.12688855086969886, + "grad_norm": 2.4101521968841553, + "learning_rate": 9.984141918314753e-06, + "loss": 3.3553, + "step": 128800 + }, + { + "epoch": 0.12693780884752096, + "grad_norm": 2.4606688022613525, + "learning_rate": 9.984129600607508e-06, + "loss": 3.3417, + "step": 128850 + }, + { + "epoch": 0.12698706682534303, + "grad_norm": 2.3904287815093994, + "learning_rate": 9.984117278125855e-06, + "loss": 3.3756, + "step": 128900 + }, + { + "epoch": 0.12703632480316512, + "grad_norm": 2.428767204284668, + "learning_rate": 9.984104950869809e-06, + "loss": 3.3664, + "step": 128950 + }, + { + "epoch": 0.12708558278098722, + "grad_norm": 2.3415589332580566, + "learning_rate": 9.984092618839379e-06, + "loss": 3.3881, + "step": 129000 + }, + { + "epoch": 0.1271348407588093, + "grad_norm": 2.3242931365966797, + "learning_rate": 9.98408028203458e-06, + "loss": 3.4072, + "step": 129050 + }, + { + "epoch": 0.1271840987366314, + "grad_norm": 2.567004442214966, + "learning_rate": 9.984067940455423e-06, + "loss": 3.3348, + "step": 129100 + }, + { + "epoch": 0.12723335671445346, + "grad_norm": 2.415694236755371, + "learning_rate": 9.984055594101918e-06, + "loss": 3.4026, + "step": 129150 + }, + { + "epoch": 0.12728261469227556, + "grad_norm": 2.474930763244629, + "learning_rate": 9.984043242974078e-06, + "loss": 3.3146, + "step": 129200 + }, + { + "epoch": 0.12733187267009766, + "grad_norm": 2.440046787261963, + "learning_rate": 9.984030887071915e-06, + "loss": 3.3101, + "step": 129250 + }, + { + "epoch": 0.12738113064791973, + "grad_norm": 2.488495111465454, + "learning_rate": 9.98401852639544e-06, + "loss": 3.3929, + "step": 129300 + }, + { + "epoch": 0.12743038862574183, + "grad_norm": 2.4777398109436035, + "learning_rate": 9.984006160944667e-06, + "loss": 3.4003, + "step": 129350 + }, + { + "epoch": 0.12747964660356392, + "grad_norm": 2.858377695083618, + "learning_rate": 9.983993790719605e-06, + "loss": 3.4004, + "step": 129400 + }, + { + "epoch": 0.127528904581386, + "grad_norm": 2.243535280227661, + "learning_rate": 9.98398141572027e-06, + "loss": 3.3571, + "step": 129450 + }, + { + "epoch": 0.1275781625592081, + "grad_norm": 2.4967029094696045, + "learning_rate": 9.983969035946669e-06, + "loss": 3.4008, + "step": 129500 + }, + { + "epoch": 0.1276274205370302, + "grad_norm": 2.518839120864868, + "learning_rate": 9.983956651398815e-06, + "loss": 3.4043, + "step": 129550 + }, + { + "epoch": 0.12767667851485226, + "grad_norm": 2.799847364425659, + "learning_rate": 9.983944262076722e-06, + "loss": 3.3867, + "step": 129600 + }, + { + "epoch": 0.12772593649267436, + "grad_norm": 2.2840347290039062, + "learning_rate": 9.983931867980402e-06, + "loss": 3.3585, + "step": 129650 + }, + { + "epoch": 0.12777519447049643, + "grad_norm": 2.584991216659546, + "learning_rate": 9.983919469109863e-06, + "loss": 3.3536, + "step": 129700 + }, + { + "epoch": 0.12782445244831853, + "grad_norm": 2.4160144329071045, + "learning_rate": 9.98390706546512e-06, + "loss": 3.4074, + "step": 129750 + }, + { + "epoch": 0.12787371042614062, + "grad_norm": 2.4027867317199707, + "learning_rate": 9.983894657046184e-06, + "loss": 3.3337, + "step": 129800 + }, + { + "epoch": 0.1279229684039627, + "grad_norm": 2.308525800704956, + "learning_rate": 9.98388224385307e-06, + "loss": 3.3701, + "step": 129850 + }, + { + "epoch": 0.1279722263817848, + "grad_norm": 2.4812049865722656, + "learning_rate": 9.983869825885783e-06, + "loss": 3.3853, + "step": 129900 + }, + { + "epoch": 0.1280214843596069, + "grad_norm": 2.2166528701782227, + "learning_rate": 9.983857403144342e-06, + "loss": 3.2372, + "step": 129950 + }, + { + "epoch": 0.12807074233742896, + "grad_norm": 2.6954221725463867, + "learning_rate": 9.983844975628755e-06, + "loss": 3.2424, + "step": 130000 + }, + { + "epoch": 0.12812000031525106, + "grad_norm": 2.6420326232910156, + "learning_rate": 9.983832543339035e-06, + "loss": 3.3877, + "step": 130050 + }, + { + "epoch": 0.12816925829307316, + "grad_norm": 2.455517292022705, + "learning_rate": 9.983820106275193e-06, + "loss": 3.3032, + "step": 130100 + }, + { + "epoch": 0.12821851627089523, + "grad_norm": 2.4757537841796875, + "learning_rate": 9.983807664437242e-06, + "loss": 3.3668, + "step": 130150 + }, + { + "epoch": 0.12826777424871733, + "grad_norm": 2.6717588901519775, + "learning_rate": 9.983795217825194e-06, + "loss": 3.3114, + "step": 130200 + }, + { + "epoch": 0.12831703222653942, + "grad_norm": 2.5731923580169678, + "learning_rate": 9.98378276643906e-06, + "loss": 3.3721, + "step": 130250 + }, + { + "epoch": 0.1283662902043615, + "grad_norm": 2.5594942569732666, + "learning_rate": 9.983770310278852e-06, + "loss": 3.3748, + "step": 130300 + }, + { + "epoch": 0.1284155481821836, + "grad_norm": 2.729557752609253, + "learning_rate": 9.983757849344581e-06, + "loss": 3.3027, + "step": 130350 + }, + { + "epoch": 0.12846480616000566, + "grad_norm": 2.6334259510040283, + "learning_rate": 9.983745383636264e-06, + "loss": 3.3768, + "step": 130400 + }, + { + "epoch": 0.12851406413782776, + "grad_norm": 2.390185832977295, + "learning_rate": 9.983732913153908e-06, + "loss": 3.3419, + "step": 130450 + }, + { + "epoch": 0.12856332211564986, + "grad_norm": 2.517690420150757, + "learning_rate": 9.983720437897523e-06, + "loss": 3.3187, + "step": 130500 + }, + { + "epoch": 0.12861258009347193, + "grad_norm": 2.3586184978485107, + "learning_rate": 9.983707957867127e-06, + "loss": 3.3218, + "step": 130550 + }, + { + "epoch": 0.12866183807129403, + "grad_norm": 2.5505306720733643, + "learning_rate": 9.98369547306273e-06, + "loss": 3.3987, + "step": 130600 + }, + { + "epoch": 0.12871109604911612, + "grad_norm": 2.8113210201263428, + "learning_rate": 9.983682983484339e-06, + "loss": 3.3257, + "step": 130650 + }, + { + "epoch": 0.1287603540269382, + "grad_norm": 2.822786331176758, + "learning_rate": 9.983670489131974e-06, + "loss": 3.3675, + "step": 130700 + }, + { + "epoch": 0.1288096120047603, + "grad_norm": 2.6194067001342773, + "learning_rate": 9.983657990005639e-06, + "loss": 3.3179, + "step": 130750 + }, + { + "epoch": 0.1288588699825824, + "grad_norm": 2.5005898475646973, + "learning_rate": 9.983645486105353e-06, + "loss": 3.3272, + "step": 130800 + }, + { + "epoch": 0.12890812796040446, + "grad_norm": 2.4400925636291504, + "learning_rate": 9.983632977431123e-06, + "loss": 3.3259, + "step": 130850 + }, + { + "epoch": 0.12895738593822656, + "grad_norm": 2.3297295570373535, + "learning_rate": 9.983620463982965e-06, + "loss": 3.3491, + "step": 130900 + }, + { + "epoch": 0.12900664391604863, + "grad_norm": 2.541872262954712, + "learning_rate": 9.983607945760886e-06, + "loss": 3.3252, + "step": 130950 + }, + { + "epoch": 0.12905590189387073, + "grad_norm": 2.4682912826538086, + "learning_rate": 9.983595422764905e-06, + "loss": 3.3592, + "step": 131000 + }, + { + "epoch": 0.12910515987169283, + "grad_norm": 2.2377772331237793, + "learning_rate": 9.983582894995027e-06, + "loss": 3.3833, + "step": 131050 + }, + { + "epoch": 0.1291544178495149, + "grad_norm": 2.6657533645629883, + "learning_rate": 9.983570362451266e-06, + "loss": 3.3513, + "step": 131100 + }, + { + "epoch": 0.129203675827337, + "grad_norm": 2.37954044342041, + "learning_rate": 9.983557825133637e-06, + "loss": 3.3383, + "step": 131150 + }, + { + "epoch": 0.1292529338051591, + "grad_norm": 2.6916258335113525, + "learning_rate": 9.98354528304215e-06, + "loss": 3.3272, + "step": 131200 + }, + { + "epoch": 0.12930219178298116, + "grad_norm": 2.580704927444458, + "learning_rate": 9.983532736176813e-06, + "loss": 3.365, + "step": 131250 + }, + { + "epoch": 0.12935144976080326, + "grad_norm": 2.2672581672668457, + "learning_rate": 9.983520184537645e-06, + "loss": 3.344, + "step": 131300 + }, + { + "epoch": 0.12940070773862536, + "grad_norm": 2.870878219604492, + "learning_rate": 9.983507628124656e-06, + "loss": 3.3709, + "step": 131350 + }, + { + "epoch": 0.12944996571644743, + "grad_norm": 2.403296709060669, + "learning_rate": 9.983495066937854e-06, + "loss": 3.2934, + "step": 131400 + }, + { + "epoch": 0.12949922369426953, + "grad_norm": 2.4617629051208496, + "learning_rate": 9.983482500977256e-06, + "loss": 3.3159, + "step": 131450 + }, + { + "epoch": 0.12954848167209163, + "grad_norm": 2.588036298751831, + "learning_rate": 9.983469930242872e-06, + "loss": 3.3195, + "step": 131500 + }, + { + "epoch": 0.1295977396499137, + "grad_norm": 2.6028099060058594, + "learning_rate": 9.983457354734714e-06, + "loss": 3.3944, + "step": 131550 + }, + { + "epoch": 0.1296469976277358, + "grad_norm": 3.217900037765503, + "learning_rate": 9.983444774452793e-06, + "loss": 3.3862, + "step": 131600 + }, + { + "epoch": 0.12969625560555786, + "grad_norm": 2.891491651535034, + "learning_rate": 9.983432189397122e-06, + "loss": 3.3248, + "step": 131650 + }, + { + "epoch": 0.12974551358337996, + "grad_norm": 2.4643173217773438, + "learning_rate": 9.983419599567713e-06, + "loss": 3.3493, + "step": 131700 + }, + { + "epoch": 0.12979477156120206, + "grad_norm": 2.7450125217437744, + "learning_rate": 9.983407004964581e-06, + "loss": 3.3022, + "step": 131750 + }, + { + "epoch": 0.12984402953902413, + "grad_norm": 2.49796199798584, + "learning_rate": 9.983394405587732e-06, + "loss": 3.4453, + "step": 131800 + }, + { + "epoch": 0.12989328751684623, + "grad_norm": 2.4183664321899414, + "learning_rate": 9.983381801437184e-06, + "loss": 3.2929, + "step": 131850 + }, + { + "epoch": 0.12994254549466833, + "grad_norm": 2.501638889312744, + "learning_rate": 9.983369192512945e-06, + "loss": 3.4204, + "step": 131900 + }, + { + "epoch": 0.1299918034724904, + "grad_norm": 2.937954902648926, + "learning_rate": 9.98335657881503e-06, + "loss": 3.3738, + "step": 131950 + }, + { + "epoch": 0.1300410614503125, + "grad_norm": 2.5155978202819824, + "learning_rate": 9.983343960343447e-06, + "loss": 3.3179, + "step": 132000 + }, + { + "epoch": 0.1300903194281346, + "grad_norm": 2.299220323562622, + "learning_rate": 9.983331337098213e-06, + "loss": 3.3936, + "step": 132050 + }, + { + "epoch": 0.13013957740595666, + "grad_norm": 2.628406286239624, + "learning_rate": 9.983318709079336e-06, + "loss": 3.3773, + "step": 132100 + }, + { + "epoch": 0.13018883538377876, + "grad_norm": 2.6008517742156982, + "learning_rate": 9.983306076286831e-06, + "loss": 3.3366, + "step": 132150 + }, + { + "epoch": 0.13023809336160083, + "grad_norm": 2.501002311706543, + "learning_rate": 9.983293438720709e-06, + "loss": 3.3073, + "step": 132200 + }, + { + "epoch": 0.13028735133942293, + "grad_norm": 2.3524646759033203, + "learning_rate": 9.983280796380984e-06, + "loss": 3.2445, + "step": 132250 + }, + { + "epoch": 0.13033660931724503, + "grad_norm": 2.5214614868164062, + "learning_rate": 9.983268149267663e-06, + "loss": 3.3661, + "step": 132300 + }, + { + "epoch": 0.1303858672950671, + "grad_norm": 2.677401542663574, + "learning_rate": 9.983255497380762e-06, + "loss": 3.3722, + "step": 132350 + }, + { + "epoch": 0.1304351252728892, + "grad_norm": 2.203618049621582, + "learning_rate": 9.983242840720293e-06, + "loss": 3.3688, + "step": 132400 + }, + { + "epoch": 0.1304843832507113, + "grad_norm": 2.5540359020233154, + "learning_rate": 9.983230179286268e-06, + "loss": 3.3171, + "step": 132450 + }, + { + "epoch": 0.13053364122853336, + "grad_norm": 2.3545377254486084, + "learning_rate": 9.983217513078698e-06, + "loss": 3.2852, + "step": 132500 + }, + { + "epoch": 0.13058289920635546, + "grad_norm": 2.4534616470336914, + "learning_rate": 9.983204842097597e-06, + "loss": 3.3513, + "step": 132550 + }, + { + "epoch": 0.13063215718417756, + "grad_norm": 2.4522480964660645, + "learning_rate": 9.983192166342974e-06, + "loss": 3.2542, + "step": 132600 + }, + { + "epoch": 0.13068141516199963, + "grad_norm": 2.426788568496704, + "learning_rate": 9.983179485814844e-06, + "loss": 3.3133, + "step": 132650 + }, + { + "epoch": 0.13073067313982173, + "grad_norm": 2.192673683166504, + "learning_rate": 9.98316680051322e-06, + "loss": 3.358, + "step": 132700 + }, + { + "epoch": 0.13077993111764383, + "grad_norm": 2.5046803951263428, + "learning_rate": 9.983154110438111e-06, + "loss": 3.4099, + "step": 132750 + }, + { + "epoch": 0.1308291890954659, + "grad_norm": 2.310718536376953, + "learning_rate": 9.983141415589531e-06, + "loss": 3.383, + "step": 132800 + }, + { + "epoch": 0.130878447073288, + "grad_norm": 2.536156415939331, + "learning_rate": 9.98312871596749e-06, + "loss": 3.298, + "step": 132850 + }, + { + "epoch": 0.13092770505111007, + "grad_norm": 2.384089946746826, + "learning_rate": 9.983116011572005e-06, + "loss": 3.331, + "step": 132900 + }, + { + "epoch": 0.13097696302893216, + "grad_norm": 2.5198779106140137, + "learning_rate": 9.983103302403082e-06, + "loss": 3.3225, + "step": 132950 + }, + { + "epoch": 0.13102622100675426, + "grad_norm": 2.49294376373291, + "learning_rate": 9.983090588460738e-06, + "loss": 3.2792, + "step": 133000 + }, + { + "epoch": 0.13107547898457633, + "grad_norm": 2.8805341720581055, + "learning_rate": 9.983077869744982e-06, + "loss": 3.3651, + "step": 133050 + }, + { + "epoch": 0.13112473696239843, + "grad_norm": 2.5350656509399414, + "learning_rate": 9.983065146255829e-06, + "loss": 3.311, + "step": 133100 + }, + { + "epoch": 0.13117399494022053, + "grad_norm": 2.5297765731811523, + "learning_rate": 9.983052417993288e-06, + "loss": 3.3358, + "step": 133150 + }, + { + "epoch": 0.1312232529180426, + "grad_norm": 2.3900156021118164, + "learning_rate": 9.983039684957376e-06, + "loss": 3.3739, + "step": 133200 + }, + { + "epoch": 0.1312725108958647, + "grad_norm": 2.492724895477295, + "learning_rate": 9.9830269471481e-06, + "loss": 3.3252, + "step": 133250 + }, + { + "epoch": 0.1313217688736868, + "grad_norm": 2.492194890975952, + "learning_rate": 9.983014204565476e-06, + "loss": 3.3908, + "step": 133300 + }, + { + "epoch": 0.13137102685150887, + "grad_norm": 2.5742130279541016, + "learning_rate": 9.983001457209513e-06, + "loss": 3.2843, + "step": 133350 + }, + { + "epoch": 0.13142028482933096, + "grad_norm": 2.5196924209594727, + "learning_rate": 9.982988705080225e-06, + "loss": 3.3609, + "step": 133400 + }, + { + "epoch": 0.13146954280715303, + "grad_norm": 2.7247695922851562, + "learning_rate": 9.982975948177625e-06, + "loss": 3.3113, + "step": 133450 + }, + { + "epoch": 0.13151880078497513, + "grad_norm": 2.998905658721924, + "learning_rate": 9.982963186501723e-06, + "loss": 3.3593, + "step": 133500 + }, + { + "epoch": 0.13156805876279723, + "grad_norm": 2.5062928199768066, + "learning_rate": 9.982950420052533e-06, + "loss": 3.432, + "step": 133550 + }, + { + "epoch": 0.1316173167406193, + "grad_norm": 2.3104939460754395, + "learning_rate": 9.982937648830067e-06, + "loss": 3.3176, + "step": 133600 + }, + { + "epoch": 0.1316665747184414, + "grad_norm": 2.659116744995117, + "learning_rate": 9.982924872834336e-06, + "loss": 3.2904, + "step": 133650 + }, + { + "epoch": 0.1317158326962635, + "grad_norm": 2.500839948654175, + "learning_rate": 9.982912092065353e-06, + "loss": 3.3102, + "step": 133700 + }, + { + "epoch": 0.13176509067408557, + "grad_norm": 2.4487648010253906, + "learning_rate": 9.982899306523132e-06, + "loss": 3.2999, + "step": 133750 + }, + { + "epoch": 0.13181434865190766, + "grad_norm": 2.499160051345825, + "learning_rate": 9.982886516207684e-06, + "loss": 3.32, + "step": 133800 + }, + { + "epoch": 0.13186360662972976, + "grad_norm": 2.2931950092315674, + "learning_rate": 9.98287372111902e-06, + "loss": 3.3765, + "step": 133850 + }, + { + "epoch": 0.13191286460755183, + "grad_norm": 2.695383310317993, + "learning_rate": 9.982860921257151e-06, + "loss": 3.3323, + "step": 133900 + }, + { + "epoch": 0.13196212258537393, + "grad_norm": 2.518108606338501, + "learning_rate": 9.982848116622095e-06, + "loss": 3.2461, + "step": 133950 + }, + { + "epoch": 0.13201138056319603, + "grad_norm": 2.49371075630188, + "learning_rate": 9.982835307213859e-06, + "loss": 3.3342, + "step": 134000 + }, + { + "epoch": 0.1320606385410181, + "grad_norm": 2.5274276733398438, + "learning_rate": 9.982822493032456e-06, + "loss": 3.3699, + "step": 134050 + }, + { + "epoch": 0.1321098965188402, + "grad_norm": 2.755802869796753, + "learning_rate": 9.9828096740779e-06, + "loss": 3.307, + "step": 134100 + }, + { + "epoch": 0.13215915449666227, + "grad_norm": 2.741234302520752, + "learning_rate": 9.982796850350202e-06, + "loss": 3.3371, + "step": 134150 + }, + { + "epoch": 0.13220841247448437, + "grad_norm": 2.401212215423584, + "learning_rate": 9.982784021849376e-06, + "loss": 3.3105, + "step": 134200 + }, + { + "epoch": 0.13225767045230646, + "grad_norm": 2.590445041656494, + "learning_rate": 9.982771188575432e-06, + "loss": 3.354, + "step": 134250 + }, + { + "epoch": 0.13230692843012853, + "grad_norm": 2.3939108848571777, + "learning_rate": 9.982758350528385e-06, + "loss": 3.3233, + "step": 134300 + }, + { + "epoch": 0.13235618640795063, + "grad_norm": 2.8216092586517334, + "learning_rate": 9.982745507708245e-06, + "loss": 3.3169, + "step": 134350 + }, + { + "epoch": 0.13240544438577273, + "grad_norm": 2.461487293243408, + "learning_rate": 9.982732660115025e-06, + "loss": 3.3017, + "step": 134400 + }, + { + "epoch": 0.1324547023635948, + "grad_norm": 2.415942668914795, + "learning_rate": 9.982719807748737e-06, + "loss": 3.3261, + "step": 134450 + }, + { + "epoch": 0.1325039603414169, + "grad_norm": 2.4388933181762695, + "learning_rate": 9.982706950609395e-06, + "loss": 3.3266, + "step": 134500 + }, + { + "epoch": 0.132553218319239, + "grad_norm": 2.363949775695801, + "learning_rate": 9.982694088697006e-06, + "loss": 3.2988, + "step": 134550 + }, + { + "epoch": 0.13260247629706107, + "grad_norm": 2.2934038639068604, + "learning_rate": 9.982681222011589e-06, + "loss": 3.3184, + "step": 134600 + }, + { + "epoch": 0.13265173427488317, + "grad_norm": 2.3930673599243164, + "learning_rate": 9.982668350553155e-06, + "loss": 3.3232, + "step": 134650 + }, + { + "epoch": 0.13270099225270524, + "grad_norm": 2.5029680728912354, + "learning_rate": 9.982655474321713e-06, + "loss": 3.4058, + "step": 134700 + }, + { + "epoch": 0.13275025023052733, + "grad_norm": 2.4833905696868896, + "learning_rate": 9.982642593317277e-06, + "loss": 3.3072, + "step": 134750 + }, + { + "epoch": 0.13279950820834943, + "grad_norm": 2.5127246379852295, + "learning_rate": 9.982629707539861e-06, + "loss": 3.3105, + "step": 134800 + }, + { + "epoch": 0.1328487661861715, + "grad_norm": 2.600982427597046, + "learning_rate": 9.982616816989475e-06, + "loss": 3.3695, + "step": 134850 + }, + { + "epoch": 0.1328980241639936, + "grad_norm": 2.790300130844116, + "learning_rate": 9.982603921666131e-06, + "loss": 3.2848, + "step": 134900 + }, + { + "epoch": 0.1329472821418157, + "grad_norm": 2.61690354347229, + "learning_rate": 9.982591021569845e-06, + "loss": 3.3811, + "step": 134950 + }, + { + "epoch": 0.13299654011963777, + "grad_norm": 2.4919846057891846, + "learning_rate": 9.982578116700625e-06, + "loss": 3.3484, + "step": 135000 + }, + { + "epoch": 0.13304579809745987, + "grad_norm": 3.043800115585327, + "learning_rate": 9.982565207058487e-06, + "loss": 3.2603, + "step": 135050 + }, + { + "epoch": 0.13309505607528196, + "grad_norm": 2.5260419845581055, + "learning_rate": 9.982552292643441e-06, + "loss": 3.3228, + "step": 135100 + }, + { + "epoch": 0.13314431405310403, + "grad_norm": 2.420222520828247, + "learning_rate": 9.9825393734555e-06, + "loss": 3.3619, + "step": 135150 + }, + { + "epoch": 0.13319357203092613, + "grad_norm": 2.51693058013916, + "learning_rate": 9.982526449494677e-06, + "loss": 3.3347, + "step": 135200 + }, + { + "epoch": 0.1332428300087482, + "grad_norm": 2.514651298522949, + "learning_rate": 9.982513520760983e-06, + "loss": 3.3382, + "step": 135250 + }, + { + "epoch": 0.1332920879865703, + "grad_norm": 2.1774184703826904, + "learning_rate": 9.98250058725443e-06, + "loss": 3.3766, + "step": 135300 + }, + { + "epoch": 0.1333413459643924, + "grad_norm": 2.3952198028564453, + "learning_rate": 9.982487648975034e-06, + "loss": 3.2743, + "step": 135350 + }, + { + "epoch": 0.13339060394221447, + "grad_norm": 2.4841527938842773, + "learning_rate": 9.982474705922805e-06, + "loss": 3.331, + "step": 135400 + }, + { + "epoch": 0.13343986192003657, + "grad_norm": 2.3666741847991943, + "learning_rate": 9.982461758097752e-06, + "loss": 3.3645, + "step": 135450 + }, + { + "epoch": 0.13348911989785867, + "grad_norm": 2.5795412063598633, + "learning_rate": 9.982448805499894e-06, + "loss": 3.324, + "step": 135500 + }, + { + "epoch": 0.13353837787568074, + "grad_norm": 2.449688673019409, + "learning_rate": 9.98243584812924e-06, + "loss": 3.2925, + "step": 135550 + }, + { + "epoch": 0.13358763585350283, + "grad_norm": 2.411428213119507, + "learning_rate": 9.982422885985802e-06, + "loss": 3.2568, + "step": 135600 + }, + { + "epoch": 0.13363689383132493, + "grad_norm": 2.6189448833465576, + "learning_rate": 9.982409919069594e-06, + "loss": 3.3406, + "step": 135650 + }, + { + "epoch": 0.133686151809147, + "grad_norm": 2.38584303855896, + "learning_rate": 9.982396947380626e-06, + "loss": 3.3161, + "step": 135700 + }, + { + "epoch": 0.1337354097869691, + "grad_norm": 2.3299152851104736, + "learning_rate": 9.982383970918912e-06, + "loss": 3.3126, + "step": 135750 + }, + { + "epoch": 0.1337846677647912, + "grad_norm": 2.528596878051758, + "learning_rate": 9.982370989684465e-06, + "loss": 3.3046, + "step": 135800 + }, + { + "epoch": 0.13383392574261327, + "grad_norm": 2.4794840812683105, + "learning_rate": 9.982358003677297e-06, + "loss": 3.3107, + "step": 135850 + }, + { + "epoch": 0.13388318372043537, + "grad_norm": 2.376394271850586, + "learning_rate": 9.982345012897419e-06, + "loss": 3.349, + "step": 135900 + }, + { + "epoch": 0.13393244169825744, + "grad_norm": 2.5136525630950928, + "learning_rate": 9.982332017344845e-06, + "loss": 3.3232, + "step": 135950 + }, + { + "epoch": 0.13398169967607954, + "grad_norm": 2.6303584575653076, + "learning_rate": 9.982319017019587e-06, + "loss": 3.3295, + "step": 136000 + }, + { + "epoch": 0.13403095765390163, + "grad_norm": 2.5867836475372314, + "learning_rate": 9.982306011921659e-06, + "loss": 3.3331, + "step": 136050 + }, + { + "epoch": 0.1340802156317237, + "grad_norm": 2.4861154556274414, + "learning_rate": 9.982293002051068e-06, + "loss": 3.3347, + "step": 136100 + }, + { + "epoch": 0.1341294736095458, + "grad_norm": 2.449524402618408, + "learning_rate": 9.982279987407835e-06, + "loss": 3.2921, + "step": 136150 + }, + { + "epoch": 0.1341787315873679, + "grad_norm": 2.6459333896636963, + "learning_rate": 9.982266967991966e-06, + "loss": 3.3242, + "step": 136200 + }, + { + "epoch": 0.13422798956518997, + "grad_norm": 2.4930968284606934, + "learning_rate": 9.982253943803476e-06, + "loss": 3.3387, + "step": 136250 + }, + { + "epoch": 0.13427724754301207, + "grad_norm": 2.3944690227508545, + "learning_rate": 9.982240914842377e-06, + "loss": 3.3327, + "step": 136300 + }, + { + "epoch": 0.13432650552083417, + "grad_norm": 2.52659273147583, + "learning_rate": 9.982227881108679e-06, + "loss": 3.3072, + "step": 136350 + }, + { + "epoch": 0.13437576349865624, + "grad_norm": 2.3399899005889893, + "learning_rate": 9.9822148426024e-06, + "loss": 3.3135, + "step": 136400 + }, + { + "epoch": 0.13442502147647833, + "grad_norm": 2.50480055809021, + "learning_rate": 9.982201799323547e-06, + "loss": 3.3475, + "step": 136450 + }, + { + "epoch": 0.1344742794543004, + "grad_norm": 2.542318105697632, + "learning_rate": 9.982188751272137e-06, + "loss": 3.307, + "step": 136500 + }, + { + "epoch": 0.1345235374321225, + "grad_norm": 2.481058359146118, + "learning_rate": 9.982175698448178e-06, + "loss": 3.3891, + "step": 136550 + }, + { + "epoch": 0.1345727954099446, + "grad_norm": 2.457432508468628, + "learning_rate": 9.982162640851687e-06, + "loss": 3.2939, + "step": 136600 + }, + { + "epoch": 0.13462205338776667, + "grad_norm": 2.557265520095825, + "learning_rate": 9.982149578482672e-06, + "loss": 3.3961, + "step": 136650 + }, + { + "epoch": 0.13467131136558877, + "grad_norm": 2.4533510208129883, + "learning_rate": 9.98213651134115e-06, + "loss": 3.3834, + "step": 136700 + }, + { + "epoch": 0.13472056934341087, + "grad_norm": 2.3988099098205566, + "learning_rate": 9.98212343942713e-06, + "loss": 3.3672, + "step": 136750 + }, + { + "epoch": 0.13476982732123294, + "grad_norm": 2.5094661712646484, + "learning_rate": 9.982110362740628e-06, + "loss": 3.4218, + "step": 136800 + }, + { + "epoch": 0.13481908529905504, + "grad_norm": 2.4124655723571777, + "learning_rate": 9.982097281281651e-06, + "loss": 3.2796, + "step": 136850 + }, + { + "epoch": 0.13486834327687713, + "grad_norm": 2.3302738666534424, + "learning_rate": 9.982084195050217e-06, + "loss": 3.3458, + "step": 136900 + }, + { + "epoch": 0.1349176012546992, + "grad_norm": 2.357494354248047, + "learning_rate": 9.982071104046335e-06, + "loss": 3.305, + "step": 136950 + }, + { + "epoch": 0.1349668592325213, + "grad_norm": 2.5255463123321533, + "learning_rate": 9.982058008270022e-06, + "loss": 3.3301, + "step": 137000 + }, + { + "epoch": 0.1350161172103434, + "grad_norm": 2.3327341079711914, + "learning_rate": 9.982044907721285e-06, + "loss": 3.3674, + "step": 137050 + }, + { + "epoch": 0.13506537518816547, + "grad_norm": 2.4407548904418945, + "learning_rate": 9.982031802400139e-06, + "loss": 3.3199, + "step": 137100 + }, + { + "epoch": 0.13511463316598757, + "grad_norm": 2.3638646602630615, + "learning_rate": 9.982018692306599e-06, + "loss": 3.4032, + "step": 137150 + }, + { + "epoch": 0.13516389114380964, + "grad_norm": 2.269465684890747, + "learning_rate": 9.982005577440672e-06, + "loss": 3.3223, + "step": 137200 + }, + { + "epoch": 0.13521314912163174, + "grad_norm": 2.482313632965088, + "learning_rate": 9.981992457802377e-06, + "loss": 3.2915, + "step": 137250 + }, + { + "epoch": 0.13526240709945384, + "grad_norm": 2.482332944869995, + "learning_rate": 9.98197933339172e-06, + "loss": 3.2595, + "step": 137300 + }, + { + "epoch": 0.1353116650772759, + "grad_norm": 2.4373786449432373, + "learning_rate": 9.98196620420872e-06, + "loss": 3.3754, + "step": 137350 + }, + { + "epoch": 0.135360923055098, + "grad_norm": 2.510951280593872, + "learning_rate": 9.981953070253385e-06, + "loss": 3.3194, + "step": 137400 + }, + { + "epoch": 0.1354101810329201, + "grad_norm": 2.49735164642334, + "learning_rate": 9.981939931525729e-06, + "loss": 3.352, + "step": 137450 + }, + { + "epoch": 0.13545943901074217, + "grad_norm": 2.3950843811035156, + "learning_rate": 9.981926788025765e-06, + "loss": 3.279, + "step": 137500 + }, + { + "epoch": 0.13550869698856427, + "grad_norm": 2.4150116443634033, + "learning_rate": 9.981913639753506e-06, + "loss": 3.3117, + "step": 137550 + }, + { + "epoch": 0.13555795496638637, + "grad_norm": 2.2711334228515625, + "learning_rate": 9.981900486708962e-06, + "loss": 3.2994, + "step": 137600 + }, + { + "epoch": 0.13560721294420844, + "grad_norm": 2.7237069606781006, + "learning_rate": 9.98188732889215e-06, + "loss": 3.3311, + "step": 137650 + }, + { + "epoch": 0.13565647092203054, + "grad_norm": 2.4380433559417725, + "learning_rate": 9.981874166303078e-06, + "loss": 3.3437, + "step": 137700 + }, + { + "epoch": 0.1357057288998526, + "grad_norm": 2.4385933876037598, + "learning_rate": 9.981860998941761e-06, + "loss": 3.3122, + "step": 137750 + }, + { + "epoch": 0.1357549868776747, + "grad_norm": 2.4435009956359863, + "learning_rate": 9.981847826808211e-06, + "loss": 3.3869, + "step": 137800 + }, + { + "epoch": 0.1358042448554968, + "grad_norm": 2.4125919342041016, + "learning_rate": 9.981834649902442e-06, + "loss": 3.3299, + "step": 137850 + }, + { + "epoch": 0.13585350283331887, + "grad_norm": 2.591780662536621, + "learning_rate": 9.981821468224464e-06, + "loss": 3.3554, + "step": 137900 + }, + { + "epoch": 0.13590276081114097, + "grad_norm": 2.4262301921844482, + "learning_rate": 9.981808281774292e-06, + "loss": 3.2731, + "step": 137950 + }, + { + "epoch": 0.13595201878896307, + "grad_norm": 2.5718352794647217, + "learning_rate": 9.981795090551938e-06, + "loss": 3.3008, + "step": 138000 + }, + { + "epoch": 0.13600127676678514, + "grad_norm": 2.5085978507995605, + "learning_rate": 9.981781894557414e-06, + "loss": 3.2593, + "step": 138050 + }, + { + "epoch": 0.13605053474460724, + "grad_norm": 2.4398982524871826, + "learning_rate": 9.981768693790734e-06, + "loss": 3.3507, + "step": 138100 + }, + { + "epoch": 0.13609979272242934, + "grad_norm": 2.4069535732269287, + "learning_rate": 9.98175548825191e-06, + "loss": 3.347, + "step": 138150 + }, + { + "epoch": 0.1361490507002514, + "grad_norm": 3.339010238647461, + "learning_rate": 9.981742277940952e-06, + "loss": 3.3073, + "step": 138200 + }, + { + "epoch": 0.1361983086780735, + "grad_norm": 2.544299602508545, + "learning_rate": 9.981729062857877e-06, + "loss": 3.2752, + "step": 138250 + }, + { + "epoch": 0.1362475666558956, + "grad_norm": 2.288745641708374, + "learning_rate": 9.981715843002696e-06, + "loss": 3.3364, + "step": 138300 + }, + { + "epoch": 0.13629682463371767, + "grad_norm": 2.608415365219116, + "learning_rate": 9.981702618375419e-06, + "loss": 3.328, + "step": 138350 + }, + { + "epoch": 0.13634608261153977, + "grad_norm": 2.364572763442993, + "learning_rate": 9.981689388976062e-06, + "loss": 3.3116, + "step": 138400 + }, + { + "epoch": 0.13639534058936184, + "grad_norm": 2.633402109146118, + "learning_rate": 9.981676154804637e-06, + "loss": 3.3377, + "step": 138450 + }, + { + "epoch": 0.13644459856718394, + "grad_norm": 2.5236153602600098, + "learning_rate": 9.981662915861156e-06, + "loss": 3.3366, + "step": 138500 + }, + { + "epoch": 0.13649385654500604, + "grad_norm": 2.477294921875, + "learning_rate": 9.981649672145633e-06, + "loss": 3.3233, + "step": 138550 + }, + { + "epoch": 0.1365431145228281, + "grad_norm": 2.413985013961792, + "learning_rate": 9.981636423658078e-06, + "loss": 3.3572, + "step": 138600 + }, + { + "epoch": 0.1365923725006502, + "grad_norm": 2.4874064922332764, + "learning_rate": 9.981623170398506e-06, + "loss": 3.2786, + "step": 138650 + }, + { + "epoch": 0.1366416304784723, + "grad_norm": 2.2424540519714355, + "learning_rate": 9.98160991236693e-06, + "loss": 3.3565, + "step": 138700 + }, + { + "epoch": 0.13669088845629437, + "grad_norm": 2.5076889991760254, + "learning_rate": 9.981596649563362e-06, + "loss": 3.3086, + "step": 138750 + }, + { + "epoch": 0.13674014643411647, + "grad_norm": 2.5143632888793945, + "learning_rate": 9.981583381987813e-06, + "loss": 3.2996, + "step": 138800 + }, + { + "epoch": 0.13678940441193857, + "grad_norm": 2.3328802585601807, + "learning_rate": 9.9815701096403e-06, + "loss": 3.3422, + "step": 138850 + }, + { + "epoch": 0.13683866238976064, + "grad_norm": 2.7330501079559326, + "learning_rate": 9.98155683252083e-06, + "loss": 3.2874, + "step": 138900 + }, + { + "epoch": 0.13688792036758274, + "grad_norm": 2.7586309909820557, + "learning_rate": 9.98154355062942e-06, + "loss": 3.311, + "step": 138950 + }, + { + "epoch": 0.1369371783454048, + "grad_norm": 2.735335350036621, + "learning_rate": 9.981530263966081e-06, + "loss": 3.3583, + "step": 139000 + }, + { + "epoch": 0.1369864363232269, + "grad_norm": 2.2423620223999023, + "learning_rate": 9.981516972530826e-06, + "loss": 3.3061, + "step": 139050 + }, + { + "epoch": 0.137035694301049, + "grad_norm": 2.4437153339385986, + "learning_rate": 9.981503676323668e-06, + "loss": 3.3654, + "step": 139100 + }, + { + "epoch": 0.13708495227887108, + "grad_norm": 2.4011268615722656, + "learning_rate": 9.981490375344619e-06, + "loss": 3.3963, + "step": 139150 + }, + { + "epoch": 0.13713421025669317, + "grad_norm": 2.522320508956909, + "learning_rate": 9.981477069593692e-06, + "loss": 3.3289, + "step": 139200 + }, + { + "epoch": 0.13718346823451527, + "grad_norm": 2.4019522666931152, + "learning_rate": 9.981463759070901e-06, + "loss": 3.3304, + "step": 139250 + }, + { + "epoch": 0.13723272621233734, + "grad_norm": 2.5387160778045654, + "learning_rate": 9.981450443776259e-06, + "loss": 3.2708, + "step": 139300 + }, + { + "epoch": 0.13728198419015944, + "grad_norm": 2.7194929122924805, + "learning_rate": 9.981437123709775e-06, + "loss": 3.4019, + "step": 139350 + }, + { + "epoch": 0.13733124216798154, + "grad_norm": 2.411836624145508, + "learning_rate": 9.981423798871466e-06, + "loss": 3.3217, + "step": 139400 + }, + { + "epoch": 0.1373805001458036, + "grad_norm": 2.504457950592041, + "learning_rate": 9.981410469261343e-06, + "loss": 3.3431, + "step": 139450 + }, + { + "epoch": 0.1374297581236257, + "grad_norm": 2.446812629699707, + "learning_rate": 9.981397134879418e-06, + "loss": 3.3536, + "step": 139500 + }, + { + "epoch": 0.1374790161014478, + "grad_norm": 2.2976324558258057, + "learning_rate": 9.981383795725706e-06, + "loss": 3.2978, + "step": 139550 + }, + { + "epoch": 0.13752827407926987, + "grad_norm": 2.5022976398468018, + "learning_rate": 9.981370451800217e-06, + "loss": 3.2924, + "step": 139600 + }, + { + "epoch": 0.13757753205709197, + "grad_norm": 2.5271923542022705, + "learning_rate": 9.981357103102966e-06, + "loss": 3.3034, + "step": 139650 + }, + { + "epoch": 0.13762679003491404, + "grad_norm": 2.452100992202759, + "learning_rate": 9.981343749633963e-06, + "loss": 3.3335, + "step": 139700 + }, + { + "epoch": 0.13767604801273614, + "grad_norm": 2.3178234100341797, + "learning_rate": 9.981330391393224e-06, + "loss": 3.402, + "step": 139750 + }, + { + "epoch": 0.13772530599055824, + "grad_norm": 2.3183038234710693, + "learning_rate": 9.98131702838076e-06, + "loss": 3.3484, + "step": 139800 + }, + { + "epoch": 0.1377745639683803, + "grad_norm": 2.401075839996338, + "learning_rate": 9.981303660596588e-06, + "loss": 3.3119, + "step": 139850 + }, + { + "epoch": 0.1378238219462024, + "grad_norm": 2.5254030227661133, + "learning_rate": 9.981290288040714e-06, + "loss": 3.2726, + "step": 139900 + }, + { + "epoch": 0.1378730799240245, + "grad_norm": 2.428209066390991, + "learning_rate": 9.981276910713153e-06, + "loss": 3.3335, + "step": 139950 + }, + { + "epoch": 0.13792233790184658, + "grad_norm": 2.383347749710083, + "learning_rate": 9.981263528613922e-06, + "loss": 3.3597, + "step": 140000 + }, + { + "epoch": 0.13797159587966867, + "grad_norm": 2.399376392364502, + "learning_rate": 9.981250141743029e-06, + "loss": 3.3078, + "step": 140050 + }, + { + "epoch": 0.13802085385749077, + "grad_norm": 2.3909976482391357, + "learning_rate": 9.981236750100488e-06, + "loss": 3.2902, + "step": 140100 + }, + { + "epoch": 0.13807011183531284, + "grad_norm": 3.0182032585144043, + "learning_rate": 9.981223353686312e-06, + "loss": 3.2781, + "step": 140150 + }, + { + "epoch": 0.13811936981313494, + "grad_norm": 2.3577322959899902, + "learning_rate": 9.981209952500514e-06, + "loss": 3.3139, + "step": 140200 + }, + { + "epoch": 0.138168627790957, + "grad_norm": 2.508078098297119, + "learning_rate": 9.981196546543108e-06, + "loss": 3.3224, + "step": 140250 + }, + { + "epoch": 0.1382178857687791, + "grad_norm": 2.4417014122009277, + "learning_rate": 9.981183135814105e-06, + "loss": 3.3467, + "step": 140300 + }, + { + "epoch": 0.1382671437466012, + "grad_norm": 2.4924166202545166, + "learning_rate": 9.981169720313519e-06, + "loss": 3.3427, + "step": 140350 + }, + { + "epoch": 0.13831640172442328, + "grad_norm": 2.6018028259277344, + "learning_rate": 9.981156300041362e-06, + "loss": 3.3043, + "step": 140400 + }, + { + "epoch": 0.13836565970224537, + "grad_norm": 2.520836114883423, + "learning_rate": 9.981142874997648e-06, + "loss": 3.2591, + "step": 140450 + }, + { + "epoch": 0.13841491768006747, + "grad_norm": 2.571235418319702, + "learning_rate": 9.98112944518239e-06, + "loss": 3.3571, + "step": 140500 + }, + { + "epoch": 0.13846417565788954, + "grad_norm": 2.361311674118042, + "learning_rate": 9.981116010595597e-06, + "loss": 3.3328, + "step": 140550 + }, + { + "epoch": 0.13851343363571164, + "grad_norm": 2.5676746368408203, + "learning_rate": 9.981102571237287e-06, + "loss": 3.3541, + "step": 140600 + }, + { + "epoch": 0.13856269161353374, + "grad_norm": 3.442856550216675, + "learning_rate": 9.98108912710747e-06, + "loss": 3.2382, + "step": 140650 + }, + { + "epoch": 0.1386119495913558, + "grad_norm": 2.6859328746795654, + "learning_rate": 9.981075678206162e-06, + "loss": 3.261, + "step": 140700 + }, + { + "epoch": 0.1386612075691779, + "grad_norm": 2.3391921520233154, + "learning_rate": 9.981062224533372e-06, + "loss": 3.3058, + "step": 140750 + }, + { + "epoch": 0.138710465547, + "grad_norm": 2.2615370750427246, + "learning_rate": 9.981048766089115e-06, + "loss": 3.3154, + "step": 140800 + }, + { + "epoch": 0.13875972352482208, + "grad_norm": 2.5623269081115723, + "learning_rate": 9.9810353028734e-06, + "loss": 3.2701, + "step": 140850 + }, + { + "epoch": 0.13880898150264417, + "grad_norm": 2.3537538051605225, + "learning_rate": 9.981021834886246e-06, + "loss": 3.2571, + "step": 140900 + }, + { + "epoch": 0.13885823948046624, + "grad_norm": 2.3050129413604736, + "learning_rate": 9.981008362127664e-06, + "loss": 3.4048, + "step": 140950 + }, + { + "epoch": 0.13890749745828834, + "grad_norm": 2.6156623363494873, + "learning_rate": 9.980994884597666e-06, + "loss": 3.2993, + "step": 141000 + }, + { + "epoch": 0.13895675543611044, + "grad_norm": 2.460026741027832, + "learning_rate": 9.980981402296263e-06, + "loss": 3.3134, + "step": 141050 + }, + { + "epoch": 0.1390060134139325, + "grad_norm": 2.410754680633545, + "learning_rate": 9.980967915223471e-06, + "loss": 3.3067, + "step": 141100 + }, + { + "epoch": 0.1390552713917546, + "grad_norm": 2.6898746490478516, + "learning_rate": 9.9809544233793e-06, + "loss": 3.3313, + "step": 141150 + }, + { + "epoch": 0.1391045293695767, + "grad_norm": 2.347651958465576, + "learning_rate": 9.980940926763766e-06, + "loss": 3.3558, + "step": 141200 + }, + { + "epoch": 0.13915378734739878, + "grad_norm": 2.3405380249023438, + "learning_rate": 9.980927425376883e-06, + "loss": 3.3223, + "step": 141250 + }, + { + "epoch": 0.13920304532522088, + "grad_norm": 2.6139001846313477, + "learning_rate": 9.980913919218659e-06, + "loss": 3.3155, + "step": 141300 + }, + { + "epoch": 0.13925230330304297, + "grad_norm": 2.2898507118225098, + "learning_rate": 9.98090040828911e-06, + "loss": 3.2795, + "step": 141350 + }, + { + "epoch": 0.13930156128086504, + "grad_norm": 2.4258646965026855, + "learning_rate": 9.980886892588249e-06, + "loss": 3.3546, + "step": 141400 + }, + { + "epoch": 0.13935081925868714, + "grad_norm": 2.526224374771118, + "learning_rate": 9.980873372116086e-06, + "loss": 3.3914, + "step": 141450 + }, + { + "epoch": 0.1394000772365092, + "grad_norm": 2.792062282562256, + "learning_rate": 9.980859846872639e-06, + "loss": 3.2804, + "step": 141500 + }, + { + "epoch": 0.1394493352143313, + "grad_norm": 2.583101272583008, + "learning_rate": 9.980846316857917e-06, + "loss": 3.3249, + "step": 141550 + }, + { + "epoch": 0.1394985931921534, + "grad_norm": 2.4814393520355225, + "learning_rate": 9.980832782071936e-06, + "loss": 3.3686, + "step": 141600 + }, + { + "epoch": 0.13954785116997548, + "grad_norm": 2.41717791557312, + "learning_rate": 9.980819242514706e-06, + "loss": 3.3066, + "step": 141650 + }, + { + "epoch": 0.13959710914779758, + "grad_norm": 2.666429281234741, + "learning_rate": 9.98080569818624e-06, + "loss": 3.3273, + "step": 141700 + }, + { + "epoch": 0.13964636712561967, + "grad_norm": 2.4969065189361572, + "learning_rate": 9.980792149086553e-06, + "loss": 3.2931, + "step": 141750 + }, + { + "epoch": 0.13969562510344175, + "grad_norm": 2.339334726333618, + "learning_rate": 9.980778595215659e-06, + "loss": 3.2778, + "step": 141800 + }, + { + "epoch": 0.13974488308126384, + "grad_norm": 2.5912668704986572, + "learning_rate": 9.980765036573568e-06, + "loss": 3.2935, + "step": 141850 + }, + { + "epoch": 0.13979414105908594, + "grad_norm": 2.5058205127716064, + "learning_rate": 9.980751473160292e-06, + "loss": 3.2877, + "step": 141900 + }, + { + "epoch": 0.139843399036908, + "grad_norm": 2.9673266410827637, + "learning_rate": 9.980737904975847e-06, + "loss": 3.3279, + "step": 141950 + }, + { + "epoch": 0.1398926570147301, + "grad_norm": 2.7983357906341553, + "learning_rate": 9.980724332020247e-06, + "loss": 3.2946, + "step": 142000 + }, + { + "epoch": 0.13994191499255218, + "grad_norm": 2.388613700866699, + "learning_rate": 9.980710754293502e-06, + "loss": 3.3299, + "step": 142050 + }, + { + "epoch": 0.13999117297037428, + "grad_norm": 2.5358829498291016, + "learning_rate": 9.980697171795627e-06, + "loss": 3.2597, + "step": 142100 + }, + { + "epoch": 0.14004043094819638, + "grad_norm": 2.5365257263183594, + "learning_rate": 9.980683584526633e-06, + "loss": 3.3348, + "step": 142150 + }, + { + "epoch": 0.14008968892601845, + "grad_norm": 2.57611346244812, + "learning_rate": 9.980669992486534e-06, + "loss": 3.31, + "step": 142200 + }, + { + "epoch": 0.14013894690384054, + "grad_norm": 2.3117446899414062, + "learning_rate": 9.980656395675343e-06, + "loss": 3.251, + "step": 142250 + }, + { + "epoch": 0.14018820488166264, + "grad_norm": 2.5046188831329346, + "learning_rate": 9.980642794093075e-06, + "loss": 3.3143, + "step": 142300 + }, + { + "epoch": 0.1402374628594847, + "grad_norm": 2.4090585708618164, + "learning_rate": 9.98062918773974e-06, + "loss": 3.3309, + "step": 142350 + }, + { + "epoch": 0.1402867208373068, + "grad_norm": 2.466533899307251, + "learning_rate": 9.980615576615353e-06, + "loss": 3.2988, + "step": 142400 + }, + { + "epoch": 0.1403359788151289, + "grad_norm": 2.5477700233459473, + "learning_rate": 9.980601960719924e-06, + "loss": 3.299, + "step": 142450 + }, + { + "epoch": 0.14038523679295098, + "grad_norm": 2.5420734882354736, + "learning_rate": 9.98058834005347e-06, + "loss": 3.3358, + "step": 142500 + }, + { + "epoch": 0.14043449477077308, + "grad_norm": 2.5244507789611816, + "learning_rate": 9.980574714616003e-06, + "loss": 3.3163, + "step": 142550 + }, + { + "epoch": 0.14048375274859518, + "grad_norm": 2.596418619155884, + "learning_rate": 9.980561084407533e-06, + "loss": 3.3001, + "step": 142600 + }, + { + "epoch": 0.14053301072641725, + "grad_norm": 2.7721688747406006, + "learning_rate": 9.98054744942808e-06, + "loss": 3.2385, + "step": 142650 + }, + { + "epoch": 0.14058226870423934, + "grad_norm": 2.2778868675231934, + "learning_rate": 9.980533809677648e-06, + "loss": 3.2198, + "step": 142700 + }, + { + "epoch": 0.14063152668206141, + "grad_norm": 2.211996078491211, + "learning_rate": 9.980520165156258e-06, + "loss": 3.2215, + "step": 142750 + }, + { + "epoch": 0.1406807846598835, + "grad_norm": 2.358234405517578, + "learning_rate": 9.980506515863917e-06, + "loss": 3.3586, + "step": 142800 + }, + { + "epoch": 0.1407300426377056, + "grad_norm": 3.623809576034546, + "learning_rate": 9.980492861800642e-06, + "loss": 3.3484, + "step": 142850 + }, + { + "epoch": 0.14077930061552768, + "grad_norm": 2.1086010932922363, + "learning_rate": 9.980479202966444e-06, + "loss": 3.2874, + "step": 142900 + }, + { + "epoch": 0.14082855859334978, + "grad_norm": 2.375112533569336, + "learning_rate": 9.980465539361338e-06, + "loss": 3.2871, + "step": 142950 + }, + { + "epoch": 0.14087781657117188, + "grad_norm": 2.394282817840576, + "learning_rate": 9.980451870985336e-06, + "loss": 3.3133, + "step": 143000 + }, + { + "epoch": 0.14092707454899395, + "grad_norm": 2.338268756866455, + "learning_rate": 9.98043819783845e-06, + "loss": 3.28, + "step": 143050 + }, + { + "epoch": 0.14097633252681604, + "grad_norm": 2.3822615146636963, + "learning_rate": 9.980424519920696e-06, + "loss": 3.3147, + "step": 143100 + }, + { + "epoch": 0.14102559050463814, + "grad_norm": 2.5267529487609863, + "learning_rate": 9.980410837232084e-06, + "loss": 3.3271, + "step": 143150 + }, + { + "epoch": 0.1410748484824602, + "grad_norm": 2.6227684020996094, + "learning_rate": 9.980397149772629e-06, + "loss": 3.3035, + "step": 143200 + }, + { + "epoch": 0.1411241064602823, + "grad_norm": 2.37418794631958, + "learning_rate": 9.980383457542341e-06, + "loss": 3.3356, + "step": 143250 + }, + { + "epoch": 0.14117336443810438, + "grad_norm": 2.325625419616699, + "learning_rate": 9.98036976054124e-06, + "loss": 3.3008, + "step": 143300 + }, + { + "epoch": 0.14122262241592648, + "grad_norm": 2.4494218826293945, + "learning_rate": 9.980356058769331e-06, + "loss": 3.2898, + "step": 143350 + }, + { + "epoch": 0.14127188039374858, + "grad_norm": 2.671747922897339, + "learning_rate": 9.980342352226633e-06, + "loss": 3.2971, + "step": 143400 + }, + { + "epoch": 0.14132113837157065, + "grad_norm": 2.265096426010132, + "learning_rate": 9.980328640913158e-06, + "loss": 3.3082, + "step": 143450 + }, + { + "epoch": 0.14137039634939275, + "grad_norm": 2.483980894088745, + "learning_rate": 9.980314924828915e-06, + "loss": 3.296, + "step": 143500 + }, + { + "epoch": 0.14141965432721484, + "grad_norm": 2.399890899658203, + "learning_rate": 9.980301203973924e-06, + "loss": 3.3342, + "step": 143550 + }, + { + "epoch": 0.14146891230503691, + "grad_norm": 2.6143224239349365, + "learning_rate": 9.980287478348192e-06, + "loss": 3.2991, + "step": 143600 + }, + { + "epoch": 0.141518170282859, + "grad_norm": 2.608013153076172, + "learning_rate": 9.980273747951734e-06, + "loss": 3.3001, + "step": 143650 + }, + { + "epoch": 0.1415674282606811, + "grad_norm": 2.6112849712371826, + "learning_rate": 9.980260012784565e-06, + "loss": 3.3152, + "step": 143700 + }, + { + "epoch": 0.14161668623850318, + "grad_norm": 2.4555299282073975, + "learning_rate": 9.980246272846696e-06, + "loss": 3.2798, + "step": 143750 + }, + { + "epoch": 0.14166594421632528, + "grad_norm": 2.323209524154663, + "learning_rate": 9.980232528138143e-06, + "loss": 3.321, + "step": 143800 + }, + { + "epoch": 0.14171520219414738, + "grad_norm": 2.4593558311462402, + "learning_rate": 9.980218778658914e-06, + "loss": 3.2553, + "step": 143850 + }, + { + "epoch": 0.14176446017196945, + "grad_norm": 2.4153497219085693, + "learning_rate": 9.980205024409028e-06, + "loss": 3.3317, + "step": 143900 + }, + { + "epoch": 0.14181371814979155, + "grad_norm": 2.6605224609375, + "learning_rate": 9.980191265388494e-06, + "loss": 3.3507, + "step": 143950 + }, + { + "epoch": 0.14186297612761362, + "grad_norm": 2.712503671646118, + "learning_rate": 9.98017750159733e-06, + "loss": 3.3041, + "step": 144000 + }, + { + "epoch": 0.14191223410543571, + "grad_norm": 2.367201089859009, + "learning_rate": 9.980163733035543e-06, + "loss": 3.2922, + "step": 144050 + }, + { + "epoch": 0.1419614920832578, + "grad_norm": 2.682112216949463, + "learning_rate": 9.98014995970315e-06, + "loss": 3.3373, + "step": 144100 + }, + { + "epoch": 0.14201075006107988, + "grad_norm": 2.3647031784057617, + "learning_rate": 9.980136181600162e-06, + "loss": 3.3124, + "step": 144150 + }, + { + "epoch": 0.14206000803890198, + "grad_norm": 2.3073883056640625, + "learning_rate": 9.980122398726596e-06, + "loss": 3.3142, + "step": 144200 + }, + { + "epoch": 0.14210926601672408, + "grad_norm": 2.36147403717041, + "learning_rate": 9.98010861108246e-06, + "loss": 3.3034, + "step": 144250 + }, + { + "epoch": 0.14215852399454615, + "grad_norm": 2.4521796703338623, + "learning_rate": 9.980094818667772e-06, + "loss": 3.2424, + "step": 144300 + }, + { + "epoch": 0.14220778197236825, + "grad_norm": 2.544224739074707, + "learning_rate": 9.980081021482542e-06, + "loss": 3.3594, + "step": 144350 + }, + { + "epoch": 0.14225703995019034, + "grad_norm": 2.52839732170105, + "learning_rate": 9.980067219526784e-06, + "loss": 3.316, + "step": 144400 + }, + { + "epoch": 0.14230629792801242, + "grad_norm": 2.4467718601226807, + "learning_rate": 9.980053412800512e-06, + "loss": 3.298, + "step": 144450 + }, + { + "epoch": 0.1423555559058345, + "grad_norm": 2.5469326972961426, + "learning_rate": 9.980039601303739e-06, + "loss": 3.3457, + "step": 144500 + }, + { + "epoch": 0.14240481388365658, + "grad_norm": 2.413027048110962, + "learning_rate": 9.980025785036477e-06, + "loss": 3.3194, + "step": 144550 + }, + { + "epoch": 0.14245407186147868, + "grad_norm": 2.489638090133667, + "learning_rate": 9.980011963998742e-06, + "loss": 3.2625, + "step": 144600 + }, + { + "epoch": 0.14250332983930078, + "grad_norm": 2.381376266479492, + "learning_rate": 9.979998138190546e-06, + "loss": 3.2212, + "step": 144650 + }, + { + "epoch": 0.14255258781712285, + "grad_norm": 2.418457508087158, + "learning_rate": 9.979984307611901e-06, + "loss": 3.3328, + "step": 144700 + }, + { + "epoch": 0.14260184579494495, + "grad_norm": 2.3505351543426514, + "learning_rate": 9.97997047226282e-06, + "loss": 3.2762, + "step": 144750 + }, + { + "epoch": 0.14265110377276705, + "grad_norm": 2.9312517642974854, + "learning_rate": 9.979956632143317e-06, + "loss": 3.2973, + "step": 144800 + }, + { + "epoch": 0.14270036175058912, + "grad_norm": 2.4469308853149414, + "learning_rate": 9.979942787253408e-06, + "loss": 3.2861, + "step": 144850 + }, + { + "epoch": 0.14274961972841121, + "grad_norm": 2.610560417175293, + "learning_rate": 9.979928937593101e-06, + "loss": 3.2806, + "step": 144900 + }, + { + "epoch": 0.1427988777062333, + "grad_norm": 2.4830546379089355, + "learning_rate": 9.979915083162413e-06, + "loss": 3.3391, + "step": 144950 + }, + { + "epoch": 0.14284813568405538, + "grad_norm": 2.3902671337127686, + "learning_rate": 9.979901223961358e-06, + "loss": 3.3256, + "step": 145000 + }, + { + "epoch": 0.14289739366187748, + "grad_norm": 2.420818328857422, + "learning_rate": 9.979887359989945e-06, + "loss": 3.3159, + "step": 145050 + }, + { + "epoch": 0.14294665163969958, + "grad_norm": 2.5970120429992676, + "learning_rate": 9.979873491248191e-06, + "loss": 3.3146, + "step": 145100 + }, + { + "epoch": 0.14299590961752165, + "grad_norm": 2.414000988006592, + "learning_rate": 9.979859617736108e-06, + "loss": 3.2436, + "step": 145150 + }, + { + "epoch": 0.14304516759534375, + "grad_norm": 2.2196333408355713, + "learning_rate": 9.979845739453709e-06, + "loss": 3.2837, + "step": 145200 + }, + { + "epoch": 0.14309442557316582, + "grad_norm": 2.356818914413452, + "learning_rate": 9.979831856401008e-06, + "loss": 3.2604, + "step": 145250 + }, + { + "epoch": 0.14314368355098792, + "grad_norm": 2.592839241027832, + "learning_rate": 9.979817968578018e-06, + "loss": 3.1863, + "step": 145300 + }, + { + "epoch": 0.14319294152881, + "grad_norm": 2.6258652210235596, + "learning_rate": 9.979804075984754e-06, + "loss": 3.3675, + "step": 145350 + }, + { + "epoch": 0.14324219950663208, + "grad_norm": 2.339650869369507, + "learning_rate": 9.979790178621225e-06, + "loss": 3.3662, + "step": 145400 + }, + { + "epoch": 0.14329145748445418, + "grad_norm": 2.4103870391845703, + "learning_rate": 9.97977627648745e-06, + "loss": 3.2871, + "step": 145450 + }, + { + "epoch": 0.14334071546227628, + "grad_norm": 2.3108596801757812, + "learning_rate": 9.979762369583436e-06, + "loss": 3.3147, + "step": 145500 + }, + { + "epoch": 0.14338997344009835, + "grad_norm": 2.426852226257324, + "learning_rate": 9.979748457909201e-06, + "loss": 3.2642, + "step": 145550 + }, + { + "epoch": 0.14343923141792045, + "grad_norm": 2.2711663246154785, + "learning_rate": 9.979734541464758e-06, + "loss": 3.3544, + "step": 145600 + }, + { + "epoch": 0.14348848939574255, + "grad_norm": 2.4099419116973877, + "learning_rate": 9.979720620250118e-06, + "loss": 3.2756, + "step": 145650 + }, + { + "epoch": 0.14353774737356462, + "grad_norm": 2.439168930053711, + "learning_rate": 9.979706694265295e-06, + "loss": 3.2948, + "step": 145700 + }, + { + "epoch": 0.14358700535138672, + "grad_norm": 2.604386806488037, + "learning_rate": 9.979692763510305e-06, + "loss": 3.2835, + "step": 145750 + }, + { + "epoch": 0.14363626332920879, + "grad_norm": 2.438323736190796, + "learning_rate": 9.979678827985157e-06, + "loss": 3.3061, + "step": 145800 + }, + { + "epoch": 0.14368552130703088, + "grad_norm": 2.368328094482422, + "learning_rate": 9.979664887689868e-06, + "loss": 3.3573, + "step": 145850 + }, + { + "epoch": 0.14373477928485298, + "grad_norm": 2.4357810020446777, + "learning_rate": 9.979650942624451e-06, + "loss": 3.2772, + "step": 145900 + }, + { + "epoch": 0.14378403726267505, + "grad_norm": 2.2624030113220215, + "learning_rate": 9.979636992788915e-06, + "loss": 3.3247, + "step": 145950 + }, + { + "epoch": 0.14383329524049715, + "grad_norm": 2.5204734802246094, + "learning_rate": 9.97962303818328e-06, + "loss": 3.2395, + "step": 146000 + }, + { + "epoch": 0.14388255321831925, + "grad_norm": 2.54004168510437, + "learning_rate": 9.979609078807556e-06, + "loss": 3.2424, + "step": 146050 + }, + { + "epoch": 0.14393181119614132, + "grad_norm": 2.509308099746704, + "learning_rate": 9.979595114661754e-06, + "loss": 3.3464, + "step": 146100 + }, + { + "epoch": 0.14398106917396342, + "grad_norm": 2.679945468902588, + "learning_rate": 9.979581145745891e-06, + "loss": 3.246, + "step": 146150 + }, + { + "epoch": 0.14403032715178551, + "grad_norm": 2.6270086765289307, + "learning_rate": 9.97956717205998e-06, + "loss": 3.2905, + "step": 146200 + }, + { + "epoch": 0.14407958512960758, + "grad_norm": 2.493008613586426, + "learning_rate": 9.979553193604032e-06, + "loss": 3.2916, + "step": 146250 + }, + { + "epoch": 0.14412884310742968, + "grad_norm": 2.4219274520874023, + "learning_rate": 9.979539210378063e-06, + "loss": 3.2347, + "step": 146300 + }, + { + "epoch": 0.14417810108525178, + "grad_norm": 3.127563953399658, + "learning_rate": 9.979525222382085e-06, + "loss": 3.3334, + "step": 146350 + }, + { + "epoch": 0.14422735906307385, + "grad_norm": 2.3080618381500244, + "learning_rate": 9.979511229616112e-06, + "loss": 3.3062, + "step": 146400 + }, + { + "epoch": 0.14427661704089595, + "grad_norm": 2.4787940979003906, + "learning_rate": 9.979497232080156e-06, + "loss": 3.2242, + "step": 146450 + }, + { + "epoch": 0.14432587501871802, + "grad_norm": 2.26639986038208, + "learning_rate": 9.979483229774235e-06, + "loss": 3.2811, + "step": 146500 + }, + { + "epoch": 0.14437513299654012, + "grad_norm": 2.4112660884857178, + "learning_rate": 9.979469222698355e-06, + "loss": 3.2658, + "step": 146550 + }, + { + "epoch": 0.14442439097436222, + "grad_norm": 2.349797248840332, + "learning_rate": 9.979455210852535e-06, + "loss": 3.3028, + "step": 146600 + }, + { + "epoch": 0.14447364895218429, + "grad_norm": 2.6167349815368652, + "learning_rate": 9.979441194236788e-06, + "loss": 3.3002, + "step": 146650 + }, + { + "epoch": 0.14452290693000638, + "grad_norm": 2.5947835445404053, + "learning_rate": 9.979427172851124e-06, + "loss": 3.2312, + "step": 146700 + }, + { + "epoch": 0.14457216490782848, + "grad_norm": 2.3929131031036377, + "learning_rate": 9.97941314669556e-06, + "loss": 3.2819, + "step": 146750 + }, + { + "epoch": 0.14462142288565055, + "grad_norm": 2.3144378662109375, + "learning_rate": 9.979399115770107e-06, + "loss": 3.2715, + "step": 146800 + }, + { + "epoch": 0.14467068086347265, + "grad_norm": 2.5739693641662598, + "learning_rate": 9.979385080074782e-06, + "loss": 3.3078, + "step": 146850 + }, + { + "epoch": 0.14471993884129475, + "grad_norm": 2.396960735321045, + "learning_rate": 9.979371039609594e-06, + "loss": 3.2647, + "step": 146900 + }, + { + "epoch": 0.14476919681911682, + "grad_norm": 2.5673041343688965, + "learning_rate": 9.979356994374558e-06, + "loss": 3.2826, + "step": 146950 + }, + { + "epoch": 0.14481845479693892, + "grad_norm": 2.5432779788970947, + "learning_rate": 9.97934294436969e-06, + "loss": 3.2812, + "step": 147000 + }, + { + "epoch": 0.144867712774761, + "grad_norm": 2.698669910430908, + "learning_rate": 9.979328889595e-06, + "loss": 3.278, + "step": 147050 + }, + { + "epoch": 0.14491697075258309, + "grad_norm": 2.3214752674102783, + "learning_rate": 9.979314830050504e-06, + "loss": 3.34, + "step": 147100 + }, + { + "epoch": 0.14496622873040518, + "grad_norm": 2.4732139110565186, + "learning_rate": 9.979300765736214e-06, + "loss": 3.2034, + "step": 147150 + }, + { + "epoch": 0.14501548670822725, + "grad_norm": 2.45590877532959, + "learning_rate": 9.979286696652142e-06, + "loss": 3.3291, + "step": 147200 + }, + { + "epoch": 0.14506474468604935, + "grad_norm": 2.368332624435425, + "learning_rate": 9.979272622798304e-06, + "loss": 3.2576, + "step": 147250 + }, + { + "epoch": 0.14511400266387145, + "grad_norm": 2.514828681945801, + "learning_rate": 9.979258544174714e-06, + "loss": 3.2614, + "step": 147300 + }, + { + "epoch": 0.14516326064169352, + "grad_norm": 2.501635789871216, + "learning_rate": 9.979244460781383e-06, + "loss": 3.306, + "step": 147350 + }, + { + "epoch": 0.14521251861951562, + "grad_norm": 2.827171802520752, + "learning_rate": 9.979230372618328e-06, + "loss": 3.2906, + "step": 147400 + }, + { + "epoch": 0.14526177659733772, + "grad_norm": 2.3729114532470703, + "learning_rate": 9.979216279685558e-06, + "loss": 3.3687, + "step": 147450 + }, + { + "epoch": 0.1453110345751598, + "grad_norm": 2.6385669708251953, + "learning_rate": 9.979202181983087e-06, + "loss": 3.2612, + "step": 147500 + }, + { + "epoch": 0.14536029255298188, + "grad_norm": 2.4721758365631104, + "learning_rate": 9.979188079510933e-06, + "loss": 3.2914, + "step": 147550 + }, + { + "epoch": 0.14540955053080398, + "grad_norm": 2.2799479961395264, + "learning_rate": 9.979173972269107e-06, + "loss": 3.2835, + "step": 147600 + }, + { + "epoch": 0.14545880850862605, + "grad_norm": 2.340970277786255, + "learning_rate": 9.97915986025762e-06, + "loss": 3.2837, + "step": 147650 + }, + { + "epoch": 0.14550806648644815, + "grad_norm": 2.490550994873047, + "learning_rate": 9.979145743476489e-06, + "loss": 3.2892, + "step": 147700 + }, + { + "epoch": 0.14555732446427022, + "grad_norm": 2.597238063812256, + "learning_rate": 9.979131621925726e-06, + "loss": 3.305, + "step": 147750 + }, + { + "epoch": 0.14560658244209232, + "grad_norm": 2.574496269226074, + "learning_rate": 9.979117495605346e-06, + "loss": 3.2265, + "step": 147800 + }, + { + "epoch": 0.14565584041991442, + "grad_norm": 2.226213216781616, + "learning_rate": 9.979103364515361e-06, + "loss": 3.3109, + "step": 147850 + }, + { + "epoch": 0.1457050983977365, + "grad_norm": 2.3243770599365234, + "learning_rate": 9.979089228655784e-06, + "loss": 3.3261, + "step": 147900 + }, + { + "epoch": 0.14575435637555859, + "grad_norm": 2.465961456298828, + "learning_rate": 9.97907508802663e-06, + "loss": 3.289, + "step": 147950 + }, + { + "epoch": 0.14580361435338068, + "grad_norm": 2.276695966720581, + "learning_rate": 9.979060942627911e-06, + "loss": 3.3005, + "step": 148000 + }, + { + "epoch": 0.14585287233120275, + "grad_norm": 2.5324413776397705, + "learning_rate": 9.979046792459642e-06, + "loss": 3.287, + "step": 148050 + }, + { + "epoch": 0.14590213030902485, + "grad_norm": 2.4110751152038574, + "learning_rate": 9.979032637521838e-06, + "loss": 3.3059, + "step": 148100 + }, + { + "epoch": 0.14595138828684695, + "grad_norm": 2.535853624343872, + "learning_rate": 9.979018477814507e-06, + "loss": 3.326, + "step": 148150 + }, + { + "epoch": 0.14600064626466902, + "grad_norm": 2.435337543487549, + "learning_rate": 9.979004313337668e-06, + "loss": 3.2113, + "step": 148200 + }, + { + "epoch": 0.14604990424249112, + "grad_norm": 2.5442047119140625, + "learning_rate": 9.978990144091334e-06, + "loss": 3.2839, + "step": 148250 + }, + { + "epoch": 0.1460991622203132, + "grad_norm": 2.5221338272094727, + "learning_rate": 9.978975970075515e-06, + "loss": 3.3352, + "step": 148300 + }, + { + "epoch": 0.1461484201981353, + "grad_norm": 2.4341981410980225, + "learning_rate": 9.97896179129023e-06, + "loss": 3.3155, + "step": 148350 + }, + { + "epoch": 0.14619767817595739, + "grad_norm": 2.3526546955108643, + "learning_rate": 9.978947607735485e-06, + "loss": 3.3023, + "step": 148400 + }, + { + "epoch": 0.14624693615377946, + "grad_norm": 2.400481700897217, + "learning_rate": 9.978933419411302e-06, + "loss": 3.2909, + "step": 148450 + }, + { + "epoch": 0.14629619413160155, + "grad_norm": 2.4070205688476562, + "learning_rate": 9.978919226317689e-06, + "loss": 3.2199, + "step": 148500 + }, + { + "epoch": 0.14634545210942365, + "grad_norm": 2.480579376220703, + "learning_rate": 9.978905028454661e-06, + "loss": 3.2443, + "step": 148550 + }, + { + "epoch": 0.14639471008724572, + "grad_norm": 2.4894566535949707, + "learning_rate": 9.978890825822232e-06, + "loss": 3.2814, + "step": 148600 + }, + { + "epoch": 0.14644396806506782, + "grad_norm": 2.521941900253296, + "learning_rate": 9.978876618420416e-06, + "loss": 3.3343, + "step": 148650 + }, + { + "epoch": 0.14649322604288992, + "grad_norm": 2.449779748916626, + "learning_rate": 9.978862406249227e-06, + "loss": 3.3327, + "step": 148700 + }, + { + "epoch": 0.146542484020712, + "grad_norm": 2.2670562267303467, + "learning_rate": 9.978848189308677e-06, + "loss": 3.3424, + "step": 148750 + }, + { + "epoch": 0.1465917419985341, + "grad_norm": 2.377495527267456, + "learning_rate": 9.97883396759878e-06, + "loss": 3.2847, + "step": 148800 + }, + { + "epoch": 0.14664099997635618, + "grad_norm": 2.3170268535614014, + "learning_rate": 9.97881974111955e-06, + "loss": 3.2765, + "step": 148850 + }, + { + "epoch": 0.14669025795417825, + "grad_norm": 2.2896487712860107, + "learning_rate": 9.978805509870999e-06, + "loss": 3.288, + "step": 148900 + }, + { + "epoch": 0.14673951593200035, + "grad_norm": 2.5009007453918457, + "learning_rate": 9.978791273853145e-06, + "loss": 3.2615, + "step": 148950 + }, + { + "epoch": 0.14678877390982242, + "grad_norm": 3.0417041778564453, + "learning_rate": 9.978777033065997e-06, + "loss": 3.2558, + "step": 149000 + }, + { + "epoch": 0.14683803188764452, + "grad_norm": 2.407058000564575, + "learning_rate": 9.97876278750957e-06, + "loss": 3.2533, + "step": 149050 + }, + { + "epoch": 0.14688728986546662, + "grad_norm": 2.4789116382598877, + "learning_rate": 9.97874853718388e-06, + "loss": 3.3029, + "step": 149100 + }, + { + "epoch": 0.1469365478432887, + "grad_norm": 2.379430055618286, + "learning_rate": 9.978734282088937e-06, + "loss": 3.3172, + "step": 149150 + }, + { + "epoch": 0.1469858058211108, + "grad_norm": 2.349109172821045, + "learning_rate": 9.978720022224758e-06, + "loss": 3.2582, + "step": 149200 + }, + { + "epoch": 0.14703506379893289, + "grad_norm": 2.450807571411133, + "learning_rate": 9.978705757591354e-06, + "loss": 3.2481, + "step": 149250 + }, + { + "epoch": 0.14708432177675496, + "grad_norm": 2.4101028442382812, + "learning_rate": 9.97869148818874e-06, + "loss": 3.2536, + "step": 149300 + }, + { + "epoch": 0.14713357975457705, + "grad_norm": 2.4175539016723633, + "learning_rate": 9.97867721401693e-06, + "loss": 3.3054, + "step": 149350 + }, + { + "epoch": 0.14718283773239915, + "grad_norm": 2.233492136001587, + "learning_rate": 9.978662935075936e-06, + "loss": 3.3238, + "step": 149400 + }, + { + "epoch": 0.14723209571022122, + "grad_norm": 2.4466259479522705, + "learning_rate": 9.978648651365774e-06, + "loss": 3.3276, + "step": 149450 + }, + { + "epoch": 0.14728135368804332, + "grad_norm": 2.45829439163208, + "learning_rate": 9.978634362886456e-06, + "loss": 3.3413, + "step": 149500 + }, + { + "epoch": 0.1473306116658654, + "grad_norm": 2.4345948696136475, + "learning_rate": 9.978620069637997e-06, + "loss": 3.3234, + "step": 149550 + }, + { + "epoch": 0.1473798696436875, + "grad_norm": 2.7117388248443604, + "learning_rate": 9.978605771620408e-06, + "loss": 3.2672, + "step": 149600 + }, + { + "epoch": 0.1474291276215096, + "grad_norm": 2.7042853832244873, + "learning_rate": 9.978591468833705e-06, + "loss": 3.3143, + "step": 149650 + }, + { + "epoch": 0.14747838559933166, + "grad_norm": 2.5825064182281494, + "learning_rate": 9.9785771612779e-06, + "loss": 3.3432, + "step": 149700 + }, + { + "epoch": 0.14752764357715376, + "grad_norm": 2.7188947200775146, + "learning_rate": 9.97856284895301e-06, + "loss": 3.3423, + "step": 149750 + }, + { + "epoch": 0.14757690155497585, + "grad_norm": 2.5379478931427, + "learning_rate": 9.978548531859047e-06, + "loss": 3.3949, + "step": 149800 + }, + { + "epoch": 0.14762615953279792, + "grad_norm": 2.414306640625, + "learning_rate": 9.978534209996023e-06, + "loss": 3.2833, + "step": 149850 + }, + { + "epoch": 0.14767541751062002, + "grad_norm": 2.649200439453125, + "learning_rate": 9.978519883363955e-06, + "loss": 3.2816, + "step": 149900 + }, + { + "epoch": 0.14772467548844212, + "grad_norm": 2.369764566421509, + "learning_rate": 9.978505551962853e-06, + "loss": 3.2828, + "step": 149950 + }, + { + "epoch": 0.1477739334662642, + "grad_norm": 2.336366653442383, + "learning_rate": 9.978491215792732e-06, + "loss": 3.3031, + "step": 150000 + }, + { + "epoch": 0.1478231914440863, + "grad_norm": 2.4197518825531006, + "learning_rate": 9.97847687485361e-06, + "loss": 3.2846, + "step": 150050 + }, + { + "epoch": 0.14787244942190836, + "grad_norm": 2.4224517345428467, + "learning_rate": 9.978462529145492e-06, + "loss": 3.3006, + "step": 150100 + }, + { + "epoch": 0.14792170739973046, + "grad_norm": 2.4161429405212402, + "learning_rate": 9.978448178668399e-06, + "loss": 3.3157, + "step": 150150 + }, + { + "epoch": 0.14797096537755255, + "grad_norm": 2.299060344696045, + "learning_rate": 9.978433823422342e-06, + "loss": 3.293, + "step": 150200 + }, + { + "epoch": 0.14802022335537462, + "grad_norm": 2.337463140487671, + "learning_rate": 9.978419463407336e-06, + "loss": 3.336, + "step": 150250 + }, + { + "epoch": 0.14806948133319672, + "grad_norm": 2.403433084487915, + "learning_rate": 9.978405098623393e-06, + "loss": 3.2861, + "step": 150300 + }, + { + "epoch": 0.14811873931101882, + "grad_norm": 2.5146632194519043, + "learning_rate": 9.97839072907053e-06, + "loss": 3.3833, + "step": 150350 + }, + { + "epoch": 0.1481679972888409, + "grad_norm": 2.423896074295044, + "learning_rate": 9.978376354748756e-06, + "loss": 3.271, + "step": 150400 + }, + { + "epoch": 0.148217255266663, + "grad_norm": 2.517756700515747, + "learning_rate": 9.978361975658088e-06, + "loss": 3.3072, + "step": 150450 + }, + { + "epoch": 0.1482665132444851, + "grad_norm": 2.423351287841797, + "learning_rate": 9.978347591798541e-06, + "loss": 3.2788, + "step": 150500 + }, + { + "epoch": 0.14831577122230716, + "grad_norm": 2.2556159496307373, + "learning_rate": 9.978333203170123e-06, + "loss": 3.3211, + "step": 150550 + }, + { + "epoch": 0.14836502920012926, + "grad_norm": 2.3965258598327637, + "learning_rate": 9.978318809772854e-06, + "loss": 3.3346, + "step": 150600 + }, + { + "epoch": 0.14841428717795135, + "grad_norm": 2.3876726627349854, + "learning_rate": 9.978304411606745e-06, + "loss": 3.2629, + "step": 150650 + }, + { + "epoch": 0.14846354515577342, + "grad_norm": 2.68920636177063, + "learning_rate": 9.978290008671811e-06, + "loss": 3.3458, + "step": 150700 + }, + { + "epoch": 0.14851280313359552, + "grad_norm": 2.1873795986175537, + "learning_rate": 9.978275600968064e-06, + "loss": 3.3014, + "step": 150750 + }, + { + "epoch": 0.1485620611114176, + "grad_norm": 2.2328312397003174, + "learning_rate": 9.97826118849552e-06, + "loss": 3.2473, + "step": 150800 + }, + { + "epoch": 0.1486113190892397, + "grad_norm": 2.8184449672698975, + "learning_rate": 9.978246771254191e-06, + "loss": 3.3464, + "step": 150850 + }, + { + "epoch": 0.1486605770670618, + "grad_norm": 2.5048348903656006, + "learning_rate": 9.97823234924409e-06, + "loss": 3.3421, + "step": 150900 + }, + { + "epoch": 0.14870983504488386, + "grad_norm": 2.426356077194214, + "learning_rate": 9.978217922465235e-06, + "loss": 3.284, + "step": 150950 + }, + { + "epoch": 0.14875909302270596, + "grad_norm": 2.207216262817383, + "learning_rate": 9.978203490917635e-06, + "loss": 3.2471, + "step": 151000 + }, + { + "epoch": 0.14880835100052806, + "grad_norm": 2.3343188762664795, + "learning_rate": 9.978189054601306e-06, + "loss": 3.2607, + "step": 151050 + }, + { + "epoch": 0.14885760897835013, + "grad_norm": 2.44858717918396, + "learning_rate": 9.978174613516263e-06, + "loss": 3.3335, + "step": 151100 + }, + { + "epoch": 0.14890686695617222, + "grad_norm": 2.4627461433410645, + "learning_rate": 9.978160167662517e-06, + "loss": 3.2515, + "step": 151150 + }, + { + "epoch": 0.14895612493399432, + "grad_norm": 2.604360342025757, + "learning_rate": 9.978145717040085e-06, + "loss": 3.3334, + "step": 151200 + }, + { + "epoch": 0.1490053829118164, + "grad_norm": 2.368360996246338, + "learning_rate": 9.978131261648977e-06, + "loss": 3.2695, + "step": 151250 + }, + { + "epoch": 0.1490546408896385, + "grad_norm": 2.4292421340942383, + "learning_rate": 9.978116801489211e-06, + "loss": 3.3069, + "step": 151300 + }, + { + "epoch": 0.14910389886746056, + "grad_norm": 2.3855268955230713, + "learning_rate": 9.978102336560797e-06, + "loss": 3.3683, + "step": 151350 + }, + { + "epoch": 0.14915315684528266, + "grad_norm": 2.316948413848877, + "learning_rate": 9.978087866863753e-06, + "loss": 3.3332, + "step": 151400 + }, + { + "epoch": 0.14920241482310476, + "grad_norm": 2.392646312713623, + "learning_rate": 9.97807339239809e-06, + "loss": 3.2646, + "step": 151450 + }, + { + "epoch": 0.14925167280092683, + "grad_norm": 2.8090627193450928, + "learning_rate": 9.978058913163822e-06, + "loss": 3.3435, + "step": 151500 + }, + { + "epoch": 0.14930093077874892, + "grad_norm": 2.486398220062256, + "learning_rate": 9.978044429160963e-06, + "loss": 3.3209, + "step": 151550 + }, + { + "epoch": 0.14935018875657102, + "grad_norm": 2.2862391471862793, + "learning_rate": 9.978029940389528e-06, + "loss": 3.2569, + "step": 151600 + }, + { + "epoch": 0.1493994467343931, + "grad_norm": 2.331777334213257, + "learning_rate": 9.97801544684953e-06, + "loss": 3.2582, + "step": 151650 + }, + { + "epoch": 0.1494487047122152, + "grad_norm": 2.490126132965088, + "learning_rate": 9.978000948540983e-06, + "loss": 3.3346, + "step": 151700 + }, + { + "epoch": 0.1494979626900373, + "grad_norm": 3.2594997882843018, + "learning_rate": 9.9779864454639e-06, + "loss": 3.2881, + "step": 151750 + }, + { + "epoch": 0.14954722066785936, + "grad_norm": 2.4209935665130615, + "learning_rate": 9.977971937618299e-06, + "loss": 3.2753, + "step": 151800 + }, + { + "epoch": 0.14959647864568146, + "grad_norm": 2.4318056106567383, + "learning_rate": 9.977957425004188e-06, + "loss": 3.2823, + "step": 151850 + }, + { + "epoch": 0.14964573662350356, + "grad_norm": 2.597581624984741, + "learning_rate": 9.977942907621583e-06, + "loss": 3.289, + "step": 151900 + }, + { + "epoch": 0.14969499460132563, + "grad_norm": 2.473238229751587, + "learning_rate": 9.9779283854705e-06, + "loss": 3.2445, + "step": 151950 + }, + { + "epoch": 0.14974425257914772, + "grad_norm": 2.4434046745300293, + "learning_rate": 9.97791385855095e-06, + "loss": 3.2827, + "step": 152000 + }, + { + "epoch": 0.1497935105569698, + "grad_norm": 2.627157688140869, + "learning_rate": 9.97789932686295e-06, + "loss": 3.3365, + "step": 152050 + }, + { + "epoch": 0.1498427685347919, + "grad_norm": 2.139827013015747, + "learning_rate": 9.97788479040651e-06, + "loss": 3.3235, + "step": 152100 + }, + { + "epoch": 0.149892026512614, + "grad_norm": 2.7437503337860107, + "learning_rate": 9.97787024918165e-06, + "loss": 3.3048, + "step": 152150 + }, + { + "epoch": 0.14994128449043606, + "grad_norm": 2.331782102584839, + "learning_rate": 9.977855703188377e-06, + "loss": 3.3181, + "step": 152200 + }, + { + "epoch": 0.14999054246825816, + "grad_norm": 2.710099220275879, + "learning_rate": 9.977841152426708e-06, + "loss": 3.2854, + "step": 152250 + }, + { + "epoch": 0.15003980044608026, + "grad_norm": 2.3837549686431885, + "learning_rate": 9.977826596896658e-06, + "loss": 3.2619, + "step": 152300 + }, + { + "epoch": 0.15008905842390233, + "grad_norm": 2.5437936782836914, + "learning_rate": 9.97781203659824e-06, + "loss": 3.235, + "step": 152350 + }, + { + "epoch": 0.15013831640172443, + "grad_norm": 3.37489914894104, + "learning_rate": 9.977797471531468e-06, + "loss": 3.2802, + "step": 152400 + }, + { + "epoch": 0.15018757437954652, + "grad_norm": 2.6280834674835205, + "learning_rate": 9.977782901696354e-06, + "loss": 3.2462, + "step": 152450 + }, + { + "epoch": 0.1502368323573686, + "grad_norm": 2.301314115524292, + "learning_rate": 9.977768327092917e-06, + "loss": 3.2525, + "step": 152500 + }, + { + "epoch": 0.1502860903351907, + "grad_norm": 2.7387471199035645, + "learning_rate": 9.977753747721164e-06, + "loss": 3.2933, + "step": 152550 + }, + { + "epoch": 0.15033534831301276, + "grad_norm": 2.5955018997192383, + "learning_rate": 9.977739163581115e-06, + "loss": 3.2982, + "step": 152600 + }, + { + "epoch": 0.15038460629083486, + "grad_norm": 2.530768394470215, + "learning_rate": 9.977724574672781e-06, + "loss": 3.2864, + "step": 152650 + }, + { + "epoch": 0.15043386426865696, + "grad_norm": 2.3504109382629395, + "learning_rate": 9.977709980996177e-06, + "loss": 3.289, + "step": 152700 + }, + { + "epoch": 0.15048312224647903, + "grad_norm": 2.2651355266571045, + "learning_rate": 9.977695382551317e-06, + "loss": 3.2959, + "step": 152750 + }, + { + "epoch": 0.15053238022430113, + "grad_norm": 2.5837149620056152, + "learning_rate": 9.977680779338214e-06, + "loss": 3.3368, + "step": 152800 + }, + { + "epoch": 0.15058163820212322, + "grad_norm": 2.345137357711792, + "learning_rate": 9.977666171356882e-06, + "loss": 3.2261, + "step": 152850 + }, + { + "epoch": 0.1506308961799453, + "grad_norm": 2.2083981037139893, + "learning_rate": 9.977651558607336e-06, + "loss": 3.2318, + "step": 152900 + }, + { + "epoch": 0.1506801541577674, + "grad_norm": 2.286292314529419, + "learning_rate": 9.97763694108959e-06, + "loss": 3.2576, + "step": 152950 + }, + { + "epoch": 0.1507294121355895, + "grad_norm": 2.2421185970306396, + "learning_rate": 9.977622318803658e-06, + "loss": 3.2813, + "step": 153000 + }, + { + "epoch": 0.15077867011341156, + "grad_norm": 2.309407949447632, + "learning_rate": 9.977607691749553e-06, + "loss": 3.3366, + "step": 153050 + }, + { + "epoch": 0.15082792809123366, + "grad_norm": 2.35783314704895, + "learning_rate": 9.97759305992729e-06, + "loss": 3.3906, + "step": 153100 + }, + { + "epoch": 0.15087718606905576, + "grad_norm": 2.681591510772705, + "learning_rate": 9.977578423336883e-06, + "loss": 3.2705, + "step": 153150 + }, + { + "epoch": 0.15092644404687783, + "grad_norm": 2.5046284198760986, + "learning_rate": 9.977563781978343e-06, + "loss": 3.2609, + "step": 153200 + }, + { + "epoch": 0.15097570202469993, + "grad_norm": 2.347092866897583, + "learning_rate": 9.977549135851689e-06, + "loss": 3.2569, + "step": 153250 + }, + { + "epoch": 0.151024960002522, + "grad_norm": 2.471653461456299, + "learning_rate": 9.977534484956932e-06, + "loss": 3.273, + "step": 153300 + }, + { + "epoch": 0.1510742179803441, + "grad_norm": 2.446086883544922, + "learning_rate": 9.977519829294088e-06, + "loss": 3.2916, + "step": 153350 + }, + { + "epoch": 0.1511234759581662, + "grad_norm": 2.4059042930603027, + "learning_rate": 9.977505168863168e-06, + "loss": 3.3143, + "step": 153400 + }, + { + "epoch": 0.15117273393598826, + "grad_norm": 2.59415864944458, + "learning_rate": 9.97749050366419e-06, + "loss": 3.3057, + "step": 153450 + }, + { + "epoch": 0.15122199191381036, + "grad_norm": 2.501274347305298, + "learning_rate": 9.977475833697163e-06, + "loss": 3.2568, + "step": 153500 + }, + { + "epoch": 0.15127124989163246, + "grad_norm": 2.516129970550537, + "learning_rate": 9.977461158962106e-06, + "loss": 3.3436, + "step": 153550 + }, + { + "epoch": 0.15132050786945453, + "grad_norm": 2.435983896255493, + "learning_rate": 9.97744647945903e-06, + "loss": 3.2691, + "step": 153600 + }, + { + "epoch": 0.15136976584727663, + "grad_norm": 2.6737070083618164, + "learning_rate": 9.977431795187951e-06, + "loss": 3.2848, + "step": 153650 + }, + { + "epoch": 0.15141902382509873, + "grad_norm": 2.5116662979125977, + "learning_rate": 9.977417106148882e-06, + "loss": 3.3348, + "step": 153700 + }, + { + "epoch": 0.1514682818029208, + "grad_norm": 2.5109643936157227, + "learning_rate": 9.977402412341836e-06, + "loss": 3.3156, + "step": 153750 + }, + { + "epoch": 0.1515175397807429, + "grad_norm": 2.3035101890563965, + "learning_rate": 9.97738771376683e-06, + "loss": 3.2049, + "step": 153800 + }, + { + "epoch": 0.15156679775856496, + "grad_norm": 2.3835597038269043, + "learning_rate": 9.977373010423874e-06, + "loss": 3.3418, + "step": 153850 + }, + { + "epoch": 0.15161605573638706, + "grad_norm": 2.4332709312438965, + "learning_rate": 9.977358302312986e-06, + "loss": 3.2651, + "step": 153900 + }, + { + "epoch": 0.15166531371420916, + "grad_norm": 2.4367105960845947, + "learning_rate": 9.977343589434178e-06, + "loss": 3.2972, + "step": 153950 + }, + { + "epoch": 0.15171457169203123, + "grad_norm": 2.3251142501831055, + "learning_rate": 9.977328871787467e-06, + "loss": 3.3117, + "step": 154000 + }, + { + "epoch": 0.15176382966985333, + "grad_norm": 2.31186842918396, + "learning_rate": 9.977314149372861e-06, + "loss": 3.3653, + "step": 154050 + }, + { + "epoch": 0.15181308764767543, + "grad_norm": 2.3509304523468018, + "learning_rate": 9.97729942219038e-06, + "loss": 3.3388, + "step": 154100 + }, + { + "epoch": 0.1518623456254975, + "grad_norm": 2.434520959854126, + "learning_rate": 9.977284690240035e-06, + "loss": 3.2536, + "step": 154150 + }, + { + "epoch": 0.1519116036033196, + "grad_norm": 2.380880832672119, + "learning_rate": 9.977269953521842e-06, + "loss": 3.2944, + "step": 154200 + }, + { + "epoch": 0.1519608615811417, + "grad_norm": 2.4723386764526367, + "learning_rate": 9.977255212035812e-06, + "loss": 3.2684, + "step": 154250 + }, + { + "epoch": 0.15201011955896376, + "grad_norm": 2.5194687843322754, + "learning_rate": 9.977240465781964e-06, + "loss": 3.2823, + "step": 154300 + }, + { + "epoch": 0.15205937753678586, + "grad_norm": 2.4119274616241455, + "learning_rate": 9.977225714760308e-06, + "loss": 3.2884, + "step": 154350 + }, + { + "epoch": 0.15210863551460796, + "grad_norm": 2.4193413257598877, + "learning_rate": 9.977210958970859e-06, + "loss": 3.326, + "step": 154400 + }, + { + "epoch": 0.15215789349243003, + "grad_norm": 2.2917275428771973, + "learning_rate": 9.977196198413632e-06, + "loss": 3.2287, + "step": 154450 + }, + { + "epoch": 0.15220715147025213, + "grad_norm": 2.298164129257202, + "learning_rate": 9.97718143308864e-06, + "loss": 3.2516, + "step": 154500 + }, + { + "epoch": 0.1522564094480742, + "grad_norm": 2.438342332839966, + "learning_rate": 9.9771666629959e-06, + "loss": 3.2583, + "step": 154550 + }, + { + "epoch": 0.1523056674258963, + "grad_norm": 2.330731153488159, + "learning_rate": 9.977151888135424e-06, + "loss": 3.2195, + "step": 154600 + }, + { + "epoch": 0.1523549254037184, + "grad_norm": 2.8021440505981445, + "learning_rate": 9.977137108507224e-06, + "loss": 3.3307, + "step": 154650 + }, + { + "epoch": 0.15240418338154046, + "grad_norm": 2.300248622894287, + "learning_rate": 9.977122324111316e-06, + "loss": 3.2732, + "step": 154700 + }, + { + "epoch": 0.15245344135936256, + "grad_norm": 2.3328301906585693, + "learning_rate": 9.977107534947717e-06, + "loss": 3.2772, + "step": 154750 + }, + { + "epoch": 0.15250269933718466, + "grad_norm": 2.4833085536956787, + "learning_rate": 9.977092741016438e-06, + "loss": 3.2807, + "step": 154800 + }, + { + "epoch": 0.15255195731500673, + "grad_norm": 2.4440841674804688, + "learning_rate": 9.977077942317495e-06, + "loss": 3.2053, + "step": 154850 + }, + { + "epoch": 0.15260121529282883, + "grad_norm": 2.5936126708984375, + "learning_rate": 9.9770631388509e-06, + "loss": 3.3171, + "step": 154900 + }, + { + "epoch": 0.15265047327065093, + "grad_norm": 2.4147815704345703, + "learning_rate": 9.977048330616667e-06, + "loss": 3.2485, + "step": 154950 + }, + { + "epoch": 0.152699731248473, + "grad_norm": 3.1126468181610107, + "learning_rate": 9.977033517614814e-06, + "loss": 3.2606, + "step": 155000 + }, + { + "epoch": 0.1527489892262951, + "grad_norm": 2.517970323562622, + "learning_rate": 9.97701869984535e-06, + "loss": 3.2546, + "step": 155050 + }, + { + "epoch": 0.15279824720411717, + "grad_norm": 2.313511848449707, + "learning_rate": 9.977003877308293e-06, + "loss": 3.3184, + "step": 155100 + }, + { + "epoch": 0.15284750518193926, + "grad_norm": 2.3873114585876465, + "learning_rate": 9.976989050003656e-06, + "loss": 3.2548, + "step": 155150 + }, + { + "epoch": 0.15289676315976136, + "grad_norm": 2.8141448497772217, + "learning_rate": 9.976974217931454e-06, + "loss": 3.2496, + "step": 155200 + }, + { + "epoch": 0.15294602113758343, + "grad_norm": 2.4146671295166016, + "learning_rate": 9.9769593810917e-06, + "loss": 3.2429, + "step": 155250 + }, + { + "epoch": 0.15299527911540553, + "grad_norm": 2.491251230239868, + "learning_rate": 9.976944539484408e-06, + "loss": 3.3404, + "step": 155300 + }, + { + "epoch": 0.15304453709322763, + "grad_norm": 2.4193804264068604, + "learning_rate": 9.976929693109594e-06, + "loss": 3.3084, + "step": 155350 + }, + { + "epoch": 0.1530937950710497, + "grad_norm": 2.3924407958984375, + "learning_rate": 9.97691484196727e-06, + "loss": 3.2557, + "step": 155400 + }, + { + "epoch": 0.1531430530488718, + "grad_norm": 2.333134174346924, + "learning_rate": 9.97689998605745e-06, + "loss": 3.2634, + "step": 155450 + }, + { + "epoch": 0.1531923110266939, + "grad_norm": 2.806565284729004, + "learning_rate": 9.976885125380151e-06, + "loss": 3.2315, + "step": 155500 + }, + { + "epoch": 0.15324156900451597, + "grad_norm": 2.208111047744751, + "learning_rate": 9.976870259935388e-06, + "loss": 3.2211, + "step": 155550 + }, + { + "epoch": 0.15329082698233806, + "grad_norm": 2.738959312438965, + "learning_rate": 9.97685538972317e-06, + "loss": 3.2976, + "step": 155600 + }, + { + "epoch": 0.15334008496016016, + "grad_norm": 2.4517202377319336, + "learning_rate": 9.976840514743515e-06, + "loss": 3.2367, + "step": 155650 + }, + { + "epoch": 0.15338934293798223, + "grad_norm": 2.759275436401367, + "learning_rate": 9.976825634996437e-06, + "loss": 3.3007, + "step": 155700 + }, + { + "epoch": 0.15343860091580433, + "grad_norm": 2.303468942642212, + "learning_rate": 9.976810750481948e-06, + "loss": 3.3229, + "step": 155750 + }, + { + "epoch": 0.1534878588936264, + "grad_norm": 2.4701743125915527, + "learning_rate": 9.976795861200067e-06, + "loss": 3.2357, + "step": 155800 + }, + { + "epoch": 0.1535371168714485, + "grad_norm": 2.5161702632904053, + "learning_rate": 9.976780967150802e-06, + "loss": 3.2847, + "step": 155850 + }, + { + "epoch": 0.1535863748492706, + "grad_norm": 2.3651835918426514, + "learning_rate": 9.976766068334174e-06, + "loss": 3.2832, + "step": 155900 + }, + { + "epoch": 0.15363563282709267, + "grad_norm": 2.305102825164795, + "learning_rate": 9.976751164750192e-06, + "loss": 3.2414, + "step": 155950 + }, + { + "epoch": 0.15368489080491476, + "grad_norm": 2.340178966522217, + "learning_rate": 9.976736256398871e-06, + "loss": 3.2824, + "step": 156000 + }, + { + "epoch": 0.15373414878273686, + "grad_norm": 2.452003002166748, + "learning_rate": 9.976721343280229e-06, + "loss": 3.2819, + "step": 156050 + }, + { + "epoch": 0.15378340676055893, + "grad_norm": 2.4133219718933105, + "learning_rate": 9.976706425394277e-06, + "loss": 3.2682, + "step": 156100 + }, + { + "epoch": 0.15383266473838103, + "grad_norm": 2.3131155967712402, + "learning_rate": 9.976691502741026e-06, + "loss": 3.2495, + "step": 156150 + }, + { + "epoch": 0.15388192271620313, + "grad_norm": 2.45467209815979, + "learning_rate": 9.976676575320499e-06, + "loss": 3.2347, + "step": 156200 + }, + { + "epoch": 0.1539311806940252, + "grad_norm": 2.390944719314575, + "learning_rate": 9.976661643132703e-06, + "loss": 3.2589, + "step": 156250 + }, + { + "epoch": 0.1539804386718473, + "grad_norm": 2.56076717376709, + "learning_rate": 9.976646706177658e-06, + "loss": 3.2709, + "step": 156300 + }, + { + "epoch": 0.15402969664966937, + "grad_norm": 2.5402331352233887, + "learning_rate": 9.976631764455372e-06, + "loss": 3.2341, + "step": 156350 + }, + { + "epoch": 0.15407895462749147, + "grad_norm": 2.519803762435913, + "learning_rate": 9.976616817965862e-06, + "loss": 3.2426, + "step": 156400 + }, + { + "epoch": 0.15412821260531356, + "grad_norm": 2.48053240776062, + "learning_rate": 9.976601866709146e-06, + "loss": 3.2325, + "step": 156450 + }, + { + "epoch": 0.15417747058313563, + "grad_norm": 2.451362133026123, + "learning_rate": 9.976586910685232e-06, + "loss": 3.1817, + "step": 156500 + }, + { + "epoch": 0.15422672856095773, + "grad_norm": 2.366558074951172, + "learning_rate": 9.976571949894139e-06, + "loss": 3.2684, + "step": 156550 + }, + { + "epoch": 0.15427598653877983, + "grad_norm": 2.3876562118530273, + "learning_rate": 9.97655698433588e-06, + "loss": 3.2911, + "step": 156600 + }, + { + "epoch": 0.1543252445166019, + "grad_norm": 2.2565135955810547, + "learning_rate": 9.976542014010469e-06, + "loss": 3.2218, + "step": 156650 + }, + { + "epoch": 0.154374502494424, + "grad_norm": 2.590651512145996, + "learning_rate": 9.97652703891792e-06, + "loss": 3.2119, + "step": 156700 + }, + { + "epoch": 0.1544237604722461, + "grad_norm": 2.3665263652801514, + "learning_rate": 9.976512059058247e-06, + "loss": 3.3024, + "step": 156750 + }, + { + "epoch": 0.15447301845006817, + "grad_norm": 2.303668260574341, + "learning_rate": 9.976497074431465e-06, + "loss": 3.254, + "step": 156800 + }, + { + "epoch": 0.15452227642789026, + "grad_norm": 2.3221631050109863, + "learning_rate": 9.97648208503759e-06, + "loss": 3.3035, + "step": 156850 + }, + { + "epoch": 0.15457153440571236, + "grad_norm": 2.301067590713501, + "learning_rate": 9.976467090876637e-06, + "loss": 3.2774, + "step": 156900 + }, + { + "epoch": 0.15462079238353443, + "grad_norm": 2.55307674407959, + "learning_rate": 9.976452091948613e-06, + "loss": 3.3182, + "step": 156950 + }, + { + "epoch": 0.15467005036135653, + "grad_norm": 2.391819477081299, + "learning_rate": 9.976437088253541e-06, + "loss": 3.289, + "step": 157000 + }, + { + "epoch": 0.1547193083391786, + "grad_norm": 2.4568984508514404, + "learning_rate": 9.976422079791432e-06, + "loss": 3.2943, + "step": 157050 + }, + { + "epoch": 0.1547685663170007, + "grad_norm": 2.361287832260132, + "learning_rate": 9.9764070665623e-06, + "loss": 3.2702, + "step": 157100 + }, + { + "epoch": 0.1548178242948228, + "grad_norm": 2.4821529388427734, + "learning_rate": 9.976392048566158e-06, + "loss": 3.2573, + "step": 157150 + }, + { + "epoch": 0.15486708227264487, + "grad_norm": 2.383223533630371, + "learning_rate": 9.976377025803023e-06, + "loss": 3.2692, + "step": 157200 + }, + { + "epoch": 0.15491634025046697, + "grad_norm": 2.370103597640991, + "learning_rate": 9.97636199827291e-06, + "loss": 3.2354, + "step": 157250 + }, + { + "epoch": 0.15496559822828906, + "grad_norm": 2.358902931213379, + "learning_rate": 9.97634696597583e-06, + "loss": 3.2772, + "step": 157300 + }, + { + "epoch": 0.15501485620611113, + "grad_norm": 2.5299248695373535, + "learning_rate": 9.9763319289118e-06, + "loss": 3.2773, + "step": 157350 + }, + { + "epoch": 0.15506411418393323, + "grad_norm": 2.514885187149048, + "learning_rate": 9.976316887080833e-06, + "loss": 3.2201, + "step": 157400 + }, + { + "epoch": 0.15511337216175533, + "grad_norm": 2.425171136856079, + "learning_rate": 9.976301840482946e-06, + "loss": 3.2306, + "step": 157450 + }, + { + "epoch": 0.1551626301395774, + "grad_norm": 2.463677167892456, + "learning_rate": 9.97628678911815e-06, + "loss": 3.2793, + "step": 157500 + }, + { + "epoch": 0.1552118881173995, + "grad_norm": 2.603466033935547, + "learning_rate": 9.976271732986461e-06, + "loss": 3.2586, + "step": 157550 + }, + { + "epoch": 0.15526114609522157, + "grad_norm": 2.4229319095611572, + "learning_rate": 9.976256672087893e-06, + "loss": 3.3244, + "step": 157600 + }, + { + "epoch": 0.15531040407304367, + "grad_norm": 2.3020153045654297, + "learning_rate": 9.976241606422463e-06, + "loss": 3.3079, + "step": 157650 + }, + { + "epoch": 0.15535966205086577, + "grad_norm": 2.6308183670043945, + "learning_rate": 9.976226535990181e-06, + "loss": 3.2728, + "step": 157700 + }, + { + "epoch": 0.15540892002868784, + "grad_norm": 2.3429410457611084, + "learning_rate": 9.976211460791063e-06, + "loss": 3.2405, + "step": 157750 + }, + { + "epoch": 0.15545817800650993, + "grad_norm": 2.586076021194458, + "learning_rate": 9.976196380825126e-06, + "loss": 3.2596, + "step": 157800 + }, + { + "epoch": 0.15550743598433203, + "grad_norm": 2.5446383953094482, + "learning_rate": 9.976181296092382e-06, + "loss": 3.2657, + "step": 157850 + }, + { + "epoch": 0.1555566939621541, + "grad_norm": 2.7898037433624268, + "learning_rate": 9.976166206592845e-06, + "loss": 3.2571, + "step": 157900 + }, + { + "epoch": 0.1556059519399762, + "grad_norm": 2.2260541915893555, + "learning_rate": 9.976151112326532e-06, + "loss": 3.3077, + "step": 157950 + }, + { + "epoch": 0.1556552099177983, + "grad_norm": 2.329389810562134, + "learning_rate": 9.976136013293455e-06, + "loss": 3.32, + "step": 158000 + }, + { + "epoch": 0.15570446789562037, + "grad_norm": 2.417072296142578, + "learning_rate": 9.97612090949363e-06, + "loss": 3.2907, + "step": 158050 + }, + { + "epoch": 0.15575372587344247, + "grad_norm": 2.41607666015625, + "learning_rate": 9.97610580092707e-06, + "loss": 3.2577, + "step": 158100 + }, + { + "epoch": 0.15580298385126454, + "grad_norm": 2.471095085144043, + "learning_rate": 9.976090687593791e-06, + "loss": 3.2543, + "step": 158150 + }, + { + "epoch": 0.15585224182908664, + "grad_norm": 2.5231313705444336, + "learning_rate": 9.976075569493806e-06, + "loss": 3.2247, + "step": 158200 + }, + { + "epoch": 0.15590149980690873, + "grad_norm": 2.5799996852874756, + "learning_rate": 9.976060446627132e-06, + "loss": 3.254, + "step": 158250 + }, + { + "epoch": 0.1559507577847308, + "grad_norm": 2.439098834991455, + "learning_rate": 9.976045318993779e-06, + "loss": 3.247, + "step": 158300 + }, + { + "epoch": 0.1560000157625529, + "grad_norm": 2.2421817779541016, + "learning_rate": 9.976030186593767e-06, + "loss": 3.2679, + "step": 158350 + }, + { + "epoch": 0.156049273740375, + "grad_norm": 2.361891269683838, + "learning_rate": 9.976015049427105e-06, + "loss": 3.2192, + "step": 158400 + }, + { + "epoch": 0.15609853171819707, + "grad_norm": 2.5467379093170166, + "learning_rate": 9.975999907493811e-06, + "loss": 3.2446, + "step": 158450 + }, + { + "epoch": 0.15614778969601917, + "grad_norm": 2.68713116645813, + "learning_rate": 9.975984760793899e-06, + "loss": 3.286, + "step": 158500 + }, + { + "epoch": 0.15619704767384127, + "grad_norm": 2.6045749187469482, + "learning_rate": 9.975969609327385e-06, + "loss": 3.2655, + "step": 158550 + }, + { + "epoch": 0.15624630565166334, + "grad_norm": 2.1771633625030518, + "learning_rate": 9.975954453094281e-06, + "loss": 3.2546, + "step": 158600 + }, + { + "epoch": 0.15629556362948543, + "grad_norm": 2.3881595134735107, + "learning_rate": 9.975939292094601e-06, + "loss": 3.193, + "step": 158650 + }, + { + "epoch": 0.15634482160730753, + "grad_norm": 2.473228931427002, + "learning_rate": 9.975924126328362e-06, + "loss": 3.2561, + "step": 158700 + }, + { + "epoch": 0.1563940795851296, + "grad_norm": 2.3984456062316895, + "learning_rate": 9.975908955795577e-06, + "loss": 3.3194, + "step": 158750 + }, + { + "epoch": 0.1564433375629517, + "grad_norm": 2.685887336730957, + "learning_rate": 9.97589378049626e-06, + "loss": 3.3194, + "step": 158800 + }, + { + "epoch": 0.15649259554077377, + "grad_norm": 2.4245779514312744, + "learning_rate": 9.975878600430428e-06, + "loss": 3.2909, + "step": 158850 + }, + { + "epoch": 0.15654185351859587, + "grad_norm": 2.189951181411743, + "learning_rate": 9.975863415598092e-06, + "loss": 3.3106, + "step": 158900 + }, + { + "epoch": 0.15659111149641797, + "grad_norm": 2.4818761348724365, + "learning_rate": 9.97584822599927e-06, + "loss": 3.2693, + "step": 158950 + }, + { + "epoch": 0.15664036947424004, + "grad_norm": 2.373173236846924, + "learning_rate": 9.975833031633974e-06, + "loss": 3.2634, + "step": 159000 + }, + { + "epoch": 0.15668962745206214, + "grad_norm": 2.3683037757873535, + "learning_rate": 9.97581783250222e-06, + "loss": 3.3133, + "step": 159050 + }, + { + "epoch": 0.15673888542988423, + "grad_norm": 2.558323621749878, + "learning_rate": 9.975802628604023e-06, + "loss": 3.2526, + "step": 159100 + }, + { + "epoch": 0.1567881434077063, + "grad_norm": 2.6029269695281982, + "learning_rate": 9.975787419939396e-06, + "loss": 3.3302, + "step": 159150 + }, + { + "epoch": 0.1568374013855284, + "grad_norm": 2.276942253112793, + "learning_rate": 9.975772206508354e-06, + "loss": 3.2402, + "step": 159200 + }, + { + "epoch": 0.1568866593633505, + "grad_norm": 2.692389726638794, + "learning_rate": 9.975756988310912e-06, + "loss": 3.2812, + "step": 159250 + }, + { + "epoch": 0.15693591734117257, + "grad_norm": 2.3732175827026367, + "learning_rate": 9.975741765347086e-06, + "loss": 3.2499, + "step": 159300 + }, + { + "epoch": 0.15698517531899467, + "grad_norm": 2.4103550910949707, + "learning_rate": 9.975726537616888e-06, + "loss": 3.3174, + "step": 159350 + }, + { + "epoch": 0.15703443329681674, + "grad_norm": 2.1734941005706787, + "learning_rate": 9.975711305120333e-06, + "loss": 3.2564, + "step": 159400 + }, + { + "epoch": 0.15708369127463884, + "grad_norm": 2.4341249465942383, + "learning_rate": 9.975696067857436e-06, + "loss": 3.3383, + "step": 159450 + }, + { + "epoch": 0.15713294925246094, + "grad_norm": 2.3326504230499268, + "learning_rate": 9.975680825828213e-06, + "loss": 3.2316, + "step": 159500 + }, + { + "epoch": 0.157182207230283, + "grad_norm": 2.4679226875305176, + "learning_rate": 9.975665579032676e-06, + "loss": 3.3058, + "step": 159550 + }, + { + "epoch": 0.1572314652081051, + "grad_norm": 2.559429407119751, + "learning_rate": 9.975650327470842e-06, + "loss": 3.2462, + "step": 159600 + }, + { + "epoch": 0.1572807231859272, + "grad_norm": 2.2775092124938965, + "learning_rate": 9.975635071142725e-06, + "loss": 3.279, + "step": 159650 + }, + { + "epoch": 0.15732998116374927, + "grad_norm": 2.461078643798828, + "learning_rate": 9.975619810048338e-06, + "loss": 3.2095, + "step": 159700 + }, + { + "epoch": 0.15737923914157137, + "grad_norm": 2.501760721206665, + "learning_rate": 9.975604544187697e-06, + "loss": 3.2993, + "step": 159750 + }, + { + "epoch": 0.15742849711939347, + "grad_norm": 2.6077377796173096, + "learning_rate": 9.975589273560816e-06, + "loss": 3.2149, + "step": 159800 + }, + { + "epoch": 0.15747775509721554, + "grad_norm": 2.513956308364868, + "learning_rate": 9.97557399816771e-06, + "loss": 3.2562, + "step": 159850 + }, + { + "epoch": 0.15752701307503764, + "grad_norm": 2.3805456161499023, + "learning_rate": 9.975558718008396e-06, + "loss": 3.2899, + "step": 159900 + }, + { + "epoch": 0.15757627105285973, + "grad_norm": 2.356175184249878, + "learning_rate": 9.975543433082886e-06, + "loss": 3.2726, + "step": 159950 + }, + { + "epoch": 0.1576255290306818, + "grad_norm": 2.30304217338562, + "learning_rate": 9.975528143391193e-06, + "loss": 3.2843, + "step": 160000 + }, + { + "epoch": 0.1576747870085039, + "grad_norm": 2.495638847351074, + "learning_rate": 9.975512848933336e-06, + "loss": 3.3027, + "step": 160050 + }, + { + "epoch": 0.15772404498632597, + "grad_norm": 2.4209985733032227, + "learning_rate": 9.975497549709326e-06, + "loss": 3.2801, + "step": 160100 + }, + { + "epoch": 0.15777330296414807, + "grad_norm": 2.4127249717712402, + "learning_rate": 9.97548224571918e-06, + "loss": 3.2352, + "step": 160150 + }, + { + "epoch": 0.15782256094197017, + "grad_norm": 2.3141283988952637, + "learning_rate": 9.97546693696291e-06, + "loss": 3.2455, + "step": 160200 + }, + { + "epoch": 0.15787181891979224, + "grad_norm": 2.4455366134643555, + "learning_rate": 9.975451623440533e-06, + "loss": 3.2768, + "step": 160250 + }, + { + "epoch": 0.15792107689761434, + "grad_norm": 2.34306001663208, + "learning_rate": 9.975436305152062e-06, + "loss": 3.2844, + "step": 160300 + }, + { + "epoch": 0.15797033487543644, + "grad_norm": 2.4881815910339355, + "learning_rate": 9.975420982097513e-06, + "loss": 3.2288, + "step": 160350 + }, + { + "epoch": 0.1580195928532585, + "grad_norm": 2.367155075073242, + "learning_rate": 9.975405654276902e-06, + "loss": 3.2429, + "step": 160400 + }, + { + "epoch": 0.1580688508310806, + "grad_norm": 2.293860912322998, + "learning_rate": 9.975390321690239e-06, + "loss": 3.2559, + "step": 160450 + }, + { + "epoch": 0.1581181088089027, + "grad_norm": 2.5184576511383057, + "learning_rate": 9.975374984337545e-06, + "loss": 3.1892, + "step": 160500 + }, + { + "epoch": 0.15816736678672477, + "grad_norm": 2.3683156967163086, + "learning_rate": 9.97535964221883e-06, + "loss": 3.268, + "step": 160550 + }, + { + "epoch": 0.15821662476454687, + "grad_norm": 2.399445056915283, + "learning_rate": 9.975344295334108e-06, + "loss": 3.261, + "step": 160600 + }, + { + "epoch": 0.15826588274236894, + "grad_norm": 2.7420685291290283, + "learning_rate": 9.975328943683399e-06, + "loss": 3.2957, + "step": 160650 + }, + { + "epoch": 0.15831514072019104, + "grad_norm": 2.550940990447998, + "learning_rate": 9.975313587266712e-06, + "loss": 3.2946, + "step": 160700 + }, + { + "epoch": 0.15836439869801314, + "grad_norm": 2.3411035537719727, + "learning_rate": 9.975298226084066e-06, + "loss": 3.2642, + "step": 160750 + }, + { + "epoch": 0.1584136566758352, + "grad_norm": 2.8769683837890625, + "learning_rate": 9.975282860135473e-06, + "loss": 3.2598, + "step": 160800 + }, + { + "epoch": 0.1584629146536573, + "grad_norm": 2.3809103965759277, + "learning_rate": 9.975267489420948e-06, + "loss": 3.2752, + "step": 160850 + }, + { + "epoch": 0.1585121726314794, + "grad_norm": 2.5599234104156494, + "learning_rate": 9.975252113940508e-06, + "loss": 3.2836, + "step": 160900 + }, + { + "epoch": 0.15856143060930147, + "grad_norm": 2.575878858566284, + "learning_rate": 9.975236733694165e-06, + "loss": 3.2309, + "step": 160950 + }, + { + "epoch": 0.15861068858712357, + "grad_norm": 2.320521593093872, + "learning_rate": 9.975221348681936e-06, + "loss": 3.2435, + "step": 161000 + }, + { + "epoch": 0.15865994656494567, + "grad_norm": 2.399873733520508, + "learning_rate": 9.975205958903834e-06, + "loss": 3.2466, + "step": 161050 + }, + { + "epoch": 0.15870920454276774, + "grad_norm": 2.5881803035736084, + "learning_rate": 9.975190564359874e-06, + "loss": 3.2574, + "step": 161100 + }, + { + "epoch": 0.15875846252058984, + "grad_norm": 2.3451972007751465, + "learning_rate": 9.97517516505007e-06, + "loss": 3.1979, + "step": 161150 + }, + { + "epoch": 0.15880772049841194, + "grad_norm": 2.441636323928833, + "learning_rate": 9.975159760974437e-06, + "loss": 3.3197, + "step": 161200 + }, + { + "epoch": 0.158856978476234, + "grad_norm": 2.2847506999969482, + "learning_rate": 9.975144352132993e-06, + "loss": 3.2379, + "step": 161250 + }, + { + "epoch": 0.1589062364540561, + "grad_norm": 2.575822353363037, + "learning_rate": 9.97512893852575e-06, + "loss": 3.2633, + "step": 161300 + }, + { + "epoch": 0.15895549443187817, + "grad_norm": 2.2760426998138428, + "learning_rate": 9.975113520152722e-06, + "loss": 3.329, + "step": 161350 + }, + { + "epoch": 0.15900475240970027, + "grad_norm": 2.451220989227295, + "learning_rate": 9.975098097013926e-06, + "loss": 3.1977, + "step": 161400 + }, + { + "epoch": 0.15905401038752237, + "grad_norm": 3.0568602085113525, + "learning_rate": 9.975082669109375e-06, + "loss": 3.2664, + "step": 161450 + }, + { + "epoch": 0.15910326836534444, + "grad_norm": 2.5147864818573, + "learning_rate": 9.975067236439085e-06, + "loss": 3.2469, + "step": 161500 + }, + { + "epoch": 0.15915252634316654, + "grad_norm": 2.6028192043304443, + "learning_rate": 9.975051799003069e-06, + "loss": 3.2653, + "step": 161550 + }, + { + "epoch": 0.15920178432098864, + "grad_norm": 2.234363079071045, + "learning_rate": 9.975036356801344e-06, + "loss": 3.2553, + "step": 161600 + }, + { + "epoch": 0.1592510422988107, + "grad_norm": 2.45231556892395, + "learning_rate": 9.975020909833923e-06, + "loss": 3.2516, + "step": 161650 + }, + { + "epoch": 0.1593003002766328, + "grad_norm": 2.5472731590270996, + "learning_rate": 9.975005458100821e-06, + "loss": 3.2428, + "step": 161700 + }, + { + "epoch": 0.1593495582544549, + "grad_norm": 2.3638665676116943, + "learning_rate": 9.974990001602054e-06, + "loss": 3.2315, + "step": 161750 + }, + { + "epoch": 0.15939881623227697, + "grad_norm": 2.5635008811950684, + "learning_rate": 9.974974540337636e-06, + "loss": 3.2552, + "step": 161800 + }, + { + "epoch": 0.15944807421009907, + "grad_norm": 2.347587823867798, + "learning_rate": 9.974959074307583e-06, + "loss": 3.2654, + "step": 161850 + }, + { + "epoch": 0.15949733218792114, + "grad_norm": 2.297929048538208, + "learning_rate": 9.97494360351191e-06, + "loss": 3.2206, + "step": 161900 + }, + { + "epoch": 0.15954659016574324, + "grad_norm": 2.5409791469573975, + "learning_rate": 9.974928127950627e-06, + "loss": 3.2447, + "step": 161950 + }, + { + "epoch": 0.15959584814356534, + "grad_norm": 2.4456892013549805, + "learning_rate": 9.974912647623754e-06, + "loss": 3.2503, + "step": 162000 + }, + { + "epoch": 0.1596451061213874, + "grad_norm": 2.463228464126587, + "learning_rate": 9.974897162531304e-06, + "loss": 3.3117, + "step": 162050 + }, + { + "epoch": 0.1596943640992095, + "grad_norm": 2.5095057487487793, + "learning_rate": 9.974881672673293e-06, + "loss": 3.2543, + "step": 162100 + }, + { + "epoch": 0.1597436220770316, + "grad_norm": 2.442728042602539, + "learning_rate": 9.974866178049734e-06, + "loss": 3.218, + "step": 162150 + }, + { + "epoch": 0.15979288005485368, + "grad_norm": 2.33260178565979, + "learning_rate": 9.974850678660644e-06, + "loss": 3.2395, + "step": 162200 + }, + { + "epoch": 0.15984213803267577, + "grad_norm": 2.700822591781616, + "learning_rate": 9.974835174506035e-06, + "loss": 3.2238, + "step": 162250 + }, + { + "epoch": 0.15989139601049787, + "grad_norm": 2.4309475421905518, + "learning_rate": 9.974819665585923e-06, + "loss": 3.2515, + "step": 162300 + }, + { + "epoch": 0.15994065398831994, + "grad_norm": 2.7000277042388916, + "learning_rate": 9.974804151900325e-06, + "loss": 3.2912, + "step": 162350 + }, + { + "epoch": 0.15998991196614204, + "grad_norm": 2.5332977771759033, + "learning_rate": 9.974788633449255e-06, + "loss": 3.2354, + "step": 162400 + }, + { + "epoch": 0.16003916994396414, + "grad_norm": 2.52994966506958, + "learning_rate": 9.974773110232725e-06, + "loss": 3.2703, + "step": 162450 + }, + { + "epoch": 0.1600884279217862, + "grad_norm": 2.4180970191955566, + "learning_rate": 9.974757582250753e-06, + "loss": 3.258, + "step": 162500 + }, + { + "epoch": 0.1601376858996083, + "grad_norm": 2.184600830078125, + "learning_rate": 9.974742049503353e-06, + "loss": 3.2045, + "step": 162550 + }, + { + "epoch": 0.16018694387743038, + "grad_norm": 2.323749303817749, + "learning_rate": 9.974726511990538e-06, + "loss": 3.2194, + "step": 162600 + }, + { + "epoch": 0.16023620185525247, + "grad_norm": 2.2821106910705566, + "learning_rate": 9.974710969712326e-06, + "loss": 3.2347, + "step": 162650 + }, + { + "epoch": 0.16028545983307457, + "grad_norm": 2.6876988410949707, + "learning_rate": 9.97469542266873e-06, + "loss": 3.1942, + "step": 162700 + }, + { + "epoch": 0.16033471781089664, + "grad_norm": 2.5631306171417236, + "learning_rate": 9.974679870859764e-06, + "loss": 3.2672, + "step": 162750 + }, + { + "epoch": 0.16038397578871874, + "grad_norm": 2.278031826019287, + "learning_rate": 9.974664314285447e-06, + "loss": 3.2874, + "step": 162800 + }, + { + "epoch": 0.16043323376654084, + "grad_norm": 2.370387554168701, + "learning_rate": 9.97464875294579e-06, + "loss": 3.2074, + "step": 162850 + }, + { + "epoch": 0.1604824917443629, + "grad_norm": 2.5671796798706055, + "learning_rate": 9.974633186840809e-06, + "loss": 3.2763, + "step": 162900 + }, + { + "epoch": 0.160531749722185, + "grad_norm": 2.334472894668579, + "learning_rate": 9.974617615970519e-06, + "loss": 3.2144, + "step": 162950 + }, + { + "epoch": 0.1605810077000071, + "grad_norm": 2.337481737136841, + "learning_rate": 9.974602040334935e-06, + "loss": 3.1672, + "step": 163000 + }, + { + "epoch": 0.16063026567782918, + "grad_norm": 2.287097454071045, + "learning_rate": 9.97458645993407e-06, + "loss": 3.2364, + "step": 163050 + }, + { + "epoch": 0.16067952365565127, + "grad_norm": 2.4239590167999268, + "learning_rate": 9.974570874767942e-06, + "loss": 3.3133, + "step": 163100 + }, + { + "epoch": 0.16072878163347334, + "grad_norm": 2.4097015857696533, + "learning_rate": 9.974555284836566e-06, + "loss": 3.2568, + "step": 163150 + }, + { + "epoch": 0.16077803961129544, + "grad_norm": 2.3915295600891113, + "learning_rate": 9.974539690139954e-06, + "loss": 3.2289, + "step": 163200 + }, + { + "epoch": 0.16082729758911754, + "grad_norm": 2.7900233268737793, + "learning_rate": 9.974524090678123e-06, + "loss": 3.2097, + "step": 163250 + }, + { + "epoch": 0.1608765555669396, + "grad_norm": 2.655670404434204, + "learning_rate": 9.974508486451087e-06, + "loss": 3.2298, + "step": 163300 + }, + { + "epoch": 0.1609258135447617, + "grad_norm": 2.619340419769287, + "learning_rate": 9.974492877458863e-06, + "loss": 3.2728, + "step": 163350 + }, + { + "epoch": 0.1609750715225838, + "grad_norm": 2.752850294113159, + "learning_rate": 9.974477263701461e-06, + "loss": 3.2741, + "step": 163400 + }, + { + "epoch": 0.16102432950040588, + "grad_norm": 2.250701427459717, + "learning_rate": 9.974461645178903e-06, + "loss": 3.287, + "step": 163450 + }, + { + "epoch": 0.16107358747822798, + "grad_norm": 2.434692859649658, + "learning_rate": 9.974446021891197e-06, + "loss": 3.2272, + "step": 163500 + }, + { + "epoch": 0.16112284545605007, + "grad_norm": 2.444840908050537, + "learning_rate": 9.974430393838363e-06, + "loss": 3.2033, + "step": 163550 + }, + { + "epoch": 0.16117210343387214, + "grad_norm": 2.3131494522094727, + "learning_rate": 9.974414761020416e-06, + "loss": 3.225, + "step": 163600 + }, + { + "epoch": 0.16122136141169424, + "grad_norm": 2.7039568424224854, + "learning_rate": 9.974399123437367e-06, + "loss": 3.2521, + "step": 163650 + }, + { + "epoch": 0.16127061938951634, + "grad_norm": 2.6730096340179443, + "learning_rate": 9.974383481089234e-06, + "loss": 3.2567, + "step": 163700 + }, + { + "epoch": 0.1613198773673384, + "grad_norm": 2.6406149864196777, + "learning_rate": 9.974367833976031e-06, + "loss": 3.2566, + "step": 163750 + }, + { + "epoch": 0.1613691353451605, + "grad_norm": 2.492889881134033, + "learning_rate": 9.974352182097771e-06, + "loss": 3.2304, + "step": 163800 + }, + { + "epoch": 0.16141839332298258, + "grad_norm": 2.5191712379455566, + "learning_rate": 9.974336525454474e-06, + "loss": 3.235, + "step": 163850 + }, + { + "epoch": 0.16146765130080468, + "grad_norm": 2.555583953857422, + "learning_rate": 9.974320864046151e-06, + "loss": 3.188, + "step": 163900 + }, + { + "epoch": 0.16151690927862677, + "grad_norm": 2.5691208839416504, + "learning_rate": 9.974305197872818e-06, + "loss": 3.2567, + "step": 163950 + }, + { + "epoch": 0.16156616725644884, + "grad_norm": 2.5336766242980957, + "learning_rate": 9.97428952693449e-06, + "loss": 3.1847, + "step": 164000 + }, + { + "epoch": 0.16161542523427094, + "grad_norm": 2.3060240745544434, + "learning_rate": 9.974273851231183e-06, + "loss": 3.1885, + "step": 164050 + }, + { + "epoch": 0.16166468321209304, + "grad_norm": 2.2539544105529785, + "learning_rate": 9.974258170762912e-06, + "loss": 3.2804, + "step": 164100 + }, + { + "epoch": 0.1617139411899151, + "grad_norm": 2.66815185546875, + "learning_rate": 9.974242485529689e-06, + "loss": 3.1809, + "step": 164150 + }, + { + "epoch": 0.1617631991677372, + "grad_norm": 2.692131757736206, + "learning_rate": 9.974226795531533e-06, + "loss": 3.2172, + "step": 164200 + }, + { + "epoch": 0.1618124571455593, + "grad_norm": 2.503390073776245, + "learning_rate": 9.974211100768456e-06, + "loss": 3.2218, + "step": 164250 + }, + { + "epoch": 0.16186171512338138, + "grad_norm": 2.422560453414917, + "learning_rate": 9.974195401240474e-06, + "loss": 3.1958, + "step": 164300 + }, + { + "epoch": 0.16191097310120348, + "grad_norm": 2.437238931655884, + "learning_rate": 9.974179696947604e-06, + "loss": 3.298, + "step": 164350 + }, + { + "epoch": 0.16196023107902555, + "grad_norm": 2.9633736610412598, + "learning_rate": 9.974163987889858e-06, + "loss": 3.2335, + "step": 164400 + }, + { + "epoch": 0.16200948905684764, + "grad_norm": 2.436521530151367, + "learning_rate": 9.974148274067252e-06, + "loss": 3.2471, + "step": 164450 + }, + { + "epoch": 0.16205874703466974, + "grad_norm": 2.5256268978118896, + "learning_rate": 9.974132555479801e-06, + "loss": 3.2861, + "step": 164500 + }, + { + "epoch": 0.1621080050124918, + "grad_norm": 2.4144959449768066, + "learning_rate": 9.974116832127523e-06, + "loss": 3.2956, + "step": 164550 + }, + { + "epoch": 0.1621572629903139, + "grad_norm": 2.510390043258667, + "learning_rate": 9.97410110401043e-06, + "loss": 3.3113, + "step": 164600 + }, + { + "epoch": 0.162206520968136, + "grad_norm": 2.3978402614593506, + "learning_rate": 9.974085371128535e-06, + "loss": 3.2662, + "step": 164650 + }, + { + "epoch": 0.16225577894595808, + "grad_norm": 2.358368396759033, + "learning_rate": 9.974069633481859e-06, + "loss": 3.2522, + "step": 164700 + }, + { + "epoch": 0.16230503692378018, + "grad_norm": 2.4287681579589844, + "learning_rate": 9.974053891070412e-06, + "loss": 3.2661, + "step": 164750 + }, + { + "epoch": 0.16235429490160228, + "grad_norm": 2.3747003078460693, + "learning_rate": 9.974038143894212e-06, + "loss": 3.3153, + "step": 164800 + }, + { + "epoch": 0.16240355287942435, + "grad_norm": 2.2973694801330566, + "learning_rate": 9.974022391953271e-06, + "loss": 3.217, + "step": 164850 + }, + { + "epoch": 0.16245281085724644, + "grad_norm": 2.376401901245117, + "learning_rate": 9.974006635247608e-06, + "loss": 3.2057, + "step": 164900 + }, + { + "epoch": 0.1625020688350685, + "grad_norm": 2.4252877235412598, + "learning_rate": 9.973990873777236e-06, + "loss": 3.2184, + "step": 164950 + }, + { + "epoch": 0.1625513268128906, + "grad_norm": 2.3587210178375244, + "learning_rate": 9.973975107542169e-06, + "loss": 3.2529, + "step": 165000 + }, + { + "epoch": 0.1626005847907127, + "grad_norm": 2.354672431945801, + "learning_rate": 9.973959336542425e-06, + "loss": 3.2774, + "step": 165050 + }, + { + "epoch": 0.16264984276853478, + "grad_norm": 2.5408666133880615, + "learning_rate": 9.973943560778016e-06, + "loss": 3.2704, + "step": 165100 + }, + { + "epoch": 0.16269910074635688, + "grad_norm": 2.3218014240264893, + "learning_rate": 9.973927780248958e-06, + "loss": 3.1652, + "step": 165150 + }, + { + "epoch": 0.16274835872417898, + "grad_norm": 2.471491575241089, + "learning_rate": 9.973911994955267e-06, + "loss": 3.2896, + "step": 165200 + }, + { + "epoch": 0.16279761670200105, + "grad_norm": 2.339040756225586, + "learning_rate": 9.97389620489696e-06, + "loss": 3.2435, + "step": 165250 + }, + { + "epoch": 0.16284687467982314, + "grad_norm": 2.364563226699829, + "learning_rate": 9.973880410074049e-06, + "loss": 3.232, + "step": 165300 + }, + { + "epoch": 0.16289613265764524, + "grad_norm": 2.2789087295532227, + "learning_rate": 9.973864610486548e-06, + "loss": 3.2473, + "step": 165350 + }, + { + "epoch": 0.1629453906354673, + "grad_norm": 2.5302774906158447, + "learning_rate": 9.973848806134477e-06, + "loss": 3.2654, + "step": 165400 + }, + { + "epoch": 0.1629946486132894, + "grad_norm": 2.4698848724365234, + "learning_rate": 9.973832997017847e-06, + "loss": 3.215, + "step": 165450 + }, + { + "epoch": 0.1630439065911115, + "grad_norm": 2.4532675743103027, + "learning_rate": 9.973817183136674e-06, + "loss": 3.2503, + "step": 165500 + }, + { + "epoch": 0.16309316456893358, + "grad_norm": 2.4959073066711426, + "learning_rate": 9.973801364490973e-06, + "loss": 3.2812, + "step": 165550 + }, + { + "epoch": 0.16314242254675568, + "grad_norm": 2.468905448913574, + "learning_rate": 9.973785541080762e-06, + "loss": 3.1966, + "step": 165600 + }, + { + "epoch": 0.16319168052457775, + "grad_norm": 2.474400758743286, + "learning_rate": 9.973769712906054e-06, + "loss": 3.3249, + "step": 165650 + }, + { + "epoch": 0.16324093850239985, + "grad_norm": 2.4777016639709473, + "learning_rate": 9.973753879966861e-06, + "loss": 3.2167, + "step": 165700 + }, + { + "epoch": 0.16329019648022194, + "grad_norm": 2.321035861968994, + "learning_rate": 9.973738042263203e-06, + "loss": 3.2395, + "step": 165750 + }, + { + "epoch": 0.16333945445804401, + "grad_norm": 2.303624391555786, + "learning_rate": 9.973722199795095e-06, + "loss": 3.2329, + "step": 165800 + }, + { + "epoch": 0.1633887124358661, + "grad_norm": 2.3610687255859375, + "learning_rate": 9.973706352562549e-06, + "loss": 3.1736, + "step": 165850 + }, + { + "epoch": 0.1634379704136882, + "grad_norm": 2.414618968963623, + "learning_rate": 9.973690500565582e-06, + "loss": 3.2749, + "step": 165900 + }, + { + "epoch": 0.16348722839151028, + "grad_norm": 2.5510413646698, + "learning_rate": 9.973674643804208e-06, + "loss": 3.1891, + "step": 165950 + }, + { + "epoch": 0.16353648636933238, + "grad_norm": 2.2854127883911133, + "learning_rate": 9.973658782278445e-06, + "loss": 3.2653, + "step": 166000 + }, + { + "epoch": 0.16358574434715448, + "grad_norm": 2.2825701236724854, + "learning_rate": 9.973642915988306e-06, + "loss": 3.2607, + "step": 166050 + }, + { + "epoch": 0.16363500232497655, + "grad_norm": 2.5323050022125244, + "learning_rate": 9.973627044933806e-06, + "loss": 3.2294, + "step": 166100 + }, + { + "epoch": 0.16368426030279865, + "grad_norm": 2.3168442249298096, + "learning_rate": 9.97361116911496e-06, + "loss": 3.2706, + "step": 166150 + }, + { + "epoch": 0.16373351828062072, + "grad_norm": 2.4082138538360596, + "learning_rate": 9.973595288531785e-06, + "loss": 3.2644, + "step": 166200 + }, + { + "epoch": 0.1637827762584428, + "grad_norm": 2.4204163551330566, + "learning_rate": 9.973579403184294e-06, + "loss": 3.2273, + "step": 166250 + }, + { + "epoch": 0.1638320342362649, + "grad_norm": 2.5267386436462402, + "learning_rate": 9.973563513072506e-06, + "loss": 3.2145, + "step": 166300 + }, + { + "epoch": 0.16388129221408698, + "grad_norm": 2.5250329971313477, + "learning_rate": 9.973547618196431e-06, + "loss": 3.3321, + "step": 166350 + }, + { + "epoch": 0.16393055019190908, + "grad_norm": 2.3203165531158447, + "learning_rate": 9.97353171855609e-06, + "loss": 3.253, + "step": 166400 + }, + { + "epoch": 0.16397980816973118, + "grad_norm": 2.511007070541382, + "learning_rate": 9.97351581415149e-06, + "loss": 3.251, + "step": 166450 + }, + { + "epoch": 0.16402906614755325, + "grad_norm": 2.4214298725128174, + "learning_rate": 9.973499904982655e-06, + "loss": 3.2116, + "step": 166500 + }, + { + "epoch": 0.16407832412537535, + "grad_norm": 2.253725528717041, + "learning_rate": 9.973483991049596e-06, + "loss": 3.2826, + "step": 166550 + }, + { + "epoch": 0.16412758210319744, + "grad_norm": 2.6337883472442627, + "learning_rate": 9.973468072352327e-06, + "loss": 3.261, + "step": 166600 + }, + { + "epoch": 0.16417684008101951, + "grad_norm": 3.005277156829834, + "learning_rate": 9.973452148890868e-06, + "loss": 3.2677, + "step": 166650 + }, + { + "epoch": 0.1642260980588416, + "grad_norm": 2.4036245346069336, + "learning_rate": 9.97343622066523e-06, + "loss": 3.245, + "step": 166700 + }, + { + "epoch": 0.1642753560366637, + "grad_norm": 2.2724854946136475, + "learning_rate": 9.97342028767543e-06, + "loss": 3.2354, + "step": 166750 + }, + { + "epoch": 0.16432461401448578, + "grad_norm": 2.4631028175354004, + "learning_rate": 9.973404349921482e-06, + "loss": 3.271, + "step": 166800 + }, + { + "epoch": 0.16437387199230788, + "grad_norm": 2.475844621658325, + "learning_rate": 9.973388407403402e-06, + "loss": 3.2628, + "step": 166850 + }, + { + "epoch": 0.16442312997012995, + "grad_norm": 2.3671717643737793, + "learning_rate": 9.973372460121206e-06, + "loss": 3.2566, + "step": 166900 + }, + { + "epoch": 0.16447238794795205, + "grad_norm": 2.570286989212036, + "learning_rate": 9.973356508074907e-06, + "loss": 3.2782, + "step": 166950 + }, + { + "epoch": 0.16452164592577415, + "grad_norm": 2.316213846206665, + "learning_rate": 9.973340551264525e-06, + "loss": 3.2138, + "step": 167000 + }, + { + "epoch": 0.16457090390359622, + "grad_norm": 2.358488082885742, + "learning_rate": 9.97332458969007e-06, + "loss": 3.2498, + "step": 167050 + }, + { + "epoch": 0.16462016188141831, + "grad_norm": 2.455321788787842, + "learning_rate": 9.97330862335156e-06, + "loss": 3.2599, + "step": 167100 + }, + { + "epoch": 0.1646694198592404, + "grad_norm": 2.295921564102173, + "learning_rate": 9.97329265224901e-06, + "loss": 3.2884, + "step": 167150 + }, + { + "epoch": 0.16471867783706248, + "grad_norm": 2.2220418453216553, + "learning_rate": 9.973276676382433e-06, + "loss": 3.2416, + "step": 167200 + }, + { + "epoch": 0.16476793581488458, + "grad_norm": 2.505247116088867, + "learning_rate": 9.97326069575185e-06, + "loss": 3.2174, + "step": 167250 + }, + { + "epoch": 0.16481719379270668, + "grad_norm": 2.403751850128174, + "learning_rate": 9.973244710357271e-06, + "loss": 3.2767, + "step": 167300 + }, + { + "epoch": 0.16486645177052875, + "grad_norm": 2.3857574462890625, + "learning_rate": 9.973228720198713e-06, + "loss": 3.2768, + "step": 167350 + }, + { + "epoch": 0.16491570974835085, + "grad_norm": 2.4741947650909424, + "learning_rate": 9.97321272527619e-06, + "loss": 3.2976, + "step": 167400 + }, + { + "epoch": 0.16496496772617292, + "grad_norm": 2.4477157592773438, + "learning_rate": 9.97319672558972e-06, + "loss": 3.2816, + "step": 167450 + }, + { + "epoch": 0.16501422570399502, + "grad_norm": 2.4478719234466553, + "learning_rate": 9.973180721139318e-06, + "loss": 3.203, + "step": 167500 + }, + { + "epoch": 0.1650634836818171, + "grad_norm": 2.3998541831970215, + "learning_rate": 9.973164711924996e-06, + "loss": 3.2041, + "step": 167550 + }, + { + "epoch": 0.16511274165963918, + "grad_norm": 2.541332960128784, + "learning_rate": 9.973148697946772e-06, + "loss": 3.1557, + "step": 167600 + }, + { + "epoch": 0.16516199963746128, + "grad_norm": 2.291151285171509, + "learning_rate": 9.973132679204662e-06, + "loss": 3.2109, + "step": 167650 + }, + { + "epoch": 0.16521125761528338, + "grad_norm": 2.589271068572998, + "learning_rate": 9.97311665569868e-06, + "loss": 3.2819, + "step": 167700 + }, + { + "epoch": 0.16526051559310545, + "grad_norm": 2.297220468521118, + "learning_rate": 9.973100627428842e-06, + "loss": 3.179, + "step": 167750 + }, + { + "epoch": 0.16530977357092755, + "grad_norm": 2.5937283039093018, + "learning_rate": 9.973084594395163e-06, + "loss": 3.2171, + "step": 167800 + }, + { + "epoch": 0.16535903154874965, + "grad_norm": 2.7462685108184814, + "learning_rate": 9.973068556597659e-06, + "loss": 3.1943, + "step": 167850 + }, + { + "epoch": 0.16540828952657172, + "grad_norm": 2.5025150775909424, + "learning_rate": 9.973052514036343e-06, + "loss": 3.1982, + "step": 167900 + }, + { + "epoch": 0.16545754750439381, + "grad_norm": 2.4069015979766846, + "learning_rate": 9.973036466711232e-06, + "loss": 3.3, + "step": 167950 + }, + { + "epoch": 0.1655068054822159, + "grad_norm": 2.2212586402893066, + "learning_rate": 9.973020414622344e-06, + "loss": 3.2683, + "step": 168000 + }, + { + "epoch": 0.16555606346003798, + "grad_norm": 2.361391067504883, + "learning_rate": 9.973004357769688e-06, + "loss": 3.2019, + "step": 168050 + }, + { + "epoch": 0.16560532143786008, + "grad_norm": 2.3977742195129395, + "learning_rate": 9.972988296153288e-06, + "loss": 3.221, + "step": 168100 + }, + { + "epoch": 0.16565457941568215, + "grad_norm": 2.4179306030273438, + "learning_rate": 9.972972229773151e-06, + "loss": 3.228, + "step": 168150 + }, + { + "epoch": 0.16570383739350425, + "grad_norm": 2.4009392261505127, + "learning_rate": 9.972956158629297e-06, + "loss": 3.22, + "step": 168200 + }, + { + "epoch": 0.16575309537132635, + "grad_norm": 2.5648458003997803, + "learning_rate": 9.972940082721741e-06, + "loss": 3.2073, + "step": 168250 + }, + { + "epoch": 0.16580235334914842, + "grad_norm": 2.255211353302002, + "learning_rate": 9.972924002050497e-06, + "loss": 3.2738, + "step": 168300 + }, + { + "epoch": 0.16585161132697052, + "grad_norm": 2.3374712467193604, + "learning_rate": 9.972907916615582e-06, + "loss": 3.337, + "step": 168350 + }, + { + "epoch": 0.16590086930479261, + "grad_norm": 2.404176712036133, + "learning_rate": 9.97289182641701e-06, + "loss": 3.2574, + "step": 168400 + }, + { + "epoch": 0.16595012728261468, + "grad_norm": 2.205357789993286, + "learning_rate": 9.972875731454797e-06, + "loss": 3.3072, + "step": 168450 + }, + { + "epoch": 0.16599938526043678, + "grad_norm": 2.372446298599243, + "learning_rate": 9.97285963172896e-06, + "loss": 3.1796, + "step": 168500 + }, + { + "epoch": 0.16604864323825888, + "grad_norm": 2.388124942779541, + "learning_rate": 9.97284352723951e-06, + "loss": 3.1837, + "step": 168550 + }, + { + "epoch": 0.16609790121608095, + "grad_norm": 3.4314990043640137, + "learning_rate": 9.972827417986468e-06, + "loss": 3.2411, + "step": 168600 + }, + { + "epoch": 0.16614715919390305, + "grad_norm": 2.3243260383605957, + "learning_rate": 9.972811303969846e-06, + "loss": 3.2424, + "step": 168650 + }, + { + "epoch": 0.16619641717172512, + "grad_norm": 2.891624927520752, + "learning_rate": 9.97279518518966e-06, + "loss": 3.1539, + "step": 168700 + }, + { + "epoch": 0.16624567514954722, + "grad_norm": 2.4437644481658936, + "learning_rate": 9.972779061645926e-06, + "loss": 3.245, + "step": 168750 + }, + { + "epoch": 0.16629493312736932, + "grad_norm": 2.3860108852386475, + "learning_rate": 9.972762933338657e-06, + "loss": 3.3329, + "step": 168800 + }, + { + "epoch": 0.16634419110519139, + "grad_norm": 2.1905527114868164, + "learning_rate": 9.972746800267873e-06, + "loss": 3.2195, + "step": 168850 + }, + { + "epoch": 0.16639344908301348, + "grad_norm": 2.473388433456421, + "learning_rate": 9.972730662433586e-06, + "loss": 3.2742, + "step": 168900 + }, + { + "epoch": 0.16644270706083558, + "grad_norm": 2.5832114219665527, + "learning_rate": 9.972714519835812e-06, + "loss": 3.2479, + "step": 168950 + }, + { + "epoch": 0.16649196503865765, + "grad_norm": 2.5272881984710693, + "learning_rate": 9.972698372474568e-06, + "loss": 3.2061, + "step": 169000 + }, + { + "epoch": 0.16654122301647975, + "grad_norm": 2.3357207775115967, + "learning_rate": 9.972682220349868e-06, + "loss": 3.2282, + "step": 169050 + }, + { + "epoch": 0.16659048099430185, + "grad_norm": 2.3276467323303223, + "learning_rate": 9.972666063461727e-06, + "loss": 3.231, + "step": 169100 + }, + { + "epoch": 0.16663973897212392, + "grad_norm": 2.5087954998016357, + "learning_rate": 9.972649901810162e-06, + "loss": 3.243, + "step": 169150 + }, + { + "epoch": 0.16668899694994602, + "grad_norm": 2.5327253341674805, + "learning_rate": 9.972633735395188e-06, + "loss": 3.2477, + "step": 169200 + }, + { + "epoch": 0.16673825492776811, + "grad_norm": 2.278111219406128, + "learning_rate": 9.97261756421682e-06, + "loss": 3.2326, + "step": 169250 + }, + { + "epoch": 0.16678751290559019, + "grad_norm": 2.4280636310577393, + "learning_rate": 9.972601388275074e-06, + "loss": 3.1736, + "step": 169300 + }, + { + "epoch": 0.16683677088341228, + "grad_norm": 2.5307745933532715, + "learning_rate": 9.972585207569964e-06, + "loss": 3.1667, + "step": 169350 + }, + { + "epoch": 0.16688602886123435, + "grad_norm": 2.3883166313171387, + "learning_rate": 9.972569022101509e-06, + "loss": 3.262, + "step": 169400 + }, + { + "epoch": 0.16693528683905645, + "grad_norm": 2.391712188720703, + "learning_rate": 9.97255283186972e-06, + "loss": 3.2528, + "step": 169450 + }, + { + "epoch": 0.16698454481687855, + "grad_norm": 2.478403329849243, + "learning_rate": 9.972536636874615e-06, + "loss": 3.2707, + "step": 169500 + }, + { + "epoch": 0.16703380279470062, + "grad_norm": 2.325885772705078, + "learning_rate": 9.97252043711621e-06, + "loss": 3.3226, + "step": 169550 + }, + { + "epoch": 0.16708306077252272, + "grad_norm": 2.742633819580078, + "learning_rate": 9.97250423259452e-06, + "loss": 3.248, + "step": 169600 + }, + { + "epoch": 0.16713231875034482, + "grad_norm": 2.2895331382751465, + "learning_rate": 9.97248802330956e-06, + "loss": 3.2303, + "step": 169650 + }, + { + "epoch": 0.1671815767281669, + "grad_norm": 2.2983646392822266, + "learning_rate": 9.972471809261346e-06, + "loss": 3.2391, + "step": 169700 + }, + { + "epoch": 0.16723083470598898, + "grad_norm": 2.427401065826416, + "learning_rate": 9.972455590449893e-06, + "loss": 3.2274, + "step": 169750 + }, + { + "epoch": 0.16728009268381108, + "grad_norm": 2.306028127670288, + "learning_rate": 9.972439366875217e-06, + "loss": 3.2631, + "step": 169800 + }, + { + "epoch": 0.16732935066163315, + "grad_norm": 2.405992031097412, + "learning_rate": 9.972423138537335e-06, + "loss": 3.2077, + "step": 169850 + }, + { + "epoch": 0.16737860863945525, + "grad_norm": 2.325310707092285, + "learning_rate": 9.97240690543626e-06, + "loss": 3.2439, + "step": 169900 + }, + { + "epoch": 0.16742786661727732, + "grad_norm": 2.519519567489624, + "learning_rate": 9.972390667572008e-06, + "loss": 3.2298, + "step": 169950 + }, + { + "epoch": 0.16747712459509942, + "grad_norm": 2.4212136268615723, + "learning_rate": 9.972374424944595e-06, + "loss": 3.1672, + "step": 170000 + }, + { + "epoch": 0.16752638257292152, + "grad_norm": 2.4617903232574463, + "learning_rate": 9.972358177554036e-06, + "loss": 3.2561, + "step": 170050 + }, + { + "epoch": 0.1675756405507436, + "grad_norm": 2.2961173057556152, + "learning_rate": 9.972341925400349e-06, + "loss": 3.2511, + "step": 170100 + }, + { + "epoch": 0.16762489852856569, + "grad_norm": 2.411914825439453, + "learning_rate": 9.972325668483546e-06, + "loss": 3.1909, + "step": 170150 + }, + { + "epoch": 0.16767415650638778, + "grad_norm": 2.7543323040008545, + "learning_rate": 9.972309406803647e-06, + "loss": 3.2086, + "step": 170200 + }, + { + "epoch": 0.16772341448420985, + "grad_norm": 2.360491991043091, + "learning_rate": 9.972293140360663e-06, + "loss": 3.205, + "step": 170250 + }, + { + "epoch": 0.16777267246203195, + "grad_norm": 2.433039665222168, + "learning_rate": 9.972276869154612e-06, + "loss": 3.2174, + "step": 170300 + }, + { + "epoch": 0.16782193043985405, + "grad_norm": 2.3675684928894043, + "learning_rate": 9.97226059318551e-06, + "loss": 3.2572, + "step": 170350 + }, + { + "epoch": 0.16787118841767612, + "grad_norm": 2.5406646728515625, + "learning_rate": 9.972244312453369e-06, + "loss": 3.281, + "step": 170400 + }, + { + "epoch": 0.16792044639549822, + "grad_norm": 2.286390542984009, + "learning_rate": 9.97222802695821e-06, + "loss": 3.1883, + "step": 170450 + }, + { + "epoch": 0.16796970437332032, + "grad_norm": 2.4245963096618652, + "learning_rate": 9.972211736700046e-06, + "loss": 3.2696, + "step": 170500 + }, + { + "epoch": 0.1680189623511424, + "grad_norm": 2.3237249851226807, + "learning_rate": 9.972195441678892e-06, + "loss": 3.2409, + "step": 170550 + }, + { + "epoch": 0.16806822032896448, + "grad_norm": 2.3382763862609863, + "learning_rate": 9.972179141894763e-06, + "loss": 3.2172, + "step": 170600 + }, + { + "epoch": 0.16811747830678656, + "grad_norm": 2.44527268409729, + "learning_rate": 9.972162837347676e-06, + "loss": 3.2409, + "step": 170650 + }, + { + "epoch": 0.16816673628460865, + "grad_norm": 2.413429021835327, + "learning_rate": 9.972146528037647e-06, + "loss": 3.1964, + "step": 170700 + }, + { + "epoch": 0.16821599426243075, + "grad_norm": 2.869380235671997, + "learning_rate": 9.972130213964691e-06, + "loss": 3.1853, + "step": 170750 + }, + { + "epoch": 0.16826525224025282, + "grad_norm": 2.30489444732666, + "learning_rate": 9.972113895128823e-06, + "loss": 3.1096, + "step": 170800 + }, + { + "epoch": 0.16831451021807492, + "grad_norm": 2.1066465377807617, + "learning_rate": 9.97209757153006e-06, + "loss": 3.2576, + "step": 170850 + }, + { + "epoch": 0.16836376819589702, + "grad_norm": 2.743096113204956, + "learning_rate": 9.972081243168416e-06, + "loss": 3.2896, + "step": 170900 + }, + { + "epoch": 0.1684130261737191, + "grad_norm": 2.644369602203369, + "learning_rate": 9.972064910043908e-06, + "loss": 3.2405, + "step": 170950 + }, + { + "epoch": 0.1684622841515412, + "grad_norm": 2.267193555831909, + "learning_rate": 9.972048572156551e-06, + "loss": 3.2235, + "step": 171000 + }, + { + "epoch": 0.16851154212936328, + "grad_norm": 2.5574398040771484, + "learning_rate": 9.972032229506362e-06, + "loss": 3.2429, + "step": 171050 + }, + { + "epoch": 0.16856080010718535, + "grad_norm": 2.498792886734009, + "learning_rate": 9.972015882093353e-06, + "loss": 3.2312, + "step": 171100 + }, + { + "epoch": 0.16861005808500745, + "grad_norm": 2.3705992698669434, + "learning_rate": 9.971999529917545e-06, + "loss": 3.2427, + "step": 171150 + }, + { + "epoch": 0.16865931606282952, + "grad_norm": 2.2677624225616455, + "learning_rate": 9.971983172978948e-06, + "loss": 3.1684, + "step": 171200 + }, + { + "epoch": 0.16870857404065162, + "grad_norm": 2.2330195903778076, + "learning_rate": 9.971966811277583e-06, + "loss": 3.2471, + "step": 171250 + }, + { + "epoch": 0.16875783201847372, + "grad_norm": 2.7185702323913574, + "learning_rate": 9.97195044481346e-06, + "loss": 3.2494, + "step": 171300 + }, + { + "epoch": 0.1688070899962958, + "grad_norm": 2.5487043857574463, + "learning_rate": 9.9719340735866e-06, + "loss": 3.2224, + "step": 171350 + }, + { + "epoch": 0.1688563479741179, + "grad_norm": 2.3917417526245117, + "learning_rate": 9.971917697597016e-06, + "loss": 3.1809, + "step": 171400 + }, + { + "epoch": 0.16890560595193999, + "grad_norm": 2.423706531524658, + "learning_rate": 9.971901316844725e-06, + "loss": 3.2835, + "step": 171450 + }, + { + "epoch": 0.16895486392976206, + "grad_norm": 2.3908448219299316, + "learning_rate": 9.97188493132974e-06, + "loss": 3.2845, + "step": 171500 + }, + { + "epoch": 0.16900412190758415, + "grad_norm": 2.406096935272217, + "learning_rate": 9.971868541052081e-06, + "loss": 3.288, + "step": 171550 + }, + { + "epoch": 0.16905337988540625, + "grad_norm": 2.4399733543395996, + "learning_rate": 9.97185214601176e-06, + "loss": 3.1303, + "step": 171600 + }, + { + "epoch": 0.16910263786322832, + "grad_norm": 2.4967050552368164, + "learning_rate": 9.971835746208795e-06, + "loss": 3.2488, + "step": 171650 + }, + { + "epoch": 0.16915189584105042, + "grad_norm": 2.177173137664795, + "learning_rate": 9.9718193416432e-06, + "loss": 3.2137, + "step": 171700 + }, + { + "epoch": 0.16920115381887252, + "grad_norm": 2.3297157287597656, + "learning_rate": 9.97180293231499e-06, + "loss": 3.1859, + "step": 171750 + }, + { + "epoch": 0.1692504117966946, + "grad_norm": 2.5912609100341797, + "learning_rate": 9.971786518224183e-06, + "loss": 3.2549, + "step": 171800 + }, + { + "epoch": 0.1692996697745167, + "grad_norm": 2.3210597038269043, + "learning_rate": 9.971770099370795e-06, + "loss": 3.2719, + "step": 171850 + }, + { + "epoch": 0.16934892775233876, + "grad_norm": 2.3501977920532227, + "learning_rate": 9.97175367575484e-06, + "loss": 3.25, + "step": 171900 + }, + { + "epoch": 0.16939818573016086, + "grad_norm": 2.4376003742218018, + "learning_rate": 9.971737247376332e-06, + "loss": 3.2787, + "step": 171950 + }, + { + "epoch": 0.16944744370798295, + "grad_norm": 2.7095160484313965, + "learning_rate": 9.971720814235292e-06, + "loss": 3.2282, + "step": 172000 + }, + { + "epoch": 0.16949670168580502, + "grad_norm": 2.4647998809814453, + "learning_rate": 9.971704376331732e-06, + "loss": 3.2155, + "step": 172050 + }, + { + "epoch": 0.16954595966362712, + "grad_norm": 2.3711771965026855, + "learning_rate": 9.971687933665669e-06, + "loss": 3.2271, + "step": 172100 + }, + { + "epoch": 0.16959521764144922, + "grad_norm": 2.3051702976226807, + "learning_rate": 9.971671486237117e-06, + "loss": 3.2281, + "step": 172150 + }, + { + "epoch": 0.1696444756192713, + "grad_norm": 2.2700905799865723, + "learning_rate": 9.971655034046093e-06, + "loss": 3.2093, + "step": 172200 + }, + { + "epoch": 0.1696937335970934, + "grad_norm": 2.195577383041382, + "learning_rate": 9.971638577092614e-06, + "loss": 3.2203, + "step": 172250 + }, + { + "epoch": 0.16974299157491549, + "grad_norm": 2.416840076446533, + "learning_rate": 9.971622115376694e-06, + "loss": 3.2358, + "step": 172300 + }, + { + "epoch": 0.16979224955273756, + "grad_norm": 2.474241018295288, + "learning_rate": 9.97160564889835e-06, + "loss": 3.2703, + "step": 172350 + }, + { + "epoch": 0.16984150753055965, + "grad_norm": 2.5900232791900635, + "learning_rate": 9.971589177657596e-06, + "loss": 3.2078, + "step": 172400 + }, + { + "epoch": 0.16989076550838172, + "grad_norm": 2.2069942951202393, + "learning_rate": 9.971572701654448e-06, + "loss": 3.2314, + "step": 172450 + }, + { + "epoch": 0.16994002348620382, + "grad_norm": 2.443166732788086, + "learning_rate": 9.971556220888924e-06, + "loss": 3.2652, + "step": 172500 + }, + { + "epoch": 0.16998928146402592, + "grad_norm": 2.646296977996826, + "learning_rate": 9.97153973536104e-06, + "loss": 3.2193, + "step": 172550 + }, + { + "epoch": 0.170038539441848, + "grad_norm": 2.4543447494506836, + "learning_rate": 9.971523245070808e-06, + "loss": 3.2515, + "step": 172600 + }, + { + "epoch": 0.1700877974196701, + "grad_norm": 2.4075093269348145, + "learning_rate": 9.971506750018246e-06, + "loss": 3.2378, + "step": 172650 + }, + { + "epoch": 0.1701370553974922, + "grad_norm": 2.411224365234375, + "learning_rate": 9.97149025020337e-06, + "loss": 3.2964, + "step": 172700 + }, + { + "epoch": 0.17018631337531426, + "grad_norm": 2.372020721435547, + "learning_rate": 9.971473745626197e-06, + "loss": 3.2409, + "step": 172750 + }, + { + "epoch": 0.17023557135313636, + "grad_norm": 2.430980682373047, + "learning_rate": 9.97145723628674e-06, + "loss": 3.2, + "step": 172800 + }, + { + "epoch": 0.17028482933095845, + "grad_norm": 2.405514717102051, + "learning_rate": 9.971440722185017e-06, + "loss": 3.1711, + "step": 172850 + }, + { + "epoch": 0.17033408730878052, + "grad_norm": 2.162614583969116, + "learning_rate": 9.971424203321043e-06, + "loss": 3.2052, + "step": 172900 + }, + { + "epoch": 0.17038334528660262, + "grad_norm": 2.4492428302764893, + "learning_rate": 9.971407679694834e-06, + "loss": 3.259, + "step": 172950 + }, + { + "epoch": 0.1704326032644247, + "grad_norm": 2.3386659622192383, + "learning_rate": 9.971391151306406e-06, + "loss": 3.1712, + "step": 173000 + }, + { + "epoch": 0.1704818612422468, + "grad_norm": 2.463663101196289, + "learning_rate": 9.971374618155773e-06, + "loss": 3.2603, + "step": 173050 + }, + { + "epoch": 0.1705311192200689, + "grad_norm": 2.4989259243011475, + "learning_rate": 9.971358080242954e-06, + "loss": 3.2787, + "step": 173100 + }, + { + "epoch": 0.17058037719789096, + "grad_norm": 2.508099317550659, + "learning_rate": 9.971341537567962e-06, + "loss": 3.2207, + "step": 173150 + }, + { + "epoch": 0.17062963517571306, + "grad_norm": 2.3569538593292236, + "learning_rate": 9.971324990130817e-06, + "loss": 3.2078, + "step": 173200 + }, + { + "epoch": 0.17067889315353516, + "grad_norm": 2.2047271728515625, + "learning_rate": 9.971308437931528e-06, + "loss": 3.2143, + "step": 173250 + }, + { + "epoch": 0.17072815113135723, + "grad_norm": 2.886317253112793, + "learning_rate": 9.971291880970118e-06, + "loss": 3.1974, + "step": 173300 + }, + { + "epoch": 0.17077740910917932, + "grad_norm": 2.2679877281188965, + "learning_rate": 9.971275319246598e-06, + "loss": 3.2848, + "step": 173350 + }, + { + "epoch": 0.17082666708700142, + "grad_norm": 2.631575584411621, + "learning_rate": 9.971258752760986e-06, + "loss": 3.2026, + "step": 173400 + }, + { + "epoch": 0.1708759250648235, + "grad_norm": 2.408205986022949, + "learning_rate": 9.971242181513296e-06, + "loss": 3.1683, + "step": 173450 + }, + { + "epoch": 0.1709251830426456, + "grad_norm": 2.502215623855591, + "learning_rate": 9.971225605503547e-06, + "loss": 3.2891, + "step": 173500 + }, + { + "epoch": 0.1709744410204677, + "grad_norm": 2.265805244445801, + "learning_rate": 9.971209024731753e-06, + "loss": 3.2223, + "step": 173550 + }, + { + "epoch": 0.17102369899828976, + "grad_norm": 2.2860562801361084, + "learning_rate": 9.97119243919793e-06, + "loss": 3.2067, + "step": 173600 + }, + { + "epoch": 0.17107295697611186, + "grad_norm": 2.3920576572418213, + "learning_rate": 9.971175848902093e-06, + "loss": 3.2394, + "step": 173650 + }, + { + "epoch": 0.17112221495393393, + "grad_norm": 2.372739791870117, + "learning_rate": 9.97115925384426e-06, + "loss": 3.1576, + "step": 173700 + }, + { + "epoch": 0.17117147293175602, + "grad_norm": 2.5407564640045166, + "learning_rate": 9.971142654024445e-06, + "loss": 3.2554, + "step": 173750 + }, + { + "epoch": 0.17122073090957812, + "grad_norm": 2.4803049564361572, + "learning_rate": 9.971126049442665e-06, + "loss": 3.2712, + "step": 173800 + }, + { + "epoch": 0.1712699888874002, + "grad_norm": 2.1661553382873535, + "learning_rate": 9.971109440098934e-06, + "loss": 3.2123, + "step": 173850 + }, + { + "epoch": 0.1713192468652223, + "grad_norm": 2.424905776977539, + "learning_rate": 9.971092825993272e-06, + "loss": 3.2324, + "step": 173900 + }, + { + "epoch": 0.1713685048430444, + "grad_norm": 2.721795082092285, + "learning_rate": 9.971076207125691e-06, + "loss": 3.1998, + "step": 173950 + }, + { + "epoch": 0.17141776282086646, + "grad_norm": 2.3044917583465576, + "learning_rate": 9.971059583496208e-06, + "loss": 3.201, + "step": 174000 + }, + { + "epoch": 0.17146702079868856, + "grad_norm": 2.494201898574829, + "learning_rate": 9.971042955104839e-06, + "loss": 3.1993, + "step": 174050 + }, + { + "epoch": 0.17151627877651066, + "grad_norm": 2.578601598739624, + "learning_rate": 9.9710263219516e-06, + "loss": 3.2697, + "step": 174100 + }, + { + "epoch": 0.17156553675433273, + "grad_norm": 2.9186549186706543, + "learning_rate": 9.971009684036508e-06, + "loss": 3.2119, + "step": 174150 + }, + { + "epoch": 0.17161479473215482, + "grad_norm": 2.5611016750335693, + "learning_rate": 9.970993041359578e-06, + "loss": 3.2495, + "step": 174200 + }, + { + "epoch": 0.1716640527099769, + "grad_norm": 2.428990364074707, + "learning_rate": 9.970976393920827e-06, + "loss": 3.2782, + "step": 174250 + }, + { + "epoch": 0.171713310687799, + "grad_norm": 2.3071579933166504, + "learning_rate": 9.970959741720268e-06, + "loss": 3.2636, + "step": 174300 + }, + { + "epoch": 0.1717625686656211, + "grad_norm": 2.4201884269714355, + "learning_rate": 9.970943084757919e-06, + "loss": 3.1933, + "step": 174350 + }, + { + "epoch": 0.17181182664344316, + "grad_norm": 2.4167661666870117, + "learning_rate": 9.970926423033796e-06, + "loss": 3.2427, + "step": 174400 + }, + { + "epoch": 0.17186108462126526, + "grad_norm": 2.4131710529327393, + "learning_rate": 9.970909756547913e-06, + "loss": 3.1972, + "step": 174450 + }, + { + "epoch": 0.17191034259908736, + "grad_norm": 2.4808688163757324, + "learning_rate": 9.970893085300291e-06, + "loss": 3.1644, + "step": 174500 + }, + { + "epoch": 0.17195960057690943, + "grad_norm": 2.653364896774292, + "learning_rate": 9.97087640929094e-06, + "loss": 3.1963, + "step": 174550 + }, + { + "epoch": 0.17200885855473153, + "grad_norm": 2.542815923690796, + "learning_rate": 9.970859728519881e-06, + "loss": 3.2296, + "step": 174600 + }, + { + "epoch": 0.17205811653255362, + "grad_norm": 2.347154378890991, + "learning_rate": 9.970843042987126e-06, + "loss": 3.2584, + "step": 174650 + }, + { + "epoch": 0.1721073745103757, + "grad_norm": 2.307035446166992, + "learning_rate": 9.970826352692694e-06, + "loss": 3.19, + "step": 174700 + }, + { + "epoch": 0.1721566324881978, + "grad_norm": 2.4512135982513428, + "learning_rate": 9.970809657636598e-06, + "loss": 3.2042, + "step": 174750 + }, + { + "epoch": 0.1722058904660199, + "grad_norm": 2.3631134033203125, + "learning_rate": 9.970792957818856e-06, + "loss": 3.2662, + "step": 174800 + }, + { + "epoch": 0.17225514844384196, + "grad_norm": 2.411156415939331, + "learning_rate": 9.970776253239484e-06, + "loss": 3.2078, + "step": 174850 + }, + { + "epoch": 0.17230440642166406, + "grad_norm": 2.5532026290893555, + "learning_rate": 9.970759543898496e-06, + "loss": 3.2319, + "step": 174900 + }, + { + "epoch": 0.17235366439948613, + "grad_norm": 2.4078595638275146, + "learning_rate": 9.970742829795912e-06, + "loss": 3.286, + "step": 174950 + }, + { + "epoch": 0.17240292237730823, + "grad_norm": 2.187547206878662, + "learning_rate": 9.970726110931745e-06, + "loss": 3.173, + "step": 175000 + }, + { + "epoch": 0.17245218035513032, + "grad_norm": 2.5770387649536133, + "learning_rate": 9.97070938730601e-06, + "loss": 3.2262, + "step": 175050 + }, + { + "epoch": 0.1725014383329524, + "grad_norm": 2.3505170345306396, + "learning_rate": 9.970692658918726e-06, + "loss": 3.2261, + "step": 175100 + }, + { + "epoch": 0.1725506963107745, + "grad_norm": 2.297321081161499, + "learning_rate": 9.970675925769909e-06, + "loss": 3.1537, + "step": 175150 + }, + { + "epoch": 0.1725999542885966, + "grad_norm": 2.5150694847106934, + "learning_rate": 9.970659187859572e-06, + "loss": 3.234, + "step": 175200 + }, + { + "epoch": 0.17264921226641866, + "grad_norm": 2.377392053604126, + "learning_rate": 9.970642445187733e-06, + "loss": 3.224, + "step": 175250 + }, + { + "epoch": 0.17269847024424076, + "grad_norm": 2.3507533073425293, + "learning_rate": 9.970625697754408e-06, + "loss": 3.2202, + "step": 175300 + }, + { + "epoch": 0.17274772822206286, + "grad_norm": 2.198974609375, + "learning_rate": 9.970608945559612e-06, + "loss": 3.2034, + "step": 175350 + }, + { + "epoch": 0.17279698619988493, + "grad_norm": 2.4746742248535156, + "learning_rate": 9.970592188603364e-06, + "loss": 3.1561, + "step": 175400 + }, + { + "epoch": 0.17284624417770703, + "grad_norm": 2.1418864727020264, + "learning_rate": 9.970575426885675e-06, + "loss": 3.2814, + "step": 175450 + }, + { + "epoch": 0.1728955021555291, + "grad_norm": 2.5270841121673584, + "learning_rate": 9.970558660406565e-06, + "loss": 3.1718, + "step": 175500 + }, + { + "epoch": 0.1729447601333512, + "grad_norm": 2.8102076053619385, + "learning_rate": 9.97054188916605e-06, + "loss": 3.2014, + "step": 175550 + }, + { + "epoch": 0.1729940181111733, + "grad_norm": 2.409632682800293, + "learning_rate": 9.970525113164144e-06, + "loss": 3.2273, + "step": 175600 + }, + { + "epoch": 0.17304327608899536, + "grad_norm": 2.176866054534912, + "learning_rate": 9.970508332400864e-06, + "loss": 3.2491, + "step": 175650 + }, + { + "epoch": 0.17309253406681746, + "grad_norm": 2.5835020542144775, + "learning_rate": 9.970491546876227e-06, + "loss": 3.2277, + "step": 175700 + }, + { + "epoch": 0.17314179204463956, + "grad_norm": 2.4079623222351074, + "learning_rate": 9.97047475659025e-06, + "loss": 3.2639, + "step": 175750 + }, + { + "epoch": 0.17319105002246163, + "grad_norm": 2.363938808441162, + "learning_rate": 9.970457961542945e-06, + "loss": 3.1784, + "step": 175800 + }, + { + "epoch": 0.17324030800028373, + "grad_norm": 2.384348154067993, + "learning_rate": 9.97044116173433e-06, + "loss": 3.2732, + "step": 175850 + }, + { + "epoch": 0.17328956597810583, + "grad_norm": 2.3531157970428467, + "learning_rate": 9.970424357164423e-06, + "loss": 3.2295, + "step": 175900 + }, + { + "epoch": 0.1733388239559279, + "grad_norm": 2.329005241394043, + "learning_rate": 9.970407547833239e-06, + "loss": 3.1685, + "step": 175950 + }, + { + "epoch": 0.17338808193375, + "grad_norm": 2.4844858646392822, + "learning_rate": 9.970390733740793e-06, + "loss": 3.2319, + "step": 176000 + }, + { + "epoch": 0.1734373399115721, + "grad_norm": 2.6110851764678955, + "learning_rate": 9.970373914887101e-06, + "loss": 3.2685, + "step": 176050 + }, + { + "epoch": 0.17348659788939416, + "grad_norm": 2.810617685317993, + "learning_rate": 9.970357091272182e-06, + "loss": 3.1943, + "step": 176100 + }, + { + "epoch": 0.17353585586721626, + "grad_norm": 2.3159124851226807, + "learning_rate": 9.970340262896048e-06, + "loss": 3.2544, + "step": 176150 + }, + { + "epoch": 0.17358511384503833, + "grad_norm": 2.3879196643829346, + "learning_rate": 9.97032342975872e-06, + "loss": 3.2139, + "step": 176200 + }, + { + "epoch": 0.17363437182286043, + "grad_norm": 2.4451446533203125, + "learning_rate": 9.970306591860208e-06, + "loss": 3.1877, + "step": 176250 + }, + { + "epoch": 0.17368362980068253, + "grad_norm": 2.2923426628112793, + "learning_rate": 9.970289749200535e-06, + "loss": 3.2136, + "step": 176300 + }, + { + "epoch": 0.1737328877785046, + "grad_norm": 2.2988176345825195, + "learning_rate": 9.970272901779712e-06, + "loss": 3.1644, + "step": 176350 + }, + { + "epoch": 0.1737821457563267, + "grad_norm": 2.6148171424865723, + "learning_rate": 9.970256049597755e-06, + "loss": 3.2128, + "step": 176400 + }, + { + "epoch": 0.1738314037341488, + "grad_norm": 2.283486843109131, + "learning_rate": 9.970239192654684e-06, + "loss": 3.245, + "step": 176450 + }, + { + "epoch": 0.17388066171197086, + "grad_norm": 2.37431001663208, + "learning_rate": 9.970222330950513e-06, + "loss": 3.1857, + "step": 176500 + }, + { + "epoch": 0.17392991968979296, + "grad_norm": 2.206726312637329, + "learning_rate": 9.970205464485259e-06, + "loss": 3.2206, + "step": 176550 + }, + { + "epoch": 0.17397917766761506, + "grad_norm": 2.302769184112549, + "learning_rate": 9.970188593258934e-06, + "loss": 3.1743, + "step": 176600 + }, + { + "epoch": 0.17402843564543713, + "grad_norm": 2.218107223510742, + "learning_rate": 9.970171717271561e-06, + "loss": 3.1986, + "step": 176650 + }, + { + "epoch": 0.17407769362325923, + "grad_norm": 2.3868567943573, + "learning_rate": 9.970154836523151e-06, + "loss": 3.2332, + "step": 176700 + }, + { + "epoch": 0.1741269516010813, + "grad_norm": 2.3658347129821777, + "learning_rate": 9.970137951013723e-06, + "loss": 3.2347, + "step": 176750 + }, + { + "epoch": 0.1741762095789034, + "grad_norm": 2.488234519958496, + "learning_rate": 9.970121060743291e-06, + "loss": 3.3221, + "step": 176800 + }, + { + "epoch": 0.1742254675567255, + "grad_norm": 2.5332071781158447, + "learning_rate": 9.970104165711871e-06, + "loss": 3.1868, + "step": 176850 + }, + { + "epoch": 0.17427472553454756, + "grad_norm": 2.3807594776153564, + "learning_rate": 9.970087265919484e-06, + "loss": 3.1987, + "step": 176900 + }, + { + "epoch": 0.17432398351236966, + "grad_norm": 2.5645101070404053, + "learning_rate": 9.97007036136614e-06, + "loss": 3.2314, + "step": 176950 + }, + { + "epoch": 0.17437324149019176, + "grad_norm": 2.656818389892578, + "learning_rate": 9.970053452051858e-06, + "loss": 3.2458, + "step": 177000 + }, + { + "epoch": 0.17442249946801383, + "grad_norm": 2.353442907333374, + "learning_rate": 9.970036537976654e-06, + "loss": 3.2499, + "step": 177050 + }, + { + "epoch": 0.17447175744583593, + "grad_norm": 2.27417254447937, + "learning_rate": 9.970019619140545e-06, + "loss": 3.2293, + "step": 177100 + }, + { + "epoch": 0.17452101542365803, + "grad_norm": 2.4451217651367188, + "learning_rate": 9.970002695543546e-06, + "loss": 3.2549, + "step": 177150 + }, + { + "epoch": 0.1745702734014801, + "grad_norm": 2.4593799114227295, + "learning_rate": 9.969985767185672e-06, + "loss": 3.2287, + "step": 177200 + }, + { + "epoch": 0.1746195313793022, + "grad_norm": 2.2430336475372314, + "learning_rate": 9.969968834066943e-06, + "loss": 3.2072, + "step": 177250 + }, + { + "epoch": 0.1746687893571243, + "grad_norm": 2.453826665878296, + "learning_rate": 9.969951896187372e-06, + "loss": 3.2609, + "step": 177300 + }, + { + "epoch": 0.17471804733494636, + "grad_norm": 2.6128177642822266, + "learning_rate": 9.969934953546977e-06, + "loss": 3.3027, + "step": 177350 + }, + { + "epoch": 0.17476730531276846, + "grad_norm": 2.5761077404022217, + "learning_rate": 9.969918006145773e-06, + "loss": 3.1796, + "step": 177400 + }, + { + "epoch": 0.17481656329059053, + "grad_norm": 2.2695202827453613, + "learning_rate": 9.969901053983777e-06, + "loss": 3.3303, + "step": 177450 + }, + { + "epoch": 0.17486582126841263, + "grad_norm": 2.2156848907470703, + "learning_rate": 9.969884097061004e-06, + "loss": 3.1921, + "step": 177500 + }, + { + "epoch": 0.17491507924623473, + "grad_norm": 2.415022134780884, + "learning_rate": 9.969867135377473e-06, + "loss": 3.2027, + "step": 177550 + }, + { + "epoch": 0.1749643372240568, + "grad_norm": 2.372105121612549, + "learning_rate": 9.969850168933197e-06, + "loss": 3.2477, + "step": 177600 + }, + { + "epoch": 0.1750135952018789, + "grad_norm": 2.373159646987915, + "learning_rate": 9.969833197728194e-06, + "loss": 3.3252, + "step": 177650 + }, + { + "epoch": 0.175062853179701, + "grad_norm": 2.375117778778076, + "learning_rate": 9.969816221762481e-06, + "loss": 3.1769, + "step": 177700 + }, + { + "epoch": 0.17511211115752306, + "grad_norm": 2.3999218940734863, + "learning_rate": 9.96979924103607e-06, + "loss": 3.239, + "step": 177750 + }, + { + "epoch": 0.17516136913534516, + "grad_norm": 2.683366060256958, + "learning_rate": 9.969782255548984e-06, + "loss": 3.2696, + "step": 177800 + }, + { + "epoch": 0.17521062711316726, + "grad_norm": 2.558255195617676, + "learning_rate": 9.969765265301234e-06, + "loss": 3.2513, + "step": 177850 + }, + { + "epoch": 0.17525988509098933, + "grad_norm": 2.459028720855713, + "learning_rate": 9.96974827029284e-06, + "loss": 3.212, + "step": 177900 + }, + { + "epoch": 0.17530914306881143, + "grad_norm": 2.330308675765991, + "learning_rate": 9.969731270523814e-06, + "loss": 3.1923, + "step": 177950 + }, + { + "epoch": 0.1753584010466335, + "grad_norm": 2.7446229457855225, + "learning_rate": 9.969714265994175e-06, + "loss": 3.2111, + "step": 178000 + }, + { + "epoch": 0.1754076590244556, + "grad_norm": 2.47710919380188, + "learning_rate": 9.96969725670394e-06, + "loss": 3.1435, + "step": 178050 + }, + { + "epoch": 0.1754569170022777, + "grad_norm": 2.5674662590026855, + "learning_rate": 9.969680242653124e-06, + "loss": 3.1494, + "step": 178100 + }, + { + "epoch": 0.17550617498009977, + "grad_norm": 2.2957444190979004, + "learning_rate": 9.969663223841744e-06, + "loss": 3.2169, + "step": 178150 + }, + { + "epoch": 0.17555543295792186, + "grad_norm": 2.4439327716827393, + "learning_rate": 9.969646200269813e-06, + "loss": 3.221, + "step": 178200 + }, + { + "epoch": 0.17560469093574396, + "grad_norm": 2.365339994430542, + "learning_rate": 9.969629171937354e-06, + "loss": 3.2101, + "step": 178250 + }, + { + "epoch": 0.17565394891356603, + "grad_norm": 2.4139089584350586, + "learning_rate": 9.969612138844377e-06, + "loss": 3.1817, + "step": 178300 + }, + { + "epoch": 0.17570320689138813, + "grad_norm": 2.3424696922302246, + "learning_rate": 9.969595100990901e-06, + "loss": 3.167, + "step": 178350 + }, + { + "epoch": 0.17575246486921023, + "grad_norm": 2.3044238090515137, + "learning_rate": 9.969578058376942e-06, + "loss": 3.1844, + "step": 178400 + }, + { + "epoch": 0.1758017228470323, + "grad_norm": 2.4840033054351807, + "learning_rate": 9.969561011002516e-06, + "loss": 3.1937, + "step": 178450 + }, + { + "epoch": 0.1758509808248544, + "grad_norm": 2.502654552459717, + "learning_rate": 9.96954395886764e-06, + "loss": 3.2016, + "step": 178500 + }, + { + "epoch": 0.1759002388026765, + "grad_norm": 2.57059907913208, + "learning_rate": 9.96952690197233e-06, + "loss": 3.0883, + "step": 178550 + }, + { + "epoch": 0.17594949678049857, + "grad_norm": 2.252640724182129, + "learning_rate": 9.969509840316604e-06, + "loss": 3.1544, + "step": 178600 + }, + { + "epoch": 0.17599875475832066, + "grad_norm": 2.509791612625122, + "learning_rate": 9.969492773900475e-06, + "loss": 3.2039, + "step": 178650 + }, + { + "epoch": 0.17604801273614273, + "grad_norm": 2.2449185848236084, + "learning_rate": 9.969475702723961e-06, + "loss": 3.2125, + "step": 178700 + }, + { + "epoch": 0.17609727071396483, + "grad_norm": 2.456995964050293, + "learning_rate": 9.96945862678708e-06, + "loss": 3.2734, + "step": 178750 + }, + { + "epoch": 0.17614652869178693, + "grad_norm": 2.3344812393188477, + "learning_rate": 9.969441546089846e-06, + "loss": 3.2524, + "step": 178800 + }, + { + "epoch": 0.176195786669609, + "grad_norm": 2.600053548812866, + "learning_rate": 9.969424460632275e-06, + "loss": 3.2894, + "step": 178850 + }, + { + "epoch": 0.1762450446474311, + "grad_norm": 2.3503122329711914, + "learning_rate": 9.969407370414386e-06, + "loss": 3.1809, + "step": 178900 + }, + { + "epoch": 0.1762943026252532, + "grad_norm": 2.4465696811676025, + "learning_rate": 9.969390275436193e-06, + "loss": 3.2239, + "step": 178950 + }, + { + "epoch": 0.17634356060307527, + "grad_norm": 2.2553763389587402, + "learning_rate": 9.969373175697712e-06, + "loss": 3.1909, + "step": 179000 + }, + { + "epoch": 0.17639281858089736, + "grad_norm": 2.6784815788269043, + "learning_rate": 9.969356071198963e-06, + "loss": 3.215, + "step": 179050 + }, + { + "epoch": 0.17644207655871946, + "grad_norm": 2.4496383666992188, + "learning_rate": 9.969338961939958e-06, + "loss": 3.1194, + "step": 179100 + }, + { + "epoch": 0.17649133453654153, + "grad_norm": 2.2548818588256836, + "learning_rate": 9.969321847920718e-06, + "loss": 3.2143, + "step": 179150 + }, + { + "epoch": 0.17654059251436363, + "grad_norm": 2.3630807399749756, + "learning_rate": 9.969304729141256e-06, + "loss": 3.2393, + "step": 179200 + }, + { + "epoch": 0.1765898504921857, + "grad_norm": 2.304321765899658, + "learning_rate": 9.969287605601587e-06, + "loss": 3.2695, + "step": 179250 + }, + { + "epoch": 0.1766391084700078, + "grad_norm": 2.2405922412872314, + "learning_rate": 9.969270477301732e-06, + "loss": 3.1952, + "step": 179300 + }, + { + "epoch": 0.1766883664478299, + "grad_norm": 2.3130648136138916, + "learning_rate": 9.969253344241702e-06, + "loss": 3.2475, + "step": 179350 + }, + { + "epoch": 0.17673762442565197, + "grad_norm": 2.4354708194732666, + "learning_rate": 9.96923620642152e-06, + "loss": 3.1491, + "step": 179400 + }, + { + "epoch": 0.17678688240347407, + "grad_norm": 2.2851667404174805, + "learning_rate": 9.969219063841198e-06, + "loss": 3.2361, + "step": 179450 + }, + { + "epoch": 0.17683614038129616, + "grad_norm": 2.1669671535491943, + "learning_rate": 9.969201916500752e-06, + "loss": 3.2251, + "step": 179500 + }, + { + "epoch": 0.17688539835911823, + "grad_norm": 2.455143690109253, + "learning_rate": 9.9691847644002e-06, + "loss": 3.2062, + "step": 179550 + }, + { + "epoch": 0.17693465633694033, + "grad_norm": 2.3827309608459473, + "learning_rate": 9.969167607539558e-06, + "loss": 3.2083, + "step": 179600 + }, + { + "epoch": 0.17698391431476243, + "grad_norm": 2.3525800704956055, + "learning_rate": 9.969150445918843e-06, + "loss": 3.2226, + "step": 179650 + }, + { + "epoch": 0.1770331722925845, + "grad_norm": 2.357990026473999, + "learning_rate": 9.969133279538072e-06, + "loss": 3.1874, + "step": 179700 + }, + { + "epoch": 0.1770824302704066, + "grad_norm": 2.433995008468628, + "learning_rate": 9.969116108397259e-06, + "loss": 3.2163, + "step": 179750 + }, + { + "epoch": 0.1771316882482287, + "grad_norm": 2.3474621772766113, + "learning_rate": 9.969098932496422e-06, + "loss": 3.166, + "step": 179800 + }, + { + "epoch": 0.17718094622605077, + "grad_norm": 2.3695600032806396, + "learning_rate": 9.969081751835577e-06, + "loss": 3.1432, + "step": 179850 + }, + { + "epoch": 0.17723020420387287, + "grad_norm": 2.4250152111053467, + "learning_rate": 9.969064566414742e-06, + "loss": 3.2001, + "step": 179900 + }, + { + "epoch": 0.17727946218169494, + "grad_norm": 2.7788491249084473, + "learning_rate": 9.969047376233932e-06, + "loss": 3.1954, + "step": 179950 + }, + { + "epoch": 0.17732872015951703, + "grad_norm": 2.843271493911743, + "learning_rate": 9.969030181293163e-06, + "loss": 3.2637, + "step": 180000 + }, + { + "epoch": 0.17737797813733913, + "grad_norm": 2.479048728942871, + "learning_rate": 9.969012981592452e-06, + "loss": 3.1628, + "step": 180050 + }, + { + "epoch": 0.1774272361151612, + "grad_norm": 2.4869651794433594, + "learning_rate": 9.968995777131816e-06, + "loss": 3.2622, + "step": 180100 + }, + { + "epoch": 0.1774764940929833, + "grad_norm": 2.5936601161956787, + "learning_rate": 9.968978567911272e-06, + "loss": 3.289, + "step": 180150 + }, + { + "epoch": 0.1775257520708054, + "grad_norm": 2.46224045753479, + "learning_rate": 9.968961353930835e-06, + "loss": 3.1471, + "step": 180200 + }, + { + "epoch": 0.17757501004862747, + "grad_norm": 2.42905330657959, + "learning_rate": 9.968944135190523e-06, + "loss": 3.2288, + "step": 180250 + }, + { + "epoch": 0.17762426802644957, + "grad_norm": 2.3414146900177, + "learning_rate": 9.968926911690352e-06, + "loss": 3.2019, + "step": 180300 + }, + { + "epoch": 0.17767352600427166, + "grad_norm": 2.439530611038208, + "learning_rate": 9.968909683430334e-06, + "loss": 3.136, + "step": 180350 + }, + { + "epoch": 0.17772278398209373, + "grad_norm": 2.425395965576172, + "learning_rate": 9.968892450410493e-06, + "loss": 3.1511, + "step": 180400 + }, + { + "epoch": 0.17777204195991583, + "grad_norm": 2.3070318698883057, + "learning_rate": 9.968875212630842e-06, + "loss": 3.172, + "step": 180450 + }, + { + "epoch": 0.1778212999377379, + "grad_norm": 2.6527514457702637, + "learning_rate": 9.968857970091398e-06, + "loss": 3.193, + "step": 180500 + }, + { + "epoch": 0.17787055791556, + "grad_norm": 2.4171979427337646, + "learning_rate": 9.968840722792175e-06, + "loss": 3.187, + "step": 180550 + }, + { + "epoch": 0.1779198158933821, + "grad_norm": 2.6169016361236572, + "learning_rate": 9.968823470733193e-06, + "loss": 3.2338, + "step": 180600 + }, + { + "epoch": 0.17796907387120417, + "grad_norm": 2.244690418243408, + "learning_rate": 9.968806213914468e-06, + "loss": 3.2431, + "step": 180650 + }, + { + "epoch": 0.17801833184902627, + "grad_norm": 2.3776443004608154, + "learning_rate": 9.968788952336014e-06, + "loss": 3.2088, + "step": 180700 + }, + { + "epoch": 0.17806758982684837, + "grad_norm": 2.2827038764953613, + "learning_rate": 9.968771685997851e-06, + "loss": 3.2046, + "step": 180750 + }, + { + "epoch": 0.17811684780467044, + "grad_norm": 3.0177316665649414, + "learning_rate": 9.968754414899993e-06, + "loss": 3.2659, + "step": 180800 + }, + { + "epoch": 0.17816610578249253, + "grad_norm": 2.5790157318115234, + "learning_rate": 9.968737139042458e-06, + "loss": 3.2466, + "step": 180850 + }, + { + "epoch": 0.17821536376031463, + "grad_norm": 2.415644884109497, + "learning_rate": 9.968719858425263e-06, + "loss": 3.228, + "step": 180900 + }, + { + "epoch": 0.1782646217381367, + "grad_norm": 2.376086473464966, + "learning_rate": 9.968702573048421e-06, + "loss": 3.1735, + "step": 180950 + }, + { + "epoch": 0.1783138797159588, + "grad_norm": 2.4414310455322266, + "learning_rate": 9.968685282911953e-06, + "loss": 3.1912, + "step": 181000 + }, + { + "epoch": 0.17836313769378087, + "grad_norm": 2.4331605434417725, + "learning_rate": 9.96866798801587e-06, + "loss": 3.2045, + "step": 181050 + }, + { + "epoch": 0.17841239567160297, + "grad_norm": 2.17183780670166, + "learning_rate": 9.968650688360195e-06, + "loss": 3.2325, + "step": 181100 + }, + { + "epoch": 0.17846165364942507, + "grad_norm": 2.289301872253418, + "learning_rate": 9.968633383944941e-06, + "loss": 3.2067, + "step": 181150 + }, + { + "epoch": 0.17851091162724714, + "grad_norm": 2.3786370754241943, + "learning_rate": 9.968616074770126e-06, + "loss": 3.2623, + "step": 181200 + }, + { + "epoch": 0.17856016960506924, + "grad_norm": 2.4174704551696777, + "learning_rate": 9.968598760835765e-06, + "loss": 3.1445, + "step": 181250 + }, + { + "epoch": 0.17860942758289133, + "grad_norm": 2.3299131393432617, + "learning_rate": 9.968581442141877e-06, + "loss": 3.2231, + "step": 181300 + }, + { + "epoch": 0.1786586855607134, + "grad_norm": 2.3166391849517822, + "learning_rate": 9.968564118688477e-06, + "loss": 3.1527, + "step": 181350 + }, + { + "epoch": 0.1787079435385355, + "grad_norm": 2.3757858276367188, + "learning_rate": 9.96854679047558e-06, + "loss": 3.2625, + "step": 181400 + }, + { + "epoch": 0.1787572015163576, + "grad_norm": 2.4631245136260986, + "learning_rate": 9.968529457503204e-06, + "loss": 3.2558, + "step": 181450 + }, + { + "epoch": 0.17880645949417967, + "grad_norm": 2.584956645965576, + "learning_rate": 9.968512119771367e-06, + "loss": 3.2225, + "step": 181500 + }, + { + "epoch": 0.17885571747200177, + "grad_norm": 2.3392624855041504, + "learning_rate": 9.968494777280084e-06, + "loss": 3.2828, + "step": 181550 + }, + { + "epoch": 0.17890497544982387, + "grad_norm": 2.4581894874572754, + "learning_rate": 9.968477430029373e-06, + "loss": 3.2916, + "step": 181600 + }, + { + "epoch": 0.17895423342764594, + "grad_norm": 2.4843897819519043, + "learning_rate": 9.96846007801925e-06, + "loss": 3.1885, + "step": 181650 + }, + { + "epoch": 0.17900349140546803, + "grad_norm": 2.4048242568969727, + "learning_rate": 9.96844272124973e-06, + "loss": 3.217, + "step": 181700 + }, + { + "epoch": 0.1790527493832901, + "grad_norm": 2.4753775596618652, + "learning_rate": 9.96842535972083e-06, + "loss": 3.1802, + "step": 181750 + }, + { + "epoch": 0.1791020073611122, + "grad_norm": 2.256366729736328, + "learning_rate": 9.96840799343257e-06, + "loss": 3.2358, + "step": 181800 + }, + { + "epoch": 0.1791512653389343, + "grad_norm": 2.349601984024048, + "learning_rate": 9.968390622384963e-06, + "loss": 3.2442, + "step": 181850 + }, + { + "epoch": 0.17920052331675637, + "grad_norm": 2.2636196613311768, + "learning_rate": 9.968373246578025e-06, + "loss": 3.1425, + "step": 181900 + }, + { + "epoch": 0.17924978129457847, + "grad_norm": 2.643313407897949, + "learning_rate": 9.968355866011777e-06, + "loss": 3.146, + "step": 181950 + }, + { + "epoch": 0.17929903927240057, + "grad_norm": 2.3444671630859375, + "learning_rate": 9.968338480686232e-06, + "loss": 3.1962, + "step": 182000 + }, + { + "epoch": 0.17934829725022264, + "grad_norm": 2.5086519718170166, + "learning_rate": 9.968321090601409e-06, + "loss": 3.1536, + "step": 182050 + }, + { + "epoch": 0.17939755522804474, + "grad_norm": 2.286423444747925, + "learning_rate": 9.968303695757322e-06, + "loss": 3.2251, + "step": 182100 + }, + { + "epoch": 0.17944681320586683, + "grad_norm": 2.624016284942627, + "learning_rate": 9.96828629615399e-06, + "loss": 3.2075, + "step": 182150 + }, + { + "epoch": 0.1794960711836889, + "grad_norm": 3.246455192565918, + "learning_rate": 9.968268891791428e-06, + "loss": 3.2645, + "step": 182200 + }, + { + "epoch": 0.179545329161511, + "grad_norm": 2.625476598739624, + "learning_rate": 9.968251482669655e-06, + "loss": 3.1777, + "step": 182250 + }, + { + "epoch": 0.17959458713933307, + "grad_norm": 2.3698625564575195, + "learning_rate": 9.968234068788684e-06, + "loss": 3.1453, + "step": 182300 + }, + { + "epoch": 0.17964384511715517, + "grad_norm": 2.3047168254852295, + "learning_rate": 9.968216650148537e-06, + "loss": 3.2371, + "step": 182350 + }, + { + "epoch": 0.17969310309497727, + "grad_norm": 2.2632524967193604, + "learning_rate": 9.968199226749225e-06, + "loss": 3.2292, + "step": 182400 + }, + { + "epoch": 0.17974236107279934, + "grad_norm": 2.403829574584961, + "learning_rate": 9.968181798590767e-06, + "loss": 3.2815, + "step": 182450 + }, + { + "epoch": 0.17979161905062144, + "grad_norm": 2.308981418609619, + "learning_rate": 9.96816436567318e-06, + "loss": 3.2758, + "step": 182500 + }, + { + "epoch": 0.17984087702844354, + "grad_norm": 2.242492198944092, + "learning_rate": 9.968146927996481e-06, + "loss": 3.2051, + "step": 182550 + }, + { + "epoch": 0.1798901350062656, + "grad_norm": 2.318098783493042, + "learning_rate": 9.968129485560686e-06, + "loss": 3.221, + "step": 182600 + }, + { + "epoch": 0.1799393929840877, + "grad_norm": 2.26322078704834, + "learning_rate": 9.968112038365813e-06, + "loss": 3.1868, + "step": 182650 + }, + { + "epoch": 0.1799886509619098, + "grad_norm": 2.604257822036743, + "learning_rate": 9.968094586411877e-06, + "loss": 3.2324, + "step": 182700 + }, + { + "epoch": 0.18003790893973187, + "grad_norm": 2.2347121238708496, + "learning_rate": 9.968077129698897e-06, + "loss": 3.234, + "step": 182750 + }, + { + "epoch": 0.18008716691755397, + "grad_norm": 2.424950122833252, + "learning_rate": 9.968059668226886e-06, + "loss": 3.3001, + "step": 182800 + }, + { + "epoch": 0.18013642489537607, + "grad_norm": 2.4661848545074463, + "learning_rate": 9.968042201995862e-06, + "loss": 3.2215, + "step": 182850 + }, + { + "epoch": 0.18018568287319814, + "grad_norm": 3.518190622329712, + "learning_rate": 9.968024731005845e-06, + "loss": 3.1993, + "step": 182900 + }, + { + "epoch": 0.18023494085102024, + "grad_norm": 2.3831968307495117, + "learning_rate": 9.968007255256849e-06, + "loss": 3.1478, + "step": 182950 + }, + { + "epoch": 0.1802841988288423, + "grad_norm": 2.472097873687744, + "learning_rate": 9.96798977474889e-06, + "loss": 3.2736, + "step": 183000 + }, + { + "epoch": 0.1803334568066644, + "grad_norm": 2.2536277770996094, + "learning_rate": 9.967972289481987e-06, + "loss": 3.1764, + "step": 183050 + }, + { + "epoch": 0.1803827147844865, + "grad_norm": 2.588365077972412, + "learning_rate": 9.967954799456154e-06, + "loss": 3.211, + "step": 183100 + }, + { + "epoch": 0.18043197276230857, + "grad_norm": 2.477708578109741, + "learning_rate": 9.96793730467141e-06, + "loss": 3.1815, + "step": 183150 + }, + { + "epoch": 0.18048123074013067, + "grad_norm": 2.4470369815826416, + "learning_rate": 9.967919805127771e-06, + "loss": 3.2072, + "step": 183200 + }, + { + "epoch": 0.18053048871795277, + "grad_norm": 2.3543145656585693, + "learning_rate": 9.967902300825254e-06, + "loss": 3.2764, + "step": 183250 + }, + { + "epoch": 0.18057974669577484, + "grad_norm": 2.187976837158203, + "learning_rate": 9.967884791763876e-06, + "loss": 3.2268, + "step": 183300 + }, + { + "epoch": 0.18062900467359694, + "grad_norm": 2.2675974369049072, + "learning_rate": 9.967867277943653e-06, + "loss": 3.1894, + "step": 183350 + }, + { + "epoch": 0.18067826265141904, + "grad_norm": 2.4269113540649414, + "learning_rate": 9.967849759364602e-06, + "loss": 3.1766, + "step": 183400 + }, + { + "epoch": 0.1807275206292411, + "grad_norm": 2.1761116981506348, + "learning_rate": 9.967832236026741e-06, + "loss": 3.2333, + "step": 183450 + }, + { + "epoch": 0.1807767786070632, + "grad_norm": 2.30753755569458, + "learning_rate": 9.967814707930086e-06, + "loss": 3.2127, + "step": 183500 + }, + { + "epoch": 0.18082603658488527, + "grad_norm": 2.636326789855957, + "learning_rate": 9.96779717507465e-06, + "loss": 3.2021, + "step": 183550 + }, + { + "epoch": 0.18087529456270737, + "grad_norm": 2.421142339706421, + "learning_rate": 9.967779637460458e-06, + "loss": 3.2146, + "step": 183600 + }, + { + "epoch": 0.18092455254052947, + "grad_norm": 2.531524658203125, + "learning_rate": 9.96776209508752e-06, + "loss": 3.212, + "step": 183650 + }, + { + "epoch": 0.18097381051835154, + "grad_norm": 2.468977689743042, + "learning_rate": 9.967744547955854e-06, + "loss": 3.2537, + "step": 183700 + }, + { + "epoch": 0.18102306849617364, + "grad_norm": 2.3957114219665527, + "learning_rate": 9.96772699606548e-06, + "loss": 3.197, + "step": 183750 + }, + { + "epoch": 0.18107232647399574, + "grad_norm": 2.337158203125, + "learning_rate": 9.967709439416411e-06, + "loss": 3.1421, + "step": 183800 + }, + { + "epoch": 0.1811215844518178, + "grad_norm": 4.121609687805176, + "learning_rate": 9.967691878008666e-06, + "loss": 3.2154, + "step": 183850 + }, + { + "epoch": 0.1811708424296399, + "grad_norm": 2.4127163887023926, + "learning_rate": 9.967674311842259e-06, + "loss": 3.202, + "step": 183900 + }, + { + "epoch": 0.181220100407462, + "grad_norm": 2.3678977489471436, + "learning_rate": 9.967656740917211e-06, + "loss": 3.2303, + "step": 183950 + }, + { + "epoch": 0.18126935838528407, + "grad_norm": 2.325740337371826, + "learning_rate": 9.967639165233535e-06, + "loss": 3.2165, + "step": 184000 + }, + { + "epoch": 0.18131861636310617, + "grad_norm": 2.4106318950653076, + "learning_rate": 9.967621584791252e-06, + "loss": 3.116, + "step": 184050 + }, + { + "epoch": 0.18136787434092827, + "grad_norm": 2.3828835487365723, + "learning_rate": 9.967603999590377e-06, + "loss": 3.2087, + "step": 184100 + }, + { + "epoch": 0.18141713231875034, + "grad_norm": 2.370549201965332, + "learning_rate": 9.967586409630925e-06, + "loss": 3.2244, + "step": 184150 + }, + { + "epoch": 0.18146639029657244, + "grad_norm": 2.5792245864868164, + "learning_rate": 9.967568814912914e-06, + "loss": 3.1708, + "step": 184200 + }, + { + "epoch": 0.1815156482743945, + "grad_norm": 2.433742046356201, + "learning_rate": 9.967551215436362e-06, + "loss": 3.2196, + "step": 184250 + }, + { + "epoch": 0.1815649062522166, + "grad_norm": 2.4566173553466797, + "learning_rate": 9.967533611201283e-06, + "loss": 3.2788, + "step": 184300 + }, + { + "epoch": 0.1816141642300387, + "grad_norm": 2.915496826171875, + "learning_rate": 9.967516002207698e-06, + "loss": 3.2244, + "step": 184350 + }, + { + "epoch": 0.18166342220786078, + "grad_norm": 2.6744544506073, + "learning_rate": 9.96749838845562e-06, + "loss": 3.2732, + "step": 184400 + }, + { + "epoch": 0.18171268018568287, + "grad_norm": 2.5147130489349365, + "learning_rate": 9.967480769945068e-06, + "loss": 3.2233, + "step": 184450 + }, + { + "epoch": 0.18176193816350497, + "grad_norm": 2.3343405723571777, + "learning_rate": 9.96746314667606e-06, + "loss": 3.2019, + "step": 184500 + }, + { + "epoch": 0.18181119614132704, + "grad_norm": 2.734468460083008, + "learning_rate": 9.96744551864861e-06, + "loss": 3.2, + "step": 184550 + }, + { + "epoch": 0.18186045411914914, + "grad_norm": 2.3590869903564453, + "learning_rate": 9.967427885862736e-06, + "loss": 3.2323, + "step": 184600 + }, + { + "epoch": 0.18190971209697124, + "grad_norm": 2.3478872776031494, + "learning_rate": 9.967410248318453e-06, + "loss": 3.2005, + "step": 184650 + }, + { + "epoch": 0.1819589700747933, + "grad_norm": 2.3960773944854736, + "learning_rate": 9.967392606015784e-06, + "loss": 3.1868, + "step": 184700 + }, + { + "epoch": 0.1820082280526154, + "grad_norm": 2.218552827835083, + "learning_rate": 9.96737495895474e-06, + "loss": 3.2673, + "step": 184750 + }, + { + "epoch": 0.18205748603043748, + "grad_norm": 2.4703712463378906, + "learning_rate": 9.96735730713534e-06, + "loss": 3.1731, + "step": 184800 + }, + { + "epoch": 0.18210674400825957, + "grad_norm": 2.463500499725342, + "learning_rate": 9.9673396505576e-06, + "loss": 3.2466, + "step": 184850 + }, + { + "epoch": 0.18215600198608167, + "grad_norm": 2.333935499191284, + "learning_rate": 9.967321989221538e-06, + "loss": 3.2262, + "step": 184900 + }, + { + "epoch": 0.18220525996390374, + "grad_norm": 2.3653900623321533, + "learning_rate": 9.967304323127172e-06, + "loss": 3.247, + "step": 184950 + }, + { + "epoch": 0.18225451794172584, + "grad_norm": 2.3154373168945312, + "learning_rate": 9.967286652274515e-06, + "loss": 3.1941, + "step": 185000 + }, + { + "epoch": 0.18230377591954794, + "grad_norm": 2.414435863494873, + "learning_rate": 9.967268976663587e-06, + "loss": 3.1564, + "step": 185050 + }, + { + "epoch": 0.18235303389737, + "grad_norm": 2.322814702987671, + "learning_rate": 9.967251296294406e-06, + "loss": 3.2223, + "step": 185100 + }, + { + "epoch": 0.1824022918751921, + "grad_norm": 2.5436880588531494, + "learning_rate": 9.967233611166986e-06, + "loss": 3.1892, + "step": 185150 + }, + { + "epoch": 0.1824515498530142, + "grad_norm": 2.2896182537078857, + "learning_rate": 9.967215921281344e-06, + "loss": 3.136, + "step": 185200 + }, + { + "epoch": 0.18250080783083628, + "grad_norm": 2.4659423828125, + "learning_rate": 9.9671982266375e-06, + "loss": 3.2351, + "step": 185250 + }, + { + "epoch": 0.18255006580865837, + "grad_norm": 2.7327942848205566, + "learning_rate": 9.967180527235467e-06, + "loss": 3.1901, + "step": 185300 + }, + { + "epoch": 0.18259932378648047, + "grad_norm": 2.3278703689575195, + "learning_rate": 9.967162823075265e-06, + "loss": 3.2248, + "step": 185350 + }, + { + "epoch": 0.18264858176430254, + "grad_norm": 2.4119722843170166, + "learning_rate": 9.96714511415691e-06, + "loss": 3.2229, + "step": 185400 + }, + { + "epoch": 0.18269783974212464, + "grad_norm": 2.525313138961792, + "learning_rate": 9.96712740048042e-06, + "loss": 3.2026, + "step": 185450 + }, + { + "epoch": 0.1827470977199467, + "grad_norm": 2.8143012523651123, + "learning_rate": 9.967109682045811e-06, + "loss": 3.2227, + "step": 185500 + }, + { + "epoch": 0.1827963556977688, + "grad_norm": 2.178868055343628, + "learning_rate": 9.967091958853099e-06, + "loss": 3.2126, + "step": 185550 + }, + { + "epoch": 0.1828456136755909, + "grad_norm": 2.2754158973693848, + "learning_rate": 9.967074230902302e-06, + "loss": 3.1576, + "step": 185600 + }, + { + "epoch": 0.18289487165341298, + "grad_norm": 2.245694875717163, + "learning_rate": 9.967056498193437e-06, + "loss": 3.2209, + "step": 185650 + }, + { + "epoch": 0.18294412963123508, + "grad_norm": 2.593104839324951, + "learning_rate": 9.96703876072652e-06, + "loss": 3.157, + "step": 185700 + }, + { + "epoch": 0.18299338760905717, + "grad_norm": 2.3891561031341553, + "learning_rate": 9.96702101850157e-06, + "loss": 3.2507, + "step": 185750 + }, + { + "epoch": 0.18304264558687924, + "grad_norm": 2.28312087059021, + "learning_rate": 9.967003271518601e-06, + "loss": 3.2289, + "step": 185800 + }, + { + "epoch": 0.18309190356470134, + "grad_norm": 2.6639416217803955, + "learning_rate": 9.966985519777633e-06, + "loss": 3.2002, + "step": 185850 + }, + { + "epoch": 0.18314116154252344, + "grad_norm": 2.5683982372283936, + "learning_rate": 9.966967763278683e-06, + "loss": 3.1509, + "step": 185900 + }, + { + "epoch": 0.1831904195203455, + "grad_norm": 2.3612489700317383, + "learning_rate": 9.966950002021766e-06, + "loss": 3.1809, + "step": 185950 + }, + { + "epoch": 0.1832396774981676, + "grad_norm": 2.1621787548065186, + "learning_rate": 9.966932236006898e-06, + "loss": 3.2045, + "step": 186000 + }, + { + "epoch": 0.18328893547598968, + "grad_norm": 2.1562697887420654, + "learning_rate": 9.9669144652341e-06, + "loss": 3.1925, + "step": 186050 + }, + { + "epoch": 0.18333819345381178, + "grad_norm": 2.486769676208496, + "learning_rate": 9.966896689703386e-06, + "loss": 3.178, + "step": 186100 + }, + { + "epoch": 0.18338745143163387, + "grad_norm": 2.630861282348633, + "learning_rate": 9.966878909414775e-06, + "loss": 3.1672, + "step": 186150 + }, + { + "epoch": 0.18343670940945594, + "grad_norm": 2.4195058345794678, + "learning_rate": 9.966861124368282e-06, + "loss": 3.1956, + "step": 186200 + }, + { + "epoch": 0.18348596738727804, + "grad_norm": 2.488677740097046, + "learning_rate": 9.966843334563923e-06, + "loss": 3.2247, + "step": 186250 + }, + { + "epoch": 0.18353522536510014, + "grad_norm": 2.3713600635528564, + "learning_rate": 9.96682554000172e-06, + "loss": 3.1821, + "step": 186300 + }, + { + "epoch": 0.1835844833429222, + "grad_norm": 2.3867037296295166, + "learning_rate": 9.966807740681686e-06, + "loss": 3.1999, + "step": 186350 + }, + { + "epoch": 0.1836337413207443, + "grad_norm": 2.70941162109375, + "learning_rate": 9.96678993660384e-06, + "loss": 3.2407, + "step": 186400 + }, + { + "epoch": 0.1836829992985664, + "grad_norm": 2.4856951236724854, + "learning_rate": 9.966772127768196e-06, + "loss": 3.1999, + "step": 186450 + }, + { + "epoch": 0.18373225727638848, + "grad_norm": 3.1221067905426025, + "learning_rate": 9.966754314174776e-06, + "loss": 3.1413, + "step": 186500 + }, + { + "epoch": 0.18378151525421058, + "grad_norm": 2.358966827392578, + "learning_rate": 9.966736495823593e-06, + "loss": 3.2191, + "step": 186550 + }, + { + "epoch": 0.18383077323203267, + "grad_norm": 2.2385661602020264, + "learning_rate": 9.966718672714664e-06, + "loss": 3.1604, + "step": 186600 + }, + { + "epoch": 0.18388003120985474, + "grad_norm": 2.799252510070801, + "learning_rate": 9.96670084484801e-06, + "loss": 3.2351, + "step": 186650 + }, + { + "epoch": 0.18392928918767684, + "grad_norm": 2.5816032886505127, + "learning_rate": 9.966683012223643e-06, + "loss": 3.1631, + "step": 186700 + }, + { + "epoch": 0.1839785471654989, + "grad_norm": 2.298856735229492, + "learning_rate": 9.966665174841585e-06, + "loss": 3.2295, + "step": 186750 + }, + { + "epoch": 0.184027805143321, + "grad_norm": 2.563558578491211, + "learning_rate": 9.966647332701849e-06, + "loss": 3.2209, + "step": 186800 + }, + { + "epoch": 0.1840770631211431, + "grad_norm": 2.4518649578094482, + "learning_rate": 9.966629485804454e-06, + "loss": 3.2239, + "step": 186850 + }, + { + "epoch": 0.18412632109896518, + "grad_norm": 2.378243923187256, + "learning_rate": 9.966611634149418e-06, + "loss": 3.2289, + "step": 186900 + }, + { + "epoch": 0.18417557907678728, + "grad_norm": 2.168172836303711, + "learning_rate": 9.966593777736755e-06, + "loss": 3.2019, + "step": 186950 + }, + { + "epoch": 0.18422483705460937, + "grad_norm": 2.4742865562438965, + "learning_rate": 9.966575916566485e-06, + "loss": 3.146, + "step": 187000 + }, + { + "epoch": 0.18427409503243145, + "grad_norm": 2.487452268600464, + "learning_rate": 9.966558050638627e-06, + "loss": 3.1706, + "step": 187050 + }, + { + "epoch": 0.18432335301025354, + "grad_norm": 2.2697088718414307, + "learning_rate": 9.966540179953191e-06, + "loss": 3.1917, + "step": 187100 + }, + { + "epoch": 0.18437261098807564, + "grad_norm": 2.2541401386260986, + "learning_rate": 9.9665223045102e-06, + "loss": 3.2077, + "step": 187150 + }, + { + "epoch": 0.1844218689658977, + "grad_norm": 2.3923799991607666, + "learning_rate": 9.96650442430967e-06, + "loss": 3.1846, + "step": 187200 + }, + { + "epoch": 0.1844711269437198, + "grad_norm": 2.406975507736206, + "learning_rate": 9.966486539351618e-06, + "loss": 3.1576, + "step": 187250 + }, + { + "epoch": 0.18452038492154188, + "grad_norm": 2.3095078468322754, + "learning_rate": 9.96646864963606e-06, + "loss": 3.2195, + "step": 187300 + }, + { + "epoch": 0.18456964289936398, + "grad_norm": 2.487431049346924, + "learning_rate": 9.966450755163014e-06, + "loss": 3.2506, + "step": 187350 + }, + { + "epoch": 0.18461890087718608, + "grad_norm": 2.5561208724975586, + "learning_rate": 9.966432855932498e-06, + "loss": 3.2144, + "step": 187400 + }, + { + "epoch": 0.18466815885500815, + "grad_norm": 2.329491138458252, + "learning_rate": 9.966414951944527e-06, + "loss": 3.156, + "step": 187450 + }, + { + "epoch": 0.18471741683283024, + "grad_norm": 2.2726519107818604, + "learning_rate": 9.96639704319912e-06, + "loss": 3.1701, + "step": 187500 + }, + { + "epoch": 0.18476667481065234, + "grad_norm": 2.3236093521118164, + "learning_rate": 9.966379129696293e-06, + "loss": 3.1886, + "step": 187550 + }, + { + "epoch": 0.1848159327884744, + "grad_norm": 2.2540907859802246, + "learning_rate": 9.966361211436064e-06, + "loss": 3.1292, + "step": 187600 + }, + { + "epoch": 0.1848651907662965, + "grad_norm": 2.315483570098877, + "learning_rate": 9.96634328841845e-06, + "loss": 3.2044, + "step": 187650 + }, + { + "epoch": 0.1849144487441186, + "grad_norm": 2.3866240978240967, + "learning_rate": 9.966325360643467e-06, + "loss": 3.1911, + "step": 187700 + }, + { + "epoch": 0.18496370672194068, + "grad_norm": 2.2230470180511475, + "learning_rate": 9.966307428111135e-06, + "loss": 3.239, + "step": 187750 + }, + { + "epoch": 0.18501296469976278, + "grad_norm": 2.3880510330200195, + "learning_rate": 9.966289490821467e-06, + "loss": 3.2173, + "step": 187800 + }, + { + "epoch": 0.18506222267758485, + "grad_norm": 2.368213415145874, + "learning_rate": 9.966271548774485e-06, + "loss": 3.2433, + "step": 187850 + }, + { + "epoch": 0.18511148065540695, + "grad_norm": 2.3240973949432373, + "learning_rate": 9.966253601970203e-06, + "loss": 3.2442, + "step": 187900 + }, + { + "epoch": 0.18516073863322904, + "grad_norm": 2.543519973754883, + "learning_rate": 9.966235650408638e-06, + "loss": 3.1855, + "step": 187950 + }, + { + "epoch": 0.18520999661105111, + "grad_norm": 2.2920472621917725, + "learning_rate": 9.96621769408981e-06, + "loss": 3.1459, + "step": 188000 + }, + { + "epoch": 0.1852592545888732, + "grad_norm": 2.3326361179351807, + "learning_rate": 9.966199733013732e-06, + "loss": 3.273, + "step": 188050 + }, + { + "epoch": 0.1853085125666953, + "grad_norm": 2.4187192916870117, + "learning_rate": 9.966181767180425e-06, + "loss": 3.1906, + "step": 188100 + }, + { + "epoch": 0.18535777054451738, + "grad_norm": 2.8203608989715576, + "learning_rate": 9.966163796589904e-06, + "loss": 3.2051, + "step": 188150 + }, + { + "epoch": 0.18540702852233948, + "grad_norm": 2.429426670074463, + "learning_rate": 9.966145821242187e-06, + "loss": 3.1885, + "step": 188200 + }, + { + "epoch": 0.18545628650016158, + "grad_norm": 2.077145576477051, + "learning_rate": 9.966127841137293e-06, + "loss": 3.1737, + "step": 188250 + }, + { + "epoch": 0.18550554447798365, + "grad_norm": 2.3485257625579834, + "learning_rate": 9.966109856275236e-06, + "loss": 3.192, + "step": 188300 + }, + { + "epoch": 0.18555480245580575, + "grad_norm": 2.307839870452881, + "learning_rate": 9.966091866656034e-06, + "loss": 3.1629, + "step": 188350 + }, + { + "epoch": 0.18560406043362784, + "grad_norm": 2.415091037750244, + "learning_rate": 9.966073872279707e-06, + "loss": 3.2074, + "step": 188400 + }, + { + "epoch": 0.1856533184114499, + "grad_norm": 2.17326283454895, + "learning_rate": 9.966055873146268e-06, + "loss": 3.1449, + "step": 188450 + }, + { + "epoch": 0.185702576389272, + "grad_norm": 2.466304063796997, + "learning_rate": 9.966037869255736e-06, + "loss": 3.2399, + "step": 188500 + }, + { + "epoch": 0.18575183436709408, + "grad_norm": 2.375298023223877, + "learning_rate": 9.966019860608132e-06, + "loss": 3.2807, + "step": 188550 + }, + { + "epoch": 0.18580109234491618, + "grad_norm": 2.267178773880005, + "learning_rate": 9.966001847203465e-06, + "loss": 3.1547, + "step": 188600 + }, + { + "epoch": 0.18585035032273828, + "grad_norm": 2.364042043685913, + "learning_rate": 9.965983829041762e-06, + "loss": 3.1687, + "step": 188650 + }, + { + "epoch": 0.18589960830056035, + "grad_norm": 2.257005214691162, + "learning_rate": 9.965965806123033e-06, + "loss": 3.2198, + "step": 188700 + }, + { + "epoch": 0.18594886627838245, + "grad_norm": 2.5347790718078613, + "learning_rate": 9.965947778447296e-06, + "loss": 3.199, + "step": 188750 + }, + { + "epoch": 0.18599812425620454, + "grad_norm": 2.306288480758667, + "learning_rate": 9.965929746014571e-06, + "loss": 3.2225, + "step": 188800 + }, + { + "epoch": 0.18604738223402661, + "grad_norm": 2.2567124366760254, + "learning_rate": 9.965911708824876e-06, + "loss": 3.1695, + "step": 188850 + }, + { + "epoch": 0.1860966402118487, + "grad_norm": 2.20658016204834, + "learning_rate": 9.965893666878224e-06, + "loss": 3.179, + "step": 188900 + }, + { + "epoch": 0.1861458981896708, + "grad_norm": 2.458904266357422, + "learning_rate": 9.965875620174636e-06, + "loss": 3.2378, + "step": 188950 + }, + { + "epoch": 0.18619515616749288, + "grad_norm": 2.3616299629211426, + "learning_rate": 9.965857568714128e-06, + "loss": 3.2432, + "step": 189000 + }, + { + "epoch": 0.18624441414531498, + "grad_norm": 2.572533130645752, + "learning_rate": 9.965839512496718e-06, + "loss": 3.1845, + "step": 189050 + }, + { + "epoch": 0.18629367212313705, + "grad_norm": 2.3329010009765625, + "learning_rate": 9.965821451522421e-06, + "loss": 3.2116, + "step": 189100 + }, + { + "epoch": 0.18634293010095915, + "grad_norm": 2.4752893447875977, + "learning_rate": 9.965803385791258e-06, + "loss": 3.2225, + "step": 189150 + }, + { + "epoch": 0.18639218807878125, + "grad_norm": 2.3398358821868896, + "learning_rate": 9.965785315303242e-06, + "loss": 3.1822, + "step": 189200 + }, + { + "epoch": 0.18644144605660332, + "grad_norm": 2.3595540523529053, + "learning_rate": 9.965767240058392e-06, + "loss": 3.1536, + "step": 189250 + }, + { + "epoch": 0.18649070403442541, + "grad_norm": 2.615919351577759, + "learning_rate": 9.965749160056727e-06, + "loss": 3.1804, + "step": 189300 + }, + { + "epoch": 0.1865399620122475, + "grad_norm": 2.7051188945770264, + "learning_rate": 9.965731075298264e-06, + "loss": 3.22, + "step": 189350 + }, + { + "epoch": 0.18658921999006958, + "grad_norm": 2.365379571914673, + "learning_rate": 9.965712985783019e-06, + "loss": 3.2254, + "step": 189400 + }, + { + "epoch": 0.18663847796789168, + "grad_norm": 2.526567220687866, + "learning_rate": 9.96569489151101e-06, + "loss": 3.2379, + "step": 189450 + }, + { + "epoch": 0.18668773594571378, + "grad_norm": 2.423424243927002, + "learning_rate": 9.965676792482255e-06, + "loss": 3.2253, + "step": 189500 + }, + { + "epoch": 0.18673699392353585, + "grad_norm": 2.3560242652893066, + "learning_rate": 9.965658688696769e-06, + "loss": 3.1556, + "step": 189550 + }, + { + "epoch": 0.18678625190135795, + "grad_norm": 2.2108547687530518, + "learning_rate": 9.96564058015457e-06, + "loss": 3.2213, + "step": 189600 + }, + { + "epoch": 0.18683550987918005, + "grad_norm": 2.2721002101898193, + "learning_rate": 9.965622466855679e-06, + "loss": 3.1578, + "step": 189650 + }, + { + "epoch": 0.18688476785700212, + "grad_norm": 2.401252508163452, + "learning_rate": 9.965604348800107e-06, + "loss": 3.1111, + "step": 189700 + }, + { + "epoch": 0.1869340258348242, + "grad_norm": 2.2952773571014404, + "learning_rate": 9.965586225987877e-06, + "loss": 3.2204, + "step": 189750 + }, + { + "epoch": 0.18698328381264628, + "grad_norm": 2.318816661834717, + "learning_rate": 9.965568098419004e-06, + "loss": 3.2496, + "step": 189800 + }, + { + "epoch": 0.18703254179046838, + "grad_norm": 2.376627206802368, + "learning_rate": 9.965549966093505e-06, + "loss": 3.1094, + "step": 189850 + }, + { + "epoch": 0.18708179976829048, + "grad_norm": 2.3444344997406006, + "learning_rate": 9.9655318290114e-06, + "loss": 3.2271, + "step": 189900 + }, + { + "epoch": 0.18713105774611255, + "grad_norm": 2.2698018550872803, + "learning_rate": 9.9655136871727e-06, + "loss": 3.1162, + "step": 189950 + }, + { + "epoch": 0.18718031572393465, + "grad_norm": 2.229180335998535, + "learning_rate": 9.965495540577431e-06, + "loss": 3.205, + "step": 190000 + }, + { + "epoch": 0.18722957370175675, + "grad_norm": 2.3924190998077393, + "learning_rate": 9.965477389225604e-06, + "loss": 3.1147, + "step": 190050 + }, + { + "epoch": 0.18727883167957882, + "grad_norm": 2.9361519813537598, + "learning_rate": 9.965459233117239e-06, + "loss": 3.2207, + "step": 190100 + }, + { + "epoch": 0.18732808965740091, + "grad_norm": 2.4314749240875244, + "learning_rate": 9.965441072252353e-06, + "loss": 3.1731, + "step": 190150 + }, + { + "epoch": 0.187377347635223, + "grad_norm": 2.196521282196045, + "learning_rate": 9.965422906630964e-06, + "loss": 3.1942, + "step": 190200 + }, + { + "epoch": 0.18742660561304508, + "grad_norm": 2.4408762454986572, + "learning_rate": 9.965404736253088e-06, + "loss": 3.1712, + "step": 190250 + }, + { + "epoch": 0.18747586359086718, + "grad_norm": 2.3225667476654053, + "learning_rate": 9.965386561118742e-06, + "loss": 3.1283, + "step": 190300 + }, + { + "epoch": 0.18752512156868925, + "grad_norm": 2.425530433654785, + "learning_rate": 9.965368381227947e-06, + "loss": 3.2405, + "step": 190350 + }, + { + "epoch": 0.18757437954651135, + "grad_norm": 2.404256820678711, + "learning_rate": 9.965350196580717e-06, + "loss": 3.1879, + "step": 190400 + }, + { + "epoch": 0.18762363752433345, + "grad_norm": 2.251185655593872, + "learning_rate": 9.96533200717707e-06, + "loss": 3.2216, + "step": 190450 + }, + { + "epoch": 0.18767289550215552, + "grad_norm": 2.3930776119232178, + "learning_rate": 9.965313813017022e-06, + "loss": 3.1793, + "step": 190500 + }, + { + "epoch": 0.18772215347997762, + "grad_norm": 2.379326820373535, + "learning_rate": 9.965295614100593e-06, + "loss": 3.2168, + "step": 190550 + }, + { + "epoch": 0.18777141145779971, + "grad_norm": 2.487004280090332, + "learning_rate": 9.9652774104278e-06, + "loss": 3.1719, + "step": 190600 + }, + { + "epoch": 0.18782066943562178, + "grad_norm": 2.306688070297241, + "learning_rate": 9.965259201998661e-06, + "loss": 3.1912, + "step": 190650 + }, + { + "epoch": 0.18786992741344388, + "grad_norm": 2.4894628524780273, + "learning_rate": 9.965240988813191e-06, + "loss": 3.2002, + "step": 190700 + }, + { + "epoch": 0.18791918539126598, + "grad_norm": 2.3999686241149902, + "learning_rate": 9.965222770871412e-06, + "loss": 3.1331, + "step": 190750 + }, + { + "epoch": 0.18796844336908805, + "grad_norm": 2.290255069732666, + "learning_rate": 9.965204548173335e-06, + "loss": 3.1921, + "step": 190800 + }, + { + "epoch": 0.18801770134691015, + "grad_norm": 2.900500774383545, + "learning_rate": 9.965186320718982e-06, + "loss": 3.178, + "step": 190850 + }, + { + "epoch": 0.18806695932473225, + "grad_norm": 2.4093830585479736, + "learning_rate": 9.96516808850837e-06, + "loss": 3.2521, + "step": 190900 + }, + { + "epoch": 0.18811621730255432, + "grad_norm": 2.7893965244293213, + "learning_rate": 9.965149851541515e-06, + "loss": 3.203, + "step": 190950 + }, + { + "epoch": 0.18816547528037642, + "grad_norm": 2.3622825145721436, + "learning_rate": 9.965131609818435e-06, + "loss": 3.255, + "step": 191000 + }, + { + "epoch": 0.18821473325819849, + "grad_norm": 2.3034400939941406, + "learning_rate": 9.965113363339148e-06, + "loss": 3.1967, + "step": 191050 + }, + { + "epoch": 0.18826399123602058, + "grad_norm": 2.238585948944092, + "learning_rate": 9.96509511210367e-06, + "loss": 3.2214, + "step": 191100 + }, + { + "epoch": 0.18831324921384268, + "grad_norm": 2.2969276905059814, + "learning_rate": 9.965076856112022e-06, + "loss": 3.2484, + "step": 191150 + }, + { + "epoch": 0.18836250719166475, + "grad_norm": 2.5052690505981445, + "learning_rate": 9.96505859536422e-06, + "loss": 3.1633, + "step": 191200 + }, + { + "epoch": 0.18841176516948685, + "grad_norm": 2.4098780155181885, + "learning_rate": 9.965040329860278e-06, + "loss": 3.1874, + "step": 191250 + }, + { + "epoch": 0.18846102314730895, + "grad_norm": 2.2313461303710938, + "learning_rate": 9.965022059600217e-06, + "loss": 3.2293, + "step": 191300 + }, + { + "epoch": 0.18851028112513102, + "grad_norm": 2.6183159351348877, + "learning_rate": 9.965003784584053e-06, + "loss": 3.1966, + "step": 191350 + }, + { + "epoch": 0.18855953910295312, + "grad_norm": 2.1891672611236572, + "learning_rate": 9.964985504811807e-06, + "loss": 3.2499, + "step": 191400 + }, + { + "epoch": 0.18860879708077521, + "grad_norm": 2.337702751159668, + "learning_rate": 9.96496722028349e-06, + "loss": 3.1831, + "step": 191450 + }, + { + "epoch": 0.18865805505859728, + "grad_norm": 2.3634822368621826, + "learning_rate": 9.964948930999125e-06, + "loss": 3.2413, + "step": 191500 + }, + { + "epoch": 0.18870731303641938, + "grad_norm": 2.456507921218872, + "learning_rate": 9.964930636958728e-06, + "loss": 3.1044, + "step": 191550 + }, + { + "epoch": 0.18875657101424145, + "grad_norm": 2.2775163650512695, + "learning_rate": 9.964912338162315e-06, + "loss": 3.1492, + "step": 191600 + }, + { + "epoch": 0.18880582899206355, + "grad_norm": 2.2085235118865967, + "learning_rate": 9.964894034609904e-06, + "loss": 3.1629, + "step": 191650 + }, + { + "epoch": 0.18885508696988565, + "grad_norm": 2.4059901237487793, + "learning_rate": 9.964875726301514e-06, + "loss": 3.2244, + "step": 191700 + }, + { + "epoch": 0.18890434494770772, + "grad_norm": 2.2727034091949463, + "learning_rate": 9.964857413237164e-06, + "loss": 3.1553, + "step": 191750 + }, + { + "epoch": 0.18895360292552982, + "grad_norm": 2.2478978633880615, + "learning_rate": 9.964839095416869e-06, + "loss": 3.1959, + "step": 191800 + }, + { + "epoch": 0.18900286090335192, + "grad_norm": 2.448654890060425, + "learning_rate": 9.964820772840645e-06, + "loss": 3.1776, + "step": 191850 + }, + { + "epoch": 0.18905211888117399, + "grad_norm": 2.516153335571289, + "learning_rate": 9.964802445508512e-06, + "loss": 3.2046, + "step": 191900 + }, + { + "epoch": 0.18910137685899608, + "grad_norm": 2.467266082763672, + "learning_rate": 9.964784113420488e-06, + "loss": 3.1473, + "step": 191950 + }, + { + "epoch": 0.18915063483681818, + "grad_norm": 2.2957606315612793, + "learning_rate": 9.964765776576588e-06, + "loss": 3.165, + "step": 192000 + }, + { + "epoch": 0.18919989281464025, + "grad_norm": 2.3886754512786865, + "learning_rate": 9.964747434976833e-06, + "loss": 3.1599, + "step": 192050 + }, + { + "epoch": 0.18924915079246235, + "grad_norm": 2.3692195415496826, + "learning_rate": 9.964729088621238e-06, + "loss": 3.1689, + "step": 192100 + }, + { + "epoch": 0.18929840877028445, + "grad_norm": 2.3151657581329346, + "learning_rate": 9.96471073750982e-06, + "loss": 3.153, + "step": 192150 + }, + { + "epoch": 0.18934766674810652, + "grad_norm": 2.4680593013763428, + "learning_rate": 9.9646923816426e-06, + "loss": 3.1025, + "step": 192200 + }, + { + "epoch": 0.18939692472592862, + "grad_norm": 2.4993388652801514, + "learning_rate": 9.964674021019592e-06, + "loss": 3.1516, + "step": 192250 + }, + { + "epoch": 0.1894461827037507, + "grad_norm": 2.2265734672546387, + "learning_rate": 9.964655655640816e-06, + "loss": 3.2583, + "step": 192300 + }, + { + "epoch": 0.18949544068157279, + "grad_norm": 2.234433889389038, + "learning_rate": 9.964637285506288e-06, + "loss": 3.1798, + "step": 192350 + }, + { + "epoch": 0.18954469865939488, + "grad_norm": 2.4456393718719482, + "learning_rate": 9.964618910616026e-06, + "loss": 3.1588, + "step": 192400 + }, + { + "epoch": 0.18959395663721695, + "grad_norm": 2.2508795261383057, + "learning_rate": 9.96460053097005e-06, + "loss": 3.1929, + "step": 192450 + }, + { + "epoch": 0.18964321461503905, + "grad_norm": 2.156420946121216, + "learning_rate": 9.964582146568372e-06, + "loss": 3.2179, + "step": 192500 + }, + { + "epoch": 0.18969247259286115, + "grad_norm": 2.350403070449829, + "learning_rate": 9.964563757411015e-06, + "loss": 3.2007, + "step": 192550 + }, + { + "epoch": 0.18974173057068322, + "grad_norm": 2.8115296363830566, + "learning_rate": 9.964545363497995e-06, + "loss": 3.2324, + "step": 192600 + }, + { + "epoch": 0.18979098854850532, + "grad_norm": 2.27908992767334, + "learning_rate": 9.964526964829328e-06, + "loss": 3.1823, + "step": 192650 + }, + { + "epoch": 0.18984024652632742, + "grad_norm": 2.3734939098358154, + "learning_rate": 9.964508561405034e-06, + "loss": 3.2604, + "step": 192700 + }, + { + "epoch": 0.1898895045041495, + "grad_norm": 2.5990357398986816, + "learning_rate": 9.96449015322513e-06, + "loss": 3.1182, + "step": 192750 + }, + { + "epoch": 0.18993876248197158, + "grad_norm": 2.4322078227996826, + "learning_rate": 9.964471740289632e-06, + "loss": 3.1366, + "step": 192800 + }, + { + "epoch": 0.18998802045979366, + "grad_norm": 2.651874303817749, + "learning_rate": 9.96445332259856e-06, + "loss": 3.2434, + "step": 192850 + }, + { + "epoch": 0.19003727843761575, + "grad_norm": 2.3832454681396484, + "learning_rate": 9.964434900151929e-06, + "loss": 3.1514, + "step": 192900 + }, + { + "epoch": 0.19008653641543785, + "grad_norm": 2.2981324195861816, + "learning_rate": 9.964416472949758e-06, + "loss": 3.1424, + "step": 192950 + }, + { + "epoch": 0.19013579439325992, + "grad_norm": 2.4571800231933594, + "learning_rate": 9.964398040992067e-06, + "loss": 3.1966, + "step": 193000 + }, + { + "epoch": 0.19018505237108202, + "grad_norm": 2.3524985313415527, + "learning_rate": 9.96437960427887e-06, + "loss": 3.2319, + "step": 193050 + }, + { + "epoch": 0.19023431034890412, + "grad_norm": 2.2744481563568115, + "learning_rate": 9.964361162810186e-06, + "loss": 3.1967, + "step": 193100 + }, + { + "epoch": 0.1902835683267262, + "grad_norm": 2.3375165462493896, + "learning_rate": 9.964342716586033e-06, + "loss": 3.1686, + "step": 193150 + }, + { + "epoch": 0.19033282630454829, + "grad_norm": 2.194582462310791, + "learning_rate": 9.964324265606428e-06, + "loss": 3.2426, + "step": 193200 + }, + { + "epoch": 0.19038208428237038, + "grad_norm": 2.7396366596221924, + "learning_rate": 9.964305809871388e-06, + "loss": 3.1897, + "step": 193250 + }, + { + "epoch": 0.19043134226019245, + "grad_norm": 2.732534885406494, + "learning_rate": 9.964287349380935e-06, + "loss": 3.173, + "step": 193300 + }, + { + "epoch": 0.19048060023801455, + "grad_norm": 2.403519868850708, + "learning_rate": 9.96426888413508e-06, + "loss": 3.1772, + "step": 193350 + }, + { + "epoch": 0.19052985821583665, + "grad_norm": 2.4294614791870117, + "learning_rate": 9.964250414133848e-06, + "loss": 3.2305, + "step": 193400 + }, + { + "epoch": 0.19057911619365872, + "grad_norm": 2.2275023460388184, + "learning_rate": 9.964231939377248e-06, + "loss": 3.2027, + "step": 193450 + }, + { + "epoch": 0.19062837417148082, + "grad_norm": 2.46925950050354, + "learning_rate": 9.964213459865306e-06, + "loss": 3.1574, + "step": 193500 + }, + { + "epoch": 0.1906776321493029, + "grad_norm": 2.3353970050811768, + "learning_rate": 9.964194975598036e-06, + "loss": 3.2669, + "step": 193550 + }, + { + "epoch": 0.190726890127125, + "grad_norm": 2.2026402950286865, + "learning_rate": 9.964176486575454e-06, + "loss": 3.1596, + "step": 193600 + }, + { + "epoch": 0.19077614810494709, + "grad_norm": 2.497058868408203, + "learning_rate": 9.964157992797581e-06, + "loss": 3.2443, + "step": 193650 + }, + { + "epoch": 0.19082540608276916, + "grad_norm": 2.2916629314422607, + "learning_rate": 9.964139494264435e-06, + "loss": 3.2155, + "step": 193700 + }, + { + "epoch": 0.19087466406059125, + "grad_norm": 2.5768330097198486, + "learning_rate": 9.96412099097603e-06, + "loss": 3.1433, + "step": 193750 + }, + { + "epoch": 0.19092392203841335, + "grad_norm": 2.4778990745544434, + "learning_rate": 9.964102482932385e-06, + "loss": 3.1687, + "step": 193800 + }, + { + "epoch": 0.19097318001623542, + "grad_norm": 2.578822374343872, + "learning_rate": 9.96408397013352e-06, + "loss": 3.2199, + "step": 193850 + }, + { + "epoch": 0.19102243799405752, + "grad_norm": 2.301558256149292, + "learning_rate": 9.964065452579452e-06, + "loss": 3.177, + "step": 193900 + }, + { + "epoch": 0.19107169597187962, + "grad_norm": 2.457897663116455, + "learning_rate": 9.964046930270196e-06, + "loss": 3.2086, + "step": 193950 + }, + { + "epoch": 0.1911209539497017, + "grad_norm": 2.283489942550659, + "learning_rate": 9.964028403205774e-06, + "loss": 3.2617, + "step": 194000 + }, + { + "epoch": 0.1911702119275238, + "grad_norm": 2.175835609436035, + "learning_rate": 9.9640098713862e-06, + "loss": 3.1478, + "step": 194050 + }, + { + "epoch": 0.19121946990534586, + "grad_norm": 2.514758348464966, + "learning_rate": 9.963991334811494e-06, + "loss": 3.2253, + "step": 194100 + }, + { + "epoch": 0.19126872788316795, + "grad_norm": 2.4461724758148193, + "learning_rate": 9.963972793481671e-06, + "loss": 3.1813, + "step": 194150 + }, + { + "epoch": 0.19131798586099005, + "grad_norm": 2.3089981079101562, + "learning_rate": 9.963954247396754e-06, + "loss": 3.2122, + "step": 194200 + }, + { + "epoch": 0.19136724383881212, + "grad_norm": 2.3478996753692627, + "learning_rate": 9.963935696556755e-06, + "loss": 3.1946, + "step": 194250 + }, + { + "epoch": 0.19141650181663422, + "grad_norm": 2.3709185123443604, + "learning_rate": 9.963917140961694e-06, + "loss": 3.1467, + "step": 194300 + }, + { + "epoch": 0.19146575979445632, + "grad_norm": 2.412412405014038, + "learning_rate": 9.963898580611592e-06, + "loss": 3.1587, + "step": 194350 + }, + { + "epoch": 0.1915150177722784, + "grad_norm": 2.4064080715179443, + "learning_rate": 9.963880015506463e-06, + "loss": 3.228, + "step": 194400 + }, + { + "epoch": 0.1915642757501005, + "grad_norm": 2.3433806896209717, + "learning_rate": 9.963861445646324e-06, + "loss": 3.1675, + "step": 194450 + }, + { + "epoch": 0.19161353372792259, + "grad_norm": 2.7242088317871094, + "learning_rate": 9.963842871031196e-06, + "loss": 3.1884, + "step": 194500 + }, + { + "epoch": 0.19166279170574466, + "grad_norm": 2.316920757293701, + "learning_rate": 9.963824291661093e-06, + "loss": 3.1764, + "step": 194550 + }, + { + "epoch": 0.19171204968356675, + "grad_norm": 2.3244409561157227, + "learning_rate": 9.963805707536036e-06, + "loss": 3.2085, + "step": 194600 + }, + { + "epoch": 0.19176130766138885, + "grad_norm": 2.331098794937134, + "learning_rate": 9.963787118656043e-06, + "loss": 3.1673, + "step": 194650 + }, + { + "epoch": 0.19181056563921092, + "grad_norm": 2.3229801654815674, + "learning_rate": 9.96376852502113e-06, + "loss": 3.1482, + "step": 194700 + }, + { + "epoch": 0.19185982361703302, + "grad_norm": 2.285534381866455, + "learning_rate": 9.963749926631315e-06, + "loss": 3.1665, + "step": 194750 + }, + { + "epoch": 0.1919090815948551, + "grad_norm": 2.4967353343963623, + "learning_rate": 9.963731323486618e-06, + "loss": 3.2219, + "step": 194800 + }, + { + "epoch": 0.1919583395726772, + "grad_norm": 2.5189368724823, + "learning_rate": 9.963712715587053e-06, + "loss": 3.1594, + "step": 194850 + }, + { + "epoch": 0.1920075975504993, + "grad_norm": 2.631378173828125, + "learning_rate": 9.963694102932641e-06, + "loss": 3.2156, + "step": 194900 + }, + { + "epoch": 0.19205685552832136, + "grad_norm": 2.2416679859161377, + "learning_rate": 9.9636754855234e-06, + "loss": 3.1704, + "step": 194950 + }, + { + "epoch": 0.19210611350614346, + "grad_norm": 2.335855484008789, + "learning_rate": 9.963656863359343e-06, + "loss": 3.1867, + "step": 195000 + }, + { + "epoch": 0.19215537148396555, + "grad_norm": 2.530660629272461, + "learning_rate": 9.963638236440494e-06, + "loss": 3.1466, + "step": 195050 + }, + { + "epoch": 0.19220462946178762, + "grad_norm": 2.464677333831787, + "learning_rate": 9.963619604766868e-06, + "loss": 3.2254, + "step": 195100 + }, + { + "epoch": 0.19225388743960972, + "grad_norm": 2.373227119445801, + "learning_rate": 9.963600968338483e-06, + "loss": 3.192, + "step": 195150 + }, + { + "epoch": 0.19230314541743182, + "grad_norm": 2.301301956176758, + "learning_rate": 9.963582327155356e-06, + "loss": 3.1967, + "step": 195200 + }, + { + "epoch": 0.1923524033952539, + "grad_norm": 2.333597183227539, + "learning_rate": 9.963563681217506e-06, + "loss": 3.1943, + "step": 195250 + }, + { + "epoch": 0.192401661373076, + "grad_norm": 2.222482919692993, + "learning_rate": 9.963545030524952e-06, + "loss": 3.2003, + "step": 195300 + }, + { + "epoch": 0.19245091935089806, + "grad_norm": 2.518244743347168, + "learning_rate": 9.96352637507771e-06, + "loss": 3.1716, + "step": 195350 + }, + { + "epoch": 0.19250017732872016, + "grad_norm": 2.437635898590088, + "learning_rate": 9.963507714875797e-06, + "loss": 3.2102, + "step": 195400 + }, + { + "epoch": 0.19254943530654225, + "grad_norm": 2.578450918197632, + "learning_rate": 9.963489049919233e-06, + "loss": 3.0361, + "step": 195450 + }, + { + "epoch": 0.19259869328436433, + "grad_norm": 2.291053533554077, + "learning_rate": 9.963470380208036e-06, + "loss": 3.1768, + "step": 195500 + }, + { + "epoch": 0.19264795126218642, + "grad_norm": 2.284888505935669, + "learning_rate": 9.963451705742224e-06, + "loss": 3.1621, + "step": 195550 + }, + { + "epoch": 0.19269720924000852, + "grad_norm": 2.4152355194091797, + "learning_rate": 9.963433026521812e-06, + "loss": 3.1914, + "step": 195600 + }, + { + "epoch": 0.1927464672178306, + "grad_norm": 2.288513660430908, + "learning_rate": 9.96341434254682e-06, + "loss": 3.1319, + "step": 195650 + }, + { + "epoch": 0.1927957251956527, + "grad_norm": 2.371650457382202, + "learning_rate": 9.963395653817268e-06, + "loss": 3.1691, + "step": 195700 + }, + { + "epoch": 0.1928449831734748, + "grad_norm": 2.560469388961792, + "learning_rate": 9.963376960333168e-06, + "loss": 3.2114, + "step": 195750 + }, + { + "epoch": 0.19289424115129686, + "grad_norm": 2.23199200630188, + "learning_rate": 9.963358262094544e-06, + "loss": 3.1571, + "step": 195800 + }, + { + "epoch": 0.19294349912911896, + "grad_norm": 2.3587799072265625, + "learning_rate": 9.96333955910141e-06, + "loss": 3.1663, + "step": 195850 + }, + { + "epoch": 0.19299275710694103, + "grad_norm": 2.535485029220581, + "learning_rate": 9.963320851353786e-06, + "loss": 3.1396, + "step": 195900 + }, + { + "epoch": 0.19304201508476312, + "grad_norm": 2.4983019828796387, + "learning_rate": 9.963302138851689e-06, + "loss": 3.2006, + "step": 195950 + }, + { + "epoch": 0.19309127306258522, + "grad_norm": 2.294947862625122, + "learning_rate": 9.963283421595139e-06, + "loss": 3.2082, + "step": 196000 + }, + { + "epoch": 0.1931405310404073, + "grad_norm": 2.242189884185791, + "learning_rate": 9.96326469958415e-06, + "loss": 3.155, + "step": 196050 + }, + { + "epoch": 0.1931897890182294, + "grad_norm": 2.2374038696289062, + "learning_rate": 9.963245972818742e-06, + "loss": 3.1288, + "step": 196100 + }, + { + "epoch": 0.1932390469960515, + "grad_norm": 2.4462459087371826, + "learning_rate": 9.963227241298935e-06, + "loss": 3.1916, + "step": 196150 + }, + { + "epoch": 0.19328830497387356, + "grad_norm": 2.417543411254883, + "learning_rate": 9.963208505024745e-06, + "loss": 3.1696, + "step": 196200 + }, + { + "epoch": 0.19333756295169566, + "grad_norm": 2.306180238723755, + "learning_rate": 9.963189763996188e-06, + "loss": 3.1968, + "step": 196250 + }, + { + "epoch": 0.19338682092951776, + "grad_norm": 2.5113680362701416, + "learning_rate": 9.963171018213284e-06, + "loss": 3.2388, + "step": 196300 + }, + { + "epoch": 0.19343607890733983, + "grad_norm": 2.5705857276916504, + "learning_rate": 9.963152267676052e-06, + "loss": 3.1763, + "step": 196350 + }, + { + "epoch": 0.19348533688516192, + "grad_norm": 2.7328579425811768, + "learning_rate": 9.963133512384507e-06, + "loss": 3.1389, + "step": 196400 + }, + { + "epoch": 0.19353459486298402, + "grad_norm": 2.161534309387207, + "learning_rate": 9.963114752338671e-06, + "loss": 3.19, + "step": 196450 + }, + { + "epoch": 0.1935838528408061, + "grad_norm": 2.388536214828491, + "learning_rate": 9.963095987538558e-06, + "loss": 3.1997, + "step": 196500 + }, + { + "epoch": 0.1936331108186282, + "grad_norm": 2.399711847305298, + "learning_rate": 9.963077217984188e-06, + "loss": 3.1649, + "step": 196550 + }, + { + "epoch": 0.19368236879645026, + "grad_norm": 2.1534523963928223, + "learning_rate": 9.96305844367558e-06, + "loss": 3.2177, + "step": 196600 + }, + { + "epoch": 0.19373162677427236, + "grad_norm": 2.15371036529541, + "learning_rate": 9.96303966461275e-06, + "loss": 3.2178, + "step": 196650 + }, + { + "epoch": 0.19378088475209446, + "grad_norm": 2.2764859199523926, + "learning_rate": 9.963020880795715e-06, + "loss": 3.1579, + "step": 196700 + }, + { + "epoch": 0.19383014272991653, + "grad_norm": 2.4023356437683105, + "learning_rate": 9.963002092224496e-06, + "loss": 3.1625, + "step": 196750 + }, + { + "epoch": 0.19387940070773862, + "grad_norm": 2.2914934158325195, + "learning_rate": 9.962983298899109e-06, + "loss": 3.1439, + "step": 196800 + }, + { + "epoch": 0.19392865868556072, + "grad_norm": 2.465825319290161, + "learning_rate": 9.962964500819574e-06, + "loss": 3.2253, + "step": 196850 + }, + { + "epoch": 0.1939779166633828, + "grad_norm": 2.3274152278900146, + "learning_rate": 9.962945697985907e-06, + "loss": 3.2058, + "step": 196900 + }, + { + "epoch": 0.1940271746412049, + "grad_norm": 2.3699562549591064, + "learning_rate": 9.962926890398126e-06, + "loss": 3.1411, + "step": 196950 + }, + { + "epoch": 0.194076432619027, + "grad_norm": 2.378976821899414, + "learning_rate": 9.96290807805625e-06, + "loss": 3.1929, + "step": 197000 + }, + { + "epoch": 0.19412569059684906, + "grad_norm": 2.5655336380004883, + "learning_rate": 9.962889260960298e-06, + "loss": 3.2295, + "step": 197050 + }, + { + "epoch": 0.19417494857467116, + "grad_norm": 2.3621158599853516, + "learning_rate": 9.962870439110284e-06, + "loss": 3.0955, + "step": 197100 + }, + { + "epoch": 0.19422420655249323, + "grad_norm": 2.3521695137023926, + "learning_rate": 9.96285161250623e-06, + "loss": 3.1352, + "step": 197150 + }, + { + "epoch": 0.19427346453031533, + "grad_norm": 2.62026047706604, + "learning_rate": 9.962832781148154e-06, + "loss": 3.1936, + "step": 197200 + }, + { + "epoch": 0.19432272250813742, + "grad_norm": 2.4014358520507812, + "learning_rate": 9.962813945036072e-06, + "loss": 3.19, + "step": 197250 + }, + { + "epoch": 0.1943719804859595, + "grad_norm": 2.4127185344696045, + "learning_rate": 9.962795104170003e-06, + "loss": 3.2728, + "step": 197300 + }, + { + "epoch": 0.1944212384637816, + "grad_norm": 2.575002431869507, + "learning_rate": 9.962776258549965e-06, + "loss": 3.1712, + "step": 197350 + }, + { + "epoch": 0.1944704964416037, + "grad_norm": 2.2262978553771973, + "learning_rate": 9.962757408175975e-06, + "loss": 3.2059, + "step": 197400 + }, + { + "epoch": 0.19451975441942576, + "grad_norm": 2.4179749488830566, + "learning_rate": 9.962738553048053e-06, + "loss": 3.1328, + "step": 197450 + }, + { + "epoch": 0.19456901239724786, + "grad_norm": 2.385349750518799, + "learning_rate": 9.962719693166215e-06, + "loss": 3.1755, + "step": 197500 + }, + { + "epoch": 0.19461827037506996, + "grad_norm": 2.4968371391296387, + "learning_rate": 9.96270082853048e-06, + "loss": 3.1833, + "step": 197550 + }, + { + "epoch": 0.19466752835289203, + "grad_norm": 2.2525041103363037, + "learning_rate": 9.962681959140868e-06, + "loss": 3.1664, + "step": 197600 + }, + { + "epoch": 0.19471678633071413, + "grad_norm": 2.8358733654022217, + "learning_rate": 9.962663084997396e-06, + "loss": 3.2166, + "step": 197650 + }, + { + "epoch": 0.19476604430853622, + "grad_norm": 2.465688467025757, + "learning_rate": 9.962644206100078e-06, + "loss": 3.2715, + "step": 197700 + }, + { + "epoch": 0.1948153022863583, + "grad_norm": 2.3951239585876465, + "learning_rate": 9.962625322448938e-06, + "loss": 3.1547, + "step": 197750 + }, + { + "epoch": 0.1948645602641804, + "grad_norm": 2.2760398387908936, + "learning_rate": 9.96260643404399e-06, + "loss": 3.1814, + "step": 197800 + }, + { + "epoch": 0.19491381824200246, + "grad_norm": 2.5958476066589355, + "learning_rate": 9.962587540885255e-06, + "loss": 3.1697, + "step": 197850 + }, + { + "epoch": 0.19496307621982456, + "grad_norm": 2.7002389430999756, + "learning_rate": 9.962568642972748e-06, + "loss": 3.1944, + "step": 197900 + }, + { + "epoch": 0.19501233419764666, + "grad_norm": 2.3151636123657227, + "learning_rate": 9.962549740306492e-06, + "loss": 3.1758, + "step": 197950 + }, + { + "epoch": 0.19506159217546873, + "grad_norm": 2.4793667793273926, + "learning_rate": 9.9625308328865e-06, + "loss": 3.1716, + "step": 198000 + }, + { + "epoch": 0.19511085015329083, + "grad_norm": 2.3488872051239014, + "learning_rate": 9.96251192071279e-06, + "loss": 3.2181, + "step": 198050 + }, + { + "epoch": 0.19516010813111292, + "grad_norm": 2.5180134773254395, + "learning_rate": 9.962493003785386e-06, + "loss": 3.1714, + "step": 198100 + }, + { + "epoch": 0.195209366108935, + "grad_norm": 2.4682064056396484, + "learning_rate": 9.962474082104298e-06, + "loss": 3.1384, + "step": 198150 + }, + { + "epoch": 0.1952586240867571, + "grad_norm": 2.598789691925049, + "learning_rate": 9.962455155669553e-06, + "loss": 3.1643, + "step": 198200 + }, + { + "epoch": 0.1953078820645792, + "grad_norm": 2.2681994438171387, + "learning_rate": 9.962436224481161e-06, + "loss": 3.1153, + "step": 198250 + }, + { + "epoch": 0.19535714004240126, + "grad_norm": 2.3721683025360107, + "learning_rate": 9.962417288539146e-06, + "loss": 3.1922, + "step": 198300 + }, + { + "epoch": 0.19540639802022336, + "grad_norm": 2.248358964920044, + "learning_rate": 9.962398347843523e-06, + "loss": 3.1414, + "step": 198350 + }, + { + "epoch": 0.19545565599804543, + "grad_norm": 2.37998628616333, + "learning_rate": 9.962379402394312e-06, + "loss": 3.216, + "step": 198400 + }, + { + "epoch": 0.19550491397586753, + "grad_norm": 2.3856618404388428, + "learning_rate": 9.962360452191528e-06, + "loss": 3.1578, + "step": 198450 + }, + { + "epoch": 0.19555417195368963, + "grad_norm": 2.5816261768341064, + "learning_rate": 9.962341497235191e-06, + "loss": 3.1771, + "step": 198500 + }, + { + "epoch": 0.1956034299315117, + "grad_norm": 2.229011297225952, + "learning_rate": 9.962322537525322e-06, + "loss": 3.19, + "step": 198550 + }, + { + "epoch": 0.1956526879093338, + "grad_norm": 2.4047539234161377, + "learning_rate": 9.962303573061934e-06, + "loss": 3.1503, + "step": 198600 + }, + { + "epoch": 0.1957019458871559, + "grad_norm": 2.409688711166382, + "learning_rate": 9.962284603845049e-06, + "loss": 3.2172, + "step": 198650 + }, + { + "epoch": 0.19575120386497796, + "grad_norm": 2.312586784362793, + "learning_rate": 9.962265629874684e-06, + "loss": 3.2165, + "step": 198700 + }, + { + "epoch": 0.19580046184280006, + "grad_norm": 2.4302942752838135, + "learning_rate": 9.962246651150857e-06, + "loss": 3.2028, + "step": 198750 + }, + { + "epoch": 0.19584971982062216, + "grad_norm": 2.301647186279297, + "learning_rate": 9.962227667673587e-06, + "loss": 3.1915, + "step": 198800 + }, + { + "epoch": 0.19589897779844423, + "grad_norm": 2.356356382369995, + "learning_rate": 9.96220867944289e-06, + "loss": 3.1627, + "step": 198850 + }, + { + "epoch": 0.19594823577626633, + "grad_norm": 2.259392499923706, + "learning_rate": 9.962189686458788e-06, + "loss": 3.1367, + "step": 198900 + }, + { + "epoch": 0.19599749375408843, + "grad_norm": 2.43422532081604, + "learning_rate": 9.962170688721296e-06, + "loss": 3.1738, + "step": 198950 + }, + { + "epoch": 0.1960467517319105, + "grad_norm": 2.4379403591156006, + "learning_rate": 9.96215168623043e-06, + "loss": 3.1603, + "step": 199000 + }, + { + "epoch": 0.1960960097097326, + "grad_norm": 2.1605894565582275, + "learning_rate": 9.962132678986215e-06, + "loss": 3.1717, + "step": 199050 + }, + { + "epoch": 0.19614526768755466, + "grad_norm": 2.6138803958892822, + "learning_rate": 9.962113666988664e-06, + "loss": 3.1361, + "step": 199100 + }, + { + "epoch": 0.19619452566537676, + "grad_norm": 2.4692981243133545, + "learning_rate": 9.962094650237797e-06, + "loss": 3.1057, + "step": 199150 + }, + { + "epoch": 0.19624378364319886, + "grad_norm": 2.5575692653656006, + "learning_rate": 9.962075628733631e-06, + "loss": 3.1856, + "step": 199200 + }, + { + "epoch": 0.19629304162102093, + "grad_norm": 2.375133514404297, + "learning_rate": 9.962056602476187e-06, + "loss": 3.1883, + "step": 199250 + }, + { + "epoch": 0.19634229959884303, + "grad_norm": 2.3268232345581055, + "learning_rate": 9.962037571465479e-06, + "loss": 3.1597, + "step": 199300 + }, + { + "epoch": 0.19639155757666513, + "grad_norm": 2.4489972591400146, + "learning_rate": 9.96201853570153e-06, + "loss": 3.2092, + "step": 199350 + }, + { + "epoch": 0.1964408155544872, + "grad_norm": 2.386378526687622, + "learning_rate": 9.961999495184354e-06, + "loss": 3.1872, + "step": 199400 + }, + { + "epoch": 0.1964900735323093, + "grad_norm": 2.471327543258667, + "learning_rate": 9.961980449913972e-06, + "loss": 3.2072, + "step": 199450 + }, + { + "epoch": 0.1965393315101314, + "grad_norm": 2.1944525241851807, + "learning_rate": 9.961961399890401e-06, + "loss": 3.1711, + "step": 199500 + }, + { + "epoch": 0.19658858948795346, + "grad_norm": 2.346181631088257, + "learning_rate": 9.96194234511366e-06, + "loss": 3.2251, + "step": 199550 + }, + { + "epoch": 0.19663784746577556, + "grad_norm": 2.348905563354492, + "learning_rate": 9.961923285583765e-06, + "loss": 3.2123, + "step": 199600 + }, + { + "epoch": 0.19668710544359763, + "grad_norm": 2.3777413368225098, + "learning_rate": 9.961904221300738e-06, + "loss": 3.167, + "step": 199650 + }, + { + "epoch": 0.19673636342141973, + "grad_norm": 2.30802321434021, + "learning_rate": 9.961885152264595e-06, + "loss": 3.1885, + "step": 199700 + }, + { + "epoch": 0.19678562139924183, + "grad_norm": 2.48321270942688, + "learning_rate": 9.961866078475353e-06, + "loss": 3.1363, + "step": 199750 + }, + { + "epoch": 0.1968348793770639, + "grad_norm": 2.4374709129333496, + "learning_rate": 9.961846999933032e-06, + "loss": 3.2174, + "step": 199800 + }, + { + "epoch": 0.196884137354886, + "grad_norm": 2.846295118331909, + "learning_rate": 9.961827916637652e-06, + "loss": 3.1916, + "step": 199850 + }, + { + "epoch": 0.1969333953327081, + "grad_norm": 2.375593423843384, + "learning_rate": 9.961808828589229e-06, + "loss": 3.1647, + "step": 199900 + }, + { + "epoch": 0.19698265331053016, + "grad_norm": 2.3127782344818115, + "learning_rate": 9.96178973578778e-06, + "loss": 3.1941, + "step": 199950 + }, + { + "epoch": 0.19703191128835226, + "grad_norm": 2.329901695251465, + "learning_rate": 9.961770638233325e-06, + "loss": 3.1717, + "step": 200000 + }, + { + "epoch": 0.19708116926617436, + "grad_norm": 2.3490004539489746, + "learning_rate": 9.961751535925885e-06, + "loss": 3.1307, + "step": 200050 + }, + { + "epoch": 0.19713042724399643, + "grad_norm": 2.347968578338623, + "learning_rate": 9.961732428865474e-06, + "loss": 3.2119, + "step": 200100 + }, + { + "epoch": 0.19717968522181853, + "grad_norm": 2.2133829593658447, + "learning_rate": 9.961713317052112e-06, + "loss": 3.1709, + "step": 200150 + }, + { + "epoch": 0.19722894319964063, + "grad_norm": 2.3734090328216553, + "learning_rate": 9.961694200485816e-06, + "loss": 3.1486, + "step": 200200 + }, + { + "epoch": 0.1972782011774627, + "grad_norm": 2.529090166091919, + "learning_rate": 9.961675079166608e-06, + "loss": 3.2172, + "step": 200250 + }, + { + "epoch": 0.1973274591552848, + "grad_norm": 2.2635481357574463, + "learning_rate": 9.961655953094501e-06, + "loss": 3.187, + "step": 200300 + }, + { + "epoch": 0.19737671713310687, + "grad_norm": 2.3745622634887695, + "learning_rate": 9.961636822269519e-06, + "loss": 3.1741, + "step": 200350 + }, + { + "epoch": 0.19742597511092896, + "grad_norm": 2.3730173110961914, + "learning_rate": 9.961617686691675e-06, + "loss": 3.2237, + "step": 200400 + }, + { + "epoch": 0.19747523308875106, + "grad_norm": 2.2681336402893066, + "learning_rate": 9.96159854636099e-06, + "loss": 3.136, + "step": 200450 + }, + { + "epoch": 0.19752449106657313, + "grad_norm": 2.6144564151763916, + "learning_rate": 9.961579401277484e-06, + "loss": 3.1863, + "step": 200500 + }, + { + "epoch": 0.19757374904439523, + "grad_norm": 2.2651357650756836, + "learning_rate": 9.961560251441172e-06, + "loss": 3.1832, + "step": 200550 + }, + { + "epoch": 0.19762300702221733, + "grad_norm": 2.178575038909912, + "learning_rate": 9.961541096852073e-06, + "loss": 3.172, + "step": 200600 + }, + { + "epoch": 0.1976722650000394, + "grad_norm": 2.2683420181274414, + "learning_rate": 9.961521937510208e-06, + "loss": 3.1787, + "step": 200650 + }, + { + "epoch": 0.1977215229778615, + "grad_norm": 2.2300939559936523, + "learning_rate": 9.961502773415593e-06, + "loss": 3.165, + "step": 200700 + }, + { + "epoch": 0.1977707809556836, + "grad_norm": 2.563718318939209, + "learning_rate": 9.961483604568244e-06, + "loss": 3.1805, + "step": 200750 + }, + { + "epoch": 0.19782003893350567, + "grad_norm": 2.2559280395507812, + "learning_rate": 9.961464430968186e-06, + "loss": 3.1962, + "step": 200800 + }, + { + "epoch": 0.19786929691132776, + "grad_norm": 2.5315680503845215, + "learning_rate": 9.961445252615434e-06, + "loss": 3.1266, + "step": 200850 + }, + { + "epoch": 0.19791855488914983, + "grad_norm": 2.3059022426605225, + "learning_rate": 9.961426069510002e-06, + "loss": 3.1499, + "step": 200900 + }, + { + "epoch": 0.19796781286697193, + "grad_norm": 2.3210031986236572, + "learning_rate": 9.961406881651916e-06, + "loss": 3.1193, + "step": 200950 + }, + { + "epoch": 0.19801707084479403, + "grad_norm": 2.5008418560028076, + "learning_rate": 9.96138768904119e-06, + "loss": 3.1331, + "step": 201000 + }, + { + "epoch": 0.1980663288226161, + "grad_norm": 2.245042085647583, + "learning_rate": 9.961368491677842e-06, + "loss": 3.1097, + "step": 201050 + }, + { + "epoch": 0.1981155868004382, + "grad_norm": 2.249685049057007, + "learning_rate": 9.961349289561892e-06, + "loss": 3.1945, + "step": 201100 + }, + { + "epoch": 0.1981648447782603, + "grad_norm": 2.6442301273345947, + "learning_rate": 9.961330082693356e-06, + "loss": 3.154, + "step": 201150 + }, + { + "epoch": 0.19821410275608237, + "grad_norm": 2.5741302967071533, + "learning_rate": 9.961310871072259e-06, + "loss": 3.1217, + "step": 201200 + }, + { + "epoch": 0.19826336073390446, + "grad_norm": 2.356677770614624, + "learning_rate": 9.96129165469861e-06, + "loss": 3.2068, + "step": 201250 + }, + { + "epoch": 0.19831261871172656, + "grad_norm": 2.3624699115753174, + "learning_rate": 9.961272433572434e-06, + "loss": 3.1714, + "step": 201300 + }, + { + "epoch": 0.19836187668954863, + "grad_norm": 2.552502393722534, + "learning_rate": 9.961253207693748e-06, + "loss": 3.1822, + "step": 201350 + }, + { + "epoch": 0.19841113466737073, + "grad_norm": 2.6163618564605713, + "learning_rate": 9.96123397706257e-06, + "loss": 3.2711, + "step": 201400 + }, + { + "epoch": 0.19846039264519283, + "grad_norm": 2.3790595531463623, + "learning_rate": 9.961214741678917e-06, + "loss": 3.1501, + "step": 201450 + }, + { + "epoch": 0.1985096506230149, + "grad_norm": 2.5507352352142334, + "learning_rate": 9.96119550154281e-06, + "loss": 3.1609, + "step": 201500 + }, + { + "epoch": 0.198558908600837, + "grad_norm": 2.3799893856048584, + "learning_rate": 9.961176256654266e-06, + "loss": 3.2122, + "step": 201550 + }, + { + "epoch": 0.19860816657865907, + "grad_norm": 2.3326478004455566, + "learning_rate": 9.961157007013302e-06, + "loss": 3.1995, + "step": 201600 + }, + { + "epoch": 0.19865742455648117, + "grad_norm": 2.434476375579834, + "learning_rate": 9.96113775261994e-06, + "loss": 3.1316, + "step": 201650 + }, + { + "epoch": 0.19870668253430326, + "grad_norm": 2.277432680130005, + "learning_rate": 9.961118493474195e-06, + "loss": 3.1644, + "step": 201700 + }, + { + "epoch": 0.19875594051212533, + "grad_norm": 2.6430203914642334, + "learning_rate": 9.961099229576089e-06, + "loss": 3.1755, + "step": 201750 + }, + { + "epoch": 0.19880519848994743, + "grad_norm": 2.235264301300049, + "learning_rate": 9.961079960925637e-06, + "loss": 3.1889, + "step": 201800 + }, + { + "epoch": 0.19885445646776953, + "grad_norm": 2.3129382133483887, + "learning_rate": 9.96106068752286e-06, + "loss": 3.187, + "step": 201850 + }, + { + "epoch": 0.1989037144455916, + "grad_norm": 2.230006456375122, + "learning_rate": 9.961041409367773e-06, + "loss": 3.0785, + "step": 201900 + }, + { + "epoch": 0.1989529724234137, + "grad_norm": 2.4203357696533203, + "learning_rate": 9.9610221264604e-06, + "loss": 3.1672, + "step": 201950 + }, + { + "epoch": 0.1990022304012358, + "grad_norm": 2.3013617992401123, + "learning_rate": 9.961002838800755e-06, + "loss": 3.1561, + "step": 202000 + }, + { + "epoch": 0.19905148837905787, + "grad_norm": 2.3852150440216064, + "learning_rate": 9.960983546388855e-06, + "loss": 3.1547, + "step": 202050 + }, + { + "epoch": 0.19910074635687997, + "grad_norm": 2.3615288734436035, + "learning_rate": 9.960964249224725e-06, + "loss": 3.1832, + "step": 202100 + }, + { + "epoch": 0.19915000433470204, + "grad_norm": 2.3716626167297363, + "learning_rate": 9.960944947308377e-06, + "loss": 3.1837, + "step": 202150 + }, + { + "epoch": 0.19919926231252413, + "grad_norm": 2.329118490219116, + "learning_rate": 9.960925640639834e-06, + "loss": 3.1237, + "step": 202200 + }, + { + "epoch": 0.19924852029034623, + "grad_norm": 3.411515474319458, + "learning_rate": 9.960906329219114e-06, + "loss": 3.1649, + "step": 202250 + }, + { + "epoch": 0.1992977782681683, + "grad_norm": 2.3965423107147217, + "learning_rate": 9.960887013046233e-06, + "loss": 3.0808, + "step": 202300 + }, + { + "epoch": 0.1993470362459904, + "grad_norm": 2.792933225631714, + "learning_rate": 9.960867692121209e-06, + "loss": 3.1782, + "step": 202350 + }, + { + "epoch": 0.1993962942238125, + "grad_norm": 2.4136593341827393, + "learning_rate": 9.960848366444064e-06, + "loss": 3.1281, + "step": 202400 + }, + { + "epoch": 0.19944555220163457, + "grad_norm": 2.5002527236938477, + "learning_rate": 9.960829036014815e-06, + "loss": 3.1564, + "step": 202450 + }, + { + "epoch": 0.19949481017945667, + "grad_norm": 2.3341305255889893, + "learning_rate": 9.960809700833477e-06, + "loss": 3.1716, + "step": 202500 + }, + { + "epoch": 0.19954406815727876, + "grad_norm": 2.292515516281128, + "learning_rate": 9.960790360900075e-06, + "loss": 3.1689, + "step": 202550 + }, + { + "epoch": 0.19959332613510083, + "grad_norm": 2.278526782989502, + "learning_rate": 9.960771016214623e-06, + "loss": 3.1087, + "step": 202600 + }, + { + "epoch": 0.19964258411292293, + "grad_norm": 2.190265655517578, + "learning_rate": 9.960751666777142e-06, + "loss": 3.153, + "step": 202650 + }, + { + "epoch": 0.19969184209074503, + "grad_norm": 2.379063844680786, + "learning_rate": 9.960732312587648e-06, + "loss": 3.1745, + "step": 202700 + }, + { + "epoch": 0.1997411000685671, + "grad_norm": 2.3188283443450928, + "learning_rate": 9.960712953646163e-06, + "loss": 3.1328, + "step": 202750 + }, + { + "epoch": 0.1997903580463892, + "grad_norm": 2.407381296157837, + "learning_rate": 9.960693589952702e-06, + "loss": 3.2262, + "step": 202800 + }, + { + "epoch": 0.19983961602421127, + "grad_norm": 2.347661018371582, + "learning_rate": 9.960674221507284e-06, + "loss": 3.196, + "step": 202850 + }, + { + "epoch": 0.19988887400203337, + "grad_norm": 2.281912088394165, + "learning_rate": 9.96065484830993e-06, + "loss": 3.1594, + "step": 202900 + }, + { + "epoch": 0.19993813197985547, + "grad_norm": 2.600001573562622, + "learning_rate": 9.960635470360657e-06, + "loss": 3.1384, + "step": 202950 + }, + { + "epoch": 0.19998738995767754, + "grad_norm": 2.632993698120117, + "learning_rate": 9.960616087659483e-06, + "loss": 3.0998, + "step": 203000 + }, + { + "epoch": 0.20003664793549963, + "grad_norm": 2.7869346141815186, + "learning_rate": 9.960596700206428e-06, + "loss": 3.1866, + "step": 203050 + }, + { + "epoch": 0.20008590591332173, + "grad_norm": 2.527700185775757, + "learning_rate": 9.96057730800151e-06, + "loss": 3.1591, + "step": 203100 + }, + { + "epoch": 0.2001351638911438, + "grad_norm": 2.3003110885620117, + "learning_rate": 9.960557911044747e-06, + "loss": 3.1136, + "step": 203150 + }, + { + "epoch": 0.2001844218689659, + "grad_norm": 2.369945764541626, + "learning_rate": 9.960538509336156e-06, + "loss": 3.2229, + "step": 203200 + }, + { + "epoch": 0.200233679846788, + "grad_norm": 2.3069117069244385, + "learning_rate": 9.960519102875759e-06, + "loss": 3.1869, + "step": 203250 + }, + { + "epoch": 0.20028293782461007, + "grad_norm": 2.6560218334198, + "learning_rate": 9.960499691663574e-06, + "loss": 3.1957, + "step": 203300 + }, + { + "epoch": 0.20033219580243217, + "grad_norm": 2.306450366973877, + "learning_rate": 9.960480275699618e-06, + "loss": 3.1803, + "step": 203350 + }, + { + "epoch": 0.20038145378025424, + "grad_norm": 2.3503198623657227, + "learning_rate": 9.960460854983911e-06, + "loss": 3.1251, + "step": 203400 + }, + { + "epoch": 0.20043071175807634, + "grad_norm": 2.3734047412872314, + "learning_rate": 9.96044142951647e-06, + "loss": 3.1241, + "step": 203450 + }, + { + "epoch": 0.20047996973589843, + "grad_norm": 2.2274351119995117, + "learning_rate": 9.960421999297316e-06, + "loss": 3.1808, + "step": 203500 + }, + { + "epoch": 0.2005292277137205, + "grad_norm": 2.2439446449279785, + "learning_rate": 9.960402564326464e-06, + "loss": 3.1348, + "step": 203550 + }, + { + "epoch": 0.2005784856915426, + "grad_norm": 2.335048198699951, + "learning_rate": 9.960383124603936e-06, + "loss": 3.1355, + "step": 203600 + }, + { + "epoch": 0.2006277436693647, + "grad_norm": 2.2605020999908447, + "learning_rate": 9.96036368012975e-06, + "loss": 3.1636, + "step": 203650 + }, + { + "epoch": 0.20067700164718677, + "grad_norm": 2.4001994132995605, + "learning_rate": 9.960344230903923e-06, + "loss": 3.1298, + "step": 203700 + }, + { + "epoch": 0.20072625962500887, + "grad_norm": 2.2080001831054688, + "learning_rate": 9.960324776926476e-06, + "loss": 3.1332, + "step": 203750 + }, + { + "epoch": 0.20077551760283097, + "grad_norm": 2.4068219661712646, + "learning_rate": 9.960305318197424e-06, + "loss": 3.1914, + "step": 203800 + }, + { + "epoch": 0.20082477558065304, + "grad_norm": 2.207080602645874, + "learning_rate": 9.960285854716789e-06, + "loss": 3.1311, + "step": 203850 + }, + { + "epoch": 0.20087403355847513, + "grad_norm": 2.424910068511963, + "learning_rate": 9.96026638648459e-06, + "loss": 3.1171, + "step": 203900 + }, + { + "epoch": 0.2009232915362972, + "grad_norm": 2.374262809753418, + "learning_rate": 9.960246913500841e-06, + "loss": 3.1307, + "step": 203950 + }, + { + "epoch": 0.2009725495141193, + "grad_norm": 2.262049436569214, + "learning_rate": 9.960227435765567e-06, + "loss": 3.1365, + "step": 204000 + }, + { + "epoch": 0.2010218074919414, + "grad_norm": 2.3736376762390137, + "learning_rate": 9.960207953278783e-06, + "loss": 3.1488, + "step": 204050 + }, + { + "epoch": 0.20107106546976347, + "grad_norm": 2.353731870651245, + "learning_rate": 9.960188466040508e-06, + "loss": 3.1999, + "step": 204100 + }, + { + "epoch": 0.20112032344758557, + "grad_norm": 2.6353390216827393, + "learning_rate": 9.96016897405076e-06, + "loss": 3.1685, + "step": 204150 + }, + { + "epoch": 0.20116958142540767, + "grad_norm": 2.5656416416168213, + "learning_rate": 9.960149477309558e-06, + "loss": 3.1138, + "step": 204200 + }, + { + "epoch": 0.20121883940322974, + "grad_norm": 2.6922476291656494, + "learning_rate": 9.960129975816923e-06, + "loss": 3.1546, + "step": 204250 + }, + { + "epoch": 0.20126809738105184, + "grad_norm": 2.360347032546997, + "learning_rate": 9.96011046957287e-06, + "loss": 3.214, + "step": 204300 + }, + { + "epoch": 0.20131735535887393, + "grad_norm": 2.2466230392456055, + "learning_rate": 9.960090958577423e-06, + "loss": 3.1493, + "step": 204350 + }, + { + "epoch": 0.201366613336696, + "grad_norm": 2.248671531677246, + "learning_rate": 9.960071442830594e-06, + "loss": 3.2063, + "step": 204400 + }, + { + "epoch": 0.2014158713145181, + "grad_norm": 2.4156911373138428, + "learning_rate": 9.960051922332405e-06, + "loss": 3.2141, + "step": 204450 + }, + { + "epoch": 0.2014651292923402, + "grad_norm": 2.4040310382843018, + "learning_rate": 9.960032397082876e-06, + "loss": 3.2051, + "step": 204500 + }, + { + "epoch": 0.20151438727016227, + "grad_norm": 2.28920841217041, + "learning_rate": 9.960012867082025e-06, + "loss": 3.1104, + "step": 204550 + }, + { + "epoch": 0.20156364524798437, + "grad_norm": 2.74505877494812, + "learning_rate": 9.959993332329869e-06, + "loss": 3.1529, + "step": 204600 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 2.2143354415893555, + "learning_rate": 9.959973792826427e-06, + "loss": 3.1713, + "step": 204650 + }, + { + "epoch": 0.20166216120362854, + "grad_norm": 2.4952709674835205, + "learning_rate": 9.959954248571719e-06, + "loss": 3.1975, + "step": 204700 + }, + { + "epoch": 0.20171141918145064, + "grad_norm": 2.220656633377075, + "learning_rate": 9.959934699565764e-06, + "loss": 3.1876, + "step": 204750 + }, + { + "epoch": 0.2017606771592727, + "grad_norm": 2.295731544494629, + "learning_rate": 9.959915145808579e-06, + "loss": 3.138, + "step": 204800 + }, + { + "epoch": 0.2018099351370948, + "grad_norm": 2.709906816482544, + "learning_rate": 9.959895587300185e-06, + "loss": 3.1562, + "step": 204850 + }, + { + "epoch": 0.2018591931149169, + "grad_norm": 2.338934898376465, + "learning_rate": 9.959876024040598e-06, + "loss": 3.1895, + "step": 204900 + }, + { + "epoch": 0.20190845109273897, + "grad_norm": 2.4069290161132812, + "learning_rate": 9.959856456029839e-06, + "loss": 3.233, + "step": 204950 + }, + { + "epoch": 0.20195770907056107, + "grad_norm": 3.108976364135742, + "learning_rate": 9.959836883267927e-06, + "loss": 3.1659, + "step": 205000 + }, + { + "epoch": 0.20200696704838317, + "grad_norm": 2.4821436405181885, + "learning_rate": 9.959817305754878e-06, + "loss": 3.2422, + "step": 205050 + }, + { + "epoch": 0.20205622502620524, + "grad_norm": 2.3242716789245605, + "learning_rate": 9.959797723490711e-06, + "loss": 3.1409, + "step": 205100 + }, + { + "epoch": 0.20210548300402734, + "grad_norm": 2.4353396892547607, + "learning_rate": 9.959778136475448e-06, + "loss": 3.1642, + "step": 205150 + }, + { + "epoch": 0.2021547409818494, + "grad_norm": 2.2855384349823, + "learning_rate": 9.959758544709107e-06, + "loss": 3.2181, + "step": 205200 + }, + { + "epoch": 0.2022039989596715, + "grad_norm": 2.340528726577759, + "learning_rate": 9.959738948191705e-06, + "loss": 3.1685, + "step": 205250 + }, + { + "epoch": 0.2022532569374936, + "grad_norm": 2.4178950786590576, + "learning_rate": 9.95971934692326e-06, + "loss": 3.1245, + "step": 205300 + }, + { + "epoch": 0.20230251491531567, + "grad_norm": 2.2373764514923096, + "learning_rate": 9.959699740903794e-06, + "loss": 3.204, + "step": 205350 + }, + { + "epoch": 0.20235177289313777, + "grad_norm": 2.7442729473114014, + "learning_rate": 9.959680130133322e-06, + "loss": 3.1916, + "step": 205400 + }, + { + "epoch": 0.20240103087095987, + "grad_norm": 2.2657668590545654, + "learning_rate": 9.959660514611866e-06, + "loss": 3.1174, + "step": 205450 + }, + { + "epoch": 0.20245028884878194, + "grad_norm": 2.1790337562561035, + "learning_rate": 9.959640894339444e-06, + "loss": 3.1234, + "step": 205500 + }, + { + "epoch": 0.20249954682660404, + "grad_norm": 2.4008233547210693, + "learning_rate": 9.959621269316075e-06, + "loss": 3.171, + "step": 205550 + }, + { + "epoch": 0.20254880480442614, + "grad_norm": 2.408280372619629, + "learning_rate": 9.959601639541776e-06, + "loss": 3.2001, + "step": 205600 + }, + { + "epoch": 0.2025980627822482, + "grad_norm": 2.35282564163208, + "learning_rate": 9.959582005016567e-06, + "loss": 3.1774, + "step": 205650 + }, + { + "epoch": 0.2026473207600703, + "grad_norm": 2.2874209880828857, + "learning_rate": 9.959562365740468e-06, + "loss": 3.0817, + "step": 205700 + }, + { + "epoch": 0.2026965787378924, + "grad_norm": 4.549248695373535, + "learning_rate": 9.959542721713496e-06, + "loss": 3.237, + "step": 205750 + }, + { + "epoch": 0.20274583671571447, + "grad_norm": 2.7091376781463623, + "learning_rate": 9.959523072935669e-06, + "loss": 3.1151, + "step": 205800 + }, + { + "epoch": 0.20279509469353657, + "grad_norm": 2.4826924800872803, + "learning_rate": 9.959503419407009e-06, + "loss": 3.1787, + "step": 205850 + }, + { + "epoch": 0.20284435267135864, + "grad_norm": 2.314362049102783, + "learning_rate": 9.959483761127532e-06, + "loss": 3.1894, + "step": 205900 + }, + { + "epoch": 0.20289361064918074, + "grad_norm": 2.2838730812072754, + "learning_rate": 9.959464098097257e-06, + "loss": 3.1106, + "step": 205950 + }, + { + "epoch": 0.20294286862700284, + "grad_norm": 2.597059965133667, + "learning_rate": 9.959444430316207e-06, + "loss": 3.203, + "step": 206000 + }, + { + "epoch": 0.2029921266048249, + "grad_norm": 2.4173622131347656, + "learning_rate": 9.959424757784395e-06, + "loss": 3.158, + "step": 206050 + }, + { + "epoch": 0.203041384582647, + "grad_norm": 2.3075854778289795, + "learning_rate": 9.959405080501842e-06, + "loss": 3.1513, + "step": 206100 + }, + { + "epoch": 0.2030906425604691, + "grad_norm": 2.429185390472412, + "learning_rate": 9.959385398468571e-06, + "loss": 3.1259, + "step": 206150 + }, + { + "epoch": 0.20313990053829117, + "grad_norm": 2.219280958175659, + "learning_rate": 9.959365711684595e-06, + "loss": 3.1441, + "step": 206200 + }, + { + "epoch": 0.20318915851611327, + "grad_norm": 3.4745967388153076, + "learning_rate": 9.959346020149933e-06, + "loss": 3.1287, + "step": 206250 + }, + { + "epoch": 0.20323841649393537, + "grad_norm": 2.303740978240967, + "learning_rate": 9.959326323864608e-06, + "loss": 3.1943, + "step": 206300 + }, + { + "epoch": 0.20328767447175744, + "grad_norm": 2.2190935611724854, + "learning_rate": 9.959306622828636e-06, + "loss": 3.163, + "step": 206350 + }, + { + "epoch": 0.20333693244957954, + "grad_norm": 2.2731540203094482, + "learning_rate": 9.959286917042038e-06, + "loss": 3.2071, + "step": 206400 + }, + { + "epoch": 0.2033861904274016, + "grad_norm": 2.3589987754821777, + "learning_rate": 9.95926720650483e-06, + "loss": 3.1965, + "step": 206450 + }, + { + "epoch": 0.2034354484052237, + "grad_norm": 2.3886425495147705, + "learning_rate": 9.959247491217032e-06, + "loss": 3.1303, + "step": 206500 + }, + { + "epoch": 0.2034847063830458, + "grad_norm": 2.4407129287719727, + "learning_rate": 9.959227771178663e-06, + "loss": 3.214, + "step": 206550 + }, + { + "epoch": 0.20353396436086788, + "grad_norm": 2.655928373336792, + "learning_rate": 9.959208046389743e-06, + "loss": 3.132, + "step": 206600 + }, + { + "epoch": 0.20358322233868997, + "grad_norm": 2.5741915702819824, + "learning_rate": 9.95918831685029e-06, + "loss": 3.1545, + "step": 206650 + }, + { + "epoch": 0.20363248031651207, + "grad_norm": 2.2618424892425537, + "learning_rate": 9.959168582560323e-06, + "loss": 3.1544, + "step": 206700 + }, + { + "epoch": 0.20368173829433414, + "grad_norm": 2.370654821395874, + "learning_rate": 9.959148843519862e-06, + "loss": 3.1963, + "step": 206750 + }, + { + "epoch": 0.20373099627215624, + "grad_norm": 2.3089559078216553, + "learning_rate": 9.959129099728924e-06, + "loss": 3.14, + "step": 206800 + }, + { + "epoch": 0.20378025424997834, + "grad_norm": 2.3521053791046143, + "learning_rate": 9.959109351187528e-06, + "loss": 3.1428, + "step": 206850 + }, + { + "epoch": 0.2038295122278004, + "grad_norm": 2.2573249340057373, + "learning_rate": 9.959089597895693e-06, + "loss": 3.0746, + "step": 206900 + }, + { + "epoch": 0.2038787702056225, + "grad_norm": 2.378023147583008, + "learning_rate": 9.95906983985344e-06, + "loss": 3.1807, + "step": 206950 + }, + { + "epoch": 0.2039280281834446, + "grad_norm": 2.686462879180908, + "learning_rate": 9.959050077060785e-06, + "loss": 3.1223, + "step": 207000 + }, + { + "epoch": 0.20397728616126667, + "grad_norm": 2.35837984085083, + "learning_rate": 9.95903030951775e-06, + "loss": 3.218, + "step": 207050 + }, + { + "epoch": 0.20402654413908877, + "grad_norm": 2.211061954498291, + "learning_rate": 9.959010537224351e-06, + "loss": 3.1777, + "step": 207100 + }, + { + "epoch": 0.20407580211691084, + "grad_norm": 2.2865025997161865, + "learning_rate": 9.958990760180608e-06, + "loss": 3.1164, + "step": 207150 + }, + { + "epoch": 0.20412506009473294, + "grad_norm": 2.413261890411377, + "learning_rate": 9.958970978386541e-06, + "loss": 3.1057, + "step": 207200 + }, + { + "epoch": 0.20417431807255504, + "grad_norm": 2.3077125549316406, + "learning_rate": 9.958951191842168e-06, + "loss": 3.1341, + "step": 207250 + }, + { + "epoch": 0.2042235760503771, + "grad_norm": 2.389791965484619, + "learning_rate": 9.958931400547507e-06, + "loss": 3.1285, + "step": 207300 + }, + { + "epoch": 0.2042728340281992, + "grad_norm": 2.2830917835235596, + "learning_rate": 9.958911604502578e-06, + "loss": 3.1478, + "step": 207350 + }, + { + "epoch": 0.2043220920060213, + "grad_norm": 2.4379444122314453, + "learning_rate": 9.958891803707403e-06, + "loss": 3.1836, + "step": 207400 + }, + { + "epoch": 0.20437134998384338, + "grad_norm": 2.4105894565582275, + "learning_rate": 9.958871998161995e-06, + "loss": 3.1238, + "step": 207450 + }, + { + "epoch": 0.20442060796166547, + "grad_norm": 2.8004002571105957, + "learning_rate": 9.958852187866377e-06, + "loss": 3.1343, + "step": 207500 + }, + { + "epoch": 0.20446986593948757, + "grad_norm": 2.2657265663146973, + "learning_rate": 9.958832372820567e-06, + "loss": 3.1498, + "step": 207550 + }, + { + "epoch": 0.20451912391730964, + "grad_norm": 2.269650936126709, + "learning_rate": 9.958812553024584e-06, + "loss": 3.1549, + "step": 207600 + }, + { + "epoch": 0.20456838189513174, + "grad_norm": 2.3662571907043457, + "learning_rate": 9.958792728478445e-06, + "loss": 3.0788, + "step": 207650 + }, + { + "epoch": 0.2046176398729538, + "grad_norm": 2.6087539196014404, + "learning_rate": 9.958772899182174e-06, + "loss": 3.1854, + "step": 207700 + }, + { + "epoch": 0.2046668978507759, + "grad_norm": 2.4934215545654297, + "learning_rate": 9.958753065135785e-06, + "loss": 3.1649, + "step": 207750 + }, + { + "epoch": 0.204716155828598, + "grad_norm": 2.3540399074554443, + "learning_rate": 9.9587332263393e-06, + "loss": 3.1151, + "step": 207800 + }, + { + "epoch": 0.20476541380642008, + "grad_norm": 2.2416234016418457, + "learning_rate": 9.958713382792734e-06, + "loss": 3.1338, + "step": 207850 + }, + { + "epoch": 0.20481467178424217, + "grad_norm": 2.452441692352295, + "learning_rate": 9.958693534496112e-06, + "loss": 3.0862, + "step": 207900 + }, + { + "epoch": 0.20486392976206427, + "grad_norm": 2.31312894821167, + "learning_rate": 9.958673681449449e-06, + "loss": 3.169, + "step": 207950 + }, + { + "epoch": 0.20491318773988634, + "grad_norm": 2.4456894397735596, + "learning_rate": 9.958653823652766e-06, + "loss": 3.1959, + "step": 208000 + }, + { + "epoch": 0.20496244571770844, + "grad_norm": 2.508714437484741, + "learning_rate": 9.95863396110608e-06, + "loss": 3.1874, + "step": 208050 + }, + { + "epoch": 0.20501170369553054, + "grad_norm": 2.3797824382781982, + "learning_rate": 9.958614093809409e-06, + "loss": 3.162, + "step": 208100 + }, + { + "epoch": 0.2050609616733526, + "grad_norm": 2.405336618423462, + "learning_rate": 9.958594221762777e-06, + "loss": 3.1123, + "step": 208150 + }, + { + "epoch": 0.2051102196511747, + "grad_norm": 2.263030767440796, + "learning_rate": 9.958574344966198e-06, + "loss": 3.1963, + "step": 208200 + }, + { + "epoch": 0.2051594776289968, + "grad_norm": 2.330038547515869, + "learning_rate": 9.958554463419693e-06, + "loss": 3.1642, + "step": 208250 + }, + { + "epoch": 0.20520873560681888, + "grad_norm": 2.2958781719207764, + "learning_rate": 9.958534577123282e-06, + "loss": 3.1471, + "step": 208300 + }, + { + "epoch": 0.20525799358464097, + "grad_norm": 2.2572388648986816, + "learning_rate": 9.958514686076983e-06, + "loss": 3.152, + "step": 208350 + }, + { + "epoch": 0.20530725156246304, + "grad_norm": 2.3831839561462402, + "learning_rate": 9.958494790280815e-06, + "loss": 3.149, + "step": 208400 + }, + { + "epoch": 0.20535650954028514, + "grad_norm": 2.436060667037964, + "learning_rate": 9.958474889734797e-06, + "loss": 3.1494, + "step": 208450 + }, + { + "epoch": 0.20540576751810724, + "grad_norm": 2.349781036376953, + "learning_rate": 9.95845498443895e-06, + "loss": 3.0828, + "step": 208500 + }, + { + "epoch": 0.2054550254959293, + "grad_norm": 2.2536070346832275, + "learning_rate": 9.95843507439329e-06, + "loss": 3.1674, + "step": 208550 + }, + { + "epoch": 0.2055042834737514, + "grad_norm": 2.520420551300049, + "learning_rate": 9.958415159597837e-06, + "loss": 3.1144, + "step": 208600 + }, + { + "epoch": 0.2055535414515735, + "grad_norm": 2.385207176208496, + "learning_rate": 9.958395240052613e-06, + "loss": 3.1462, + "step": 208650 + }, + { + "epoch": 0.20560279942939558, + "grad_norm": 2.2822673320770264, + "learning_rate": 9.958375315757631e-06, + "loss": 3.1023, + "step": 208700 + }, + { + "epoch": 0.20565205740721768, + "grad_norm": 2.1631195545196533, + "learning_rate": 9.958355386712917e-06, + "loss": 3.1703, + "step": 208750 + }, + { + "epoch": 0.20570131538503977, + "grad_norm": 2.463300943374634, + "learning_rate": 9.958335452918485e-06, + "loss": 3.1238, + "step": 208800 + }, + { + "epoch": 0.20575057336286184, + "grad_norm": 2.2513768672943115, + "learning_rate": 9.958315514374356e-06, + "loss": 3.1655, + "step": 208850 + }, + { + "epoch": 0.20579983134068394, + "grad_norm": 2.274393320083618, + "learning_rate": 9.958295571080549e-06, + "loss": 3.1471, + "step": 208900 + }, + { + "epoch": 0.205849089318506, + "grad_norm": 3.0230369567871094, + "learning_rate": 9.958275623037084e-06, + "loss": 3.0998, + "step": 208950 + }, + { + "epoch": 0.2058983472963281, + "grad_norm": 2.436112642288208, + "learning_rate": 9.958255670243979e-06, + "loss": 3.0369, + "step": 209000 + }, + { + "epoch": 0.2059476052741502, + "grad_norm": 2.2775697708129883, + "learning_rate": 9.958235712701252e-06, + "loss": 3.0889, + "step": 209050 + }, + { + "epoch": 0.20599686325197228, + "grad_norm": 2.29316782951355, + "learning_rate": 9.958215750408924e-06, + "loss": 3.1651, + "step": 209100 + }, + { + "epoch": 0.20604612122979438, + "grad_norm": 2.417280435562134, + "learning_rate": 9.958195783367015e-06, + "loss": 3.1964, + "step": 209150 + }, + { + "epoch": 0.20609537920761647, + "grad_norm": 2.307659149169922, + "learning_rate": 9.95817581157554e-06, + "loss": 3.1335, + "step": 209200 + }, + { + "epoch": 0.20614463718543855, + "grad_norm": 2.6780643463134766, + "learning_rate": 9.958155835034525e-06, + "loss": 3.1626, + "step": 209250 + }, + { + "epoch": 0.20619389516326064, + "grad_norm": 2.239384889602661, + "learning_rate": 9.958135853743982e-06, + "loss": 3.1304, + "step": 209300 + }, + { + "epoch": 0.20624315314108274, + "grad_norm": 2.462625503540039, + "learning_rate": 9.958115867703932e-06, + "loss": 3.1578, + "step": 209350 + }, + { + "epoch": 0.2062924111189048, + "grad_norm": 2.3173625469207764, + "learning_rate": 9.958095876914398e-06, + "loss": 3.1559, + "step": 209400 + }, + { + "epoch": 0.2063416690967269, + "grad_norm": 2.2476139068603516, + "learning_rate": 9.958075881375395e-06, + "loss": 3.1745, + "step": 209450 + }, + { + "epoch": 0.206390927074549, + "grad_norm": 2.606207847595215, + "learning_rate": 9.958055881086944e-06, + "loss": 3.1699, + "step": 209500 + }, + { + "epoch": 0.20644018505237108, + "grad_norm": 2.232581615447998, + "learning_rate": 9.958035876049063e-06, + "loss": 3.1725, + "step": 209550 + }, + { + "epoch": 0.20648944303019318, + "grad_norm": 2.16389536857605, + "learning_rate": 9.958015866261772e-06, + "loss": 3.0922, + "step": 209600 + }, + { + "epoch": 0.20653870100801525, + "grad_norm": 3.8807289600372314, + "learning_rate": 9.957995851725091e-06, + "loss": 3.1558, + "step": 209650 + }, + { + "epoch": 0.20658795898583734, + "grad_norm": 2.299062967300415, + "learning_rate": 9.957975832439037e-06, + "loss": 3.1826, + "step": 209700 + }, + { + "epoch": 0.20663721696365944, + "grad_norm": 2.3832263946533203, + "learning_rate": 9.957955808403632e-06, + "loss": 3.1828, + "step": 209750 + }, + { + "epoch": 0.2066864749414815, + "grad_norm": 2.3467345237731934, + "learning_rate": 9.957935779618892e-06, + "loss": 3.1411, + "step": 209800 + }, + { + "epoch": 0.2067357329193036, + "grad_norm": 4.1672682762146, + "learning_rate": 9.957915746084839e-06, + "loss": 3.1023, + "step": 209850 + }, + { + "epoch": 0.2067849908971257, + "grad_norm": 2.2828493118286133, + "learning_rate": 9.957895707801491e-06, + "loss": 3.1391, + "step": 209900 + }, + { + "epoch": 0.20683424887494778, + "grad_norm": 2.3025710582733154, + "learning_rate": 9.957875664768865e-06, + "loss": 3.1494, + "step": 209950 + }, + { + "epoch": 0.20688350685276988, + "grad_norm": 2.445500373840332, + "learning_rate": 9.957855616986985e-06, + "loss": 3.1046, + "step": 210000 + }, + { + "epoch": 0.20693276483059198, + "grad_norm": 2.5231399536132812, + "learning_rate": 9.957835564455867e-06, + "loss": 3.1231, + "step": 210050 + }, + { + "epoch": 0.20698202280841405, + "grad_norm": 2.439889669418335, + "learning_rate": 9.95781550717553e-06, + "loss": 3.1485, + "step": 210100 + }, + { + "epoch": 0.20703128078623614, + "grad_norm": 2.306947708129883, + "learning_rate": 9.957795445145993e-06, + "loss": 3.21, + "step": 210150 + }, + { + "epoch": 0.20708053876405821, + "grad_norm": 2.2984092235565186, + "learning_rate": 9.957775378367278e-06, + "loss": 3.122, + "step": 210200 + }, + { + "epoch": 0.2071297967418803, + "grad_norm": 3.2085721492767334, + "learning_rate": 9.957755306839401e-06, + "loss": 3.1883, + "step": 210250 + }, + { + "epoch": 0.2071790547197024, + "grad_norm": 2.2393877506256104, + "learning_rate": 9.957735230562384e-06, + "loss": 3.1944, + "step": 210300 + }, + { + "epoch": 0.20722831269752448, + "grad_norm": 2.356269359588623, + "learning_rate": 9.957715149536244e-06, + "loss": 3.1975, + "step": 210350 + }, + { + "epoch": 0.20727757067534658, + "grad_norm": 2.43263840675354, + "learning_rate": 9.957695063761002e-06, + "loss": 3.215, + "step": 210400 + }, + { + "epoch": 0.20732682865316868, + "grad_norm": 2.500180959701538, + "learning_rate": 9.957674973236676e-06, + "loss": 3.1726, + "step": 210450 + }, + { + "epoch": 0.20737608663099075, + "grad_norm": 2.353097438812256, + "learning_rate": 9.957654877963286e-06, + "loss": 3.1555, + "step": 210500 + }, + { + "epoch": 0.20742534460881284, + "grad_norm": 2.604830026626587, + "learning_rate": 9.957634777940848e-06, + "loss": 3.192, + "step": 210550 + }, + { + "epoch": 0.20747460258663494, + "grad_norm": 2.407069683074951, + "learning_rate": 9.957614673169387e-06, + "loss": 3.1395, + "step": 210600 + }, + { + "epoch": 0.207523860564457, + "grad_norm": 2.3856699466705322, + "learning_rate": 9.957594563648918e-06, + "loss": 3.1816, + "step": 210650 + }, + { + "epoch": 0.2075731185422791, + "grad_norm": 2.3091938495635986, + "learning_rate": 9.957574449379464e-06, + "loss": 3.1091, + "step": 210700 + }, + { + "epoch": 0.20762237652010118, + "grad_norm": 2.2650439739227295, + "learning_rate": 9.957554330361039e-06, + "loss": 3.1794, + "step": 210750 + }, + { + "epoch": 0.20767163449792328, + "grad_norm": 2.3910062313079834, + "learning_rate": 9.957534206593666e-06, + "loss": 3.167, + "step": 210800 + }, + { + "epoch": 0.20772089247574538, + "grad_norm": 2.3552258014678955, + "learning_rate": 9.957514078077363e-06, + "loss": 3.1997, + "step": 210850 + }, + { + "epoch": 0.20777015045356745, + "grad_norm": 2.6385338306427, + "learning_rate": 9.95749394481215e-06, + "loss": 3.111, + "step": 210900 + }, + { + "epoch": 0.20781940843138955, + "grad_norm": 2.424082040786743, + "learning_rate": 9.957473806798047e-06, + "loss": 3.1163, + "step": 210950 + }, + { + "epoch": 0.20786866640921164, + "grad_norm": 2.508225679397583, + "learning_rate": 9.957453664035073e-06, + "loss": 3.0891, + "step": 211000 + }, + { + "epoch": 0.20791792438703371, + "grad_norm": 2.3564515113830566, + "learning_rate": 9.957433516523245e-06, + "loss": 3.1648, + "step": 211050 + }, + { + "epoch": 0.2079671823648558, + "grad_norm": 2.7111833095550537, + "learning_rate": 9.957413364262585e-06, + "loss": 3.1986, + "step": 211100 + }, + { + "epoch": 0.2080164403426779, + "grad_norm": 2.1761960983276367, + "learning_rate": 9.95739320725311e-06, + "loss": 3.1546, + "step": 211150 + }, + { + "epoch": 0.20806569832049998, + "grad_norm": 2.746267318725586, + "learning_rate": 9.957373045494841e-06, + "loss": 3.1315, + "step": 211200 + }, + { + "epoch": 0.20811495629832208, + "grad_norm": 2.45660138130188, + "learning_rate": 9.957352878987796e-06, + "loss": 3.1415, + "step": 211250 + }, + { + "epoch": 0.20816421427614418, + "grad_norm": 2.3345420360565186, + "learning_rate": 9.957332707731995e-06, + "loss": 3.1407, + "step": 211300 + }, + { + "epoch": 0.20821347225396625, + "grad_norm": 2.318936824798584, + "learning_rate": 9.957312531727458e-06, + "loss": 3.1814, + "step": 211350 + }, + { + "epoch": 0.20826273023178835, + "grad_norm": 2.367290496826172, + "learning_rate": 9.957292350974206e-06, + "loss": 3.1919, + "step": 211400 + }, + { + "epoch": 0.20831198820961042, + "grad_norm": 2.4204750061035156, + "learning_rate": 9.957272165472254e-06, + "loss": 3.2131, + "step": 211450 + }, + { + "epoch": 0.20836124618743251, + "grad_norm": 2.4520339965820312, + "learning_rate": 9.957251975221624e-06, + "loss": 3.1692, + "step": 211500 + }, + { + "epoch": 0.2084105041652546, + "grad_norm": 2.4482266902923584, + "learning_rate": 9.957231780222334e-06, + "loss": 3.1847, + "step": 211550 + }, + { + "epoch": 0.20845976214307668, + "grad_norm": 2.4644150733947754, + "learning_rate": 9.957211580474405e-06, + "loss": 3.1375, + "step": 211600 + }, + { + "epoch": 0.20850902012089878, + "grad_norm": 2.382768392562866, + "learning_rate": 9.957191375977855e-06, + "loss": 3.2548, + "step": 211650 + }, + { + "epoch": 0.20855827809872088, + "grad_norm": 2.2712690830230713, + "learning_rate": 9.957171166732704e-06, + "loss": 3.1579, + "step": 211700 + }, + { + "epoch": 0.20860753607654295, + "grad_norm": 2.4542791843414307, + "learning_rate": 9.95715095273897e-06, + "loss": 3.2126, + "step": 211750 + }, + { + "epoch": 0.20865679405436505, + "grad_norm": 2.5927412509918213, + "learning_rate": 9.957130733996677e-06, + "loss": 3.1121, + "step": 211800 + }, + { + "epoch": 0.20870605203218714, + "grad_norm": 2.4330270290374756, + "learning_rate": 9.957110510505836e-06, + "loss": 3.0841, + "step": 211850 + }, + { + "epoch": 0.20875531001000922, + "grad_norm": 2.5410919189453125, + "learning_rate": 9.957090282266475e-06, + "loss": 3.2166, + "step": 211900 + }, + { + "epoch": 0.2088045679878313, + "grad_norm": 2.316696882247925, + "learning_rate": 9.95707004927861e-06, + "loss": 3.2068, + "step": 211950 + }, + { + "epoch": 0.20885382596565338, + "grad_norm": 2.508714199066162, + "learning_rate": 9.957049811542257e-06, + "loss": 3.217, + "step": 212000 + }, + { + "epoch": 0.20890308394347548, + "grad_norm": 2.615920066833496, + "learning_rate": 9.95702956905744e-06, + "loss": 3.1073, + "step": 212050 + }, + { + "epoch": 0.20895234192129758, + "grad_norm": 2.485020875930786, + "learning_rate": 9.957009321824177e-06, + "loss": 3.1421, + "step": 212100 + }, + { + "epoch": 0.20900159989911965, + "grad_norm": 2.317331314086914, + "learning_rate": 9.956989069842488e-06, + "loss": 3.1377, + "step": 212150 + }, + { + "epoch": 0.20905085787694175, + "grad_norm": 2.4104042053222656, + "learning_rate": 9.95696881311239e-06, + "loss": 3.1834, + "step": 212200 + }, + { + "epoch": 0.20910011585476385, + "grad_norm": 2.2977495193481445, + "learning_rate": 9.956948551633904e-06, + "loss": 3.1784, + "step": 212250 + }, + { + "epoch": 0.20914937383258592, + "grad_norm": 2.410491943359375, + "learning_rate": 9.956928285407051e-06, + "loss": 3.1973, + "step": 212300 + }, + { + "epoch": 0.20919863181040801, + "grad_norm": 2.5964248180389404, + "learning_rate": 9.956908014431849e-06, + "loss": 3.0851, + "step": 212350 + }, + { + "epoch": 0.2092478897882301, + "grad_norm": 2.3136489391326904, + "learning_rate": 9.956887738708317e-06, + "loss": 3.1582, + "step": 212400 + }, + { + "epoch": 0.20929714776605218, + "grad_norm": 2.3279008865356445, + "learning_rate": 9.956867458236474e-06, + "loss": 3.1703, + "step": 212450 + }, + { + "epoch": 0.20934640574387428, + "grad_norm": 2.316629409790039, + "learning_rate": 9.95684717301634e-06, + "loss": 3.1768, + "step": 212500 + }, + { + "epoch": 0.20939566372169638, + "grad_norm": 2.3444039821624756, + "learning_rate": 9.956826883047935e-06, + "loss": 3.1223, + "step": 212550 + }, + { + "epoch": 0.20944492169951845, + "grad_norm": 2.3323521614074707, + "learning_rate": 9.956806588331278e-06, + "loss": 3.1383, + "step": 212600 + }, + { + "epoch": 0.20949417967734055, + "grad_norm": 2.2402830123901367, + "learning_rate": 9.956786288866388e-06, + "loss": 3.1904, + "step": 212650 + }, + { + "epoch": 0.20954343765516262, + "grad_norm": 2.4704461097717285, + "learning_rate": 9.956765984653286e-06, + "loss": 3.0773, + "step": 212700 + }, + { + "epoch": 0.20959269563298472, + "grad_norm": 2.4164011478424072, + "learning_rate": 9.956745675691987e-06, + "loss": 3.1093, + "step": 212750 + }, + { + "epoch": 0.2096419536108068, + "grad_norm": 2.4975688457489014, + "learning_rate": 9.956725361982517e-06, + "loss": 3.1407, + "step": 212800 + }, + { + "epoch": 0.20969121158862888, + "grad_norm": 2.3983428478240967, + "learning_rate": 9.956705043524892e-06, + "loss": 3.1702, + "step": 212850 + }, + { + "epoch": 0.20974046956645098, + "grad_norm": 2.6149449348449707, + "learning_rate": 9.956684720319133e-06, + "loss": 3.0528, + "step": 212900 + }, + { + "epoch": 0.20978972754427308, + "grad_norm": 2.452202081680298, + "learning_rate": 9.956664392365255e-06, + "loss": 3.1428, + "step": 212950 + }, + { + "epoch": 0.20983898552209515, + "grad_norm": 2.463357448577881, + "learning_rate": 9.956644059663282e-06, + "loss": 3.1138, + "step": 213000 + }, + { + "epoch": 0.20988824349991725, + "grad_norm": 2.39921498298645, + "learning_rate": 9.956623722213232e-06, + "loss": 3.1898, + "step": 213050 + }, + { + "epoch": 0.20993750147773935, + "grad_norm": 2.4862189292907715, + "learning_rate": 9.956603380015127e-06, + "loss": 3.1352, + "step": 213100 + }, + { + "epoch": 0.20998675945556142, + "grad_norm": 2.2902393341064453, + "learning_rate": 9.95658303306898e-06, + "loss": 3.1974, + "step": 213150 + }, + { + "epoch": 0.21003601743338352, + "grad_norm": 2.321004629135132, + "learning_rate": 9.956562681374817e-06, + "loss": 3.1051, + "step": 213200 + }, + { + "epoch": 0.21008527541120559, + "grad_norm": 2.657411813735962, + "learning_rate": 9.956542324932654e-06, + "loss": 3.0804, + "step": 213250 + }, + { + "epoch": 0.21013453338902768, + "grad_norm": 2.3638172149658203, + "learning_rate": 9.956521963742513e-06, + "loss": 3.2135, + "step": 213300 + }, + { + "epoch": 0.21018379136684978, + "grad_norm": 2.3436832427978516, + "learning_rate": 9.95650159780441e-06, + "loss": 3.1552, + "step": 213350 + }, + { + "epoch": 0.21023304934467185, + "grad_norm": 2.4611406326293945, + "learning_rate": 9.956481227118368e-06, + "loss": 3.1274, + "step": 213400 + }, + { + "epoch": 0.21028230732249395, + "grad_norm": 2.5019917488098145, + "learning_rate": 9.956460851684406e-06, + "loss": 3.1272, + "step": 213450 + }, + { + "epoch": 0.21033156530031605, + "grad_norm": 2.1739938259124756, + "learning_rate": 9.956440471502541e-06, + "loss": 3.1333, + "step": 213500 + }, + { + "epoch": 0.21038082327813812, + "grad_norm": 2.354860544204712, + "learning_rate": 9.956420086572796e-06, + "loss": 3.1623, + "step": 213550 + }, + { + "epoch": 0.21043008125596022, + "grad_norm": 2.492450475692749, + "learning_rate": 9.956399696895187e-06, + "loss": 3.2089, + "step": 213600 + }, + { + "epoch": 0.21047933923378231, + "grad_norm": 2.3187363147735596, + "learning_rate": 9.956379302469735e-06, + "loss": 3.1257, + "step": 213650 + }, + { + "epoch": 0.21052859721160438, + "grad_norm": 2.3188271522521973, + "learning_rate": 9.956358903296462e-06, + "loss": 3.1792, + "step": 213700 + }, + { + "epoch": 0.21057785518942648, + "grad_norm": 2.148580551147461, + "learning_rate": 9.956338499375381e-06, + "loss": 3.0928, + "step": 213750 + }, + { + "epoch": 0.21062711316724858, + "grad_norm": 2.216648578643799, + "learning_rate": 9.956318090706521e-06, + "loss": 3.1286, + "step": 213800 + }, + { + "epoch": 0.21067637114507065, + "grad_norm": 2.379702568054199, + "learning_rate": 9.956297677289893e-06, + "loss": 3.1446, + "step": 213850 + }, + { + "epoch": 0.21072562912289275, + "grad_norm": 2.4406378269195557, + "learning_rate": 9.95627725912552e-06, + "loss": 3.1297, + "step": 213900 + }, + { + "epoch": 0.21077488710071482, + "grad_norm": 2.1440553665161133, + "learning_rate": 9.956256836213422e-06, + "loss": 3.1833, + "step": 213950 + }, + { + "epoch": 0.21082414507853692, + "grad_norm": 2.341656446456909, + "learning_rate": 9.95623640855362e-06, + "loss": 3.1513, + "step": 214000 + }, + { + "epoch": 0.21087340305635902, + "grad_norm": 2.363743782043457, + "learning_rate": 9.95621597614613e-06, + "loss": 3.1907, + "step": 214050 + }, + { + "epoch": 0.21092266103418109, + "grad_norm": 2.3875539302825928, + "learning_rate": 9.956195538990972e-06, + "loss": 3.1761, + "step": 214100 + }, + { + "epoch": 0.21097191901200318, + "grad_norm": 2.2219841480255127, + "learning_rate": 9.956175097088168e-06, + "loss": 3.1153, + "step": 214150 + }, + { + "epoch": 0.21102117698982528, + "grad_norm": 2.3501782417297363, + "learning_rate": 9.956154650437737e-06, + "loss": 3.179, + "step": 214200 + }, + { + "epoch": 0.21107043496764735, + "grad_norm": 2.7596614360809326, + "learning_rate": 9.956134199039697e-06, + "loss": 3.1574, + "step": 214250 + }, + { + "epoch": 0.21111969294546945, + "grad_norm": 2.302449941635132, + "learning_rate": 9.95611374289407e-06, + "loss": 3.1358, + "step": 214300 + }, + { + "epoch": 0.21116895092329155, + "grad_norm": 2.3484699726104736, + "learning_rate": 9.956093282000872e-06, + "loss": 3.1819, + "step": 214350 + }, + { + "epoch": 0.21121820890111362, + "grad_norm": 2.340998649597168, + "learning_rate": 9.956072816360125e-06, + "loss": 3.0957, + "step": 214400 + }, + { + "epoch": 0.21126746687893572, + "grad_norm": 2.5022034645080566, + "learning_rate": 9.956052345971849e-06, + "loss": 3.0428, + "step": 214450 + }, + { + "epoch": 0.2113167248567578, + "grad_norm": 2.374725103378296, + "learning_rate": 9.956031870836065e-06, + "loss": 3.1453, + "step": 214500 + }, + { + "epoch": 0.21136598283457989, + "grad_norm": 2.2930655479431152, + "learning_rate": 9.956011390952786e-06, + "loss": 3.1399, + "step": 214550 + }, + { + "epoch": 0.21141524081240198, + "grad_norm": 2.9443299770355225, + "learning_rate": 9.95599090632204e-06, + "loss": 3.1262, + "step": 214600 + }, + { + "epoch": 0.21146449879022405, + "grad_norm": 2.274686813354492, + "learning_rate": 9.95597041694384e-06, + "loss": 3.1728, + "step": 214650 + }, + { + "epoch": 0.21151375676804615, + "grad_norm": 2.355478048324585, + "learning_rate": 9.955949922818211e-06, + "loss": 3.1904, + "step": 214700 + }, + { + "epoch": 0.21156301474586825, + "grad_norm": 2.6392462253570557, + "learning_rate": 9.95592942394517e-06, + "loss": 3.1428, + "step": 214750 + }, + { + "epoch": 0.21161227272369032, + "grad_norm": 2.5454142093658447, + "learning_rate": 9.955908920324735e-06, + "loss": 3.0802, + "step": 214800 + }, + { + "epoch": 0.21166153070151242, + "grad_norm": 2.505828857421875, + "learning_rate": 9.955888411956928e-06, + "loss": 3.1451, + "step": 214850 + }, + { + "epoch": 0.21171078867933452, + "grad_norm": 2.3444368839263916, + "learning_rate": 9.955867898841769e-06, + "loss": 3.1234, + "step": 214900 + }, + { + "epoch": 0.2117600466571566, + "grad_norm": 2.2742414474487305, + "learning_rate": 9.955847380979276e-06, + "loss": 3.1028, + "step": 214950 + }, + { + "epoch": 0.21180930463497868, + "grad_norm": 2.391993761062622, + "learning_rate": 9.955826858369468e-06, + "loss": 3.1144, + "step": 215000 + }, + { + "epoch": 0.21185856261280078, + "grad_norm": 2.29498553276062, + "learning_rate": 9.955806331012366e-06, + "loss": 3.1345, + "step": 215050 + }, + { + "epoch": 0.21190782059062285, + "grad_norm": 2.5411782264709473, + "learning_rate": 9.95578579890799e-06, + "loss": 3.1415, + "step": 215100 + }, + { + "epoch": 0.21195707856844495, + "grad_norm": 2.423549175262451, + "learning_rate": 9.95576526205636e-06, + "loss": 3.1123, + "step": 215150 + }, + { + "epoch": 0.21200633654626702, + "grad_norm": 2.340252161026001, + "learning_rate": 9.955744720457496e-06, + "loss": 3.1728, + "step": 215200 + }, + { + "epoch": 0.21205559452408912, + "grad_norm": 2.3448429107666016, + "learning_rate": 9.955724174111415e-06, + "loss": 3.1435, + "step": 215250 + }, + { + "epoch": 0.21210485250191122, + "grad_norm": 2.2902934551239014, + "learning_rate": 9.955703623018139e-06, + "loss": 3.1518, + "step": 215300 + }, + { + "epoch": 0.2121541104797333, + "grad_norm": 2.551823377609253, + "learning_rate": 9.955683067177685e-06, + "loss": 3.1701, + "step": 215350 + }, + { + "epoch": 0.21220336845755539, + "grad_norm": 2.425600051879883, + "learning_rate": 9.955662506590078e-06, + "loss": 3.1578, + "step": 215400 + }, + { + "epoch": 0.21225262643537748, + "grad_norm": 2.277099132537842, + "learning_rate": 9.955641941255333e-06, + "loss": 3.148, + "step": 215450 + }, + { + "epoch": 0.21230188441319955, + "grad_norm": 2.4369056224823, + "learning_rate": 9.955621371173469e-06, + "loss": 3.1499, + "step": 215500 + }, + { + "epoch": 0.21235114239102165, + "grad_norm": 2.231456756591797, + "learning_rate": 9.95560079634451e-06, + "loss": 3.2129, + "step": 215550 + }, + { + "epoch": 0.21240040036884375, + "grad_norm": 2.5540106296539307, + "learning_rate": 9.955580216768473e-06, + "loss": 3.1958, + "step": 215600 + }, + { + "epoch": 0.21244965834666582, + "grad_norm": 2.1821067333221436, + "learning_rate": 9.955559632445378e-06, + "loss": 3.1299, + "step": 215650 + }, + { + "epoch": 0.21249891632448792, + "grad_norm": 2.3998944759368896, + "learning_rate": 9.955539043375246e-06, + "loss": 3.1231, + "step": 215700 + }, + { + "epoch": 0.21254817430231, + "grad_norm": 2.3447794914245605, + "learning_rate": 9.955518449558094e-06, + "loss": 3.1306, + "step": 215750 + }, + { + "epoch": 0.2125974322801321, + "grad_norm": 2.2837648391723633, + "learning_rate": 9.955497850993944e-06, + "loss": 3.1577, + "step": 215800 + }, + { + "epoch": 0.21264669025795419, + "grad_norm": 2.363300323486328, + "learning_rate": 9.955477247682817e-06, + "loss": 3.1911, + "step": 215850 + }, + { + "epoch": 0.21269594823577626, + "grad_norm": 2.305295944213867, + "learning_rate": 9.955456639624727e-06, + "loss": 3.1229, + "step": 215900 + }, + { + "epoch": 0.21274520621359835, + "grad_norm": 2.3401949405670166, + "learning_rate": 9.9554360268197e-06, + "loss": 3.1725, + "step": 215950 + }, + { + "epoch": 0.21279446419142045, + "grad_norm": 2.3277621269226074, + "learning_rate": 9.955415409267754e-06, + "loss": 3.1695, + "step": 216000 + }, + { + "epoch": 0.21284372216924252, + "grad_norm": 2.3886282444000244, + "learning_rate": 9.955394786968907e-06, + "loss": 3.0797, + "step": 216050 + }, + { + "epoch": 0.21289298014706462, + "grad_norm": 2.3659415245056152, + "learning_rate": 9.95537415992318e-06, + "loss": 3.1734, + "step": 216100 + }, + { + "epoch": 0.21294223812488672, + "grad_norm": 2.3833136558532715, + "learning_rate": 9.955353528130592e-06, + "loss": 3.124, + "step": 216150 + }, + { + "epoch": 0.2129914961027088, + "grad_norm": 2.4051971435546875, + "learning_rate": 9.955332891591165e-06, + "loss": 3.0935, + "step": 216200 + }, + { + "epoch": 0.2130407540805309, + "grad_norm": 2.4027061462402344, + "learning_rate": 9.955312250304916e-06, + "loss": 3.1842, + "step": 216250 + }, + { + "epoch": 0.21309001205835298, + "grad_norm": 2.247666835784912, + "learning_rate": 9.955291604271866e-06, + "loss": 3.0883, + "step": 216300 + }, + { + "epoch": 0.21313927003617505, + "grad_norm": 2.373809814453125, + "learning_rate": 9.955270953492033e-06, + "loss": 3.124, + "step": 216350 + }, + { + "epoch": 0.21318852801399715, + "grad_norm": 2.354877471923828, + "learning_rate": 9.955250297965441e-06, + "loss": 3.1447, + "step": 216400 + }, + { + "epoch": 0.21323778599181922, + "grad_norm": 2.1966781616210938, + "learning_rate": 9.955229637692107e-06, + "loss": 3.139, + "step": 216450 + }, + { + "epoch": 0.21328704396964132, + "grad_norm": 2.311089515686035, + "learning_rate": 9.95520897267205e-06, + "loss": 3.1044, + "step": 216500 + }, + { + "epoch": 0.21333630194746342, + "grad_norm": 2.391268253326416, + "learning_rate": 9.95518830290529e-06, + "loss": 3.1744, + "step": 216550 + }, + { + "epoch": 0.2133855599252855, + "grad_norm": 2.3440258502960205, + "learning_rate": 9.95516762839185e-06, + "loss": 3.1646, + "step": 216600 + }, + { + "epoch": 0.2134348179031076, + "grad_norm": 2.2114803791046143, + "learning_rate": 9.955146949131747e-06, + "loss": 3.1433, + "step": 216650 + }, + { + "epoch": 0.21348407588092969, + "grad_norm": 2.4254446029663086, + "learning_rate": 9.955126265124999e-06, + "loss": 3.1888, + "step": 216700 + }, + { + "epoch": 0.21353333385875176, + "grad_norm": 2.3717868328094482, + "learning_rate": 9.95510557637163e-06, + "loss": 3.1973, + "step": 216750 + }, + { + "epoch": 0.21358259183657385, + "grad_norm": 2.3486387729644775, + "learning_rate": 9.955084882871657e-06, + "loss": 3.1151, + "step": 216800 + }, + { + "epoch": 0.21363184981439595, + "grad_norm": 2.321690559387207, + "learning_rate": 9.955064184625102e-06, + "loss": 3.1479, + "step": 216850 + }, + { + "epoch": 0.21368110779221802, + "grad_norm": 2.254225015640259, + "learning_rate": 9.955043481631981e-06, + "loss": 3.1283, + "step": 216900 + }, + { + "epoch": 0.21373036577004012, + "grad_norm": 2.2769339084625244, + "learning_rate": 9.955022773892319e-06, + "loss": 3.1623, + "step": 216950 + }, + { + "epoch": 0.2137796237478622, + "grad_norm": 2.4090428352355957, + "learning_rate": 9.955002061406132e-06, + "loss": 3.1394, + "step": 217000 + }, + { + "epoch": 0.2138288817256843, + "grad_norm": 2.3981375694274902, + "learning_rate": 9.954981344173442e-06, + "loss": 3.1286, + "step": 217050 + }, + { + "epoch": 0.2138781397035064, + "grad_norm": 2.3273844718933105, + "learning_rate": 9.954960622194268e-06, + "loss": 3.1975, + "step": 217100 + }, + { + "epoch": 0.21392739768132846, + "grad_norm": 2.683159351348877, + "learning_rate": 9.954939895468627e-06, + "loss": 3.1231, + "step": 217150 + }, + { + "epoch": 0.21397665565915056, + "grad_norm": 2.527832508087158, + "learning_rate": 9.954919163996543e-06, + "loss": 3.1402, + "step": 217200 + }, + { + "epoch": 0.21402591363697265, + "grad_norm": 2.373608112335205, + "learning_rate": 9.954898427778035e-06, + "loss": 3.1545, + "step": 217250 + }, + { + "epoch": 0.21407517161479472, + "grad_norm": 2.315764904022217, + "learning_rate": 9.954877686813122e-06, + "loss": 3.1775, + "step": 217300 + }, + { + "epoch": 0.21412442959261682, + "grad_norm": 2.359273910522461, + "learning_rate": 9.954856941101824e-06, + "loss": 3.1475, + "step": 217350 + }, + { + "epoch": 0.21417368757043892, + "grad_norm": 2.2855985164642334, + "learning_rate": 9.954836190644162e-06, + "loss": 3.2016, + "step": 217400 + }, + { + "epoch": 0.214222945548261, + "grad_norm": 2.2812869548797607, + "learning_rate": 9.954815435440156e-06, + "loss": 3.1394, + "step": 217450 + }, + { + "epoch": 0.2142722035260831, + "grad_norm": 2.519676685333252, + "learning_rate": 9.954794675489822e-06, + "loss": 3.1994, + "step": 217500 + }, + { + "epoch": 0.2143214615039052, + "grad_norm": 2.229118585586548, + "learning_rate": 9.954773910793184e-06, + "loss": 3.1741, + "step": 217550 + }, + { + "epoch": 0.21437071948172726, + "grad_norm": 2.3279454708099365, + "learning_rate": 9.95475314135026e-06, + "loss": 3.1672, + "step": 217600 + }, + { + "epoch": 0.21441997745954935, + "grad_norm": 2.404555559158325, + "learning_rate": 9.95473236716107e-06, + "loss": 3.0889, + "step": 217650 + }, + { + "epoch": 0.21446923543737142, + "grad_norm": 2.3408584594726562, + "learning_rate": 9.954711588225637e-06, + "loss": 3.1585, + "step": 217700 + }, + { + "epoch": 0.21451849341519352, + "grad_norm": 2.3164286613464355, + "learning_rate": 9.954690804543976e-06, + "loss": 3.1577, + "step": 217750 + }, + { + "epoch": 0.21456775139301562, + "grad_norm": 2.459487199783325, + "learning_rate": 9.95467001611611e-06, + "loss": 3.1648, + "step": 217800 + }, + { + "epoch": 0.2146170093708377, + "grad_norm": 2.415505886077881, + "learning_rate": 9.954649222942057e-06, + "loss": 3.0993, + "step": 217850 + }, + { + "epoch": 0.2146662673486598, + "grad_norm": 2.3815271854400635, + "learning_rate": 9.954628425021839e-06, + "loss": 3.1695, + "step": 217900 + }, + { + "epoch": 0.2147155253264819, + "grad_norm": 2.5812854766845703, + "learning_rate": 9.954607622355476e-06, + "loss": 3.1023, + "step": 217950 + }, + { + "epoch": 0.21476478330430396, + "grad_norm": 2.1780426502227783, + "learning_rate": 9.954586814942985e-06, + "loss": 3.0459, + "step": 218000 + }, + { + "epoch": 0.21481404128212606, + "grad_norm": 2.362535238265991, + "learning_rate": 9.954566002784389e-06, + "loss": 3.1574, + "step": 218050 + }, + { + "epoch": 0.21486329925994815, + "grad_norm": 2.682753562927246, + "learning_rate": 9.954545185879706e-06, + "loss": 3.1285, + "step": 218100 + }, + { + "epoch": 0.21491255723777022, + "grad_norm": 2.333970069885254, + "learning_rate": 9.954524364228956e-06, + "loss": 3.2008, + "step": 218150 + }, + { + "epoch": 0.21496181521559232, + "grad_norm": 2.6193277835845947, + "learning_rate": 9.954503537832161e-06, + "loss": 3.1268, + "step": 218200 + }, + { + "epoch": 0.2150110731934144, + "grad_norm": 2.4867894649505615, + "learning_rate": 9.954482706689339e-06, + "loss": 3.1436, + "step": 218250 + }, + { + "epoch": 0.2150603311712365, + "grad_norm": 2.3212709426879883, + "learning_rate": 9.954461870800511e-06, + "loss": 3.1234, + "step": 218300 + }, + { + "epoch": 0.2151095891490586, + "grad_norm": 2.4020416736602783, + "learning_rate": 9.954441030165696e-06, + "loss": 3.1108, + "step": 218350 + }, + { + "epoch": 0.21515884712688066, + "grad_norm": 2.3771960735321045, + "learning_rate": 9.954420184784914e-06, + "loss": 3.1683, + "step": 218400 + }, + { + "epoch": 0.21520810510470276, + "grad_norm": 2.5260322093963623, + "learning_rate": 9.954399334658187e-06, + "loss": 3.1934, + "step": 218450 + }, + { + "epoch": 0.21525736308252486, + "grad_norm": 2.575911045074463, + "learning_rate": 9.954378479785532e-06, + "loss": 3.0993, + "step": 218500 + }, + { + "epoch": 0.21530662106034693, + "grad_norm": 2.280452013015747, + "learning_rate": 9.954357620166971e-06, + "loss": 3.1692, + "step": 218550 + }, + { + "epoch": 0.21535587903816902, + "grad_norm": 2.222432851791382, + "learning_rate": 9.954336755802522e-06, + "loss": 3.1447, + "step": 218600 + }, + { + "epoch": 0.21540513701599112, + "grad_norm": 2.273552656173706, + "learning_rate": 9.954315886692209e-06, + "loss": 3.104, + "step": 218650 + }, + { + "epoch": 0.2154543949938132, + "grad_norm": 2.3567557334899902, + "learning_rate": 9.954295012836048e-06, + "loss": 3.1258, + "step": 218700 + }, + { + "epoch": 0.2155036529716353, + "grad_norm": 2.269564151763916, + "learning_rate": 9.954274134234059e-06, + "loss": 3.1448, + "step": 218750 + }, + { + "epoch": 0.21555291094945736, + "grad_norm": 2.487272262573242, + "learning_rate": 9.954253250886265e-06, + "loss": 3.1848, + "step": 218800 + }, + { + "epoch": 0.21560216892727946, + "grad_norm": 2.23667049407959, + "learning_rate": 9.954232362792685e-06, + "loss": 3.1018, + "step": 218850 + }, + { + "epoch": 0.21565142690510156, + "grad_norm": 2.143541097640991, + "learning_rate": 9.954211469953336e-06, + "loss": 3.1883, + "step": 218900 + }, + { + "epoch": 0.21570068488292363, + "grad_norm": 2.367692232131958, + "learning_rate": 9.954190572368243e-06, + "loss": 3.1031, + "step": 218950 + }, + { + "epoch": 0.21574994286074572, + "grad_norm": 2.2872705459594727, + "learning_rate": 9.954169670037421e-06, + "loss": 3.1586, + "step": 219000 + }, + { + "epoch": 0.21579920083856782, + "grad_norm": 2.273003101348877, + "learning_rate": 9.954148762960893e-06, + "loss": 3.1623, + "step": 219050 + }, + { + "epoch": 0.2158484588163899, + "grad_norm": 2.415879249572754, + "learning_rate": 9.95412785113868e-06, + "loss": 3.1327, + "step": 219100 + }, + { + "epoch": 0.215897716794212, + "grad_norm": 2.309433698654175, + "learning_rate": 9.954106934570799e-06, + "loss": 3.1841, + "step": 219150 + }, + { + "epoch": 0.2159469747720341, + "grad_norm": 2.239039421081543, + "learning_rate": 9.954086013257272e-06, + "loss": 3.182, + "step": 219200 + }, + { + "epoch": 0.21599623274985616, + "grad_norm": 2.336247444152832, + "learning_rate": 9.954065087198118e-06, + "loss": 3.0992, + "step": 219250 + }, + { + "epoch": 0.21604549072767826, + "grad_norm": 2.232131242752075, + "learning_rate": 9.954044156393358e-06, + "loss": 3.1573, + "step": 219300 + }, + { + "epoch": 0.21609474870550036, + "grad_norm": 2.3979811668395996, + "learning_rate": 9.95402322084301e-06, + "loss": 3.1947, + "step": 219350 + }, + { + "epoch": 0.21614400668332243, + "grad_norm": 2.395092248916626, + "learning_rate": 9.954002280547097e-06, + "loss": 3.2343, + "step": 219400 + }, + { + "epoch": 0.21619326466114452, + "grad_norm": 2.3550307750701904, + "learning_rate": 9.953981335505638e-06, + "loss": 3.1316, + "step": 219450 + }, + { + "epoch": 0.2162425226389666, + "grad_norm": 2.328139305114746, + "learning_rate": 9.953960385718654e-06, + "loss": 3.1179, + "step": 219500 + }, + { + "epoch": 0.2162917806167887, + "grad_norm": 2.4443392753601074, + "learning_rate": 9.953939431186162e-06, + "loss": 3.1583, + "step": 219550 + }, + { + "epoch": 0.2163410385946108, + "grad_norm": 2.2851271629333496, + "learning_rate": 9.953918471908185e-06, + "loss": 3.1012, + "step": 219600 + }, + { + "epoch": 0.21639029657243286, + "grad_norm": 2.4817416667938232, + "learning_rate": 9.953897507884743e-06, + "loss": 3.113, + "step": 219650 + }, + { + "epoch": 0.21643955455025496, + "grad_norm": 2.2895359992980957, + "learning_rate": 9.953876539115851e-06, + "loss": 3.1865, + "step": 219700 + }, + { + "epoch": 0.21648881252807706, + "grad_norm": 2.3296163082122803, + "learning_rate": 9.953855565601538e-06, + "loss": 3.0697, + "step": 219750 + }, + { + "epoch": 0.21653807050589913, + "grad_norm": 2.4292235374450684, + "learning_rate": 9.953834587341816e-06, + "loss": 3.0956, + "step": 219800 + }, + { + "epoch": 0.21658732848372123, + "grad_norm": 2.1216650009155273, + "learning_rate": 9.95381360433671e-06, + "loss": 3.1872, + "step": 219850 + }, + { + "epoch": 0.21663658646154332, + "grad_norm": 2.415778160095215, + "learning_rate": 9.953792616586237e-06, + "loss": 3.1604, + "step": 219900 + }, + { + "epoch": 0.2166858444393654, + "grad_norm": 2.371203899383545, + "learning_rate": 9.95377162409042e-06, + "loss": 3.15, + "step": 219950 + }, + { + "epoch": 0.2167351024171875, + "grad_norm": 2.323850631713867, + "learning_rate": 9.953750626849278e-06, + "loss": 3.1505, + "step": 220000 + }, + { + "epoch": 0.21678436039500956, + "grad_norm": 2.3198366165161133, + "learning_rate": 9.953729624862831e-06, + "loss": 3.0807, + "step": 220050 + }, + { + "epoch": 0.21683361837283166, + "grad_norm": 2.310755729675293, + "learning_rate": 9.953708618131097e-06, + "loss": 3.1506, + "step": 220100 + }, + { + "epoch": 0.21688287635065376, + "grad_norm": 2.394498825073242, + "learning_rate": 9.9536876066541e-06, + "loss": 3.1061, + "step": 220150 + }, + { + "epoch": 0.21693213432847583, + "grad_norm": 2.35528302192688, + "learning_rate": 9.953666590431856e-06, + "loss": 3.1864, + "step": 220200 + }, + { + "epoch": 0.21698139230629793, + "grad_norm": 2.4034018516540527, + "learning_rate": 9.95364556946439e-06, + "loss": 3.1117, + "step": 220250 + }, + { + "epoch": 0.21703065028412002, + "grad_norm": 2.4296813011169434, + "learning_rate": 9.953624543751718e-06, + "loss": 3.1317, + "step": 220300 + }, + { + "epoch": 0.2170799082619421, + "grad_norm": 2.4014899730682373, + "learning_rate": 9.95360351329386e-06, + "loss": 3.113, + "step": 220350 + }, + { + "epoch": 0.2171291662397642, + "grad_norm": 2.2216415405273438, + "learning_rate": 9.95358247809084e-06, + "loss": 3.0968, + "step": 220400 + }, + { + "epoch": 0.2171784242175863, + "grad_norm": 2.3313658237457275, + "learning_rate": 9.953561438142677e-06, + "loss": 3.2323, + "step": 220450 + }, + { + "epoch": 0.21722768219540836, + "grad_norm": 2.508573055267334, + "learning_rate": 9.953540393449387e-06, + "loss": 3.1242, + "step": 220500 + }, + { + "epoch": 0.21727694017323046, + "grad_norm": 3.3420238494873047, + "learning_rate": 9.953519344010994e-06, + "loss": 3.1206, + "step": 220550 + }, + { + "epoch": 0.21732619815105256, + "grad_norm": 2.354785680770874, + "learning_rate": 9.953498289827519e-06, + "loss": 3.165, + "step": 220600 + }, + { + "epoch": 0.21737545612887463, + "grad_norm": 2.5638415813446045, + "learning_rate": 9.953477230898978e-06, + "loss": 3.1726, + "step": 220650 + }, + { + "epoch": 0.21742471410669673, + "grad_norm": 2.3629157543182373, + "learning_rate": 9.953456167225397e-06, + "loss": 3.1529, + "step": 220700 + }, + { + "epoch": 0.2174739720845188, + "grad_norm": 2.4882609844207764, + "learning_rate": 9.953435098806789e-06, + "loss": 3.1201, + "step": 220750 + }, + { + "epoch": 0.2175232300623409, + "grad_norm": 2.2484898567199707, + "learning_rate": 9.95341402564318e-06, + "loss": 3.0933, + "step": 220800 + }, + { + "epoch": 0.217572488040163, + "grad_norm": 2.5583443641662598, + "learning_rate": 9.953392947734589e-06, + "loss": 3.1514, + "step": 220850 + }, + { + "epoch": 0.21762174601798506, + "grad_norm": 2.3598177433013916, + "learning_rate": 9.953371865081035e-06, + "loss": 3.1507, + "step": 220900 + }, + { + "epoch": 0.21767100399580716, + "grad_norm": 2.334414005279541, + "learning_rate": 9.953350777682538e-06, + "loss": 3.1289, + "step": 220950 + }, + { + "epoch": 0.21772026197362926, + "grad_norm": 2.203545093536377, + "learning_rate": 9.953329685539119e-06, + "loss": 3.1154, + "step": 221000 + }, + { + "epoch": 0.21776951995145133, + "grad_norm": 2.596688747406006, + "learning_rate": 9.953308588650799e-06, + "loss": 3.1456, + "step": 221050 + }, + { + "epoch": 0.21781877792927343, + "grad_norm": 2.4641153812408447, + "learning_rate": 9.953287487017596e-06, + "loss": 3.1684, + "step": 221100 + }, + { + "epoch": 0.21786803590709553, + "grad_norm": 2.335426092147827, + "learning_rate": 9.953266380639532e-06, + "loss": 3.1684, + "step": 221150 + }, + { + "epoch": 0.2179172938849176, + "grad_norm": 2.3259992599487305, + "learning_rate": 9.953245269516627e-06, + "loss": 3.2126, + "step": 221200 + }, + { + "epoch": 0.2179665518627397, + "grad_norm": 2.4024343490600586, + "learning_rate": 9.9532241536489e-06, + "loss": 3.1573, + "step": 221250 + }, + { + "epoch": 0.21801580984056176, + "grad_norm": 2.225503921508789, + "learning_rate": 9.953203033036373e-06, + "loss": 3.1949, + "step": 221300 + }, + { + "epoch": 0.21806506781838386, + "grad_norm": 2.3264174461364746, + "learning_rate": 9.953181907679066e-06, + "loss": 3.1425, + "step": 221350 + }, + { + "epoch": 0.21811432579620596, + "grad_norm": 2.407752752304077, + "learning_rate": 9.953160777576996e-06, + "loss": 3.0791, + "step": 221400 + }, + { + "epoch": 0.21816358377402803, + "grad_norm": 2.1915769577026367, + "learning_rate": 9.953139642730189e-06, + "loss": 3.0761, + "step": 221450 + }, + { + "epoch": 0.21821284175185013, + "grad_norm": 2.535818338394165, + "learning_rate": 9.95311850313866e-06, + "loss": 3.2509, + "step": 221500 + }, + { + "epoch": 0.21826209972967223, + "grad_norm": 2.2756717205047607, + "learning_rate": 9.953097358802433e-06, + "loss": 3.106, + "step": 221550 + }, + { + "epoch": 0.2183113577074943, + "grad_norm": 2.342862606048584, + "learning_rate": 9.953076209721525e-06, + "loss": 3.1443, + "step": 221600 + }, + { + "epoch": 0.2183606156853164, + "grad_norm": 2.4161016941070557, + "learning_rate": 9.953055055895956e-06, + "loss": 3.2093, + "step": 221650 + }, + { + "epoch": 0.2184098736631385, + "grad_norm": 2.646806478500366, + "learning_rate": 9.953033897325752e-06, + "loss": 3.163, + "step": 221700 + }, + { + "epoch": 0.21845913164096056, + "grad_norm": 2.5922157764434814, + "learning_rate": 9.953012734010928e-06, + "loss": 3.1306, + "step": 221750 + }, + { + "epoch": 0.21850838961878266, + "grad_norm": 2.2392547130584717, + "learning_rate": 9.952991565951506e-06, + "loss": 3.1159, + "step": 221800 + }, + { + "epoch": 0.21855764759660476, + "grad_norm": 2.319308280944824, + "learning_rate": 9.952970393147506e-06, + "loss": 3.1296, + "step": 221850 + }, + { + "epoch": 0.21860690557442683, + "grad_norm": 2.368163585662842, + "learning_rate": 9.952949215598947e-06, + "loss": 3.1177, + "step": 221900 + }, + { + "epoch": 0.21865616355224893, + "grad_norm": 2.1927247047424316, + "learning_rate": 9.952928033305851e-06, + "loss": 3.2009, + "step": 221950 + }, + { + "epoch": 0.218705421530071, + "grad_norm": 2.3409321308135986, + "learning_rate": 9.952906846268238e-06, + "loss": 3.0918, + "step": 222000 + }, + { + "epoch": 0.2187546795078931, + "grad_norm": 2.5363574028015137, + "learning_rate": 9.952885654486127e-06, + "loss": 3.1061, + "step": 222050 + }, + { + "epoch": 0.2188039374857152, + "grad_norm": 2.292142152786255, + "learning_rate": 9.952864457959541e-06, + "loss": 3.0607, + "step": 222100 + }, + { + "epoch": 0.21885319546353726, + "grad_norm": Infinity, + "learning_rate": 9.952843256688498e-06, + "loss": 3.1004, + "step": 222150 + }, + { + "epoch": 0.21890245344135936, + "grad_norm": 2.2631335258483887, + "learning_rate": 9.952822050673018e-06, + "loss": 3.2112, + "step": 222200 + }, + { + "epoch": 0.21895171141918146, + "grad_norm": 2.38161301612854, + "learning_rate": 9.952800839913124e-06, + "loss": 3.1215, + "step": 222250 + }, + { + "epoch": 0.21900096939700353, + "grad_norm": 2.3659276962280273, + "learning_rate": 9.952779624408834e-06, + "loss": 3.1205, + "step": 222300 + }, + { + "epoch": 0.21905022737482563, + "grad_norm": 2.298375129699707, + "learning_rate": 9.95275840416017e-06, + "loss": 3.1116, + "step": 222350 + }, + { + "epoch": 0.21909948535264773, + "grad_norm": 2.338192939758301, + "learning_rate": 9.952737179167148e-06, + "loss": 3.1552, + "step": 222400 + }, + { + "epoch": 0.2191487433304698, + "grad_norm": 2.4458112716674805, + "learning_rate": 9.952715949429796e-06, + "loss": 3.0976, + "step": 222450 + }, + { + "epoch": 0.2191980013082919, + "grad_norm": 2.276179075241089, + "learning_rate": 9.952694714948128e-06, + "loss": 3.1573, + "step": 222500 + }, + { + "epoch": 0.21924725928611397, + "grad_norm": 2.357313871383667, + "learning_rate": 9.952673475722165e-06, + "loss": 3.1271, + "step": 222550 + }, + { + "epoch": 0.21929651726393606, + "grad_norm": 2.2970969676971436, + "learning_rate": 9.95265223175193e-06, + "loss": 3.1294, + "step": 222600 + }, + { + "epoch": 0.21934577524175816, + "grad_norm": 2.5114800930023193, + "learning_rate": 9.952630983037442e-06, + "loss": 3.1834, + "step": 222650 + }, + { + "epoch": 0.21939503321958023, + "grad_norm": 2.2983362674713135, + "learning_rate": 9.952609729578722e-06, + "loss": 3.1314, + "step": 222700 + }, + { + "epoch": 0.21944429119740233, + "grad_norm": 2.9122931957244873, + "learning_rate": 9.952588471375787e-06, + "loss": 3.1006, + "step": 222750 + }, + { + "epoch": 0.21949354917522443, + "grad_norm": 2.6576688289642334, + "learning_rate": 9.952567208428662e-06, + "loss": 3.1942, + "step": 222800 + }, + { + "epoch": 0.2195428071530465, + "grad_norm": 2.4808766841888428, + "learning_rate": 9.952545940737366e-06, + "loss": 3.083, + "step": 222850 + }, + { + "epoch": 0.2195920651308686, + "grad_norm": 2.299445867538452, + "learning_rate": 9.952524668301918e-06, + "loss": 3.0873, + "step": 222900 + }, + { + "epoch": 0.2196413231086907, + "grad_norm": 2.335651397705078, + "learning_rate": 9.952503391122339e-06, + "loss": 3.1505, + "step": 222950 + }, + { + "epoch": 0.21969058108651277, + "grad_norm": 2.324120044708252, + "learning_rate": 9.952482109198651e-06, + "loss": 3.1599, + "step": 223000 + }, + { + "epoch": 0.21973983906433486, + "grad_norm": 2.2573792934417725, + "learning_rate": 9.952460822530872e-06, + "loss": 3.0968, + "step": 223050 + }, + { + "epoch": 0.21978909704215696, + "grad_norm": 2.2530925273895264, + "learning_rate": 9.952439531119023e-06, + "loss": 3.0953, + "step": 223100 + }, + { + "epoch": 0.21983835501997903, + "grad_norm": 2.6181163787841797, + "learning_rate": 9.952418234963124e-06, + "loss": 3.0904, + "step": 223150 + }, + { + "epoch": 0.21988761299780113, + "grad_norm": 2.2858657836914062, + "learning_rate": 9.952396934063198e-06, + "loss": 3.0963, + "step": 223200 + }, + { + "epoch": 0.2199368709756232, + "grad_norm": 2.3564040660858154, + "learning_rate": 9.952375628419263e-06, + "loss": 3.1173, + "step": 223250 + }, + { + "epoch": 0.2199861289534453, + "grad_norm": 2.4005532264709473, + "learning_rate": 9.952354318031338e-06, + "loss": 3.0248, + "step": 223300 + }, + { + "epoch": 0.2200353869312674, + "grad_norm": 2.2955896854400635, + "learning_rate": 9.952333002899446e-06, + "loss": 3.0786, + "step": 223350 + }, + { + "epoch": 0.22008464490908947, + "grad_norm": 2.2909789085388184, + "learning_rate": 9.952311683023609e-06, + "loss": 3.1705, + "step": 223400 + }, + { + "epoch": 0.22013390288691156, + "grad_norm": 2.2800896167755127, + "learning_rate": 9.952290358403844e-06, + "loss": 3.0696, + "step": 223450 + }, + { + "epoch": 0.22018316086473366, + "grad_norm": 2.145925521850586, + "learning_rate": 9.952269029040172e-06, + "loss": 3.0791, + "step": 223500 + }, + { + "epoch": 0.22023241884255573, + "grad_norm": 2.243391513824463, + "learning_rate": 9.952247694932614e-06, + "loss": 3.1067, + "step": 223550 + }, + { + "epoch": 0.22028167682037783, + "grad_norm": 2.2909889221191406, + "learning_rate": 9.952226356081191e-06, + "loss": 3.136, + "step": 223600 + }, + { + "epoch": 0.22033093479819993, + "grad_norm": 2.319020986557007, + "learning_rate": 9.952205012485924e-06, + "loss": 3.0825, + "step": 223650 + }, + { + "epoch": 0.220380192776022, + "grad_norm": 2.3813540935516357, + "learning_rate": 9.95218366414683e-06, + "loss": 3.1717, + "step": 223700 + }, + { + "epoch": 0.2204294507538441, + "grad_norm": 2.2313458919525146, + "learning_rate": 9.952162311063934e-06, + "loss": 3.1216, + "step": 223750 + }, + { + "epoch": 0.22047870873166617, + "grad_norm": 2.2755343914031982, + "learning_rate": 9.952140953237254e-06, + "loss": 3.1561, + "step": 223800 + }, + { + "epoch": 0.22052796670948827, + "grad_norm": 2.4507675170898438, + "learning_rate": 9.95211959066681e-06, + "loss": 3.1193, + "step": 223850 + }, + { + "epoch": 0.22057722468731036, + "grad_norm": 2.338200330734253, + "learning_rate": 9.952098223352624e-06, + "loss": 3.119, + "step": 223900 + }, + { + "epoch": 0.22062648266513243, + "grad_norm": 2.416537284851074, + "learning_rate": 9.952076851294714e-06, + "loss": 3.1345, + "step": 223950 + }, + { + "epoch": 0.22067574064295453, + "grad_norm": 2.496049404144287, + "learning_rate": 9.952055474493104e-06, + "loss": 3.1462, + "step": 224000 + }, + { + "epoch": 0.22072499862077663, + "grad_norm": 2.4390504360198975, + "learning_rate": 9.952034092947812e-06, + "loss": 3.1478, + "step": 224050 + }, + { + "epoch": 0.2207742565985987, + "grad_norm": 2.342205286026001, + "learning_rate": 9.95201270665886e-06, + "loss": 3.1594, + "step": 224100 + }, + { + "epoch": 0.2208235145764208, + "grad_norm": 2.2451493740081787, + "learning_rate": 9.951991315626266e-06, + "loss": 3.1198, + "step": 224150 + }, + { + "epoch": 0.2208727725542429, + "grad_norm": Infinity, + "learning_rate": 9.951969919850053e-06, + "loss": 3.1493, + "step": 224200 + }, + { + "epoch": 0.22092203053206497, + "grad_norm": 2.3551137447357178, + "learning_rate": 9.95194851933024e-06, + "loss": 3.1719, + "step": 224250 + }, + { + "epoch": 0.22097128850988706, + "grad_norm": 2.408876657485962, + "learning_rate": 9.951927114066848e-06, + "loss": 3.1749, + "step": 224300 + }, + { + "epoch": 0.22102054648770916, + "grad_norm": 2.739084482192993, + "learning_rate": 9.951905704059899e-06, + "loss": 3.1445, + "step": 224350 + }, + { + "epoch": 0.22106980446553123, + "grad_norm": 2.425157308578491, + "learning_rate": 9.95188428930941e-06, + "loss": 3.1426, + "step": 224400 + }, + { + "epoch": 0.22111906244335333, + "grad_norm": 2.4225034713745117, + "learning_rate": 9.951862869815406e-06, + "loss": 3.19, + "step": 224450 + }, + { + "epoch": 0.2211683204211754, + "grad_norm": 2.413776397705078, + "learning_rate": 9.951841445577903e-06, + "loss": 3.0905, + "step": 224500 + }, + { + "epoch": 0.2212175783989975, + "grad_norm": 2.4735167026519775, + "learning_rate": 9.951820016596924e-06, + "loss": 3.1404, + "step": 224550 + }, + { + "epoch": 0.2212668363768196, + "grad_norm": 4.925982475280762, + "learning_rate": 9.95179858287249e-06, + "loss": 3.1675, + "step": 224600 + }, + { + "epoch": 0.22131609435464167, + "grad_norm": 2.3830573558807373, + "learning_rate": 9.951777144404622e-06, + "loss": 3.1558, + "step": 224650 + }, + { + "epoch": 0.22136535233246377, + "grad_norm": 2.386718988418579, + "learning_rate": 9.951755701193337e-06, + "loss": 3.1669, + "step": 224700 + }, + { + "epoch": 0.22141461031028586, + "grad_norm": 2.4135544300079346, + "learning_rate": 9.951734253238658e-06, + "loss": 3.0971, + "step": 224750 + }, + { + "epoch": 0.22146386828810793, + "grad_norm": 2.173279047012329, + "learning_rate": 9.951712800540606e-06, + "loss": 3.1893, + "step": 224800 + }, + { + "epoch": 0.22151312626593003, + "grad_norm": 2.1849546432495117, + "learning_rate": 9.951691343099202e-06, + "loss": 3.1764, + "step": 224850 + }, + { + "epoch": 0.22156238424375213, + "grad_norm": 2.4604570865631104, + "learning_rate": 9.951669880914463e-06, + "loss": 3.1355, + "step": 224900 + }, + { + "epoch": 0.2216116422215742, + "grad_norm": 2.727764129638672, + "learning_rate": 9.951648413986414e-06, + "loss": 3.1551, + "step": 224950 + }, + { + "epoch": 0.2216609001993963, + "grad_norm": 2.3390331268310547, + "learning_rate": 9.951626942315071e-06, + "loss": 3.0982, + "step": 225000 + }, + { + "epoch": 0.22171015817721837, + "grad_norm": 2.509833335876465, + "learning_rate": 9.95160546590046e-06, + "loss": 3.1597, + "step": 225050 + }, + { + "epoch": 0.22175941615504047, + "grad_norm": 2.329866647720337, + "learning_rate": 9.951583984742598e-06, + "loss": 3.1881, + "step": 225100 + }, + { + "epoch": 0.22180867413286257, + "grad_norm": 2.327477216720581, + "learning_rate": 9.951562498841505e-06, + "loss": 3.1524, + "step": 225150 + }, + { + "epoch": 0.22185793211068464, + "grad_norm": 2.3664917945861816, + "learning_rate": 9.951541008197204e-06, + "loss": 3.128, + "step": 225200 + }, + { + "epoch": 0.22190719008850673, + "grad_norm": 2.2758588790893555, + "learning_rate": 9.951519512809714e-06, + "loss": 3.0813, + "step": 225250 + }, + { + "epoch": 0.22195644806632883, + "grad_norm": 2.476743221282959, + "learning_rate": 9.951498012679057e-06, + "loss": 3.1529, + "step": 225300 + }, + { + "epoch": 0.2220057060441509, + "grad_norm": 2.308541774749756, + "learning_rate": 9.951476507805251e-06, + "loss": 3.0256, + "step": 225350 + }, + { + "epoch": 0.222054964021973, + "grad_norm": 2.2983767986297607, + "learning_rate": 9.951454998188318e-06, + "loss": 3.1596, + "step": 225400 + }, + { + "epoch": 0.2221042219997951, + "grad_norm": 2.537550687789917, + "learning_rate": 9.951433483828282e-06, + "loss": 3.1456, + "step": 225450 + }, + { + "epoch": 0.22215347997761717, + "grad_norm": 2.4512693881988525, + "learning_rate": 9.951411964725158e-06, + "loss": 3.0909, + "step": 225500 + }, + { + "epoch": 0.22220273795543927, + "grad_norm": 2.366459846496582, + "learning_rate": 9.95139044087897e-06, + "loss": 3.1937, + "step": 225550 + }, + { + "epoch": 0.22225199593326136, + "grad_norm": 2.3706185817718506, + "learning_rate": 9.951368912289736e-06, + "loss": 3.1219, + "step": 225600 + }, + { + "epoch": 0.22230125391108344, + "grad_norm": 2.306251287460327, + "learning_rate": 9.95134737895748e-06, + "loss": 3.0447, + "step": 225650 + }, + { + "epoch": 0.22235051188890553, + "grad_norm": 2.2847890853881836, + "learning_rate": 9.95132584088222e-06, + "loss": 3.1282, + "step": 225700 + }, + { + "epoch": 0.2223997698667276, + "grad_norm": 2.4151692390441895, + "learning_rate": 9.951304298063979e-06, + "loss": 3.1488, + "step": 225750 + }, + { + "epoch": 0.2224490278445497, + "grad_norm": 2.6397173404693604, + "learning_rate": 9.951282750502774e-06, + "loss": 3.1177, + "step": 225800 + }, + { + "epoch": 0.2224982858223718, + "grad_norm": 2.3319664001464844, + "learning_rate": 9.95126119819863e-06, + "loss": 3.0724, + "step": 225850 + }, + { + "epoch": 0.22254754380019387, + "grad_norm": 2.1995129585266113, + "learning_rate": 9.951239641151564e-06, + "loss": 3.0403, + "step": 225900 + }, + { + "epoch": 0.22259680177801597, + "grad_norm": 2.340365171432495, + "learning_rate": 9.951218079361598e-06, + "loss": 3.0753, + "step": 225950 + }, + { + "epoch": 0.22264605975583807, + "grad_norm": 2.412123918533325, + "learning_rate": 9.951196512828754e-06, + "loss": 3.1151, + "step": 226000 + }, + { + "epoch": 0.22269531773366014, + "grad_norm": 2.294193744659424, + "learning_rate": 9.95117494155305e-06, + "loss": 3.1075, + "step": 226050 + }, + { + "epoch": 0.22274457571148223, + "grad_norm": 2.4633636474609375, + "learning_rate": 9.951153365534509e-06, + "loss": 3.1404, + "step": 226100 + }, + { + "epoch": 0.22279383368930433, + "grad_norm": 2.410461902618408, + "learning_rate": 9.951131784773151e-06, + "loss": 3.1913, + "step": 226150 + }, + { + "epoch": 0.2228430916671264, + "grad_norm": 2.4326319694519043, + "learning_rate": 9.951110199268996e-06, + "loss": 3.1108, + "step": 226200 + }, + { + "epoch": 0.2228923496449485, + "grad_norm": 2.282093048095703, + "learning_rate": 9.951088609022065e-06, + "loss": 3.2022, + "step": 226250 + }, + { + "epoch": 0.22294160762277057, + "grad_norm": 2.2904813289642334, + "learning_rate": 9.95106701403238e-06, + "loss": 3.1781, + "step": 226300 + }, + { + "epoch": 0.22299086560059267, + "grad_norm": 2.441413164138794, + "learning_rate": 9.95104541429996e-06, + "loss": 3.0676, + "step": 226350 + }, + { + "epoch": 0.22304012357841477, + "grad_norm": 2.216420888900757, + "learning_rate": 9.951023809824826e-06, + "loss": 3.1796, + "step": 226400 + }, + { + "epoch": 0.22308938155623684, + "grad_norm": 2.2211849689483643, + "learning_rate": 9.951002200606998e-06, + "loss": 3.1065, + "step": 226450 + }, + { + "epoch": 0.22313863953405894, + "grad_norm": 2.300203323364258, + "learning_rate": 9.950980586646499e-06, + "loss": 3.1481, + "step": 226500 + }, + { + "epoch": 0.22318789751188103, + "grad_norm": 2.251762628555298, + "learning_rate": 9.950958967943347e-06, + "loss": 3.1811, + "step": 226550 + }, + { + "epoch": 0.2232371554897031, + "grad_norm": 2.272516965866089, + "learning_rate": 9.950937344497566e-06, + "loss": 3.1904, + "step": 226600 + }, + { + "epoch": 0.2232864134675252, + "grad_norm": 2.5223546028137207, + "learning_rate": 9.950915716309173e-06, + "loss": 3.0872, + "step": 226650 + }, + { + "epoch": 0.2233356714453473, + "grad_norm": 2.9933032989501953, + "learning_rate": 9.950894083378191e-06, + "loss": 3.1001, + "step": 226700 + }, + { + "epoch": 0.22338492942316937, + "grad_norm": 2.349722146987915, + "learning_rate": 9.95087244570464e-06, + "loss": 3.0759, + "step": 226750 + }, + { + "epoch": 0.22343418740099147, + "grad_norm": 2.527432680130005, + "learning_rate": 9.95085080328854e-06, + "loss": 3.0944, + "step": 226800 + }, + { + "epoch": 0.22348344537881354, + "grad_norm": 2.321079969406128, + "learning_rate": 9.950829156129913e-06, + "loss": 3.1357, + "step": 226850 + }, + { + "epoch": 0.22353270335663564, + "grad_norm": 2.3322012424468994, + "learning_rate": 9.950807504228782e-06, + "loss": 3.0767, + "step": 226900 + }, + { + "epoch": 0.22358196133445774, + "grad_norm": 2.490889549255371, + "learning_rate": 9.950785847585162e-06, + "loss": 3.1922, + "step": 226950 + }, + { + "epoch": 0.2236312193122798, + "grad_norm": 2.262770414352417, + "learning_rate": 9.950764186199078e-06, + "loss": 3.0801, + "step": 227000 + }, + { + "epoch": 0.2236804772901019, + "grad_norm": 2.309741735458374, + "learning_rate": 9.950742520070548e-06, + "loss": 3.0282, + "step": 227050 + }, + { + "epoch": 0.223729735267924, + "grad_norm": 2.244929075241089, + "learning_rate": 9.950720849199596e-06, + "loss": 3.1467, + "step": 227100 + }, + { + "epoch": 0.22377899324574607, + "grad_norm": 2.2359673976898193, + "learning_rate": 9.95069917358624e-06, + "loss": 3.1852, + "step": 227150 + }, + { + "epoch": 0.22382825122356817, + "grad_norm": 2.1796724796295166, + "learning_rate": 9.950677493230502e-06, + "loss": 3.1591, + "step": 227200 + }, + { + "epoch": 0.22387750920139027, + "grad_norm": 2.3265440464019775, + "learning_rate": 9.950655808132405e-06, + "loss": 3.1465, + "step": 227250 + }, + { + "epoch": 0.22392676717921234, + "grad_norm": 2.4119598865509033, + "learning_rate": 9.950634118291965e-06, + "loss": 3.0864, + "step": 227300 + }, + { + "epoch": 0.22397602515703444, + "grad_norm": 2.336982011795044, + "learning_rate": 9.950612423709206e-06, + "loss": 3.1098, + "step": 227350 + }, + { + "epoch": 0.22402528313485653, + "grad_norm": 2.3077609539031982, + "learning_rate": 9.950590724384149e-06, + "loss": 3.1326, + "step": 227400 + }, + { + "epoch": 0.2240745411126786, + "grad_norm": 2.4018473625183105, + "learning_rate": 9.950569020316811e-06, + "loss": 3.1542, + "step": 227450 + }, + { + "epoch": 0.2241237990905007, + "grad_norm": 2.499340772628784, + "learning_rate": 9.950547311507218e-06, + "loss": 3.1258, + "step": 227500 + }, + { + "epoch": 0.22417305706832277, + "grad_norm": 2.071631908416748, + "learning_rate": 9.950525597955387e-06, + "loss": 3.1326, + "step": 227550 + }, + { + "epoch": 0.22422231504614487, + "grad_norm": 2.308685064315796, + "learning_rate": 9.950503879661341e-06, + "loss": 3.1074, + "step": 227600 + }, + { + "epoch": 0.22427157302396697, + "grad_norm": 2.398099660873413, + "learning_rate": 9.9504821566251e-06, + "loss": 3.1095, + "step": 227650 + }, + { + "epoch": 0.22432083100178904, + "grad_norm": 2.3888399600982666, + "learning_rate": 9.950460428846684e-06, + "loss": 3.1259, + "step": 227700 + }, + { + "epoch": 0.22437008897961114, + "grad_norm": 2.3664770126342773, + "learning_rate": 9.950438696326114e-06, + "loss": 3.1317, + "step": 227750 + }, + { + "epoch": 0.22441934695743324, + "grad_norm": 2.326826810836792, + "learning_rate": 9.950416959063413e-06, + "loss": 3.12, + "step": 227800 + }, + { + "epoch": 0.2244686049352553, + "grad_norm": 2.8650074005126953, + "learning_rate": 9.9503952170586e-06, + "loss": 3.1232, + "step": 227850 + }, + { + "epoch": 0.2245178629130774, + "grad_norm": 2.3764822483062744, + "learning_rate": 9.950373470311693e-06, + "loss": 3.0929, + "step": 227900 + }, + { + "epoch": 0.2245671208908995, + "grad_norm": 2.303454637527466, + "learning_rate": 9.95035171882272e-06, + "loss": 3.1039, + "step": 227950 + }, + { + "epoch": 0.22461637886872157, + "grad_norm": 2.5130581855773926, + "learning_rate": 9.950329962591697e-06, + "loss": 3.1656, + "step": 228000 + }, + { + "epoch": 0.22466563684654367, + "grad_norm": 2.46669602394104, + "learning_rate": 9.950308201618643e-06, + "loss": 3.0518, + "step": 228050 + }, + { + "epoch": 0.22471489482436574, + "grad_norm": 2.4951865673065186, + "learning_rate": 9.950286435903582e-06, + "loss": 3.1664, + "step": 228100 + }, + { + "epoch": 0.22476415280218784, + "grad_norm": 2.2968597412109375, + "learning_rate": 9.950264665446536e-06, + "loss": 3.117, + "step": 228150 + }, + { + "epoch": 0.22481341078000994, + "grad_norm": 2.4654903411865234, + "learning_rate": 9.950242890247523e-06, + "loss": 3.1624, + "step": 228200 + }, + { + "epoch": 0.224862668757832, + "grad_norm": 2.405632734298706, + "learning_rate": 9.950221110306566e-06, + "loss": 3.0874, + "step": 228250 + }, + { + "epoch": 0.2249119267356541, + "grad_norm": 2.2929255962371826, + "learning_rate": 9.950199325623684e-06, + "loss": 3.0653, + "step": 228300 + }, + { + "epoch": 0.2249611847134762, + "grad_norm": 2.4489662647247314, + "learning_rate": 9.950177536198898e-06, + "loss": 3.1353, + "step": 228350 + }, + { + "epoch": 0.22501044269129827, + "grad_norm": 2.3690357208251953, + "learning_rate": 9.950155742032231e-06, + "loss": 3.1234, + "step": 228400 + }, + { + "epoch": 0.22505970066912037, + "grad_norm": 2.2238545417785645, + "learning_rate": 9.9501339431237e-06, + "loss": 3.1678, + "step": 228450 + }, + { + "epoch": 0.22510895864694247, + "grad_norm": 2.303699493408203, + "learning_rate": 9.950112139473332e-06, + "loss": 3.1243, + "step": 228500 + }, + { + "epoch": 0.22515821662476454, + "grad_norm": 2.277594566345215, + "learning_rate": 9.950090331081141e-06, + "loss": 3.1293, + "step": 228550 + }, + { + "epoch": 0.22520747460258664, + "grad_norm": 2.269394874572754, + "learning_rate": 9.950068517947153e-06, + "loss": 3.1606, + "step": 228600 + }, + { + "epoch": 0.22525673258040874, + "grad_norm": 2.459094285964966, + "learning_rate": 9.950046700071385e-06, + "loss": 3.0326, + "step": 228650 + }, + { + "epoch": 0.2253059905582308, + "grad_norm": 2.3973357677459717, + "learning_rate": 9.950024877453861e-06, + "loss": 3.0976, + "step": 228700 + }, + { + "epoch": 0.2253552485360529, + "grad_norm": 2.2996439933776855, + "learning_rate": 9.9500030500946e-06, + "loss": 3.1141, + "step": 228750 + }, + { + "epoch": 0.22540450651387497, + "grad_norm": 2.450636625289917, + "learning_rate": 9.949981217993627e-06, + "loss": 3.1315, + "step": 228800 + }, + { + "epoch": 0.22545376449169707, + "grad_norm": 2.48783540725708, + "learning_rate": 9.949959381150956e-06, + "loss": 3.1283, + "step": 228850 + }, + { + "epoch": 0.22550302246951917, + "grad_norm": 2.297492742538452, + "learning_rate": 9.949937539566614e-06, + "loss": 3.0837, + "step": 228900 + }, + { + "epoch": 0.22555228044734124, + "grad_norm": 2.3832290172576904, + "learning_rate": 9.949915693240618e-06, + "loss": 3.1433, + "step": 228950 + }, + { + "epoch": 0.22560153842516334, + "grad_norm": 2.2616357803344727, + "learning_rate": 9.949893842172991e-06, + "loss": 3.1009, + "step": 229000 + }, + { + "epoch": 0.22565079640298544, + "grad_norm": 2.4085733890533447, + "learning_rate": 9.949871986363752e-06, + "loss": 3.15, + "step": 229050 + }, + { + "epoch": 0.2257000543808075, + "grad_norm": 2.7629122734069824, + "learning_rate": 9.949850125812923e-06, + "loss": 3.1108, + "step": 229100 + }, + { + "epoch": 0.2257493123586296, + "grad_norm": 2.4449000358581543, + "learning_rate": 9.949828260520527e-06, + "loss": 3.1386, + "step": 229150 + }, + { + "epoch": 0.2257985703364517, + "grad_norm": 2.276897430419922, + "learning_rate": 9.949806390486582e-06, + "loss": 3.1602, + "step": 229200 + }, + { + "epoch": 0.22584782831427377, + "grad_norm": 2.3056507110595703, + "learning_rate": 9.949784515711111e-06, + "loss": 3.1337, + "step": 229250 + }, + { + "epoch": 0.22589708629209587, + "grad_norm": 2.4320430755615234, + "learning_rate": 9.949762636194133e-06, + "loss": 3.1411, + "step": 229300 + }, + { + "epoch": 0.22594634426991794, + "grad_norm": 2.579031229019165, + "learning_rate": 9.94974075193567e-06, + "loss": 3.1318, + "step": 229350 + }, + { + "epoch": 0.22599560224774004, + "grad_norm": 2.3198349475860596, + "learning_rate": 9.949718862935745e-06, + "loss": 3.1114, + "step": 229400 + }, + { + "epoch": 0.22604486022556214, + "grad_norm": 2.48252272605896, + "learning_rate": 9.949696969194376e-06, + "loss": 3.0974, + "step": 229450 + }, + { + "epoch": 0.2260941182033842, + "grad_norm": 2.432814836502075, + "learning_rate": 9.949675070711585e-06, + "loss": 3.081, + "step": 229500 + }, + { + "epoch": 0.2261433761812063, + "grad_norm": 2.2791616916656494, + "learning_rate": 9.949653167487391e-06, + "loss": 3.1672, + "step": 229550 + }, + { + "epoch": 0.2261926341590284, + "grad_norm": 2.407297134399414, + "learning_rate": 9.949631259521817e-06, + "loss": 3.1411, + "step": 229600 + }, + { + "epoch": 0.22624189213685048, + "grad_norm": 2.2199108600616455, + "learning_rate": 9.949609346814886e-06, + "loss": 3.0797, + "step": 229650 + }, + { + "epoch": 0.22629115011467257, + "grad_norm": 2.616917371749878, + "learning_rate": 9.949587429366617e-06, + "loss": 3.0246, + "step": 229700 + }, + { + "epoch": 0.22634040809249467, + "grad_norm": 2.2679972648620605, + "learning_rate": 9.949565507177028e-06, + "loss": 3.1543, + "step": 229750 + }, + { + "epoch": 0.22638966607031674, + "grad_norm": 2.493642568588257, + "learning_rate": 9.949543580246144e-06, + "loss": 3.0781, + "step": 229800 + }, + { + "epoch": 0.22643892404813884, + "grad_norm": 2.6296164989471436, + "learning_rate": 9.949521648573986e-06, + "loss": 3.1177, + "step": 229850 + }, + { + "epoch": 0.22648818202596094, + "grad_norm": 2.3538947105407715, + "learning_rate": 9.949499712160573e-06, + "loss": 3.0368, + "step": 229900 + }, + { + "epoch": 0.226537440003783, + "grad_norm": 2.2601139545440674, + "learning_rate": 9.949477771005927e-06, + "loss": 3.0856, + "step": 229950 + }, + { + "epoch": 0.2265866979816051, + "grad_norm": 2.288181781768799, + "learning_rate": 9.949455825110068e-06, + "loss": 3.2074, + "step": 230000 + }, + { + "epoch": 0.22663595595942718, + "grad_norm": 2.32810115814209, + "learning_rate": 9.949433874473019e-06, + "loss": 3.1413, + "step": 230050 + }, + { + "epoch": 0.22668521393724927, + "grad_norm": 2.4031431674957275, + "learning_rate": 9.9494119190948e-06, + "loss": 3.1555, + "step": 230100 + }, + { + "epoch": 0.22673447191507137, + "grad_norm": 2.292767286300659, + "learning_rate": 9.94938995897543e-06, + "loss": 3.1405, + "step": 230150 + }, + { + "epoch": 0.22678372989289344, + "grad_norm": 2.2212533950805664, + "learning_rate": 9.949367994114934e-06, + "loss": 3.1567, + "step": 230200 + }, + { + "epoch": 0.22683298787071554, + "grad_norm": 2.4706010818481445, + "learning_rate": 9.94934602451333e-06, + "loss": 3.0583, + "step": 230250 + }, + { + "epoch": 0.22688224584853764, + "grad_norm": 2.3244545459747314, + "learning_rate": 9.949324050170641e-06, + "loss": 3.1692, + "step": 230300 + }, + { + "epoch": 0.2269315038263597, + "grad_norm": 2.3320229053497314, + "learning_rate": 9.949302071086888e-06, + "loss": 3.0792, + "step": 230350 + }, + { + "epoch": 0.2269807618041818, + "grad_norm": 2.6513266563415527, + "learning_rate": 9.94928008726209e-06, + "loss": 3.1239, + "step": 230400 + }, + { + "epoch": 0.2270300197820039, + "grad_norm": 2.298130512237549, + "learning_rate": 9.949258098696269e-06, + "loss": 3.1854, + "step": 230450 + }, + { + "epoch": 0.22707927775982598, + "grad_norm": 2.2413125038146973, + "learning_rate": 9.949236105389447e-06, + "loss": 3.0626, + "step": 230500 + }, + { + "epoch": 0.22712853573764807, + "grad_norm": 2.2524447441101074, + "learning_rate": 9.949214107341642e-06, + "loss": 3.0847, + "step": 230550 + }, + { + "epoch": 0.22717779371547014, + "grad_norm": 2.370786428451538, + "learning_rate": 9.949192104552881e-06, + "loss": 3.1872, + "step": 230600 + }, + { + "epoch": 0.22722705169329224, + "grad_norm": 2.4005978107452393, + "learning_rate": 9.94917009702318e-06, + "loss": 3.0909, + "step": 230650 + }, + { + "epoch": 0.22727630967111434, + "grad_norm": 2.480656385421753, + "learning_rate": 9.94914808475256e-06, + "loss": 3.1399, + "step": 230700 + }, + { + "epoch": 0.2273255676489364, + "grad_norm": 2.273033380508423, + "learning_rate": 9.949126067741045e-06, + "loss": 3.088, + "step": 230750 + }, + { + "epoch": 0.2273748256267585, + "grad_norm": 2.298105001449585, + "learning_rate": 9.949104045988656e-06, + "loss": 3.125, + "step": 230800 + }, + { + "epoch": 0.2274240836045806, + "grad_norm": 2.454493999481201, + "learning_rate": 9.94908201949541e-06, + "loss": 3.1131, + "step": 230850 + }, + { + "epoch": 0.22747334158240268, + "grad_norm": 2.45310378074646, + "learning_rate": 9.949059988261335e-06, + "loss": 3.1353, + "step": 230900 + }, + { + "epoch": 0.22752259956022478, + "grad_norm": 2.277022361755371, + "learning_rate": 9.949037952286444e-06, + "loss": 3.0638, + "step": 230950 + }, + { + "epoch": 0.22757185753804687, + "grad_norm": 2.3495540618896484, + "learning_rate": 9.949015911570764e-06, + "loss": 3.0889, + "step": 231000 + }, + { + "epoch": 0.22762111551586894, + "grad_norm": 2.4992523193359375, + "learning_rate": 9.948993866114314e-06, + "loss": 3.0557, + "step": 231050 + }, + { + "epoch": 0.22767037349369104, + "grad_norm": 2.291456460952759, + "learning_rate": 9.948971815917117e-06, + "loss": 3.123, + "step": 231100 + }, + { + "epoch": 0.22771963147151314, + "grad_norm": 2.622642993927002, + "learning_rate": 9.94894976097919e-06, + "loss": 3.1377, + "step": 231150 + }, + { + "epoch": 0.2277688894493352, + "grad_norm": 2.287047863006592, + "learning_rate": 9.948927701300559e-06, + "loss": 3.1052, + "step": 231200 + }, + { + "epoch": 0.2278181474271573, + "grad_norm": 2.2981679439544678, + "learning_rate": 9.948905636881241e-06, + "loss": 3.1601, + "step": 231250 + }, + { + "epoch": 0.22786740540497938, + "grad_norm": 2.1821305751800537, + "learning_rate": 9.94888356772126e-06, + "loss": 3.14, + "step": 231300 + }, + { + "epoch": 0.22791666338280148, + "grad_norm": 2.4427993297576904, + "learning_rate": 9.948861493820636e-06, + "loss": 3.1548, + "step": 231350 + }, + { + "epoch": 0.22796592136062357, + "grad_norm": 2.4314017295837402, + "learning_rate": 9.94883941517939e-06, + "loss": 3.1284, + "step": 231400 + }, + { + "epoch": 0.22801517933844564, + "grad_norm": 2.344003438949585, + "learning_rate": 9.948817331797542e-06, + "loss": 3.0702, + "step": 231450 + }, + { + "epoch": 0.22806443731626774, + "grad_norm": 2.3072893619537354, + "learning_rate": 9.948795243675116e-06, + "loss": 3.1284, + "step": 231500 + }, + { + "epoch": 0.22811369529408984, + "grad_norm": 2.3631937503814697, + "learning_rate": 9.948773150812132e-06, + "loss": 3.1364, + "step": 231550 + }, + { + "epoch": 0.2281629532719119, + "grad_norm": 2.160754442214966, + "learning_rate": 9.94875105320861e-06, + "loss": 3.1334, + "step": 231600 + }, + { + "epoch": 0.228212211249734, + "grad_norm": 2.5623526573181152, + "learning_rate": 9.948728950864572e-06, + "loss": 3.1225, + "step": 231650 + }, + { + "epoch": 0.2282614692275561, + "grad_norm": 2.591986656188965, + "learning_rate": 9.94870684378004e-06, + "loss": 3.1219, + "step": 231700 + }, + { + "epoch": 0.22831072720537818, + "grad_norm": 2.2456555366516113, + "learning_rate": 9.948684731955034e-06, + "loss": 3.118, + "step": 231750 + }, + { + "epoch": 0.22835998518320028, + "grad_norm": 2.309113025665283, + "learning_rate": 9.948662615389575e-06, + "loss": 3.1057, + "step": 231800 + }, + { + "epoch": 0.22840924316102235, + "grad_norm": 2.5398659706115723, + "learning_rate": 9.948640494083686e-06, + "loss": 3.0865, + "step": 231850 + }, + { + "epoch": 0.22845850113884444, + "grad_norm": 2.056925058364868, + "learning_rate": 9.948618368037387e-06, + "loss": 3.1586, + "step": 231900 + }, + { + "epoch": 0.22850775911666654, + "grad_norm": 2.399930238723755, + "learning_rate": 9.948596237250697e-06, + "loss": 3.1255, + "step": 231950 + }, + { + "epoch": 0.2285570170944886, + "grad_norm": 2.401066780090332, + "learning_rate": 9.94857410172364e-06, + "loss": 3.0886, + "step": 232000 + }, + { + "epoch": 0.2286062750723107, + "grad_norm": 2.4520864486694336, + "learning_rate": 9.948551961456237e-06, + "loss": 3.1069, + "step": 232050 + }, + { + "epoch": 0.2286555330501328, + "grad_norm": 2.2527456283569336, + "learning_rate": 9.94852981644851e-06, + "loss": 3.1032, + "step": 232100 + }, + { + "epoch": 0.22870479102795488, + "grad_norm": 2.3150463104248047, + "learning_rate": 9.948507666700477e-06, + "loss": 3.1328, + "step": 232150 + }, + { + "epoch": 0.22875404900577698, + "grad_norm": 2.2695631980895996, + "learning_rate": 9.948485512212162e-06, + "loss": 3.1318, + "step": 232200 + }, + { + "epoch": 0.22880330698359908, + "grad_norm": 2.29229736328125, + "learning_rate": 9.948463352983585e-06, + "loss": 3.131, + "step": 232250 + }, + { + "epoch": 0.22885256496142115, + "grad_norm": 2.2613794803619385, + "learning_rate": 9.948441189014769e-06, + "loss": 3.0421, + "step": 232300 + }, + { + "epoch": 0.22890182293924324, + "grad_norm": 2.2791359424591064, + "learning_rate": 9.948419020305731e-06, + "loss": 3.1543, + "step": 232350 + }, + { + "epoch": 0.22895108091706534, + "grad_norm": 2.4636623859405518, + "learning_rate": 9.948396846856497e-06, + "loss": 3.1361, + "step": 232400 + }, + { + "epoch": 0.2290003388948874, + "grad_norm": 2.4935758113861084, + "learning_rate": 9.948374668667087e-06, + "loss": 3.078, + "step": 232450 + }, + { + "epoch": 0.2290495968727095, + "grad_norm": 2.2243528366088867, + "learning_rate": 9.94835248573752e-06, + "loss": 3.1416, + "step": 232500 + }, + { + "epoch": 0.22909885485053158, + "grad_norm": 2.4566490650177, + "learning_rate": 9.948330298067819e-06, + "loss": 3.0693, + "step": 232550 + }, + { + "epoch": 0.22914811282835368, + "grad_norm": 2.2690930366516113, + "learning_rate": 9.948308105658005e-06, + "loss": 3.1009, + "step": 232600 + }, + { + "epoch": 0.22919737080617578, + "grad_norm": 2.389646291732788, + "learning_rate": 9.9482859085081e-06, + "loss": 3.1227, + "step": 232650 + }, + { + "epoch": 0.22924662878399785, + "grad_norm": 2.29900860786438, + "learning_rate": 9.948263706618124e-06, + "loss": 3.0902, + "step": 232700 + }, + { + "epoch": 0.22929588676181994, + "grad_norm": 2.3441901206970215, + "learning_rate": 9.9482414999881e-06, + "loss": 3.1493, + "step": 232750 + }, + { + "epoch": 0.22934514473964204, + "grad_norm": 2.46977162361145, + "learning_rate": 9.948219288618046e-06, + "loss": 3.1013, + "step": 232800 + }, + { + "epoch": 0.2293944027174641, + "grad_norm": 2.378328800201416, + "learning_rate": 9.948197072507988e-06, + "loss": 3.0513, + "step": 232850 + }, + { + "epoch": 0.2294436606952862, + "grad_norm": 2.2376158237457275, + "learning_rate": 9.948174851657943e-06, + "loss": 3.1434, + "step": 232900 + }, + { + "epoch": 0.2294929186731083, + "grad_norm": 2.5076003074645996, + "learning_rate": 9.948152626067934e-06, + "loss": 3.1143, + "step": 232950 + }, + { + "epoch": 0.22954217665093038, + "grad_norm": 2.2947511672973633, + "learning_rate": 9.948130395737983e-06, + "loss": 3.1737, + "step": 233000 + }, + { + "epoch": 0.22959143462875248, + "grad_norm": 2.2616872787475586, + "learning_rate": 9.94810816066811e-06, + "loss": 3.1294, + "step": 233050 + }, + { + "epoch": 0.22964069260657455, + "grad_norm": 2.360733985900879, + "learning_rate": 9.948085920858338e-06, + "loss": 3.1271, + "step": 233100 + }, + { + "epoch": 0.22968995058439665, + "grad_norm": 2.363555431365967, + "learning_rate": 9.948063676308686e-06, + "loss": 3.1363, + "step": 233150 + }, + { + "epoch": 0.22973920856221874, + "grad_norm": 2.386957883834839, + "learning_rate": 9.948041427019176e-06, + "loss": 3.0712, + "step": 233200 + }, + { + "epoch": 0.22978846654004081, + "grad_norm": 2.4780349731445312, + "learning_rate": 9.94801917298983e-06, + "loss": 3.1134, + "step": 233250 + }, + { + "epoch": 0.2298377245178629, + "grad_norm": 2.3045809268951416, + "learning_rate": 9.94799691422067e-06, + "loss": 3.1154, + "step": 233300 + }, + { + "epoch": 0.229886982495685, + "grad_norm": 2.267698049545288, + "learning_rate": 9.947974650711716e-06, + "loss": 3.124, + "step": 233350 + }, + { + "epoch": 0.22993624047350708, + "grad_norm": 2.467005729675293, + "learning_rate": 9.94795238246299e-06, + "loss": 3.1422, + "step": 233400 + }, + { + "epoch": 0.22998549845132918, + "grad_norm": 2.485252618789673, + "learning_rate": 9.947930109474513e-06, + "loss": 3.1126, + "step": 233450 + }, + { + "epoch": 0.23003475642915128, + "grad_norm": 2.386655807495117, + "learning_rate": 9.947907831746306e-06, + "loss": 3.0752, + "step": 233500 + }, + { + "epoch": 0.23008401440697335, + "grad_norm": 2.4380009174346924, + "learning_rate": 9.947885549278392e-06, + "loss": 3.1303, + "step": 233550 + }, + { + "epoch": 0.23013327238479545, + "grad_norm": 2.4312243461608887, + "learning_rate": 9.94786326207079e-06, + "loss": 3.0604, + "step": 233600 + }, + { + "epoch": 0.23018253036261754, + "grad_norm": 2.352602005004883, + "learning_rate": 9.947840970123522e-06, + "loss": 3.1294, + "step": 233650 + }, + { + "epoch": 0.2302317883404396, + "grad_norm": 2.4404258728027344, + "learning_rate": 9.947818673436608e-06, + "loss": 3.1051, + "step": 233700 + }, + { + "epoch": 0.2302810463182617, + "grad_norm": 2.5121829509735107, + "learning_rate": 9.947796372010074e-06, + "loss": 3.0941, + "step": 233750 + }, + { + "epoch": 0.23033030429608378, + "grad_norm": 2.3506901264190674, + "learning_rate": 9.947774065843939e-06, + "loss": 3.1169, + "step": 233800 + }, + { + "epoch": 0.23037956227390588, + "grad_norm": 2.3131022453308105, + "learning_rate": 9.947751754938222e-06, + "loss": 3.08, + "step": 233850 + }, + { + "epoch": 0.23042882025172798, + "grad_norm": 2.4042625427246094, + "learning_rate": 9.947729439292947e-06, + "loss": 3.1609, + "step": 233900 + }, + { + "epoch": 0.23047807822955005, + "grad_norm": 2.301863431930542, + "learning_rate": 9.947707118908136e-06, + "loss": 3.1099, + "step": 233950 + }, + { + "epoch": 0.23052733620737215, + "grad_norm": 2.40221905708313, + "learning_rate": 9.947684793783807e-06, + "loss": 3.0671, + "step": 234000 + }, + { + "epoch": 0.23057659418519424, + "grad_norm": 2.296137571334839, + "learning_rate": 9.947662463919985e-06, + "loss": 3.0958, + "step": 234050 + }, + { + "epoch": 0.23062585216301631, + "grad_norm": 2.352381467819214, + "learning_rate": 9.947640129316688e-06, + "loss": 3.1078, + "step": 234100 + }, + { + "epoch": 0.2306751101408384, + "grad_norm": 2.1980345249176025, + "learning_rate": 9.94761778997394e-06, + "loss": 3.0986, + "step": 234150 + }, + { + "epoch": 0.2307243681186605, + "grad_norm": 2.5247669219970703, + "learning_rate": 9.947595445891763e-06, + "loss": 3.1209, + "step": 234200 + }, + { + "epoch": 0.23077362609648258, + "grad_norm": 2.4769108295440674, + "learning_rate": 9.947573097070175e-06, + "loss": 3.1144, + "step": 234250 + }, + { + "epoch": 0.23082288407430468, + "grad_norm": 2.9955673217773438, + "learning_rate": 9.9475507435092e-06, + "loss": 3.1635, + "step": 234300 + }, + { + "epoch": 0.23087214205212675, + "grad_norm": 2.3403661251068115, + "learning_rate": 9.947528385208859e-06, + "loss": 3.1816, + "step": 234350 + }, + { + "epoch": 0.23092140002994885, + "grad_norm": 2.369025468826294, + "learning_rate": 9.947506022169174e-06, + "loss": 3.1048, + "step": 234400 + }, + { + "epoch": 0.23097065800777095, + "grad_norm": 2.6143558025360107, + "learning_rate": 9.947483654390165e-06, + "loss": 3.0768, + "step": 234450 + }, + { + "epoch": 0.23101991598559302, + "grad_norm": 2.3820548057556152, + "learning_rate": 9.947461281871852e-06, + "loss": 3.1532, + "step": 234500 + }, + { + "epoch": 0.23106917396341511, + "grad_norm": 2.3688600063323975, + "learning_rate": 9.947438904614262e-06, + "loss": 3.0989, + "step": 234550 + }, + { + "epoch": 0.2311184319412372, + "grad_norm": 2.504629373550415, + "learning_rate": 9.94741652261741e-06, + "loss": 3.0706, + "step": 234600 + }, + { + "epoch": 0.23116768991905928, + "grad_norm": 2.3956665992736816, + "learning_rate": 9.947394135881323e-06, + "loss": 3.1152, + "step": 234650 + }, + { + "epoch": 0.23121694789688138, + "grad_norm": 2.325806140899658, + "learning_rate": 9.947371744406018e-06, + "loss": 3.1029, + "step": 234700 + }, + { + "epoch": 0.23126620587470348, + "grad_norm": 2.3691487312316895, + "learning_rate": 9.947349348191519e-06, + "loss": 3.0954, + "step": 234750 + }, + { + "epoch": 0.23131546385252555, + "grad_norm": 2.3023033142089844, + "learning_rate": 9.947326947237846e-06, + "loss": 3.1206, + "step": 234800 + }, + { + "epoch": 0.23136472183034765, + "grad_norm": 2.5985546112060547, + "learning_rate": 9.947304541545022e-06, + "loss": 3.0853, + "step": 234850 + }, + { + "epoch": 0.23141397980816972, + "grad_norm": 2.789794683456421, + "learning_rate": 9.947282131113068e-06, + "loss": 3.1346, + "step": 234900 + }, + { + "epoch": 0.23146323778599182, + "grad_norm": 2.9096155166625977, + "learning_rate": 9.947259715942003e-06, + "loss": 3.047, + "step": 234950 + }, + { + "epoch": 0.2315124957638139, + "grad_norm": 2.5125975608825684, + "learning_rate": 9.947237296031854e-06, + "loss": 3.1983, + "step": 235000 + }, + { + "epoch": 0.23156175374163598, + "grad_norm": 2.3907930850982666, + "learning_rate": 9.947214871382636e-06, + "loss": 3.1063, + "step": 235050 + }, + { + "epoch": 0.23161101171945808, + "grad_norm": 2.472198486328125, + "learning_rate": 9.947192441994375e-06, + "loss": 3.1385, + "step": 235100 + }, + { + "epoch": 0.23166026969728018, + "grad_norm": 2.497612476348877, + "learning_rate": 9.947170007867091e-06, + "loss": 3.0985, + "step": 235150 + }, + { + "epoch": 0.23170952767510225, + "grad_norm": 2.393240213394165, + "learning_rate": 9.947147569000806e-06, + "loss": 3.0728, + "step": 235200 + }, + { + "epoch": 0.23175878565292435, + "grad_norm": 2.514873743057251, + "learning_rate": 9.947125125395539e-06, + "loss": 3.0995, + "step": 235250 + }, + { + "epoch": 0.23180804363074645, + "grad_norm": 2.194000244140625, + "learning_rate": 9.947102677051314e-06, + "loss": 3.1593, + "step": 235300 + }, + { + "epoch": 0.23185730160856852, + "grad_norm": 2.136803388595581, + "learning_rate": 9.947080223968154e-06, + "loss": 3.1685, + "step": 235350 + }, + { + "epoch": 0.23190655958639061, + "grad_norm": 2.3150687217712402, + "learning_rate": 9.947057766146077e-06, + "loss": 3.0957, + "step": 235400 + }, + { + "epoch": 0.2319558175642127, + "grad_norm": 2.1616647243499756, + "learning_rate": 9.947035303585106e-06, + "loss": 3.0776, + "step": 235450 + }, + { + "epoch": 0.23200507554203478, + "grad_norm": 2.4263975620269775, + "learning_rate": 9.947012836285264e-06, + "loss": 3.0721, + "step": 235500 + }, + { + "epoch": 0.23205433351985688, + "grad_norm": 2.2836196422576904, + "learning_rate": 9.94699036424657e-06, + "loss": 3.1284, + "step": 235550 + }, + { + "epoch": 0.23210359149767895, + "grad_norm": 2.34989070892334, + "learning_rate": 9.946967887469046e-06, + "loss": 3.136, + "step": 235600 + }, + { + "epoch": 0.23215284947550105, + "grad_norm": 2.47631573677063, + "learning_rate": 9.946945405952716e-06, + "loss": 3.0621, + "step": 235650 + }, + { + "epoch": 0.23220210745332315, + "grad_norm": 2.312878370285034, + "learning_rate": 9.9469229196976e-06, + "loss": 3.1263, + "step": 235700 + }, + { + "epoch": 0.23225136543114522, + "grad_norm": 2.360121965408325, + "learning_rate": 9.946900428703717e-06, + "loss": 3.1217, + "step": 235750 + }, + { + "epoch": 0.23230062340896732, + "grad_norm": 2.383042812347412, + "learning_rate": 9.94687793297109e-06, + "loss": 3.0987, + "step": 235800 + }, + { + "epoch": 0.23234988138678941, + "grad_norm": 2.354893207550049, + "learning_rate": 9.946855432499744e-06, + "loss": 3.0821, + "step": 235850 + }, + { + "epoch": 0.23239913936461148, + "grad_norm": 2.3437821865081787, + "learning_rate": 9.946832927289695e-06, + "loss": 3.143, + "step": 235900 + }, + { + "epoch": 0.23244839734243358, + "grad_norm": 2.498812198638916, + "learning_rate": 9.94681041734097e-06, + "loss": 3.0915, + "step": 235950 + }, + { + "epoch": 0.23249765532025568, + "grad_norm": 2.435222864151001, + "learning_rate": 9.946787902653588e-06, + "loss": 3.1309, + "step": 236000 + }, + { + "epoch": 0.23254691329807775, + "grad_norm": 2.7910876274108887, + "learning_rate": 9.94676538322757e-06, + "loss": 3.1228, + "step": 236050 + }, + { + "epoch": 0.23259617127589985, + "grad_norm": 1.9755998849868774, + "learning_rate": 9.946742859062937e-06, + "loss": 3.168, + "step": 236100 + }, + { + "epoch": 0.23264542925372192, + "grad_norm": 2.356219530105591, + "learning_rate": 9.946720330159713e-06, + "loss": 3.1698, + "step": 236150 + }, + { + "epoch": 0.23269468723154402, + "grad_norm": 2.49165415763855, + "learning_rate": 9.946697796517917e-06, + "loss": 3.1278, + "step": 236200 + }, + { + "epoch": 0.23274394520936612, + "grad_norm": 2.3530771732330322, + "learning_rate": 9.946675258137572e-06, + "loss": 3.1059, + "step": 236250 + }, + { + "epoch": 0.23279320318718819, + "grad_norm": 2.7169032096862793, + "learning_rate": 9.946652715018701e-06, + "loss": 3.0824, + "step": 236300 + }, + { + "epoch": 0.23284246116501028, + "grad_norm": 2.297830581665039, + "learning_rate": 9.946630167161325e-06, + "loss": 3.1732, + "step": 236350 + }, + { + "epoch": 0.23289171914283238, + "grad_norm": 2.331650733947754, + "learning_rate": 9.946607614565463e-06, + "loss": 3.0953, + "step": 236400 + }, + { + "epoch": 0.23294097712065445, + "grad_norm": 2.148003578186035, + "learning_rate": 9.946585057231138e-06, + "loss": 3.1492, + "step": 236450 + }, + { + "epoch": 0.23299023509847655, + "grad_norm": 2.31485915184021, + "learning_rate": 9.946562495158371e-06, + "loss": 3.095, + "step": 236500 + }, + { + "epoch": 0.23303949307629865, + "grad_norm": 2.520641565322876, + "learning_rate": 9.946539928347186e-06, + "loss": 3.112, + "step": 236550 + }, + { + "epoch": 0.23308875105412072, + "grad_norm": 2.2154181003570557, + "learning_rate": 9.946517356797601e-06, + "loss": 3.106, + "step": 236600 + }, + { + "epoch": 0.23313800903194282, + "grad_norm": 2.290479898452759, + "learning_rate": 9.946494780509642e-06, + "loss": 3.148, + "step": 236650 + }, + { + "epoch": 0.23318726700976491, + "grad_norm": 2.226475477218628, + "learning_rate": 9.946472199483327e-06, + "loss": 3.059, + "step": 236700 + }, + { + "epoch": 0.23323652498758699, + "grad_norm": 2.2525384426116943, + "learning_rate": 9.94644961371868e-06, + "loss": 3.0785, + "step": 236750 + }, + { + "epoch": 0.23328578296540908, + "grad_norm": 2.2813265323638916, + "learning_rate": 9.946427023215722e-06, + "loss": 3.1007, + "step": 236800 + }, + { + "epoch": 0.23333504094323115, + "grad_norm": 2.443009853363037, + "learning_rate": 9.946404427974472e-06, + "loss": 3.0649, + "step": 236850 + }, + { + "epoch": 0.23338429892105325, + "grad_norm": 2.413403034210205, + "learning_rate": 9.946381827994955e-06, + "loss": 3.0645, + "step": 236900 + }, + { + "epoch": 0.23343355689887535, + "grad_norm": 2.48709774017334, + "learning_rate": 9.946359223277193e-06, + "loss": 3.1155, + "step": 236950 + }, + { + "epoch": 0.23348281487669742, + "grad_norm": 2.3465940952301025, + "learning_rate": 9.946336613821204e-06, + "loss": 3.1489, + "step": 237000 + }, + { + "epoch": 0.23353207285451952, + "grad_norm": 2.4294638633728027, + "learning_rate": 9.946313999627012e-06, + "loss": 3.1112, + "step": 237050 + }, + { + "epoch": 0.23358133083234162, + "grad_norm": 2.2374181747436523, + "learning_rate": 9.946291380694642e-06, + "loss": 3.0955, + "step": 237100 + }, + { + "epoch": 0.2336305888101637, + "grad_norm": 2.291595935821533, + "learning_rate": 9.946268757024108e-06, + "loss": 3.1292, + "step": 237150 + }, + { + "epoch": 0.23367984678798578, + "grad_norm": 2.592600107192993, + "learning_rate": 9.946246128615438e-06, + "loss": 3.0984, + "step": 237200 + }, + { + "epoch": 0.23372910476580788, + "grad_norm": 2.2970426082611084, + "learning_rate": 9.946223495468652e-06, + "loss": 3.0784, + "step": 237250 + }, + { + "epoch": 0.23377836274362995, + "grad_norm": 2.242952585220337, + "learning_rate": 9.94620085758377e-06, + "loss": 3.0894, + "step": 237300 + }, + { + "epoch": 0.23382762072145205, + "grad_norm": 2.468498945236206, + "learning_rate": 9.946178214960815e-06, + "loss": 3.1731, + "step": 237350 + }, + { + "epoch": 0.23387687869927412, + "grad_norm": 2.1341092586517334, + "learning_rate": 9.946155567599809e-06, + "loss": 3.113, + "step": 237400 + }, + { + "epoch": 0.23392613667709622, + "grad_norm": 2.2872345447540283, + "learning_rate": 9.946132915500772e-06, + "loss": 3.1144, + "step": 237450 + }, + { + "epoch": 0.23397539465491832, + "grad_norm": 2.4731791019439697, + "learning_rate": 9.946110258663728e-06, + "loss": 3.0541, + "step": 237500 + }, + { + "epoch": 0.2340246526327404, + "grad_norm": 2.199284791946411, + "learning_rate": 9.946087597088699e-06, + "loss": 3.1344, + "step": 237550 + }, + { + "epoch": 0.23407391061056249, + "grad_norm": 2.2212328910827637, + "learning_rate": 9.946064930775703e-06, + "loss": 3.1069, + "step": 237600 + }, + { + "epoch": 0.23412316858838458, + "grad_norm": 2.3216452598571777, + "learning_rate": 9.946042259724765e-06, + "loss": 3.1191, + "step": 237650 + }, + { + "epoch": 0.23417242656620665, + "grad_norm": 2.2018797397613525, + "learning_rate": 9.946019583935906e-06, + "loss": 3.1381, + "step": 237700 + }, + { + "epoch": 0.23422168454402875, + "grad_norm": 2.1988394260406494, + "learning_rate": 9.945996903409148e-06, + "loss": 3.1014, + "step": 237750 + }, + { + "epoch": 0.23427094252185085, + "grad_norm": 2.371311902999878, + "learning_rate": 9.945974218144511e-06, + "loss": 3.087, + "step": 237800 + }, + { + "epoch": 0.23432020049967292, + "grad_norm": 2.372830629348755, + "learning_rate": 9.945951528142019e-06, + "loss": 3.1154, + "step": 237850 + }, + { + "epoch": 0.23436945847749502, + "grad_norm": 2.418653726577759, + "learning_rate": 9.945928833401692e-06, + "loss": 3.094, + "step": 237900 + }, + { + "epoch": 0.23441871645531712, + "grad_norm": 2.2956998348236084, + "learning_rate": 9.945906133923555e-06, + "loss": 3.129, + "step": 237950 + }, + { + "epoch": 0.2344679744331392, + "grad_norm": 2.327108383178711, + "learning_rate": 9.945883429707625e-06, + "loss": 3.0912, + "step": 238000 + }, + { + "epoch": 0.23451723241096128, + "grad_norm": 2.3238492012023926, + "learning_rate": 9.945860720753926e-06, + "loss": 3.1437, + "step": 238050 + }, + { + "epoch": 0.23456649038878336, + "grad_norm": 2.3769428730010986, + "learning_rate": 9.945838007062479e-06, + "loss": 3.0836, + "step": 238100 + }, + { + "epoch": 0.23461574836660545, + "grad_norm": 2.36173415184021, + "learning_rate": 9.945815288633309e-06, + "loss": 3.103, + "step": 238150 + }, + { + "epoch": 0.23466500634442755, + "grad_norm": 2.286583185195923, + "learning_rate": 9.945792565466433e-06, + "loss": 3.0294, + "step": 238200 + }, + { + "epoch": 0.23471426432224962, + "grad_norm": 2.3353917598724365, + "learning_rate": 9.945769837561876e-06, + "loss": 3.0646, + "step": 238250 + }, + { + "epoch": 0.23476352230007172, + "grad_norm": 2.417656660079956, + "learning_rate": 9.94574710491966e-06, + "loss": 3.199, + "step": 238300 + }, + { + "epoch": 0.23481278027789382, + "grad_norm": 2.3847527503967285, + "learning_rate": 9.945724367539803e-06, + "loss": 3.087, + "step": 238350 + }, + { + "epoch": 0.2348620382557159, + "grad_norm": 2.3179914951324463, + "learning_rate": 9.94570162542233e-06, + "loss": 3.1376, + "step": 238400 + }, + { + "epoch": 0.234911296233538, + "grad_norm": 2.2376773357391357, + "learning_rate": 9.945678878567262e-06, + "loss": 3.0562, + "step": 238450 + }, + { + "epoch": 0.23496055421136008, + "grad_norm": 2.3091437816619873, + "learning_rate": 9.945656126974621e-06, + "loss": 3.07, + "step": 238500 + }, + { + "epoch": 0.23500981218918215, + "grad_norm": 2.4027609825134277, + "learning_rate": 9.94563337064443e-06, + "loss": 3.1072, + "step": 238550 + }, + { + "epoch": 0.23505907016700425, + "grad_norm": 2.3716773986816406, + "learning_rate": 9.945610609576707e-06, + "loss": 3.0815, + "step": 238600 + }, + { + "epoch": 0.23510832814482632, + "grad_norm": 2.5280227661132812, + "learning_rate": 9.945587843771479e-06, + "loss": 3.1652, + "step": 238650 + }, + { + "epoch": 0.23515758612264842, + "grad_norm": 2.3150553703308105, + "learning_rate": 9.945565073228764e-06, + "loss": 3.0599, + "step": 238700 + }, + { + "epoch": 0.23520684410047052, + "grad_norm": 2.2977688312530518, + "learning_rate": 9.945542297948586e-06, + "loss": 3.1061, + "step": 238750 + }, + { + "epoch": 0.2352561020782926, + "grad_norm": 2.2410190105438232, + "learning_rate": 9.945519517930964e-06, + "loss": 3.0871, + "step": 238800 + }, + { + "epoch": 0.2353053600561147, + "grad_norm": 2.341073751449585, + "learning_rate": 9.945496733175921e-06, + "loss": 3.0806, + "step": 238850 + }, + { + "epoch": 0.23535461803393679, + "grad_norm": 2.2087273597717285, + "learning_rate": 9.945473943683482e-06, + "loss": 3.128, + "step": 238900 + }, + { + "epoch": 0.23540387601175886, + "grad_norm": 2.203188180923462, + "learning_rate": 9.945451149453664e-06, + "loss": 3.1197, + "step": 238950 + }, + { + "epoch": 0.23545313398958095, + "grad_norm": 2.2984530925750732, + "learning_rate": 9.945428350486492e-06, + "loss": 3.0866, + "step": 239000 + }, + { + "epoch": 0.23550239196740305, + "grad_norm": 2.2698686122894287, + "learning_rate": 9.945405546781987e-06, + "loss": 3.0721, + "step": 239050 + }, + { + "epoch": 0.23555164994522512, + "grad_norm": 2.4228298664093018, + "learning_rate": 9.94538273834017e-06, + "loss": 3.0622, + "step": 239100 + }, + { + "epoch": 0.23560090792304722, + "grad_norm": 2.4454684257507324, + "learning_rate": 9.945359925161065e-06, + "loss": 3.1434, + "step": 239150 + }, + { + "epoch": 0.23565016590086932, + "grad_norm": 2.1904497146606445, + "learning_rate": 9.945337107244692e-06, + "loss": 3.0895, + "step": 239200 + }, + { + "epoch": 0.2356994238786914, + "grad_norm": 2.315798282623291, + "learning_rate": 9.945314284591072e-06, + "loss": 3.0682, + "step": 239250 + }, + { + "epoch": 0.2357486818565135, + "grad_norm": 2.369689464569092, + "learning_rate": 9.94529145720023e-06, + "loss": 3.0529, + "step": 239300 + }, + { + "epoch": 0.23579793983433556, + "grad_norm": 2.3155274391174316, + "learning_rate": 9.945268625072184e-06, + "loss": 3.0613, + "step": 239350 + }, + { + "epoch": 0.23584719781215766, + "grad_norm": 2.43719744682312, + "learning_rate": 9.94524578820696e-06, + "loss": 3.0858, + "step": 239400 + }, + { + "epoch": 0.23589645578997975, + "grad_norm": 2.273902416229248, + "learning_rate": 9.945222946604575e-06, + "loss": 3.0507, + "step": 239450 + }, + { + "epoch": 0.23594571376780182, + "grad_norm": 2.4932000637054443, + "learning_rate": 9.945200100265056e-06, + "loss": 3.0097, + "step": 239500 + }, + { + "epoch": 0.23599497174562392, + "grad_norm": 2.2390365600585938, + "learning_rate": 9.945177249188423e-06, + "loss": 3.144, + "step": 239550 + }, + { + "epoch": 0.23604422972344602, + "grad_norm": 2.2054250240325928, + "learning_rate": 9.945154393374696e-06, + "loss": 3.0919, + "step": 239600 + }, + { + "epoch": 0.2360934877012681, + "grad_norm": 2.186556100845337, + "learning_rate": 9.945131532823899e-06, + "loss": 3.1275, + "step": 239650 + }, + { + "epoch": 0.2361427456790902, + "grad_norm": 2.33681321144104, + "learning_rate": 9.945108667536053e-06, + "loss": 3.1071, + "step": 239700 + }, + { + "epoch": 0.23619200365691229, + "grad_norm": 2.218372344970703, + "learning_rate": 9.945085797511181e-06, + "loss": 3.1533, + "step": 239750 + }, + { + "epoch": 0.23624126163473436, + "grad_norm": 2.4958925247192383, + "learning_rate": 9.945062922749302e-06, + "loss": 3.0747, + "step": 239800 + }, + { + "epoch": 0.23629051961255645, + "grad_norm": 2.375368595123291, + "learning_rate": 9.945040043250442e-06, + "loss": 3.0925, + "step": 239850 + }, + { + "epoch": 0.23633977759037852, + "grad_norm": 2.586843490600586, + "learning_rate": 9.94501715901462e-06, + "loss": 3.0692, + "step": 239900 + }, + { + "epoch": 0.23638903556820062, + "grad_norm": 2.3325328826904297, + "learning_rate": 9.944994270041858e-06, + "loss": 3.0961, + "step": 239950 + }, + { + "epoch": 0.23643829354602272, + "grad_norm": 2.5129683017730713, + "learning_rate": 9.94497137633218e-06, + "loss": 3.0944, + "step": 240000 + }, + { + "epoch": 0.2364875515238448, + "grad_norm": 2.7419254779815674, + "learning_rate": 9.944948477885606e-06, + "loss": 3.1364, + "step": 240050 + }, + { + "epoch": 0.2365368095016669, + "grad_norm": 2.284538507461548, + "learning_rate": 9.944925574702158e-06, + "loss": 3.1313, + "step": 240100 + }, + { + "epoch": 0.236586067479489, + "grad_norm": 2.3621649742126465, + "learning_rate": 9.94490266678186e-06, + "loss": 3.1105, + "step": 240150 + }, + { + "epoch": 0.23663532545731106, + "grad_norm": 2.373086929321289, + "learning_rate": 9.944879754124732e-06, + "loss": 3.1358, + "step": 240200 + }, + { + "epoch": 0.23668458343513316, + "grad_norm": 2.3248748779296875, + "learning_rate": 9.944856836730795e-06, + "loss": 3.1529, + "step": 240250 + }, + { + "epoch": 0.23673384141295525, + "grad_norm": 2.3124942779541016, + "learning_rate": 9.944833914600075e-06, + "loss": 3.0888, + "step": 240300 + }, + { + "epoch": 0.23678309939077732, + "grad_norm": 2.2947301864624023, + "learning_rate": 9.94481098773259e-06, + "loss": 3.1118, + "step": 240350 + }, + { + "epoch": 0.23683235736859942, + "grad_norm": 2.3648996353149414, + "learning_rate": 9.944788056128363e-06, + "loss": 3.0994, + "step": 240400 + }, + { + "epoch": 0.23688161534642152, + "grad_norm": 2.7386412620544434, + "learning_rate": 9.944765119787416e-06, + "loss": 3.1072, + "step": 240450 + }, + { + "epoch": 0.2369308733242436, + "grad_norm": 2.2639384269714355, + "learning_rate": 9.944742178709772e-06, + "loss": 3.0822, + "step": 240500 + }, + { + "epoch": 0.2369801313020657, + "grad_norm": 2.700481653213501, + "learning_rate": 9.944719232895451e-06, + "loss": 3.0245, + "step": 240550 + }, + { + "epoch": 0.23702938927988776, + "grad_norm": 2.525482416152954, + "learning_rate": 9.944696282344478e-06, + "loss": 3.0619, + "step": 240600 + }, + { + "epoch": 0.23707864725770986, + "grad_norm": 2.4162042140960693, + "learning_rate": 9.94467332705687e-06, + "loss": 3.1195, + "step": 240650 + }, + { + "epoch": 0.23712790523553195, + "grad_norm": 2.306652069091797, + "learning_rate": 9.944650367032656e-06, + "loss": 3.063, + "step": 240700 + }, + { + "epoch": 0.23717716321335403, + "grad_norm": 2.3941709995269775, + "learning_rate": 9.94462740227185e-06, + "loss": 3.0593, + "step": 240750 + }, + { + "epoch": 0.23722642119117612, + "grad_norm": 2.2683653831481934, + "learning_rate": 9.944604432774483e-06, + "loss": 3.0285, + "step": 240800 + }, + { + "epoch": 0.23727567916899822, + "grad_norm": 2.3559648990631104, + "learning_rate": 9.94458145854057e-06, + "loss": 3.0575, + "step": 240850 + }, + { + "epoch": 0.2373249371468203, + "grad_norm": 2.288796901702881, + "learning_rate": 9.944558479570134e-06, + "loss": 3.1064, + "step": 240900 + }, + { + "epoch": 0.2373741951246424, + "grad_norm": 2.430438756942749, + "learning_rate": 9.9445354958632e-06, + "loss": 3.08, + "step": 240950 + }, + { + "epoch": 0.2374234531024645, + "grad_norm": 2.3012094497680664, + "learning_rate": 9.944512507419785e-06, + "loss": 3.0908, + "step": 241000 + }, + { + "epoch": 0.23747271108028656, + "grad_norm": 2.2216012477874756, + "learning_rate": 9.944489514239917e-06, + "loss": 3.0953, + "step": 241050 + }, + { + "epoch": 0.23752196905810866, + "grad_norm": 2.4044082164764404, + "learning_rate": 9.944466516323615e-06, + "loss": 3.1058, + "step": 241100 + }, + { + "epoch": 0.23757122703593073, + "grad_norm": 2.2939109802246094, + "learning_rate": 9.944443513670902e-06, + "loss": 3.0908, + "step": 241150 + }, + { + "epoch": 0.23762048501375282, + "grad_norm": 2.329458713531494, + "learning_rate": 9.944420506281798e-06, + "loss": 3.0783, + "step": 241200 + }, + { + "epoch": 0.23766974299157492, + "grad_norm": 2.2308907508850098, + "learning_rate": 9.944397494156325e-06, + "loss": 3.0368, + "step": 241250 + }, + { + "epoch": 0.237719000969397, + "grad_norm": 2.725761651992798, + "learning_rate": 9.944374477294509e-06, + "loss": 3.1561, + "step": 241300 + }, + { + "epoch": 0.2377682589472191, + "grad_norm": 2.354994058609009, + "learning_rate": 9.944351455696368e-06, + "loss": 3.1204, + "step": 241350 + }, + { + "epoch": 0.2378175169250412, + "grad_norm": 2.437471628189087, + "learning_rate": 9.944328429361927e-06, + "loss": 3.0021, + "step": 241400 + }, + { + "epoch": 0.23786677490286326, + "grad_norm": 2.31559681892395, + "learning_rate": 9.944305398291205e-06, + "loss": 3.0467, + "step": 241450 + }, + { + "epoch": 0.23791603288068536, + "grad_norm": 2.5003604888916016, + "learning_rate": 9.944282362484226e-06, + "loss": 3.0871, + "step": 241500 + }, + { + "epoch": 0.23796529085850746, + "grad_norm": 2.2023606300354004, + "learning_rate": 9.944259321941013e-06, + "loss": 3.0978, + "step": 241550 + }, + { + "epoch": 0.23801454883632953, + "grad_norm": 2.5916011333465576, + "learning_rate": 9.944236276661583e-06, + "loss": 3.0538, + "step": 241600 + }, + { + "epoch": 0.23806380681415162, + "grad_norm": 2.684414863586426, + "learning_rate": 9.944213226645965e-06, + "loss": 3.1453, + "step": 241650 + }, + { + "epoch": 0.2381130647919737, + "grad_norm": 2.5312578678131104, + "learning_rate": 9.944190171894178e-06, + "loss": 3.0608, + "step": 241700 + }, + { + "epoch": 0.2381623227697958, + "grad_norm": 2.345900535583496, + "learning_rate": 9.944167112406243e-06, + "loss": 3.0536, + "step": 241750 + }, + { + "epoch": 0.2382115807476179, + "grad_norm": 2.156029462814331, + "learning_rate": 9.944144048182182e-06, + "loss": 3.1098, + "step": 241800 + }, + { + "epoch": 0.23826083872543996, + "grad_norm": 2.43034029006958, + "learning_rate": 9.94412097922202e-06, + "loss": 3.0925, + "step": 241850 + }, + { + "epoch": 0.23831009670326206, + "grad_norm": 2.4536616802215576, + "learning_rate": 9.944097905525775e-06, + "loss": 3.1358, + "step": 241900 + }, + { + "epoch": 0.23835935468108416, + "grad_norm": 2.4177565574645996, + "learning_rate": 9.944074827093474e-06, + "loss": 3.0219, + "step": 241950 + }, + { + "epoch": 0.23840861265890623, + "grad_norm": 2.363697052001953, + "learning_rate": 9.944051743925135e-06, + "loss": 3.0701, + "step": 242000 + }, + { + "epoch": 0.23845787063672833, + "grad_norm": 2.3304929733276367, + "learning_rate": 9.944028656020783e-06, + "loss": 3.1136, + "step": 242050 + }, + { + "epoch": 0.23850712861455042, + "grad_norm": 2.4017462730407715, + "learning_rate": 9.944005563380436e-06, + "loss": 3.0388, + "step": 242100 + }, + { + "epoch": 0.2385563865923725, + "grad_norm": 2.2806389331817627, + "learning_rate": 9.943982466004121e-06, + "loss": 3.0634, + "step": 242150 + }, + { + "epoch": 0.2386056445701946, + "grad_norm": 2.1579458713531494, + "learning_rate": 9.943959363891857e-06, + "loss": 3.1295, + "step": 242200 + }, + { + "epoch": 0.2386549025480167, + "grad_norm": 2.259321451187134, + "learning_rate": 9.943936257043669e-06, + "loss": 3.1484, + "step": 242250 + }, + { + "epoch": 0.23870416052583876, + "grad_norm": 2.4302003383636475, + "learning_rate": 9.943913145459574e-06, + "loss": 3.0832, + "step": 242300 + }, + { + "epoch": 0.23875341850366086, + "grad_norm": 2.4563777446746826, + "learning_rate": 9.9438900291396e-06, + "loss": 3.095, + "step": 242350 + }, + { + "epoch": 0.23880267648148293, + "grad_norm": 2.42958664894104, + "learning_rate": 9.943866908083765e-06, + "loss": 3.1334, + "step": 242400 + }, + { + "epoch": 0.23885193445930503, + "grad_norm": 2.34328293800354, + "learning_rate": 9.943843782292094e-06, + "loss": 3.0816, + "step": 242450 + }, + { + "epoch": 0.23890119243712712, + "grad_norm": 2.5655317306518555, + "learning_rate": 9.943820651764607e-06, + "loss": 3.1291, + "step": 242500 + }, + { + "epoch": 0.2389504504149492, + "grad_norm": 2.731804847717285, + "learning_rate": 9.943797516501326e-06, + "loss": 3.086, + "step": 242550 + }, + { + "epoch": 0.2389997083927713, + "grad_norm": 2.2610297203063965, + "learning_rate": 9.943774376502276e-06, + "loss": 3.0559, + "step": 242600 + }, + { + "epoch": 0.2390489663705934, + "grad_norm": 2.340064764022827, + "learning_rate": 9.943751231767475e-06, + "loss": 3.141, + "step": 242650 + }, + { + "epoch": 0.23909822434841546, + "grad_norm": 2.2533581256866455, + "learning_rate": 9.943728082296951e-06, + "loss": 3.175, + "step": 242700 + }, + { + "epoch": 0.23914748232623756, + "grad_norm": 2.355695962905884, + "learning_rate": 9.94370492809072e-06, + "loss": 3.1397, + "step": 242750 + }, + { + "epoch": 0.23919674030405966, + "grad_norm": 2.5224623680114746, + "learning_rate": 9.943681769148807e-06, + "loss": 3.0451, + "step": 242800 + }, + { + "epoch": 0.23924599828188173, + "grad_norm": 2.355323314666748, + "learning_rate": 9.943658605471234e-06, + "loss": 3.1289, + "step": 242850 + }, + { + "epoch": 0.23929525625970383, + "grad_norm": 2.353156328201294, + "learning_rate": 9.943635437058023e-06, + "loss": 3.077, + "step": 242900 + }, + { + "epoch": 0.2393445142375259, + "grad_norm": 2.290187120437622, + "learning_rate": 9.943612263909197e-06, + "loss": 3.0734, + "step": 242950 + }, + { + "epoch": 0.239393772215348, + "grad_norm": 2.2772364616394043, + "learning_rate": 9.943589086024777e-06, + "loss": 3.156, + "step": 243000 + }, + { + "epoch": 0.2394430301931701, + "grad_norm": 2.2556495666503906, + "learning_rate": 9.943565903404786e-06, + "loss": 3.0659, + "step": 243050 + }, + { + "epoch": 0.23949228817099216, + "grad_norm": 2.3475306034088135, + "learning_rate": 9.943542716049247e-06, + "loss": 3.0376, + "step": 243100 + }, + { + "epoch": 0.23954154614881426, + "grad_norm": 2.7231764793395996, + "learning_rate": 9.94351952395818e-06, + "loss": 3.0422, + "step": 243150 + }, + { + "epoch": 0.23959080412663636, + "grad_norm": 2.324819564819336, + "learning_rate": 9.943496327131608e-06, + "loss": 3.1233, + "step": 243200 + }, + { + "epoch": 0.23964006210445843, + "grad_norm": 2.3405513763427734, + "learning_rate": 9.943473125569553e-06, + "loss": 3.148, + "step": 243250 + }, + { + "epoch": 0.23968932008228053, + "grad_norm": 2.444082260131836, + "learning_rate": 9.943449919272041e-06, + "loss": 3.1241, + "step": 243300 + }, + { + "epoch": 0.23973857806010263, + "grad_norm": 2.4510481357574463, + "learning_rate": 9.943426708239089e-06, + "loss": 3.0944, + "step": 243350 + }, + { + "epoch": 0.2397878360379247, + "grad_norm": 2.507953405380249, + "learning_rate": 9.94340349247072e-06, + "loss": 3.1351, + "step": 243400 + }, + { + "epoch": 0.2398370940157468, + "grad_norm": 2.488377571105957, + "learning_rate": 9.94338027196696e-06, + "loss": 3.1303, + "step": 243450 + }, + { + "epoch": 0.2398863519935689, + "grad_norm": 2.2698919773101807, + "learning_rate": 9.943357046727829e-06, + "loss": 3.1505, + "step": 243500 + }, + { + "epoch": 0.23993560997139096, + "grad_norm": 2.312382459640503, + "learning_rate": 9.943333816753346e-06, + "loss": 3.0448, + "step": 243550 + }, + { + "epoch": 0.23998486794921306, + "grad_norm": 2.2288942337036133, + "learning_rate": 9.943310582043539e-06, + "loss": 3.0905, + "step": 243600 + }, + { + "epoch": 0.24003412592703513, + "grad_norm": 2.3471872806549072, + "learning_rate": 9.943287342598426e-06, + "loss": 3.0926, + "step": 243650 + }, + { + "epoch": 0.24008338390485723, + "grad_norm": Infinity, + "learning_rate": 9.943264098418032e-06, + "loss": 3.1906, + "step": 243700 + }, + { + "epoch": 0.24013264188267933, + "grad_norm": 2.334940195083618, + "learning_rate": 9.943240849502378e-06, + "loss": 3.1313, + "step": 243750 + }, + { + "epoch": 0.2401818998605014, + "grad_norm": 2.666944980621338, + "learning_rate": 9.943217595851486e-06, + "loss": 3.0732, + "step": 243800 + }, + { + "epoch": 0.2402311578383235, + "grad_norm": 2.1752078533172607, + "learning_rate": 9.94319433746538e-06, + "loss": 3.0713, + "step": 243850 + }, + { + "epoch": 0.2402804158161456, + "grad_norm": 2.3898892402648926, + "learning_rate": 9.943171074344079e-06, + "loss": 3.0909, + "step": 243900 + }, + { + "epoch": 0.24032967379396766, + "grad_norm": 2.4039320945739746, + "learning_rate": 9.943147806487608e-06, + "loss": 3.1199, + "step": 243950 + }, + { + "epoch": 0.24037893177178976, + "grad_norm": 2.532168388366699, + "learning_rate": 9.943124533895989e-06, + "loss": 3.056, + "step": 244000 + }, + { + "epoch": 0.24042818974961186, + "grad_norm": 2.3794476985931396, + "learning_rate": 9.943101256569242e-06, + "loss": 3.1421, + "step": 244050 + }, + { + "epoch": 0.24047744772743393, + "grad_norm": 2.511522054672241, + "learning_rate": 9.943077974507391e-06, + "loss": 3.0952, + "step": 244100 + }, + { + "epoch": 0.24052670570525603, + "grad_norm": 2.5325663089752197, + "learning_rate": 9.94305468771046e-06, + "loss": 3.0879, + "step": 244150 + }, + { + "epoch": 0.2405759636830781, + "grad_norm": 2.221858263015747, + "learning_rate": 9.94303139617847e-06, + "loss": 3.1731, + "step": 244200 + }, + { + "epoch": 0.2406252216609002, + "grad_norm": 2.549050807952881, + "learning_rate": 9.943008099911442e-06, + "loss": 3.1099, + "step": 244250 + }, + { + "epoch": 0.2406744796387223, + "grad_norm": 2.2249364852905273, + "learning_rate": 9.942984798909398e-06, + "loss": 3.0743, + "step": 244300 + }, + { + "epoch": 0.24072373761654436, + "grad_norm": 2.3810582160949707, + "learning_rate": 9.942961493172365e-06, + "loss": 3.0952, + "step": 244350 + }, + { + "epoch": 0.24077299559436646, + "grad_norm": 2.2447354793548584, + "learning_rate": 9.94293818270036e-06, + "loss": 3.0618, + "step": 244400 + }, + { + "epoch": 0.24082225357218856, + "grad_norm": 2.1127240657806396, + "learning_rate": 9.942914867493407e-06, + "loss": 3.0744, + "step": 244450 + }, + { + "epoch": 0.24087151155001063, + "grad_norm": 2.4088492393493652, + "learning_rate": 9.94289154755153e-06, + "loss": 3.0908, + "step": 244500 + }, + { + "epoch": 0.24092076952783273, + "grad_norm": 2.2426552772521973, + "learning_rate": 9.942868222874748e-06, + "loss": 3.0922, + "step": 244550 + }, + { + "epoch": 0.24097002750565483, + "grad_norm": 2.174344539642334, + "learning_rate": 9.942844893463086e-06, + "loss": 3.0839, + "step": 244600 + }, + { + "epoch": 0.2410192854834769, + "grad_norm": 2.3258755207061768, + "learning_rate": 9.942821559316565e-06, + "loss": 3.1183, + "step": 244650 + }, + { + "epoch": 0.241068543461299, + "grad_norm": 2.352816104888916, + "learning_rate": 9.94279822043521e-06, + "loss": 3.1089, + "step": 244700 + }, + { + "epoch": 0.2411178014391211, + "grad_norm": 2.3318135738372803, + "learning_rate": 9.942774876819038e-06, + "loss": 3.1354, + "step": 244750 + }, + { + "epoch": 0.24116705941694316, + "grad_norm": 2.3550751209259033, + "learning_rate": 9.942751528468079e-06, + "loss": 3.1324, + "step": 244800 + }, + { + "epoch": 0.24121631739476526, + "grad_norm": 2.3526620864868164, + "learning_rate": 9.942728175382348e-06, + "loss": 3.0591, + "step": 244850 + }, + { + "epoch": 0.24126557537258733, + "grad_norm": 2.287975549697876, + "learning_rate": 9.942704817561872e-06, + "loss": 3.1311, + "step": 244900 + }, + { + "epoch": 0.24131483335040943, + "grad_norm": 2.3440470695495605, + "learning_rate": 9.94268145500667e-06, + "loss": 2.9949, + "step": 244950 + }, + { + "epoch": 0.24136409132823153, + "grad_norm": 2.3846359252929688, + "learning_rate": 9.94265808771677e-06, + "loss": 3.1672, + "step": 245000 + }, + { + "epoch": 0.2414133493060536, + "grad_norm": 2.5466792583465576, + "learning_rate": 9.942634715692188e-06, + "loss": 3.1009, + "step": 245050 + }, + { + "epoch": 0.2414626072838757, + "grad_norm": 2.335782766342163, + "learning_rate": 9.942611338932948e-06, + "loss": 3.0527, + "step": 245100 + }, + { + "epoch": 0.2415118652616978, + "grad_norm": 2.589045763015747, + "learning_rate": 9.942587957439075e-06, + "loss": 3.0058, + "step": 245150 + }, + { + "epoch": 0.24156112323951986, + "grad_norm": 2.268077850341797, + "learning_rate": 9.942564571210588e-06, + "loss": 3.1519, + "step": 245200 + }, + { + "epoch": 0.24161038121734196, + "grad_norm": 2.2644951343536377, + "learning_rate": 9.942541180247514e-06, + "loss": 3.0837, + "step": 245250 + }, + { + "epoch": 0.24165963919516406, + "grad_norm": 2.3352344036102295, + "learning_rate": 9.94251778454987e-06, + "loss": 3.1186, + "step": 245300 + }, + { + "epoch": 0.24170889717298613, + "grad_norm": 2.324587106704712, + "learning_rate": 9.942494384117682e-06, + "loss": 3.1247, + "step": 245350 + }, + { + "epoch": 0.24175815515080823, + "grad_norm": 2.2567708492279053, + "learning_rate": 9.942470978950971e-06, + "loss": 3.1065, + "step": 245400 + }, + { + "epoch": 0.2418074131286303, + "grad_norm": 2.1775035858154297, + "learning_rate": 9.942447569049761e-06, + "loss": 3.0743, + "step": 245450 + }, + { + "epoch": 0.2418566711064524, + "grad_norm": 2.361942768096924, + "learning_rate": 9.942424154414071e-06, + "loss": 3.0667, + "step": 245500 + }, + { + "epoch": 0.2419059290842745, + "grad_norm": 2.2732067108154297, + "learning_rate": 9.94240073504393e-06, + "loss": 3.0814, + "step": 245550 + }, + { + "epoch": 0.24195518706209657, + "grad_norm": 2.2713191509246826, + "learning_rate": 9.942377310939353e-06, + "loss": 3.1276, + "step": 245600 + }, + { + "epoch": 0.24200444503991866, + "grad_norm": 2.5068674087524414, + "learning_rate": 9.942353882100365e-06, + "loss": 3.1656, + "step": 245650 + }, + { + "epoch": 0.24205370301774076, + "grad_norm": 2.6648664474487305, + "learning_rate": 9.94233044852699e-06, + "loss": 3.0344, + "step": 245700 + }, + { + "epoch": 0.24210296099556283, + "grad_norm": 2.379760265350342, + "learning_rate": 9.94230701021925e-06, + "loss": 3.0594, + "step": 245750 + }, + { + "epoch": 0.24215221897338493, + "grad_norm": 2.6570470333099365, + "learning_rate": 9.942283567177166e-06, + "loss": 3.1055, + "step": 245800 + }, + { + "epoch": 0.24220147695120703, + "grad_norm": 2.401010513305664, + "learning_rate": 9.942260119400761e-06, + "loss": 3.0429, + "step": 245850 + }, + { + "epoch": 0.2422507349290291, + "grad_norm": 2.4101784229278564, + "learning_rate": 9.942236666890059e-06, + "loss": 3.1159, + "step": 245900 + }, + { + "epoch": 0.2422999929068512, + "grad_norm": 2.2877323627471924, + "learning_rate": 9.94221320964508e-06, + "loss": 3.0643, + "step": 245950 + }, + { + "epoch": 0.2423492508846733, + "grad_norm": 2.5058717727661133, + "learning_rate": 9.94218974766585e-06, + "loss": 3.0869, + "step": 246000 + }, + { + "epoch": 0.24239850886249537, + "grad_norm": 2.3630261421203613, + "learning_rate": 9.942166280952385e-06, + "loss": 3.1406, + "step": 246050 + }, + { + "epoch": 0.24244776684031746, + "grad_norm": 2.2738709449768066, + "learning_rate": 9.942142809504716e-06, + "loss": 3.1039, + "step": 246100 + }, + { + "epoch": 0.24249702481813953, + "grad_norm": 2.516711950302124, + "learning_rate": 9.94211933332286e-06, + "loss": 3.1241, + "step": 246150 + }, + { + "epoch": 0.24254628279596163, + "grad_norm": 2.224327802658081, + "learning_rate": 9.942095852406838e-06, + "loss": 3.085, + "step": 246200 + }, + { + "epoch": 0.24259554077378373, + "grad_norm": 2.5019657611846924, + "learning_rate": 9.942072366756678e-06, + "loss": 3.0569, + "step": 246250 + }, + { + "epoch": 0.2426447987516058, + "grad_norm": 2.3533637523651123, + "learning_rate": 9.942048876372399e-06, + "loss": 3.0918, + "step": 246300 + }, + { + "epoch": 0.2426940567294279, + "grad_norm": 2.9378414154052734, + "learning_rate": 9.942025381254025e-06, + "loss": 3.1072, + "step": 246350 + }, + { + "epoch": 0.24274331470725, + "grad_norm": 2.682642936706543, + "learning_rate": 9.942001881401577e-06, + "loss": 3.1172, + "step": 246400 + }, + { + "epoch": 0.24279257268507207, + "grad_norm": 2.404968500137329, + "learning_rate": 9.941978376815077e-06, + "loss": 3.1451, + "step": 246450 + }, + { + "epoch": 0.24284183066289416, + "grad_norm": 2.4377310276031494, + "learning_rate": 9.94195486749455e-06, + "loss": 3.1186, + "step": 246500 + }, + { + "epoch": 0.24289108864071626, + "grad_norm": 2.4593746662139893, + "learning_rate": 9.941931353440017e-06, + "loss": 3.0653, + "step": 246550 + }, + { + "epoch": 0.24294034661853833, + "grad_norm": 2.4421639442443848, + "learning_rate": 9.9419078346515e-06, + "loss": 3.1909, + "step": 246600 + }, + { + "epoch": 0.24298960459636043, + "grad_norm": 2.4267258644104004, + "learning_rate": 9.941884311129023e-06, + "loss": 3.0655, + "step": 246650 + }, + { + "epoch": 0.2430388625741825, + "grad_norm": 2.5625085830688477, + "learning_rate": 9.94186078287261e-06, + "loss": 2.9992, + "step": 246700 + }, + { + "epoch": 0.2430881205520046, + "grad_norm": 2.464536666870117, + "learning_rate": 9.941837249882278e-06, + "loss": 3.0667, + "step": 246750 + }, + { + "epoch": 0.2431373785298267, + "grad_norm": 2.300596237182617, + "learning_rate": 9.941813712158056e-06, + "loss": 3.14, + "step": 246800 + }, + { + "epoch": 0.24318663650764877, + "grad_norm": 2.354001045227051, + "learning_rate": 9.94179016969996e-06, + "loss": 3.0716, + "step": 246850 + }, + { + "epoch": 0.24323589448547087, + "grad_norm": 2.6441471576690674, + "learning_rate": 9.941766622508017e-06, + "loss": 3.0748, + "step": 246900 + }, + { + "epoch": 0.24328515246329296, + "grad_norm": 2.2656538486480713, + "learning_rate": 9.94174307058225e-06, + "loss": 3.115, + "step": 246950 + }, + { + "epoch": 0.24333441044111503, + "grad_norm": 2.654812812805176, + "learning_rate": 9.941719513922679e-06, + "loss": 3.1058, + "step": 247000 + }, + { + "epoch": 0.24338366841893713, + "grad_norm": 2.21592116355896, + "learning_rate": 9.941695952529328e-06, + "loss": 3.0971, + "step": 247050 + }, + { + "epoch": 0.24343292639675923, + "grad_norm": 2.258265495300293, + "learning_rate": 9.941672386402219e-06, + "loss": 3.1109, + "step": 247100 + }, + { + "epoch": 0.2434821843745813, + "grad_norm": 2.21028470993042, + "learning_rate": 9.941648815541375e-06, + "loss": 3.0554, + "step": 247150 + }, + { + "epoch": 0.2435314423524034, + "grad_norm": 2.3918704986572266, + "learning_rate": 9.941625239946819e-06, + "loss": 3.0069, + "step": 247200 + }, + { + "epoch": 0.2435807003302255, + "grad_norm": 2.466355562210083, + "learning_rate": 9.941601659618572e-06, + "loss": 3.1028, + "step": 247250 + }, + { + "epoch": 0.24362995830804757, + "grad_norm": 2.2478737831115723, + "learning_rate": 9.941578074556656e-06, + "loss": 3.101, + "step": 247300 + }, + { + "epoch": 0.24367921628586967, + "grad_norm": 2.276060104370117, + "learning_rate": 9.941554484761098e-06, + "loss": 3.0946, + "step": 247350 + }, + { + "epoch": 0.24372847426369174, + "grad_norm": 2.4429895877838135, + "learning_rate": 9.941530890231917e-06, + "loss": 3.1029, + "step": 247400 + }, + { + "epoch": 0.24377773224151383, + "grad_norm": 2.5371768474578857, + "learning_rate": 9.941507290969137e-06, + "loss": 3.1083, + "step": 247450 + }, + { + "epoch": 0.24382699021933593, + "grad_norm": 2.7976884841918945, + "learning_rate": 9.94148368697278e-06, + "loss": 3.1047, + "step": 247500 + }, + { + "epoch": 0.243876248197158, + "grad_norm": 2.624077320098877, + "learning_rate": 9.941460078242866e-06, + "loss": 3.1258, + "step": 247550 + }, + { + "epoch": 0.2439255061749801, + "grad_norm": 2.4488766193389893, + "learning_rate": 9.941436464779422e-06, + "loss": 3.0496, + "step": 247600 + }, + { + "epoch": 0.2439747641528022, + "grad_norm": 2.2939250469207764, + "learning_rate": 9.941412846582469e-06, + "loss": 3.1059, + "step": 247650 + }, + { + "epoch": 0.24402402213062427, + "grad_norm": 2.470970869064331, + "learning_rate": 9.941389223652029e-06, + "loss": 3.1238, + "step": 247700 + }, + { + "epoch": 0.24407328010844637, + "grad_norm": 2.7596423625946045, + "learning_rate": 9.941365595988127e-06, + "loss": 3.1022, + "step": 247750 + }, + { + "epoch": 0.24412253808626846, + "grad_norm": 2.537104606628418, + "learning_rate": 9.94134196359078e-06, + "loss": 3.1173, + "step": 247800 + }, + { + "epoch": 0.24417179606409053, + "grad_norm": 2.3499011993408203, + "learning_rate": 9.941318326460017e-06, + "loss": 3.124, + "step": 247850 + }, + { + "epoch": 0.24422105404191263, + "grad_norm": 2.350213050842285, + "learning_rate": 9.941294684595856e-06, + "loss": 3.1126, + "step": 247900 + }, + { + "epoch": 0.2442703120197347, + "grad_norm": 2.3174033164978027, + "learning_rate": 9.941271037998323e-06, + "loss": 3.0122, + "step": 247950 + }, + { + "epoch": 0.2443195699975568, + "grad_norm": 2.215648651123047, + "learning_rate": 9.94124738666744e-06, + "loss": 3.1004, + "step": 248000 + }, + { + "epoch": 0.2443688279753789, + "grad_norm": 2.441850423812866, + "learning_rate": 9.941223730603229e-06, + "loss": 3.1195, + "step": 248050 + }, + { + "epoch": 0.24441808595320097, + "grad_norm": 2.470325231552124, + "learning_rate": 9.94120006980571e-06, + "loss": 3.0901, + "step": 248100 + }, + { + "epoch": 0.24446734393102307, + "grad_norm": 2.3005940914154053, + "learning_rate": 9.94117640427491e-06, + "loss": 3.1085, + "step": 248150 + }, + { + "epoch": 0.24451660190884517, + "grad_norm": 2.5016424655914307, + "learning_rate": 9.94115273401085e-06, + "loss": 3.0549, + "step": 248200 + }, + { + "epoch": 0.24456585988666724, + "grad_norm": 2.3000781536102295, + "learning_rate": 9.941129059013553e-06, + "loss": 3.0505, + "step": 248250 + }, + { + "epoch": 0.24461511786448933, + "grad_norm": 2.508760929107666, + "learning_rate": 9.94110537928304e-06, + "loss": 3.0527, + "step": 248300 + }, + { + "epoch": 0.24466437584231143, + "grad_norm": 2.4953818321228027, + "learning_rate": 9.941081694819336e-06, + "loss": 3.0984, + "step": 248350 + }, + { + "epoch": 0.2447136338201335, + "grad_norm": 2.1743760108947754, + "learning_rate": 9.941058005622463e-06, + "loss": 3.1083, + "step": 248400 + }, + { + "epoch": 0.2447628917979556, + "grad_norm": 2.5311193466186523, + "learning_rate": 9.941034311692444e-06, + "loss": 3.0759, + "step": 248450 + }, + { + "epoch": 0.2448121497757777, + "grad_norm": 2.205291748046875, + "learning_rate": 9.941010613029298e-06, + "loss": 3.0108, + "step": 248500 + }, + { + "epoch": 0.24486140775359977, + "grad_norm": 2.2675647735595703, + "learning_rate": 9.940986909633053e-06, + "loss": 3.0623, + "step": 248550 + }, + { + "epoch": 0.24491066573142187, + "grad_norm": 2.4191153049468994, + "learning_rate": 9.940963201503728e-06, + "loss": 3.0732, + "step": 248600 + }, + { + "epoch": 0.24495992370924394, + "grad_norm": 2.2322371006011963, + "learning_rate": 9.940939488641349e-06, + "loss": 3.1583, + "step": 248650 + }, + { + "epoch": 0.24500918168706604, + "grad_norm": 2.179018497467041, + "learning_rate": 9.940915771045936e-06, + "loss": 3.1251, + "step": 248700 + }, + { + "epoch": 0.24505843966488813, + "grad_norm": 2.383819103240967, + "learning_rate": 9.940892048717513e-06, + "loss": 3.125, + "step": 248750 + }, + { + "epoch": 0.2451076976427102, + "grad_norm": 2.652226448059082, + "learning_rate": 9.940868321656102e-06, + "loss": 3.1239, + "step": 248800 + }, + { + "epoch": 0.2451569556205323, + "grad_norm": 2.601790428161621, + "learning_rate": 9.940844589861725e-06, + "loss": 3.0994, + "step": 248850 + }, + { + "epoch": 0.2452062135983544, + "grad_norm": 2.3763363361358643, + "learning_rate": 9.940820853334408e-06, + "loss": 3.1328, + "step": 248900 + }, + { + "epoch": 0.24525547157617647, + "grad_norm": 2.5235986709594727, + "learning_rate": 9.94079711207417e-06, + "loss": 3.0695, + "step": 248950 + }, + { + "epoch": 0.24530472955399857, + "grad_norm": 2.45487380027771, + "learning_rate": 9.940773366081036e-06, + "loss": 3.148, + "step": 249000 + }, + { + "epoch": 0.24535398753182067, + "grad_norm": 2.436323404312134, + "learning_rate": 9.940749615355028e-06, + "loss": 3.1074, + "step": 249050 + }, + { + "epoch": 0.24540324550964274, + "grad_norm": 2.496150255203247, + "learning_rate": 9.940725859896167e-06, + "loss": 3.1093, + "step": 249100 + }, + { + "epoch": 0.24545250348746483, + "grad_norm": 2.441930055618286, + "learning_rate": 9.94070209970448e-06, + "loss": 3.0142, + "step": 249150 + }, + { + "epoch": 0.2455017614652869, + "grad_norm": 2.4377288818359375, + "learning_rate": 9.940678334779985e-06, + "loss": 3.0887, + "step": 249200 + }, + { + "epoch": 0.245551019443109, + "grad_norm": 2.2656266689300537, + "learning_rate": 9.940654565122709e-06, + "loss": 3.07, + "step": 249250 + }, + { + "epoch": 0.2456002774209311, + "grad_norm": 2.2421586513519287, + "learning_rate": 9.940630790732671e-06, + "loss": 3.1638, + "step": 249300 + }, + { + "epoch": 0.24564953539875317, + "grad_norm": 2.350172519683838, + "learning_rate": 9.940607011609898e-06, + "loss": 3.0672, + "step": 249350 + }, + { + "epoch": 0.24569879337657527, + "grad_norm": 2.2374517917633057, + "learning_rate": 9.940583227754408e-06, + "loss": 3.1196, + "step": 249400 + }, + { + "epoch": 0.24574805135439737, + "grad_norm": 2.556140422821045, + "learning_rate": 9.940559439166228e-06, + "loss": 3.0958, + "step": 249450 + }, + { + "epoch": 0.24579730933221944, + "grad_norm": 2.3725037574768066, + "learning_rate": 9.940535645845377e-06, + "loss": 3.0805, + "step": 249500 + }, + { + "epoch": 0.24584656731004154, + "grad_norm": 2.8568263053894043, + "learning_rate": 9.940511847791881e-06, + "loss": 3.0543, + "step": 249550 + }, + { + "epoch": 0.24589582528786363, + "grad_norm": 2.3095898628234863, + "learning_rate": 9.94048804500576e-06, + "loss": 3.141, + "step": 249600 + }, + { + "epoch": 0.2459450832656857, + "grad_norm": 2.4278645515441895, + "learning_rate": 9.940464237487041e-06, + "loss": 3.1736, + "step": 249650 + }, + { + "epoch": 0.2459943412435078, + "grad_norm": 2.2884814739227295, + "learning_rate": 9.940440425235742e-06, + "loss": 3.1215, + "step": 249700 + }, + { + "epoch": 0.24604359922132987, + "grad_norm": 2.3589158058166504, + "learning_rate": 9.940416608251889e-06, + "loss": 3.1271, + "step": 249750 + }, + { + "epoch": 0.24609285719915197, + "grad_norm": 2.3644821643829346, + "learning_rate": 9.940392786535505e-06, + "loss": 3.1211, + "step": 249800 + }, + { + "epoch": 0.24614211517697407, + "grad_norm": 2.3447327613830566, + "learning_rate": 9.940368960086609e-06, + "loss": 3.1404, + "step": 249850 + }, + { + "epoch": 0.24619137315479614, + "grad_norm": 2.534984827041626, + "learning_rate": 9.940345128905226e-06, + "loss": 3.0886, + "step": 249900 + }, + { + "epoch": 0.24624063113261824, + "grad_norm": 2.302534341812134, + "learning_rate": 9.940321292991381e-06, + "loss": 3.08, + "step": 249950 + }, + { + "epoch": 0.24628988911044034, + "grad_norm": 2.476647138595581, + "learning_rate": 9.940297452345096e-06, + "loss": 3.2269, + "step": 250000 + }, + { + "epoch": 0.2463391470882624, + "grad_norm": 2.394244909286499, + "learning_rate": 9.940273606966391e-06, + "loss": 3.0744, + "step": 250050 + }, + { + "epoch": 0.2463884050660845, + "grad_norm": 2.248342275619507, + "learning_rate": 9.940249756855291e-06, + "loss": 3.132, + "step": 250100 + }, + { + "epoch": 0.2464376630439066, + "grad_norm": 2.3945417404174805, + "learning_rate": 9.94022590201182e-06, + "loss": 3.1471, + "step": 250150 + }, + { + "epoch": 0.24648692102172867, + "grad_norm": 2.2157044410705566, + "learning_rate": 9.940202042435997e-06, + "loss": 3.1202, + "step": 250200 + }, + { + "epoch": 0.24653617899955077, + "grad_norm": 2.2267935276031494, + "learning_rate": 9.940178178127849e-06, + "loss": 3.1011, + "step": 250250 + }, + { + "epoch": 0.24658543697737287, + "grad_norm": 2.3848650455474854, + "learning_rate": 9.940154309087397e-06, + "loss": 3.111, + "step": 250300 + }, + { + "epoch": 0.24663469495519494, + "grad_norm": 2.446502447128296, + "learning_rate": 9.940130435314664e-06, + "loss": 3.0433, + "step": 250350 + }, + { + "epoch": 0.24668395293301704, + "grad_norm": 2.696394920349121, + "learning_rate": 9.940106556809672e-06, + "loss": 3.0365, + "step": 250400 + }, + { + "epoch": 0.2467332109108391, + "grad_norm": 2.2666432857513428, + "learning_rate": 9.940082673572447e-06, + "loss": 3.0728, + "step": 250450 + }, + { + "epoch": 0.2467824688886612, + "grad_norm": 2.562997341156006, + "learning_rate": 9.940058785603008e-06, + "loss": 3.0335, + "step": 250500 + }, + { + "epoch": 0.2468317268664833, + "grad_norm": 2.45611310005188, + "learning_rate": 9.940034892901378e-06, + "loss": 3.1366, + "step": 250550 + }, + { + "epoch": 0.24688098484430537, + "grad_norm": 2.1991193294525146, + "learning_rate": 9.940010995467584e-06, + "loss": 3.025, + "step": 250600 + }, + { + "epoch": 0.24693024282212747, + "grad_norm": 2.2913434505462646, + "learning_rate": 9.939987093301646e-06, + "loss": 3.1212, + "step": 250650 + }, + { + "epoch": 0.24697950079994957, + "grad_norm": 2.203089952468872, + "learning_rate": 9.939963186403586e-06, + "loss": 3.0513, + "step": 250700 + }, + { + "epoch": 0.24702875877777164, + "grad_norm": 2.4255423545837402, + "learning_rate": 9.939939274773429e-06, + "loss": 3.0374, + "step": 250750 + }, + { + "epoch": 0.24707801675559374, + "grad_norm": 2.334313154220581, + "learning_rate": 9.939915358411196e-06, + "loss": 3.0595, + "step": 250800 + }, + { + "epoch": 0.24712727473341584, + "grad_norm": 2.455349922180176, + "learning_rate": 9.939891437316912e-06, + "loss": 3.1169, + "step": 250850 + }, + { + "epoch": 0.2471765327112379, + "grad_norm": 2.473872184753418, + "learning_rate": 9.939867511490597e-06, + "loss": 3.0962, + "step": 250900 + }, + { + "epoch": 0.24722579068906, + "grad_norm": 2.2428600788116455, + "learning_rate": 9.939843580932279e-06, + "loss": 3.0097, + "step": 250950 + }, + { + "epoch": 0.24727504866688207, + "grad_norm": 2.6244871616363525, + "learning_rate": 9.939819645641975e-06, + "loss": 3.1197, + "step": 251000 + }, + { + "epoch": 0.24732430664470417, + "grad_norm": 2.2568256855010986, + "learning_rate": 9.939795705619711e-06, + "loss": 3.0428, + "step": 251050 + }, + { + "epoch": 0.24737356462252627, + "grad_norm": 2.41304349899292, + "learning_rate": 9.93977176086551e-06, + "loss": 3.074, + "step": 251100 + }, + { + "epoch": 0.24742282260034834, + "grad_norm": 2.391720771789551, + "learning_rate": 9.939747811379395e-06, + "loss": 3.1085, + "step": 251150 + }, + { + "epoch": 0.24747208057817044, + "grad_norm": 2.445913314819336, + "learning_rate": 9.939723857161388e-06, + "loss": 3.0879, + "step": 251200 + }, + { + "epoch": 0.24752133855599254, + "grad_norm": 2.472515106201172, + "learning_rate": 9.939699898211513e-06, + "loss": 3.0373, + "step": 251250 + }, + { + "epoch": 0.2475705965338146, + "grad_norm": 2.427020788192749, + "learning_rate": 9.939675934529791e-06, + "loss": 3.0324, + "step": 251300 + }, + { + "epoch": 0.2476198545116367, + "grad_norm": 2.411036968231201, + "learning_rate": 9.939651966116245e-06, + "loss": 3.0639, + "step": 251350 + }, + { + "epoch": 0.2476691124894588, + "grad_norm": 2.1841461658477783, + "learning_rate": 9.939627992970901e-06, + "loss": 3.0497, + "step": 251400 + }, + { + "epoch": 0.24771837046728087, + "grad_norm": 2.2170310020446777, + "learning_rate": 9.939604015093781e-06, + "loss": 3.115, + "step": 251450 + }, + { + "epoch": 0.24776762844510297, + "grad_norm": 2.244953155517578, + "learning_rate": 9.939580032484907e-06, + "loss": 3.1103, + "step": 251500 + }, + { + "epoch": 0.24781688642292507, + "grad_norm": 2.2516987323760986, + "learning_rate": 9.9395560451443e-06, + "loss": 3.0713, + "step": 251550 + }, + { + "epoch": 0.24786614440074714, + "grad_norm": 2.214170217514038, + "learning_rate": 9.939532053071987e-06, + "loss": 3.081, + "step": 251600 + }, + { + "epoch": 0.24791540237856924, + "grad_norm": 2.3635520935058594, + "learning_rate": 9.93950805626799e-06, + "loss": 3.1075, + "step": 251650 + }, + { + "epoch": 0.2479646603563913, + "grad_norm": 2.274703025817871, + "learning_rate": 9.93948405473233e-06, + "loss": 3.1463, + "step": 251700 + }, + { + "epoch": 0.2480139183342134, + "grad_norm": 2.226524591445923, + "learning_rate": 9.93946004846503e-06, + "loss": 3.0669, + "step": 251750 + }, + { + "epoch": 0.2480631763120355, + "grad_norm": 2.3225700855255127, + "learning_rate": 9.939436037466114e-06, + "loss": 3.0302, + "step": 251800 + }, + { + "epoch": 0.24811243428985758, + "grad_norm": 2.2212424278259277, + "learning_rate": 9.939412021735605e-06, + "loss": 3.0683, + "step": 251850 + }, + { + "epoch": 0.24816169226767967, + "grad_norm": 2.229910135269165, + "learning_rate": 9.939388001273528e-06, + "loss": 3.1137, + "step": 251900 + }, + { + "epoch": 0.24821095024550177, + "grad_norm": 2.3148345947265625, + "learning_rate": 9.939363976079903e-06, + "loss": 3.0051, + "step": 251950 + }, + { + "epoch": 0.24826020822332384, + "grad_norm": 2.599792957305908, + "learning_rate": 9.939339946154755e-06, + "loss": 3.0721, + "step": 252000 + }, + { + "epoch": 0.24830946620114594, + "grad_norm": 2.226205587387085, + "learning_rate": 9.939315911498105e-06, + "loss": 3.083, + "step": 252050 + }, + { + "epoch": 0.24835872417896804, + "grad_norm": 2.3240468502044678, + "learning_rate": 9.939291872109978e-06, + "loss": 3.0912, + "step": 252100 + }, + { + "epoch": 0.2484079821567901, + "grad_norm": 2.4869561195373535, + "learning_rate": 9.939267827990396e-06, + "loss": 3.031, + "step": 252150 + }, + { + "epoch": 0.2484572401346122, + "grad_norm": 2.18959379196167, + "learning_rate": 9.939243779139381e-06, + "loss": 3.0822, + "step": 252200 + }, + { + "epoch": 0.24850649811243428, + "grad_norm": 2.275770425796509, + "learning_rate": 9.939219725556959e-06, + "loss": 3.1012, + "step": 252250 + }, + { + "epoch": 0.24855575609025637, + "grad_norm": 2.4653937816619873, + "learning_rate": 9.93919566724315e-06, + "loss": 3.0809, + "step": 252300 + }, + { + "epoch": 0.24860501406807847, + "grad_norm": 2.7099692821502686, + "learning_rate": 9.939171604197978e-06, + "loss": 3.0608, + "step": 252350 + }, + { + "epoch": 0.24865427204590054, + "grad_norm": 2.438619613647461, + "learning_rate": 9.939147536421467e-06, + "loss": 3.0793, + "step": 252400 + }, + { + "epoch": 0.24870353002372264, + "grad_norm": 2.2371585369110107, + "learning_rate": 9.93912346391364e-06, + "loss": 3.0508, + "step": 252450 + }, + { + "epoch": 0.24875278800154474, + "grad_norm": 2.340819835662842, + "learning_rate": 9.939099386674518e-06, + "loss": 3.1413, + "step": 252500 + }, + { + "epoch": 0.2488020459793668, + "grad_norm": 2.3207669258117676, + "learning_rate": 9.939075304704127e-06, + "loss": 3.146, + "step": 252550 + }, + { + "epoch": 0.2488513039571889, + "grad_norm": 2.2835347652435303, + "learning_rate": 9.939051218002488e-06, + "loss": 3.048, + "step": 252600 + }, + { + "epoch": 0.248900561935011, + "grad_norm": 2.399665117263794, + "learning_rate": 9.939027126569624e-06, + "loss": 3.1409, + "step": 252650 + }, + { + "epoch": 0.24894981991283308, + "grad_norm": 2.2042250633239746, + "learning_rate": 9.939003030405561e-06, + "loss": 3.1106, + "step": 252700 + }, + { + "epoch": 0.24899907789065517, + "grad_norm": 2.4495363235473633, + "learning_rate": 9.938978929510318e-06, + "loss": 3.1508, + "step": 252750 + }, + { + "epoch": 0.24904833586847727, + "grad_norm": 2.259352922439575, + "learning_rate": 9.93895482388392e-06, + "loss": 3.0992, + "step": 252800 + }, + { + "epoch": 0.24909759384629934, + "grad_norm": 2.372040271759033, + "learning_rate": 9.938930713526392e-06, + "loss": 3.2258, + "step": 252850 + }, + { + "epoch": 0.24914685182412144, + "grad_norm": 2.3804309368133545, + "learning_rate": 9.938906598437751e-06, + "loss": 3.0736, + "step": 252900 + }, + { + "epoch": 0.2491961098019435, + "grad_norm": 2.170020580291748, + "learning_rate": 9.938882478618028e-06, + "loss": 3.1219, + "step": 252950 + }, + { + "epoch": 0.2492453677797656, + "grad_norm": 2.4325509071350098, + "learning_rate": 9.93885835406724e-06, + "loss": 3.1179, + "step": 253000 + }, + { + "epoch": 0.2492946257575877, + "grad_norm": 2.3336915969848633, + "learning_rate": 9.938834224785414e-06, + "loss": 3.0529, + "step": 253050 + }, + { + "epoch": 0.24934388373540978, + "grad_norm": 2.2493019104003906, + "learning_rate": 9.938810090772572e-06, + "loss": 3.2062, + "step": 253100 + }, + { + "epoch": 0.24939314171323188, + "grad_norm": 2.281674861907959, + "learning_rate": 9.938785952028735e-06, + "loss": 3.0677, + "step": 253150 + }, + { + "epoch": 0.24944239969105397, + "grad_norm": 2.3389601707458496, + "learning_rate": 9.93876180855393e-06, + "loss": 3.0819, + "step": 253200 + }, + { + "epoch": 0.24949165766887604, + "grad_norm": 2.5760600566864014, + "learning_rate": 9.938737660348175e-06, + "loss": 3.0431, + "step": 253250 + }, + { + "epoch": 0.24954091564669814, + "grad_norm": 2.4122016429901123, + "learning_rate": 9.938713507411496e-06, + "loss": 3.1449, + "step": 253300 + }, + { + "epoch": 0.24959017362452024, + "grad_norm": 2.3927791118621826, + "learning_rate": 9.938689349743919e-06, + "loss": 3.0748, + "step": 253350 + }, + { + "epoch": 0.2496394316023423, + "grad_norm": 2.356490135192871, + "learning_rate": 9.938665187345464e-06, + "loss": 3.0579, + "step": 253400 + }, + { + "epoch": 0.2496886895801644, + "grad_norm": 2.36407470703125, + "learning_rate": 9.938641020216153e-06, + "loss": 3.1203, + "step": 253450 + }, + { + "epoch": 0.24973794755798648, + "grad_norm": 2.6700825691223145, + "learning_rate": 9.938616848356011e-06, + "loss": 3.1075, + "step": 253500 + }, + { + "epoch": 0.24978720553580858, + "grad_norm": 2.545535087585449, + "learning_rate": 9.938592671765062e-06, + "loss": 3.1192, + "step": 253550 + }, + { + "epoch": 0.24983646351363067, + "grad_norm": 2.2854321002960205, + "learning_rate": 9.938568490443326e-06, + "loss": 3.0484, + "step": 253600 + }, + { + "epoch": 0.24988572149145274, + "grad_norm": 2.2227954864501953, + "learning_rate": 9.938544304390829e-06, + "loss": 3.0424, + "step": 253650 + }, + { + "epoch": 0.24993497946927484, + "grad_norm": 2.2075023651123047, + "learning_rate": 9.938520113607592e-06, + "loss": 3.1165, + "step": 253700 + }, + { + "epoch": 0.24998423744709694, + "grad_norm": 2.250612497329712, + "learning_rate": 9.938495918093642e-06, + "loss": 3.0227, + "step": 253750 + }, + { + "epoch": 0.25003349542491904, + "grad_norm": 2.37180233001709, + "learning_rate": 9.938471717848998e-06, + "loss": 3.1164, + "step": 253800 + }, + { + "epoch": 0.2500827534027411, + "grad_norm": 2.2624518871307373, + "learning_rate": 9.938447512873686e-06, + "loss": 3.0784, + "step": 253850 + }, + { + "epoch": 0.2501320113805632, + "grad_norm": 2.4083847999572754, + "learning_rate": 9.938423303167726e-06, + "loss": 3.1082, + "step": 253900 + }, + { + "epoch": 0.2501812693583853, + "grad_norm": 2.560121774673462, + "learning_rate": 9.938399088731145e-06, + "loss": 3.0605, + "step": 253950 + }, + { + "epoch": 0.2502305273362074, + "grad_norm": 2.244215726852417, + "learning_rate": 9.938374869563964e-06, + "loss": 3.0814, + "step": 254000 + }, + { + "epoch": 0.25027978531402945, + "grad_norm": 2.2970893383026123, + "learning_rate": 9.938350645666207e-06, + "loss": 3.0951, + "step": 254050 + }, + { + "epoch": 0.25032904329185157, + "grad_norm": 2.2271292209625244, + "learning_rate": 9.938326417037895e-06, + "loss": 3.1191, + "step": 254100 + }, + { + "epoch": 0.25037830126967364, + "grad_norm": 2.465325355529785, + "learning_rate": 9.938302183679054e-06, + "loss": 3.1004, + "step": 254150 + }, + { + "epoch": 0.2504275592474957, + "grad_norm": 2.3096797466278076, + "learning_rate": 9.938277945589708e-06, + "loss": 3.0869, + "step": 254200 + }, + { + "epoch": 0.25047681722531784, + "grad_norm": 2.523975133895874, + "learning_rate": 9.938253702769876e-06, + "loss": 3.0681, + "step": 254250 + }, + { + "epoch": 0.2505260752031399, + "grad_norm": 2.206946849822998, + "learning_rate": 9.938229455219584e-06, + "loss": 3.1071, + "step": 254300 + }, + { + "epoch": 0.250575333180962, + "grad_norm": 2.196448564529419, + "learning_rate": 9.938205202938855e-06, + "loss": 3.046, + "step": 254350 + }, + { + "epoch": 0.25062459115878405, + "grad_norm": 2.299151659011841, + "learning_rate": 9.938180945927713e-06, + "loss": 2.9449, + "step": 254400 + }, + { + "epoch": 0.2506738491366062, + "grad_norm": 2.4217722415924072, + "learning_rate": 9.93815668418618e-06, + "loss": 3.0386, + "step": 254450 + }, + { + "epoch": 0.25072310711442825, + "grad_norm": 2.3608052730560303, + "learning_rate": 9.938132417714279e-06, + "loss": 3.1092, + "step": 254500 + }, + { + "epoch": 0.2507723650922503, + "grad_norm": 2.2930378913879395, + "learning_rate": 9.938108146512033e-06, + "loss": 3.043, + "step": 254550 + }, + { + "epoch": 0.25082162307007244, + "grad_norm": 2.4093875885009766, + "learning_rate": 9.938083870579466e-06, + "loss": 3.0264, + "step": 254600 + }, + { + "epoch": 0.2508708810478945, + "grad_norm": 2.457159996032715, + "learning_rate": 9.938059589916604e-06, + "loss": 3.0552, + "step": 254650 + }, + { + "epoch": 0.2509201390257166, + "grad_norm": 2.319514513015747, + "learning_rate": 9.938035304523465e-06, + "loss": 3.0099, + "step": 254700 + }, + { + "epoch": 0.2509693970035387, + "grad_norm": 2.348623514175415, + "learning_rate": 9.938011014400077e-06, + "loss": 3.1371, + "step": 254750 + }, + { + "epoch": 0.2510186549813608, + "grad_norm": 2.42630672454834, + "learning_rate": 9.93798671954646e-06, + "loss": 3.0626, + "step": 254800 + }, + { + "epoch": 0.25106791295918285, + "grad_norm": 2.381896734237671, + "learning_rate": 9.937962419962638e-06, + "loss": 3.0872, + "step": 254850 + }, + { + "epoch": 0.251117170937005, + "grad_norm": 2.2345798015594482, + "learning_rate": 9.937938115648635e-06, + "loss": 3.1616, + "step": 254900 + }, + { + "epoch": 0.25116642891482704, + "grad_norm": 2.2448477745056152, + "learning_rate": 9.937913806604475e-06, + "loss": 3.1088, + "step": 254950 + }, + { + "epoch": 0.2512156868926491, + "grad_norm": 2.418372869491577, + "learning_rate": 9.93788949283018e-06, + "loss": 3.1079, + "step": 255000 + }, + { + "epoch": 0.25126494487047124, + "grad_norm": 2.391432285308838, + "learning_rate": 9.937865174325773e-06, + "loss": 3.1139, + "step": 255050 + }, + { + "epoch": 0.2513142028482933, + "grad_norm": 2.4851670265197754, + "learning_rate": 9.937840851091278e-06, + "loss": 3.1061, + "step": 255100 + }, + { + "epoch": 0.2513634608261154, + "grad_norm": 2.5608394145965576, + "learning_rate": 9.937816523126717e-06, + "loss": 3.1126, + "step": 255150 + }, + { + "epoch": 0.2514127188039375, + "grad_norm": 2.205476760864258, + "learning_rate": 9.937792190432115e-06, + "loss": 3.1022, + "step": 255200 + }, + { + "epoch": 0.2514619767817596, + "grad_norm": 2.371083974838257, + "learning_rate": 9.937767853007496e-06, + "loss": 3.0961, + "step": 255250 + }, + { + "epoch": 0.25151123475958165, + "grad_norm": 2.2045812606811523, + "learning_rate": 9.937743510852881e-06, + "loss": 3.0957, + "step": 255300 + }, + { + "epoch": 0.2515604927374038, + "grad_norm": 2.368514060974121, + "learning_rate": 9.937719163968294e-06, + "loss": 3.0679, + "step": 255350 + }, + { + "epoch": 0.25160975071522584, + "grad_norm": 2.3765487670898438, + "learning_rate": 9.93769481235376e-06, + "loss": 3.1062, + "step": 255400 + }, + { + "epoch": 0.2516590086930479, + "grad_norm": 2.5080883502960205, + "learning_rate": 9.9376704560093e-06, + "loss": 3.0998, + "step": 255450 + }, + { + "epoch": 0.25170826667087004, + "grad_norm": 2.4907028675079346, + "learning_rate": 9.937646094934938e-06, + "loss": 3.1767, + "step": 255500 + }, + { + "epoch": 0.2517575246486921, + "grad_norm": 2.3081626892089844, + "learning_rate": 9.937621729130698e-06, + "loss": 3.0126, + "step": 255550 + }, + { + "epoch": 0.2518067826265142, + "grad_norm": 2.3124520778656006, + "learning_rate": 9.937597358596604e-06, + "loss": 3.1518, + "step": 255600 + }, + { + "epoch": 0.25185604060433625, + "grad_norm": 2.241363048553467, + "learning_rate": 9.937572983332678e-06, + "loss": 3.0758, + "step": 255650 + }, + { + "epoch": 0.2519052985821584, + "grad_norm": 2.295393705368042, + "learning_rate": 9.937548603338943e-06, + "loss": 3.0508, + "step": 255700 + }, + { + "epoch": 0.25195455655998045, + "grad_norm": 2.449326515197754, + "learning_rate": 9.937524218615424e-06, + "loss": 3.1305, + "step": 255750 + }, + { + "epoch": 0.2520038145378025, + "grad_norm": 2.3756635189056396, + "learning_rate": 9.937499829162143e-06, + "loss": 3.0365, + "step": 255800 + }, + { + "epoch": 0.25205307251562464, + "grad_norm": 2.17541766166687, + "learning_rate": 9.937475434979124e-06, + "loss": 3.1491, + "step": 255850 + }, + { + "epoch": 0.2521023304934467, + "grad_norm": 2.4092917442321777, + "learning_rate": 9.93745103606639e-06, + "loss": 3.0689, + "step": 255900 + }, + { + "epoch": 0.2521515884712688, + "grad_norm": 2.1509618759155273, + "learning_rate": 9.937426632423964e-06, + "loss": 3.0114, + "step": 255950 + }, + { + "epoch": 0.2522008464490909, + "grad_norm": 2.3913373947143555, + "learning_rate": 9.937402224051871e-06, + "loss": 3.0665, + "step": 256000 + }, + { + "epoch": 0.252250104426913, + "grad_norm": 2.494486093521118, + "learning_rate": 9.937377810950131e-06, + "loss": 3.0495, + "step": 256050 + }, + { + "epoch": 0.25229936240473505, + "grad_norm": 2.439788818359375, + "learning_rate": 9.937353393118772e-06, + "loss": 3.1098, + "step": 256100 + }, + { + "epoch": 0.2523486203825572, + "grad_norm": 2.43515682220459, + "learning_rate": 9.937328970557814e-06, + "loss": 3.0703, + "step": 256150 + }, + { + "epoch": 0.25239787836037925, + "grad_norm": 2.293869972229004, + "learning_rate": 9.937304543267283e-06, + "loss": 3.114, + "step": 256200 + }, + { + "epoch": 0.2524471363382013, + "grad_norm": 2.7489066123962402, + "learning_rate": 9.937280111247199e-06, + "loss": 3.0844, + "step": 256250 + }, + { + "epoch": 0.25249639431602344, + "grad_norm": 2.326929807662964, + "learning_rate": 9.937255674497588e-06, + "loss": 3.128, + "step": 256300 + }, + { + "epoch": 0.2525456522938455, + "grad_norm": 2.3794050216674805, + "learning_rate": 9.937231233018473e-06, + "loss": 3.0979, + "step": 256350 + }, + { + "epoch": 0.2525949102716676, + "grad_norm": 2.313791275024414, + "learning_rate": 9.937206786809877e-06, + "loss": 3.0731, + "step": 256400 + }, + { + "epoch": 0.2526441682494897, + "grad_norm": 2.9497628211975098, + "learning_rate": 9.937182335871823e-06, + "loss": 3.0589, + "step": 256450 + }, + { + "epoch": 0.2526934262273118, + "grad_norm": 2.3087689876556396, + "learning_rate": 9.937157880204336e-06, + "loss": 3.0867, + "step": 256500 + }, + { + "epoch": 0.25274268420513385, + "grad_norm": 2.239593982696533, + "learning_rate": 9.937133419807437e-06, + "loss": 3.037, + "step": 256550 + }, + { + "epoch": 0.252791942182956, + "grad_norm": 2.4424500465393066, + "learning_rate": 9.937108954681152e-06, + "loss": 3.071, + "step": 256600 + }, + { + "epoch": 0.25284120016077805, + "grad_norm": 2.4858295917510986, + "learning_rate": 9.937084484825502e-06, + "loss": 3.0842, + "step": 256650 + }, + { + "epoch": 0.2528904581386001, + "grad_norm": 2.2719104290008545, + "learning_rate": 9.937060010240512e-06, + "loss": 3.1002, + "step": 256700 + }, + { + "epoch": 0.25293971611642224, + "grad_norm": 2.20900821685791, + "learning_rate": 9.937035530926207e-06, + "loss": 3.0935, + "step": 256750 + }, + { + "epoch": 0.2529889740942443, + "grad_norm": 2.385298252105713, + "learning_rate": 9.937011046882606e-06, + "loss": 3.0206, + "step": 256800 + }, + { + "epoch": 0.2530382320720664, + "grad_norm": 2.3431928157806396, + "learning_rate": 9.936986558109736e-06, + "loss": 3.0918, + "step": 256850 + }, + { + "epoch": 0.25308749004988845, + "grad_norm": 2.3850743770599365, + "learning_rate": 9.93696206460762e-06, + "loss": 3.0065, + "step": 256900 + }, + { + "epoch": 0.2531367480277106, + "grad_norm": 2.22607159614563, + "learning_rate": 9.93693756637628e-06, + "loss": 3.0331, + "step": 256950 + }, + { + "epoch": 0.25318600600553265, + "grad_norm": 2.354191541671753, + "learning_rate": 9.936913063415741e-06, + "loss": 3.0231, + "step": 257000 + }, + { + "epoch": 0.2532352639833547, + "grad_norm": 2.306366443634033, + "learning_rate": 9.936888555726025e-06, + "loss": 3.059, + "step": 257050 + }, + { + "epoch": 0.25328452196117685, + "grad_norm": 2.1821885108947754, + "learning_rate": 9.936864043307158e-06, + "loss": 3.1197, + "step": 257100 + }, + { + "epoch": 0.2533337799389989, + "grad_norm": 2.3182213306427, + "learning_rate": 9.936839526159162e-06, + "loss": 3.1396, + "step": 257150 + }, + { + "epoch": 0.253383037916821, + "grad_norm": 2.3195254802703857, + "learning_rate": 9.936815004282057e-06, + "loss": 3.0491, + "step": 257200 + }, + { + "epoch": 0.2534322958946431, + "grad_norm": 2.2796568870544434, + "learning_rate": 9.936790477675873e-06, + "loss": 3.0364, + "step": 257250 + }, + { + "epoch": 0.2534815538724652, + "grad_norm": 2.2190959453582764, + "learning_rate": 9.93676594634063e-06, + "loss": 3.1216, + "step": 257300 + }, + { + "epoch": 0.25353081185028725, + "grad_norm": 2.4295012950897217, + "learning_rate": 9.93674141027635e-06, + "loss": 3.1017, + "step": 257350 + }, + { + "epoch": 0.2535800698281094, + "grad_norm": 2.4229323863983154, + "learning_rate": 9.93671686948306e-06, + "loss": 3.0808, + "step": 257400 + }, + { + "epoch": 0.25362932780593145, + "grad_norm": 2.158740520477295, + "learning_rate": 9.936692323960782e-06, + "loss": 3.1169, + "step": 257450 + }, + { + "epoch": 0.2536785857837535, + "grad_norm": 2.242999792098999, + "learning_rate": 9.936667773709539e-06, + "loss": 3.0133, + "step": 257500 + }, + { + "epoch": 0.25372784376157564, + "grad_norm": 2.2652461528778076, + "learning_rate": 9.936643218729354e-06, + "loss": 2.9839, + "step": 257550 + }, + { + "epoch": 0.2537771017393977, + "grad_norm": 2.542161464691162, + "learning_rate": 9.936618659020252e-06, + "loss": 3.1022, + "step": 257600 + }, + { + "epoch": 0.2538263597172198, + "grad_norm": 2.307978868484497, + "learning_rate": 9.936594094582255e-06, + "loss": 3.1068, + "step": 257650 + }, + { + "epoch": 0.2538756176950419, + "grad_norm": 2.342412233352661, + "learning_rate": 9.936569525415389e-06, + "loss": 3.0855, + "step": 257700 + }, + { + "epoch": 0.253924875672864, + "grad_norm": 2.3787951469421387, + "learning_rate": 9.936544951519675e-06, + "loss": 3.1073, + "step": 257750 + }, + { + "epoch": 0.25397413365068605, + "grad_norm": 2.3494632244110107, + "learning_rate": 9.936520372895137e-06, + "loss": 3.1068, + "step": 257800 + }, + { + "epoch": 0.2540233916285082, + "grad_norm": 2.307288885116577, + "learning_rate": 9.9364957895418e-06, + "loss": 3.1082, + "step": 257850 + }, + { + "epoch": 0.25407264960633025, + "grad_norm": 2.1457531452178955, + "learning_rate": 9.936471201459686e-06, + "loss": 3.0961, + "step": 257900 + }, + { + "epoch": 0.2541219075841523, + "grad_norm": 2.312784194946289, + "learning_rate": 9.93644660864882e-06, + "loss": 3.0106, + "step": 257950 + }, + { + "epoch": 0.25417116556197444, + "grad_norm": 2.2943084239959717, + "learning_rate": 9.936422011109224e-06, + "loss": 3.0397, + "step": 258000 + }, + { + "epoch": 0.2542204235397965, + "grad_norm": 2.3307204246520996, + "learning_rate": 9.936397408840922e-06, + "loss": 3.1069, + "step": 258050 + }, + { + "epoch": 0.2542696815176186, + "grad_norm": 2.537062168121338, + "learning_rate": 9.936372801843938e-06, + "loss": 3.0722, + "step": 258100 + }, + { + "epoch": 0.25431893949544065, + "grad_norm": 2.456529140472412, + "learning_rate": 9.936348190118295e-06, + "loss": 3.0776, + "step": 258150 + }, + { + "epoch": 0.2543681974732628, + "grad_norm": 2.2354092597961426, + "learning_rate": 9.936323573664017e-06, + "loss": 3.0956, + "step": 258200 + }, + { + "epoch": 0.25441745545108485, + "grad_norm": 2.306375741958618, + "learning_rate": 9.936298952481129e-06, + "loss": 3.0752, + "step": 258250 + }, + { + "epoch": 0.2544667134289069, + "grad_norm": 2.3969156742095947, + "learning_rate": 9.936274326569652e-06, + "loss": 3.0308, + "step": 258300 + }, + { + "epoch": 0.25451597140672905, + "grad_norm": 2.2978198528289795, + "learning_rate": 9.936249695929612e-06, + "loss": 3.0466, + "step": 258350 + }, + { + "epoch": 0.2545652293845511, + "grad_norm": 2.1055655479431152, + "learning_rate": 9.93622506056103e-06, + "loss": 3.0605, + "step": 258400 + }, + { + "epoch": 0.2546144873623732, + "grad_norm": 2.231285333633423, + "learning_rate": 9.936200420463931e-06, + "loss": 3.0917, + "step": 258450 + }, + { + "epoch": 0.2546637453401953, + "grad_norm": 2.7285451889038086, + "learning_rate": 9.936175775638339e-06, + "loss": 3.1291, + "step": 258500 + }, + { + "epoch": 0.2547130033180174, + "grad_norm": 2.3237266540527344, + "learning_rate": 9.936151126084277e-06, + "loss": 3.0943, + "step": 258550 + }, + { + "epoch": 0.25476226129583945, + "grad_norm": 2.3082072734832764, + "learning_rate": 9.936126471801769e-06, + "loss": 3.0941, + "step": 258600 + }, + { + "epoch": 0.2548115192736616, + "grad_norm": 2.368072271347046, + "learning_rate": 9.936101812790836e-06, + "loss": 3.1081, + "step": 258650 + }, + { + "epoch": 0.25486077725148365, + "grad_norm": 2.303162097930908, + "learning_rate": 9.936077149051509e-06, + "loss": 3.0867, + "step": 258700 + }, + { + "epoch": 0.2549100352293057, + "grad_norm": 2.3232405185699463, + "learning_rate": 9.936052480583802e-06, + "loss": 3.0254, + "step": 258750 + }, + { + "epoch": 0.25495929320712785, + "grad_norm": 2.335277557373047, + "learning_rate": 9.936027807387745e-06, + "loss": 3.1087, + "step": 258800 + }, + { + "epoch": 0.2550085511849499, + "grad_norm": 2.686619997024536, + "learning_rate": 9.936003129463361e-06, + "loss": 3.111, + "step": 258850 + }, + { + "epoch": 0.255057809162772, + "grad_norm": 2.3179800510406494, + "learning_rate": 9.93597844681067e-06, + "loss": 3.0888, + "step": 258900 + }, + { + "epoch": 0.2551070671405941, + "grad_norm": 2.3299334049224854, + "learning_rate": 9.935953759429699e-06, + "loss": 3.0873, + "step": 258950 + }, + { + "epoch": 0.2551563251184162, + "grad_norm": 2.3747615814208984, + "learning_rate": 9.935929067320473e-06, + "loss": 3.1039, + "step": 259000 + }, + { + "epoch": 0.25520558309623825, + "grad_norm": 2.4137990474700928, + "learning_rate": 9.93590437048301e-06, + "loss": 3.1233, + "step": 259050 + }, + { + "epoch": 0.2552548410740604, + "grad_norm": 2.3562309741973877, + "learning_rate": 9.935879668917341e-06, + "loss": 3.0377, + "step": 259100 + }, + { + "epoch": 0.25530409905188245, + "grad_norm": 2.2881319522857666, + "learning_rate": 9.935854962623483e-06, + "loss": 3.0762, + "step": 259150 + }, + { + "epoch": 0.2553533570297045, + "grad_norm": 2.317577362060547, + "learning_rate": 9.935830251601463e-06, + "loss": 3.041, + "step": 259200 + }, + { + "epoch": 0.25540261500752665, + "grad_norm": 2.4774842262268066, + "learning_rate": 9.935805535851304e-06, + "loss": 3.1181, + "step": 259250 + }, + { + "epoch": 0.2554518729853487, + "grad_norm": 2.4339921474456787, + "learning_rate": 9.93578081537303e-06, + "loss": 3.095, + "step": 259300 + }, + { + "epoch": 0.2555011309631708, + "grad_norm": 2.331709623336792, + "learning_rate": 9.935756090166666e-06, + "loss": 3.1146, + "step": 259350 + }, + { + "epoch": 0.25555038894099286, + "grad_norm": 2.356633186340332, + "learning_rate": 9.935731360232232e-06, + "loss": 3.0883, + "step": 259400 + }, + { + "epoch": 0.255599646918815, + "grad_norm": 2.58652400970459, + "learning_rate": 9.935706625569754e-06, + "loss": 3.0339, + "step": 259450 + }, + { + "epoch": 0.25564890489663705, + "grad_norm": 2.6274592876434326, + "learning_rate": 9.935681886179258e-06, + "loss": 3.0788, + "step": 259500 + }, + { + "epoch": 0.2556981628744591, + "grad_norm": 2.425046682357788, + "learning_rate": 9.935657142060762e-06, + "loss": 3.1278, + "step": 259550 + }, + { + "epoch": 0.25574742085228125, + "grad_norm": 2.570483922958374, + "learning_rate": 9.935632393214295e-06, + "loss": 3.0888, + "step": 259600 + }, + { + "epoch": 0.2557966788301033, + "grad_norm": 2.372954845428467, + "learning_rate": 9.935607639639877e-06, + "loss": 3.0642, + "step": 259650 + }, + { + "epoch": 0.2558459368079254, + "grad_norm": 2.1559035778045654, + "learning_rate": 9.935582881337535e-06, + "loss": 3.0936, + "step": 259700 + }, + { + "epoch": 0.2558951947857475, + "grad_norm": 2.422140121459961, + "learning_rate": 9.935558118307291e-06, + "loss": 3.061, + "step": 259750 + }, + { + "epoch": 0.2559444527635696, + "grad_norm": 2.3255815505981445, + "learning_rate": 9.935533350549168e-06, + "loss": 3.0689, + "step": 259800 + }, + { + "epoch": 0.25599371074139166, + "grad_norm": 2.442577362060547, + "learning_rate": 9.935508578063192e-06, + "loss": 3.0887, + "step": 259850 + }, + { + "epoch": 0.2560429687192138, + "grad_norm": 2.306621551513672, + "learning_rate": 9.935483800849384e-06, + "loss": 3.1181, + "step": 259900 + }, + { + "epoch": 0.25609222669703585, + "grad_norm": 2.4201009273529053, + "learning_rate": 9.935459018907769e-06, + "loss": 3.0226, + "step": 259950 + }, + { + "epoch": 0.2561414846748579, + "grad_norm": 2.2478420734405518, + "learning_rate": 9.93543423223837e-06, + "loss": 3.112, + "step": 260000 + }, + { + "epoch": 0.25619074265268005, + "grad_norm": 2.3587725162506104, + "learning_rate": 9.935409440841212e-06, + "loss": 3.0511, + "step": 260050 + }, + { + "epoch": 0.2562400006305021, + "grad_norm": 2.2620270252227783, + "learning_rate": 9.93538464471632e-06, + "loss": 3.0882, + "step": 260100 + }, + { + "epoch": 0.2562892586083242, + "grad_norm": 2.373584032058716, + "learning_rate": 9.935359843863713e-06, + "loss": 3.0674, + "step": 260150 + }, + { + "epoch": 0.2563385165861463, + "grad_norm": 2.468580961227417, + "learning_rate": 9.93533503828342e-06, + "loss": 3.085, + "step": 260200 + }, + { + "epoch": 0.2563877745639684, + "grad_norm": 2.366079330444336, + "learning_rate": 9.93531022797546e-06, + "loss": 3.1251, + "step": 260250 + }, + { + "epoch": 0.25643703254179046, + "grad_norm": 2.385525703430176, + "learning_rate": 9.935285412939863e-06, + "loss": 3.0382, + "step": 260300 + }, + { + "epoch": 0.2564862905196126, + "grad_norm": 2.186537027359009, + "learning_rate": 9.935260593176646e-06, + "loss": 3.042, + "step": 260350 + }, + { + "epoch": 0.25653554849743465, + "grad_norm": 2.526230812072754, + "learning_rate": 9.935235768685839e-06, + "loss": 3.0707, + "step": 260400 + }, + { + "epoch": 0.2565848064752567, + "grad_norm": 2.3876867294311523, + "learning_rate": 9.935210939467459e-06, + "loss": 3.0912, + "step": 260450 + }, + { + "epoch": 0.25663406445307885, + "grad_norm": 2.1879594326019287, + "learning_rate": 9.935186105521534e-06, + "loss": 3.0342, + "step": 260500 + }, + { + "epoch": 0.2566833224309009, + "grad_norm": 2.351672887802124, + "learning_rate": 9.935161266848088e-06, + "loss": 3.0352, + "step": 260550 + }, + { + "epoch": 0.256732580408723, + "grad_norm": 2.398935079574585, + "learning_rate": 9.935136423447145e-06, + "loss": 3.1426, + "step": 260600 + }, + { + "epoch": 0.25678183838654506, + "grad_norm": 2.1015028953552246, + "learning_rate": 9.935111575318728e-06, + "loss": 3.0576, + "step": 260650 + }, + { + "epoch": 0.2568310963643672, + "grad_norm": 2.2587039470672607, + "learning_rate": 9.935086722462858e-06, + "loss": 3.0852, + "step": 260700 + }, + { + "epoch": 0.25688035434218925, + "grad_norm": 2.2581071853637695, + "learning_rate": 9.935061864879563e-06, + "loss": 3.079, + "step": 260750 + }, + { + "epoch": 0.2569296123200113, + "grad_norm": 2.4232730865478516, + "learning_rate": 9.935037002568865e-06, + "loss": 3.116, + "step": 260800 + }, + { + "epoch": 0.25697887029783345, + "grad_norm": 2.2190699577331543, + "learning_rate": 9.935012135530788e-06, + "loss": 3.0915, + "step": 260850 + }, + { + "epoch": 0.2570281282756555, + "grad_norm": 2.35693621635437, + "learning_rate": 9.934987263765355e-06, + "loss": 3.133, + "step": 260900 + }, + { + "epoch": 0.2570773862534776, + "grad_norm": 2.2636609077453613, + "learning_rate": 9.934962387272593e-06, + "loss": 3.0729, + "step": 260950 + }, + { + "epoch": 0.2571266442312997, + "grad_norm": 2.295818328857422, + "learning_rate": 9.93493750605252e-06, + "loss": 3.0473, + "step": 261000 + }, + { + "epoch": 0.2571759022091218, + "grad_norm": 2.446805477142334, + "learning_rate": 9.934912620105167e-06, + "loss": 3.0861, + "step": 261050 + }, + { + "epoch": 0.25722516018694386, + "grad_norm": 2.388827085494995, + "learning_rate": 9.93488772943055e-06, + "loss": 3.0732, + "step": 261100 + }, + { + "epoch": 0.257274418164766, + "grad_norm": 2.3711190223693848, + "learning_rate": 9.934862834028701e-06, + "loss": 3.1042, + "step": 261150 + }, + { + "epoch": 0.25732367614258805, + "grad_norm": 2.3979122638702393, + "learning_rate": 9.934837933899637e-06, + "loss": 3.1186, + "step": 261200 + }, + { + "epoch": 0.2573729341204101, + "grad_norm": 2.3179659843444824, + "learning_rate": 9.934813029043387e-06, + "loss": 3.0394, + "step": 261250 + }, + { + "epoch": 0.25742219209823225, + "grad_norm": 2.533970355987549, + "learning_rate": 9.934788119459971e-06, + "loss": 3.1065, + "step": 261300 + }, + { + "epoch": 0.2574714500760543, + "grad_norm": 3.6127326488494873, + "learning_rate": 9.934763205149414e-06, + "loss": 3.0721, + "step": 261350 + }, + { + "epoch": 0.2575207080538764, + "grad_norm": 2.2646238803863525, + "learning_rate": 9.934738286111742e-06, + "loss": 3.0592, + "step": 261400 + }, + { + "epoch": 0.2575699660316985, + "grad_norm": 2.26383376121521, + "learning_rate": 9.934713362346975e-06, + "loss": 3.02, + "step": 261450 + }, + { + "epoch": 0.2576192240095206, + "grad_norm": 2.250631332397461, + "learning_rate": 9.934688433855141e-06, + "loss": 3.0955, + "step": 261500 + }, + { + "epoch": 0.25766848198734266, + "grad_norm": 2.442823886871338, + "learning_rate": 9.93466350063626e-06, + "loss": 3.036, + "step": 261550 + }, + { + "epoch": 0.2577177399651648, + "grad_norm": 2.4431490898132324, + "learning_rate": 9.93463856269036e-06, + "loss": 3.1021, + "step": 261600 + }, + { + "epoch": 0.25776699794298685, + "grad_norm": 2.3174731731414795, + "learning_rate": 9.934613620017462e-06, + "loss": 3.0641, + "step": 261650 + }, + { + "epoch": 0.2578162559208089, + "grad_norm": 2.353742837905884, + "learning_rate": 9.93458867261759e-06, + "loss": 3.0665, + "step": 261700 + }, + { + "epoch": 0.25786551389863105, + "grad_norm": 2.464116096496582, + "learning_rate": 9.934563720490769e-06, + "loss": 3.1115, + "step": 261750 + }, + { + "epoch": 0.2579147718764531, + "grad_norm": 2.7770769596099854, + "learning_rate": 9.934538763637021e-06, + "loss": 3.0913, + "step": 261800 + }, + { + "epoch": 0.2579640298542752, + "grad_norm": 2.283189535140991, + "learning_rate": 9.934513802056372e-06, + "loss": 3.0772, + "step": 261850 + }, + { + "epoch": 0.25801328783209726, + "grad_norm": 2.1706981658935547, + "learning_rate": 9.934488835748845e-06, + "loss": 3.1118, + "step": 261900 + }, + { + "epoch": 0.2580625458099194, + "grad_norm": 2.529280185699463, + "learning_rate": 9.934463864714465e-06, + "loss": 3.0571, + "step": 261950 + }, + { + "epoch": 0.25811180378774146, + "grad_norm": 2.722412347793579, + "learning_rate": 9.934438888953255e-06, + "loss": 3.0161, + "step": 262000 + }, + { + "epoch": 0.2581610617655635, + "grad_norm": 2.439823627471924, + "learning_rate": 9.934413908465238e-06, + "loss": 3.0327, + "step": 262050 + }, + { + "epoch": 0.25821031974338565, + "grad_norm": 2.273712635040283, + "learning_rate": 9.93438892325044e-06, + "loss": 3.1111, + "step": 262100 + }, + { + "epoch": 0.2582595777212077, + "grad_norm": 2.21352219581604, + "learning_rate": 9.934363933308882e-06, + "loss": 3.1152, + "step": 262150 + }, + { + "epoch": 0.2583088356990298, + "grad_norm": 2.3663134574890137, + "learning_rate": 9.93433893864059e-06, + "loss": 3.1107, + "step": 262200 + }, + { + "epoch": 0.2583580936768519, + "grad_norm": 2.292452335357666, + "learning_rate": 9.93431393924559e-06, + "loss": 3.0541, + "step": 262250 + }, + { + "epoch": 0.258407351654674, + "grad_norm": 2.2005016803741455, + "learning_rate": 9.9342889351239e-06, + "loss": 2.9605, + "step": 262300 + }, + { + "epoch": 0.25845660963249606, + "grad_norm": 2.263667583465576, + "learning_rate": 9.93426392627555e-06, + "loss": 3.1351, + "step": 262350 + }, + { + "epoch": 0.2585058676103182, + "grad_norm": 2.321561336517334, + "learning_rate": 9.934238912700562e-06, + "loss": 3.0939, + "step": 262400 + }, + { + "epoch": 0.25855512558814026, + "grad_norm": 2.230804681777954, + "learning_rate": 9.93421389439896e-06, + "loss": 3.0738, + "step": 262450 + }, + { + "epoch": 0.2586043835659623, + "grad_norm": 2.3438355922698975, + "learning_rate": 9.934188871370766e-06, + "loss": 3.0903, + "step": 262500 + }, + { + "epoch": 0.25865364154378445, + "grad_norm": 2.8147759437561035, + "learning_rate": 9.934163843616005e-06, + "loss": 3.077, + "step": 262550 + }, + { + "epoch": 0.2587028995216065, + "grad_norm": 2.487534761428833, + "learning_rate": 9.934138811134704e-06, + "loss": 3.1012, + "step": 262600 + }, + { + "epoch": 0.2587521574994286, + "grad_norm": 2.518249988555908, + "learning_rate": 9.934113773926881e-06, + "loss": 3.0756, + "step": 262650 + }, + { + "epoch": 0.2588014154772507, + "grad_norm": 2.2651939392089844, + "learning_rate": 9.934088731992565e-06, + "loss": 3.0796, + "step": 262700 + }, + { + "epoch": 0.2588506734550728, + "grad_norm": 2.461376667022705, + "learning_rate": 9.934063685331778e-06, + "loss": 3.0791, + "step": 262750 + }, + { + "epoch": 0.25889993143289486, + "grad_norm": 2.3872203826904297, + "learning_rate": 9.934038633944544e-06, + "loss": 3.123, + "step": 262800 + }, + { + "epoch": 0.258949189410717, + "grad_norm": 2.391319990158081, + "learning_rate": 9.93401357783089e-06, + "loss": 3.0479, + "step": 262850 + }, + { + "epoch": 0.25899844738853905, + "grad_norm": 2.4777257442474365, + "learning_rate": 9.933988516990836e-06, + "loss": 2.9947, + "step": 262900 + }, + { + "epoch": 0.2590477053663611, + "grad_norm": 2.2939376831054688, + "learning_rate": 9.933963451424407e-06, + "loss": 3.0737, + "step": 262950 + }, + { + "epoch": 0.25909696334418325, + "grad_norm": 2.2882626056671143, + "learning_rate": 9.933938381131626e-06, + "loss": 3.146, + "step": 263000 + }, + { + "epoch": 0.2591462213220053, + "grad_norm": 2.247429370880127, + "learning_rate": 9.933913306112519e-06, + "loss": 3.0689, + "step": 263050 + }, + { + "epoch": 0.2591954792998274, + "grad_norm": 2.337207555770874, + "learning_rate": 9.93388822636711e-06, + "loss": 3.096, + "step": 263100 + }, + { + "epoch": 0.25924473727764946, + "grad_norm": 2.3328146934509277, + "learning_rate": 9.933863141895422e-06, + "loss": 3.0534, + "step": 263150 + }, + { + "epoch": 0.2592939952554716, + "grad_norm": 2.4184648990631104, + "learning_rate": 9.93383805269748e-06, + "loss": 3.0944, + "step": 263200 + }, + { + "epoch": 0.25934325323329366, + "grad_norm": 2.757802963256836, + "learning_rate": 9.933812958773307e-06, + "loss": 3.1401, + "step": 263250 + }, + { + "epoch": 0.25939251121111573, + "grad_norm": 2.634526014328003, + "learning_rate": 9.933787860122929e-06, + "loss": 3.1205, + "step": 263300 + }, + { + "epoch": 0.25944176918893785, + "grad_norm": 2.460357189178467, + "learning_rate": 9.933762756746366e-06, + "loss": 3.0786, + "step": 263350 + }, + { + "epoch": 0.2594910271667599, + "grad_norm": 2.6272666454315186, + "learning_rate": 9.933737648643647e-06, + "loss": 3.0671, + "step": 263400 + }, + { + "epoch": 0.259540285144582, + "grad_norm": 2.2090325355529785, + "learning_rate": 9.933712535814793e-06, + "loss": 3.1257, + "step": 263450 + }, + { + "epoch": 0.2595895431224041, + "grad_norm": 2.153813362121582, + "learning_rate": 9.933687418259827e-06, + "loss": 3.0344, + "step": 263500 + }, + { + "epoch": 0.2596388011002262, + "grad_norm": 2.376289129257202, + "learning_rate": 9.933662295978777e-06, + "loss": 3.0392, + "step": 263550 + }, + { + "epoch": 0.25968805907804826, + "grad_norm": 2.3167994022369385, + "learning_rate": 9.933637168971664e-06, + "loss": 3.075, + "step": 263600 + }, + { + "epoch": 0.2597373170558704, + "grad_norm": 2.3211567401885986, + "learning_rate": 9.933612037238513e-06, + "loss": 3.078, + "step": 263650 + }, + { + "epoch": 0.25978657503369246, + "grad_norm": 2.733569622039795, + "learning_rate": 9.933586900779349e-06, + "loss": 3.1042, + "step": 263700 + }, + { + "epoch": 0.2598358330115145, + "grad_norm": 2.2014262676239014, + "learning_rate": 9.933561759594194e-06, + "loss": 3.0498, + "step": 263750 + }, + { + "epoch": 0.25988509098933665, + "grad_norm": 2.3283870220184326, + "learning_rate": 9.933536613683072e-06, + "loss": 3.0603, + "step": 263800 + }, + { + "epoch": 0.2599343489671587, + "grad_norm": 2.323721408843994, + "learning_rate": 9.933511463046011e-06, + "loss": 3.1603, + "step": 263850 + }, + { + "epoch": 0.2599836069449808, + "grad_norm": 2.5439741611480713, + "learning_rate": 9.933486307683032e-06, + "loss": 3.056, + "step": 263900 + }, + { + "epoch": 0.2600328649228029, + "grad_norm": 2.3531341552734375, + "learning_rate": 9.933461147594157e-06, + "loss": 3.1219, + "step": 263950 + }, + { + "epoch": 0.260082122900625, + "grad_norm": 2.582170248031616, + "learning_rate": 9.933435982779414e-06, + "loss": 3.0702, + "step": 264000 + }, + { + "epoch": 0.26013138087844706, + "grad_norm": 2.203338861465454, + "learning_rate": 9.933410813238826e-06, + "loss": 3.1252, + "step": 264050 + }, + { + "epoch": 0.2601806388562692, + "grad_norm": 2.2364983558654785, + "learning_rate": 9.933385638972415e-06, + "loss": 3.0388, + "step": 264100 + }, + { + "epoch": 0.26022989683409126, + "grad_norm": 2.5165584087371826, + "learning_rate": 9.933360459980208e-06, + "loss": 3.0387, + "step": 264150 + }, + { + "epoch": 0.2602791548119133, + "grad_norm": 2.4020442962646484, + "learning_rate": 9.933335276262228e-06, + "loss": 3.0208, + "step": 264200 + }, + { + "epoch": 0.26032841278973545, + "grad_norm": 2.365980625152588, + "learning_rate": 9.9333100878185e-06, + "loss": 3.0542, + "step": 264250 + }, + { + "epoch": 0.2603776707675575, + "grad_norm": 2.275216579437256, + "learning_rate": 9.933284894649046e-06, + "loss": 3.0926, + "step": 264300 + }, + { + "epoch": 0.2604269287453796, + "grad_norm": 2.225046396255493, + "learning_rate": 9.933259696753891e-06, + "loss": 3.167, + "step": 264350 + }, + { + "epoch": 0.26047618672320166, + "grad_norm": 2.237882375717163, + "learning_rate": 9.933234494133061e-06, + "loss": 3.1036, + "step": 264400 + }, + { + "epoch": 0.2605254447010238, + "grad_norm": 2.2599258422851562, + "learning_rate": 9.933209286786579e-06, + "loss": 3.0438, + "step": 264450 + }, + { + "epoch": 0.26057470267884586, + "grad_norm": 2.489572525024414, + "learning_rate": 9.933184074714469e-06, + "loss": 3.0697, + "step": 264500 + }, + { + "epoch": 0.26062396065666793, + "grad_norm": 2.3389358520507812, + "learning_rate": 9.933158857916753e-06, + "loss": 3.0055, + "step": 264550 + }, + { + "epoch": 0.26067321863449006, + "grad_norm": 2.131216526031494, + "learning_rate": 9.933133636393456e-06, + "loss": 3.0557, + "step": 264600 + }, + { + "epoch": 0.2607224766123121, + "grad_norm": 2.314331293106079, + "learning_rate": 9.933108410144607e-06, + "loss": 3.0307, + "step": 264650 + }, + { + "epoch": 0.2607717345901342, + "grad_norm": 2.502854347229004, + "learning_rate": 9.933083179170225e-06, + "loss": 3.0467, + "step": 264700 + }, + { + "epoch": 0.2608209925679563, + "grad_norm": 2.3536572456359863, + "learning_rate": 9.933057943470334e-06, + "loss": 3.0191, + "step": 264750 + }, + { + "epoch": 0.2608702505457784, + "grad_norm": 2.1705939769744873, + "learning_rate": 9.933032703044963e-06, + "loss": 3.0696, + "step": 264800 + }, + { + "epoch": 0.26091950852360046, + "grad_norm": 2.4820799827575684, + "learning_rate": 9.933007457894131e-06, + "loss": 3.1286, + "step": 264850 + }, + { + "epoch": 0.2609687665014226, + "grad_norm": 2.2030234336853027, + "learning_rate": 9.932982208017863e-06, + "loss": 3.0509, + "step": 264900 + }, + { + "epoch": 0.26101802447924466, + "grad_norm": 2.47273588180542, + "learning_rate": 9.932956953416185e-06, + "loss": 3.1139, + "step": 264950 + }, + { + "epoch": 0.26106728245706673, + "grad_norm": 2.392012596130371, + "learning_rate": 9.93293169408912e-06, + "loss": 3.0688, + "step": 265000 + }, + { + "epoch": 0.26111654043488886, + "grad_norm": 2.3819375038146973, + "learning_rate": 9.932906430036695e-06, + "loss": 3.0052, + "step": 265050 + }, + { + "epoch": 0.2611657984127109, + "grad_norm": 2.3022165298461914, + "learning_rate": 9.932881161258931e-06, + "loss": 3.0962, + "step": 265100 + }, + { + "epoch": 0.261215056390533, + "grad_norm": 2.3350555896759033, + "learning_rate": 9.932855887755852e-06, + "loss": 3.0902, + "step": 265150 + }, + { + "epoch": 0.2612643143683551, + "grad_norm": 2.639866352081299, + "learning_rate": 9.932830609527483e-06, + "loss": 3.0366, + "step": 265200 + }, + { + "epoch": 0.2613135723461772, + "grad_norm": 2.1750435829162598, + "learning_rate": 9.93280532657385e-06, + "loss": 3.0234, + "step": 265250 + }, + { + "epoch": 0.26136283032399926, + "grad_norm": 2.2043814659118652, + "learning_rate": 9.932780038894977e-06, + "loss": 3.0747, + "step": 265300 + }, + { + "epoch": 0.2614120883018214, + "grad_norm": 2.2014036178588867, + "learning_rate": 9.932754746490885e-06, + "loss": 3.1524, + "step": 265350 + }, + { + "epoch": 0.26146134627964346, + "grad_norm": 2.2648870944976807, + "learning_rate": 9.9327294493616e-06, + "loss": 3.0915, + "step": 265400 + }, + { + "epoch": 0.26151060425746553, + "grad_norm": 2.376891613006592, + "learning_rate": 9.932704147507149e-06, + "loss": 3.0386, + "step": 265450 + }, + { + "epoch": 0.26155986223528765, + "grad_norm": 2.5101590156555176, + "learning_rate": 9.93267884092755e-06, + "loss": 3.0351, + "step": 265500 + }, + { + "epoch": 0.2616091202131097, + "grad_norm": 2.0977275371551514, + "learning_rate": 9.932653529622833e-06, + "loss": 3.087, + "step": 265550 + }, + { + "epoch": 0.2616583781909318, + "grad_norm": 2.2630743980407715, + "learning_rate": 9.93262821359302e-06, + "loss": 3.0467, + "step": 265600 + }, + { + "epoch": 0.26170763616875387, + "grad_norm": 2.4610393047332764, + "learning_rate": 9.932602892838135e-06, + "loss": 3.0657, + "step": 265650 + }, + { + "epoch": 0.261756894146576, + "grad_norm": 2.184044599533081, + "learning_rate": 9.932577567358203e-06, + "loss": 3.1085, + "step": 265700 + }, + { + "epoch": 0.26180615212439806, + "grad_norm": 2.263251304626465, + "learning_rate": 9.932552237153248e-06, + "loss": 3.0666, + "step": 265750 + }, + { + "epoch": 0.26185541010222013, + "grad_norm": 4.362361907958984, + "learning_rate": 9.932526902223295e-06, + "loss": 3.1286, + "step": 265800 + }, + { + "epoch": 0.26190466808004226, + "grad_norm": 2.264329433441162, + "learning_rate": 9.932501562568366e-06, + "loss": 3.1373, + "step": 265850 + }, + { + "epoch": 0.26195392605786433, + "grad_norm": 2.3149986267089844, + "learning_rate": 9.932476218188487e-06, + "loss": 3.0524, + "step": 265900 + }, + { + "epoch": 0.2620031840356864, + "grad_norm": 2.3355472087860107, + "learning_rate": 9.932450869083683e-06, + "loss": 3.0105, + "step": 265950 + }, + { + "epoch": 0.2620524420135085, + "grad_norm": 2.240657329559326, + "learning_rate": 9.932425515253976e-06, + "loss": 3.0171, + "step": 266000 + }, + { + "epoch": 0.2621016999913306, + "grad_norm": 2.3302268981933594, + "learning_rate": 9.932400156699393e-06, + "loss": 3.0928, + "step": 266050 + }, + { + "epoch": 0.26215095796915266, + "grad_norm": 2.484650135040283, + "learning_rate": 9.932374793419957e-06, + "loss": 3.1762, + "step": 266100 + }, + { + "epoch": 0.2622002159469748, + "grad_norm": 2.251443386077881, + "learning_rate": 9.93234942541569e-06, + "loss": 3.0228, + "step": 266150 + }, + { + "epoch": 0.26224947392479686, + "grad_norm": 2.3148317337036133, + "learning_rate": 9.932324052686619e-06, + "loss": 3.0373, + "step": 266200 + }, + { + "epoch": 0.26229873190261893, + "grad_norm": 2.5024971961975098, + "learning_rate": 9.93229867523277e-06, + "loss": 3.0128, + "step": 266250 + }, + { + "epoch": 0.26234798988044106, + "grad_norm": 2.3264265060424805, + "learning_rate": 9.932273293054163e-06, + "loss": 3.0972, + "step": 266300 + }, + { + "epoch": 0.2623972478582631, + "grad_norm": 2.3888535499572754, + "learning_rate": 9.932247906150826e-06, + "loss": 3.1076, + "step": 266350 + }, + { + "epoch": 0.2624465058360852, + "grad_norm": 2.172363519668579, + "learning_rate": 9.93222251452278e-06, + "loss": 3.058, + "step": 266400 + }, + { + "epoch": 0.2624957638139073, + "grad_norm": 2.3558425903320312, + "learning_rate": 9.932197118170052e-06, + "loss": 3.0377, + "step": 266450 + }, + { + "epoch": 0.2625450217917294, + "grad_norm": 2.2067978382110596, + "learning_rate": 9.932171717092666e-06, + "loss": 3.0166, + "step": 266500 + }, + { + "epoch": 0.26259427976955146, + "grad_norm": 2.311547040939331, + "learning_rate": 9.932146311290644e-06, + "loss": 3.055, + "step": 266550 + }, + { + "epoch": 0.2626435377473736, + "grad_norm": 2.256028890609741, + "learning_rate": 9.932120900764014e-06, + "loss": 3.1205, + "step": 266600 + }, + { + "epoch": 0.26269279572519566, + "grad_norm": 2.339846611022949, + "learning_rate": 9.932095485512798e-06, + "loss": 3.0983, + "step": 266650 + }, + { + "epoch": 0.26274205370301773, + "grad_norm": 2.1908113956451416, + "learning_rate": 9.93207006553702e-06, + "loss": 3.1126, + "step": 266700 + }, + { + "epoch": 0.26279131168083986, + "grad_norm": 2.4471638202667236, + "learning_rate": 9.932044640836704e-06, + "loss": 3.0624, + "step": 266750 + }, + { + "epoch": 0.2628405696586619, + "grad_norm": 2.3092570304870605, + "learning_rate": 9.93201921141188e-06, + "loss": 3.1238, + "step": 266800 + }, + { + "epoch": 0.262889827636484, + "grad_norm": 2.2091095447540283, + "learning_rate": 9.931993777262562e-06, + "loss": 3.0867, + "step": 266850 + }, + { + "epoch": 0.26293908561430607, + "grad_norm": 2.350799560546875, + "learning_rate": 9.931968338388783e-06, + "loss": 3.0603, + "step": 266900 + }, + { + "epoch": 0.2629883435921282, + "grad_norm": 2.4707634449005127, + "learning_rate": 9.931942894790564e-06, + "loss": 3.0413, + "step": 266950 + }, + { + "epoch": 0.26303760156995026, + "grad_norm": 2.3779118061065674, + "learning_rate": 9.931917446467931e-06, + "loss": 3.0396, + "step": 267000 + }, + { + "epoch": 0.26308685954777233, + "grad_norm": 2.4660568237304688, + "learning_rate": 9.931891993420906e-06, + "loss": 3.0844, + "step": 267050 + }, + { + "epoch": 0.26313611752559446, + "grad_norm": 2.157242774963379, + "learning_rate": 9.931866535649515e-06, + "loss": 3.0511, + "step": 267100 + }, + { + "epoch": 0.26318537550341653, + "grad_norm": 2.20387864112854, + "learning_rate": 9.931841073153783e-06, + "loss": 3.0741, + "step": 267150 + }, + { + "epoch": 0.2632346334812386, + "grad_norm": 2.4419519901275635, + "learning_rate": 9.931815605933733e-06, + "loss": 3.088, + "step": 267200 + }, + { + "epoch": 0.2632838914590607, + "grad_norm": 2.4699862003326416, + "learning_rate": 9.931790133989389e-06, + "loss": 3.0822, + "step": 267250 + }, + { + "epoch": 0.2633331494368828, + "grad_norm": 2.392817258834839, + "learning_rate": 9.931764657320778e-06, + "loss": 3.0149, + "step": 267300 + }, + { + "epoch": 0.26338240741470487, + "grad_norm": 2.3471341133117676, + "learning_rate": 9.93173917592792e-06, + "loss": 3.1146, + "step": 267350 + }, + { + "epoch": 0.263431665392527, + "grad_norm": 2.19142484664917, + "learning_rate": 9.931713689810843e-06, + "loss": 3.0603, + "step": 267400 + }, + { + "epoch": 0.26348092337034906, + "grad_norm": 2.2037861347198486, + "learning_rate": 9.931688198969571e-06, + "loss": 3.0948, + "step": 267450 + }, + { + "epoch": 0.26353018134817113, + "grad_norm": 2.2188687324523926, + "learning_rate": 9.931662703404128e-06, + "loss": 3.0214, + "step": 267500 + }, + { + "epoch": 0.26357943932599326, + "grad_norm": 2.434837579727173, + "learning_rate": 9.931637203114538e-06, + "loss": 2.9666, + "step": 267550 + }, + { + "epoch": 0.26362869730381533, + "grad_norm": 2.5141408443450928, + "learning_rate": 9.931611698100826e-06, + "loss": 3.0448, + "step": 267600 + }, + { + "epoch": 0.2636779552816374, + "grad_norm": 2.3356690406799316, + "learning_rate": 9.931586188363016e-06, + "loss": 3.0299, + "step": 267650 + }, + { + "epoch": 0.2637272132594595, + "grad_norm": 2.327857732772827, + "learning_rate": 9.931560673901133e-06, + "loss": 2.9855, + "step": 267700 + }, + { + "epoch": 0.2637764712372816, + "grad_norm": 2.3119494915008545, + "learning_rate": 9.9315351547152e-06, + "loss": 3.0294, + "step": 267750 + }, + { + "epoch": 0.26382572921510367, + "grad_norm": 2.3527722358703613, + "learning_rate": 9.931509630805244e-06, + "loss": 3.0841, + "step": 267800 + }, + { + "epoch": 0.2638749871929258, + "grad_norm": 2.3760812282562256, + "learning_rate": 9.931484102171287e-06, + "loss": 3.1242, + "step": 267850 + }, + { + "epoch": 0.26392424517074786, + "grad_norm": 2.3739471435546875, + "learning_rate": 9.931458568813354e-06, + "loss": 3.1044, + "step": 267900 + }, + { + "epoch": 0.26397350314856993, + "grad_norm": 2.1638565063476562, + "learning_rate": 9.93143303073147e-06, + "loss": 2.9749, + "step": 267950 + }, + { + "epoch": 0.26402276112639206, + "grad_norm": 2.283241033554077, + "learning_rate": 9.93140748792566e-06, + "loss": 3.1227, + "step": 268000 + }, + { + "epoch": 0.26407201910421413, + "grad_norm": 2.333972692489624, + "learning_rate": 9.931381940395949e-06, + "loss": 3.1183, + "step": 268050 + }, + { + "epoch": 0.2641212770820362, + "grad_norm": 2.2984774112701416, + "learning_rate": 9.931356388142358e-06, + "loss": 3.0598, + "step": 268100 + }, + { + "epoch": 0.26417053505985827, + "grad_norm": 2.1914758682250977, + "learning_rate": 9.931330831164913e-06, + "loss": 3.0551, + "step": 268150 + }, + { + "epoch": 0.2642197930376804, + "grad_norm": 2.3171284198760986, + "learning_rate": 9.931305269463642e-06, + "loss": 3.088, + "step": 268200 + }, + { + "epoch": 0.26426905101550247, + "grad_norm": 2.267775297164917, + "learning_rate": 9.931279703038566e-06, + "loss": 3.037, + "step": 268250 + }, + { + "epoch": 0.26431830899332454, + "grad_norm": 2.392009973526001, + "learning_rate": 9.93125413188971e-06, + "loss": 3.0467, + "step": 268300 + }, + { + "epoch": 0.26436756697114666, + "grad_norm": 2.2809956073760986, + "learning_rate": 9.931228556017098e-06, + "loss": 3.1192, + "step": 268350 + }, + { + "epoch": 0.26441682494896873, + "grad_norm": 2.3661491870880127, + "learning_rate": 9.931202975420754e-06, + "loss": 3.0382, + "step": 268400 + }, + { + "epoch": 0.2644660829267908, + "grad_norm": 2.132511615753174, + "learning_rate": 9.931177390100707e-06, + "loss": 3.0897, + "step": 268450 + }, + { + "epoch": 0.26451534090461293, + "grad_norm": 2.3445851802825928, + "learning_rate": 9.931151800056974e-06, + "loss": 3.0581, + "step": 268500 + }, + { + "epoch": 0.264564598882435, + "grad_norm": 2.1925172805786133, + "learning_rate": 9.931126205289587e-06, + "loss": 3.1101, + "step": 268550 + }, + { + "epoch": 0.26461385686025707, + "grad_norm": 2.3380303382873535, + "learning_rate": 9.931100605798567e-06, + "loss": 3.1113, + "step": 268600 + }, + { + "epoch": 0.2646631148380792, + "grad_norm": 2.3160200119018555, + "learning_rate": 9.931075001583939e-06, + "loss": 3.0643, + "step": 268650 + }, + { + "epoch": 0.26471237281590126, + "grad_norm": 2.8013155460357666, + "learning_rate": 9.931049392645725e-06, + "loss": 3.0196, + "step": 268700 + }, + { + "epoch": 0.26476163079372333, + "grad_norm": 2.285468101501465, + "learning_rate": 9.931023778983953e-06, + "loss": 3.0136, + "step": 268750 + }, + { + "epoch": 0.26481088877154546, + "grad_norm": 2.200941801071167, + "learning_rate": 9.930998160598648e-06, + "loss": 3.0095, + "step": 268800 + }, + { + "epoch": 0.26486014674936753, + "grad_norm": 2.4813740253448486, + "learning_rate": 9.93097253748983e-06, + "loss": 3.0315, + "step": 268850 + }, + { + "epoch": 0.2649094047271896, + "grad_norm": 2.2768115997314453, + "learning_rate": 9.930946909657528e-06, + "loss": 2.9886, + "step": 268900 + }, + { + "epoch": 0.2649586627050117, + "grad_norm": 2.332987070083618, + "learning_rate": 9.930921277101764e-06, + "loss": 3.0596, + "step": 268950 + }, + { + "epoch": 0.2650079206828338, + "grad_norm": 2.2726166248321533, + "learning_rate": 9.930895639822566e-06, + "loss": 3.1239, + "step": 269000 + }, + { + "epoch": 0.26505717866065587, + "grad_norm": 2.167916774749756, + "learning_rate": 9.930869997819954e-06, + "loss": 2.9781, + "step": 269050 + }, + { + "epoch": 0.265106436638478, + "grad_norm": 2.412663698196411, + "learning_rate": 9.930844351093956e-06, + "loss": 3.0431, + "step": 269100 + }, + { + "epoch": 0.26515569461630006, + "grad_norm": 2.2605154514312744, + "learning_rate": 9.930818699644594e-06, + "loss": 3.0331, + "step": 269150 + }, + { + "epoch": 0.26520495259412213, + "grad_norm": 2.4331347942352295, + "learning_rate": 9.930793043471893e-06, + "loss": 3.0291, + "step": 269200 + }, + { + "epoch": 0.2652542105719442, + "grad_norm": 2.2273125648498535, + "learning_rate": 9.930767382575881e-06, + "loss": 3.0805, + "step": 269250 + }, + { + "epoch": 0.26530346854976633, + "grad_norm": 2.470411539077759, + "learning_rate": 9.930741716956578e-06, + "loss": 3.0566, + "step": 269300 + }, + { + "epoch": 0.2653527265275884, + "grad_norm": 2.1139142513275146, + "learning_rate": 9.93071604661401e-06, + "loss": 3.0558, + "step": 269350 + }, + { + "epoch": 0.26540198450541047, + "grad_norm": 2.672767162322998, + "learning_rate": 9.930690371548204e-06, + "loss": 3.1067, + "step": 269400 + }, + { + "epoch": 0.2654512424832326, + "grad_norm": 2.3223862648010254, + "learning_rate": 9.930664691759182e-06, + "loss": 3.0552, + "step": 269450 + }, + { + "epoch": 0.26550050046105467, + "grad_norm": 2.2276573181152344, + "learning_rate": 9.93063900724697e-06, + "loss": 3.1189, + "step": 269500 + }, + { + "epoch": 0.26554975843887674, + "grad_norm": 2.4031879901885986, + "learning_rate": 9.93061331801159e-06, + "loss": 3.0826, + "step": 269550 + }, + { + "epoch": 0.26559901641669886, + "grad_norm": 2.3633084297180176, + "learning_rate": 9.93058762405307e-06, + "loss": 3.0899, + "step": 269600 + }, + { + "epoch": 0.26564827439452093, + "grad_norm": 2.3614721298217773, + "learning_rate": 9.930561925371434e-06, + "loss": 3.0562, + "step": 269650 + }, + { + "epoch": 0.265697532372343, + "grad_norm": 2.260119915008545, + "learning_rate": 9.930536221966704e-06, + "loss": 3.064, + "step": 269700 + }, + { + "epoch": 0.26574679035016513, + "grad_norm": 2.4764583110809326, + "learning_rate": 9.930510513838907e-06, + "loss": 3.0648, + "step": 269750 + }, + { + "epoch": 0.2657960483279872, + "grad_norm": 2.397484064102173, + "learning_rate": 9.930484800988067e-06, + "loss": 3.1114, + "step": 269800 + }, + { + "epoch": 0.26584530630580927, + "grad_norm": 2.2097394466400146, + "learning_rate": 9.93045908341421e-06, + "loss": 3.0295, + "step": 269850 + }, + { + "epoch": 0.2658945642836314, + "grad_norm": 2.3395190238952637, + "learning_rate": 9.930433361117358e-06, + "loss": 3.0029, + "step": 269900 + }, + { + "epoch": 0.26594382226145347, + "grad_norm": 2.3066799640655518, + "learning_rate": 9.930407634097536e-06, + "loss": 3.0202, + "step": 269950 + }, + { + "epoch": 0.26599308023927554, + "grad_norm": 2.244288444519043, + "learning_rate": 9.930381902354772e-06, + "loss": 3.0024, + "step": 270000 + }, + { + "epoch": 0.26604233821709766, + "grad_norm": 2.343271493911743, + "learning_rate": 9.930356165889086e-06, + "loss": 3.0317, + "step": 270050 + }, + { + "epoch": 0.26609159619491973, + "grad_norm": 2.3791308403015137, + "learning_rate": 9.930330424700505e-06, + "loss": 3.1241, + "step": 270100 + }, + { + "epoch": 0.2661408541727418, + "grad_norm": 2.274815797805786, + "learning_rate": 9.930304678789053e-06, + "loss": 3.0702, + "step": 270150 + }, + { + "epoch": 0.26619011215056393, + "grad_norm": 2.459787130355835, + "learning_rate": 9.930278928154757e-06, + "loss": 3.0791, + "step": 270200 + }, + { + "epoch": 0.266239370128386, + "grad_norm": 2.2481462955474854, + "learning_rate": 9.930253172797639e-06, + "loss": 3.1322, + "step": 270250 + }, + { + "epoch": 0.26628862810620807, + "grad_norm": 2.261033058166504, + "learning_rate": 9.930227412717726e-06, + "loss": 3.027, + "step": 270300 + }, + { + "epoch": 0.2663378860840302, + "grad_norm": 2.3554444313049316, + "learning_rate": 9.930201647915039e-06, + "loss": 3.1461, + "step": 270350 + }, + { + "epoch": 0.26638714406185227, + "grad_norm": 2.144186019897461, + "learning_rate": 9.930175878389606e-06, + "loss": 3.0161, + "step": 270400 + }, + { + "epoch": 0.26643640203967434, + "grad_norm": 2.2613089084625244, + "learning_rate": 9.930150104141449e-06, + "loss": 2.9733, + "step": 270450 + }, + { + "epoch": 0.2664856600174964, + "grad_norm": 2.3649911880493164, + "learning_rate": 9.930124325170598e-06, + "loss": 3.0628, + "step": 270500 + }, + { + "epoch": 0.26653491799531853, + "grad_norm": 2.3926491737365723, + "learning_rate": 9.93009854147707e-06, + "loss": 3.0291, + "step": 270550 + }, + { + "epoch": 0.2665841759731406, + "grad_norm": 2.2569563388824463, + "learning_rate": 9.930072753060895e-06, + "loss": 3.0413, + "step": 270600 + }, + { + "epoch": 0.2666334339509627, + "grad_norm": 2.189606189727783, + "learning_rate": 9.930046959922097e-06, + "loss": 3.0346, + "step": 270650 + }, + { + "epoch": 0.2666826919287848, + "grad_norm": 2.42091703414917, + "learning_rate": 9.9300211620607e-06, + "loss": 3.0232, + "step": 270700 + }, + { + "epoch": 0.26673194990660687, + "grad_norm": 2.3920516967773438, + "learning_rate": 9.929995359476728e-06, + "loss": 3.1093, + "step": 270750 + }, + { + "epoch": 0.26678120788442894, + "grad_norm": 2.4465250968933105, + "learning_rate": 9.929969552170207e-06, + "loss": 3.1086, + "step": 270800 + }, + { + "epoch": 0.26683046586225107, + "grad_norm": 2.2704055309295654, + "learning_rate": 9.929943740141161e-06, + "loss": 2.9978, + "step": 270850 + }, + { + "epoch": 0.26687972384007314, + "grad_norm": 2.15385103225708, + "learning_rate": 9.929917923389616e-06, + "loss": 3.0402, + "step": 270900 + }, + { + "epoch": 0.2669289818178952, + "grad_norm": 2.4136264324188232, + "learning_rate": 9.929892101915595e-06, + "loss": 3.0892, + "step": 270950 + }, + { + "epoch": 0.26697823979571733, + "grad_norm": 2.185389280319214, + "learning_rate": 9.929866275719123e-06, + "loss": 3.0925, + "step": 271000 + }, + { + "epoch": 0.2670274977735394, + "grad_norm": 2.221203088760376, + "learning_rate": 9.929840444800225e-06, + "loss": 3.0896, + "step": 271050 + }, + { + "epoch": 0.26707675575136147, + "grad_norm": 2.408501386642456, + "learning_rate": 9.929814609158927e-06, + "loss": 3.0498, + "step": 271100 + }, + { + "epoch": 0.2671260137291836, + "grad_norm": 2.5423808097839355, + "learning_rate": 9.929788768795251e-06, + "loss": 3.0229, + "step": 271150 + }, + { + "epoch": 0.26717527170700567, + "grad_norm": 2.1282596588134766, + "learning_rate": 9.929762923709225e-06, + "loss": 3.0489, + "step": 271200 + }, + { + "epoch": 0.26722452968482774, + "grad_norm": 2.2854952812194824, + "learning_rate": 9.929737073900872e-06, + "loss": 3.0191, + "step": 271250 + }, + { + "epoch": 0.26727378766264986, + "grad_norm": 2.2662932872772217, + "learning_rate": 9.929711219370215e-06, + "loss": 3.0382, + "step": 271300 + }, + { + "epoch": 0.26732304564047193, + "grad_norm": 2.3308775424957275, + "learning_rate": 9.929685360117282e-06, + "loss": 3.0798, + "step": 271350 + }, + { + "epoch": 0.267372303618294, + "grad_norm": 2.184906244277954, + "learning_rate": 9.929659496142097e-06, + "loss": 3.0082, + "step": 271400 + }, + { + "epoch": 0.26742156159611613, + "grad_norm": 2.2457683086395264, + "learning_rate": 9.929633627444683e-06, + "loss": 3.0844, + "step": 271450 + }, + { + "epoch": 0.2674708195739382, + "grad_norm": 2.2955844402313232, + "learning_rate": 9.929607754025069e-06, + "loss": 3.1065, + "step": 271500 + }, + { + "epoch": 0.26752007755176027, + "grad_norm": 2.494981050491333, + "learning_rate": 9.929581875883273e-06, + "loss": 3.0446, + "step": 271550 + }, + { + "epoch": 0.2675693355295824, + "grad_norm": 2.127133846282959, + "learning_rate": 9.929555993019324e-06, + "loss": 3.0748, + "step": 271600 + }, + { + "epoch": 0.26761859350740447, + "grad_norm": 2.496340036392212, + "learning_rate": 9.929530105433248e-06, + "loss": 3.0314, + "step": 271650 + }, + { + "epoch": 0.26766785148522654, + "grad_norm": 2.3220291137695312, + "learning_rate": 9.929504213125068e-06, + "loss": 3.0969, + "step": 271700 + }, + { + "epoch": 0.2677171094630486, + "grad_norm": 2.497211217880249, + "learning_rate": 9.92947831609481e-06, + "loss": 3.0178, + "step": 271750 + }, + { + "epoch": 0.26776636744087073, + "grad_norm": 2.320970296859741, + "learning_rate": 9.929452414342496e-06, + "loss": 3.0464, + "step": 271800 + }, + { + "epoch": 0.2678156254186928, + "grad_norm": 2.276566505432129, + "learning_rate": 9.929426507868151e-06, + "loss": 3.0418, + "step": 271850 + }, + { + "epoch": 0.2678648833965149, + "grad_norm": 2.424804210662842, + "learning_rate": 9.929400596671804e-06, + "loss": 3.1204, + "step": 271900 + }, + { + "epoch": 0.267914141374337, + "grad_norm": 2.230098009109497, + "learning_rate": 9.929374680753477e-06, + "loss": 3.129, + "step": 271950 + }, + { + "epoch": 0.26796339935215907, + "grad_norm": 2.2621920108795166, + "learning_rate": 9.929348760113196e-06, + "loss": 3.0144, + "step": 272000 + }, + { + "epoch": 0.26801265732998114, + "grad_norm": 2.318976402282715, + "learning_rate": 9.929322834750983e-06, + "loss": 3.1055, + "step": 272050 + }, + { + "epoch": 0.26806191530780327, + "grad_norm": 2.242457866668701, + "learning_rate": 9.929296904666865e-06, + "loss": 3.0869, + "step": 272100 + }, + { + "epoch": 0.26811117328562534, + "grad_norm": 2.2995502948760986, + "learning_rate": 9.929270969860868e-06, + "loss": 3.0195, + "step": 272150 + }, + { + "epoch": 0.2681604312634474, + "grad_norm": 2.282332181930542, + "learning_rate": 9.929245030333012e-06, + "loss": 3.063, + "step": 272200 + }, + { + "epoch": 0.26820968924126953, + "grad_norm": 2.1786437034606934, + "learning_rate": 9.929219086083328e-06, + "loss": 3.0731, + "step": 272250 + }, + { + "epoch": 0.2682589472190916, + "grad_norm": 2.2779195308685303, + "learning_rate": 9.92919313711184e-06, + "loss": 3.0209, + "step": 272300 + }, + { + "epoch": 0.2683082051969137, + "grad_norm": 2.3118577003479004, + "learning_rate": 9.929167183418567e-06, + "loss": 3.0956, + "step": 272350 + }, + { + "epoch": 0.2683574631747358, + "grad_norm": 2.2368698120117188, + "learning_rate": 9.929141225003538e-06, + "loss": 3.0833, + "step": 272400 + }, + { + "epoch": 0.26840672115255787, + "grad_norm": 2.3733928203582764, + "learning_rate": 9.92911526186678e-06, + "loss": 2.9921, + "step": 272450 + }, + { + "epoch": 0.26845597913037994, + "grad_norm": 2.183927297592163, + "learning_rate": 9.929089294008315e-06, + "loss": 3.0316, + "step": 272500 + }, + { + "epoch": 0.26850523710820207, + "grad_norm": 2.3431320190429688, + "learning_rate": 9.929063321428168e-06, + "loss": 3.0807, + "step": 272550 + }, + { + "epoch": 0.26855449508602414, + "grad_norm": 2.522692918777466, + "learning_rate": 9.929037344126365e-06, + "loss": 3.0794, + "step": 272600 + }, + { + "epoch": 0.2686037530638462, + "grad_norm": 2.2585055828094482, + "learning_rate": 9.929011362102929e-06, + "loss": 3.1138, + "step": 272650 + }, + { + "epoch": 0.26865301104166833, + "grad_norm": 2.2940473556518555, + "learning_rate": 9.928985375357887e-06, + "loss": 3.0833, + "step": 272700 + }, + { + "epoch": 0.2687022690194904, + "grad_norm": 2.331890821456909, + "learning_rate": 9.928959383891263e-06, + "loss": 3.0485, + "step": 272750 + }, + { + "epoch": 0.2687515269973125, + "grad_norm": 2.3480982780456543, + "learning_rate": 9.928933387703081e-06, + "loss": 3.035, + "step": 272800 + }, + { + "epoch": 0.2688007849751346, + "grad_norm": 2.298905611038208, + "learning_rate": 9.928907386793367e-06, + "loss": 3.0126, + "step": 272850 + }, + { + "epoch": 0.26885004295295667, + "grad_norm": 2.1908907890319824, + "learning_rate": 9.928881381162147e-06, + "loss": 3.0541, + "step": 272900 + }, + { + "epoch": 0.26889930093077874, + "grad_norm": 2.188222885131836, + "learning_rate": 9.928855370809443e-06, + "loss": 3.119, + "step": 272950 + }, + { + "epoch": 0.2689485589086008, + "grad_norm": 2.541836738586426, + "learning_rate": 9.928829355735283e-06, + "loss": 3.0236, + "step": 273000 + }, + { + "epoch": 0.26899781688642294, + "grad_norm": 2.7478833198547363, + "learning_rate": 9.92880333593969e-06, + "loss": 3.1098, + "step": 273050 + }, + { + "epoch": 0.269047074864245, + "grad_norm": 2.185455560684204, + "learning_rate": 9.928777311422689e-06, + "loss": 3.0006, + "step": 273100 + }, + { + "epoch": 0.2690963328420671, + "grad_norm": 2.260162830352783, + "learning_rate": 9.928751282184305e-06, + "loss": 3.0833, + "step": 273150 + }, + { + "epoch": 0.2691455908198892, + "grad_norm": 2.1929636001586914, + "learning_rate": 9.928725248224562e-06, + "loss": 3.0352, + "step": 273200 + }, + { + "epoch": 0.2691948487977113, + "grad_norm": 2.374878168106079, + "learning_rate": 9.928699209543489e-06, + "loss": 3.0867, + "step": 273250 + }, + { + "epoch": 0.26924410677553334, + "grad_norm": 2.516448497772217, + "learning_rate": 9.928673166141106e-06, + "loss": 2.986, + "step": 273300 + }, + { + "epoch": 0.26929336475335547, + "grad_norm": 2.273820161819458, + "learning_rate": 9.92864711801744e-06, + "loss": 3.0314, + "step": 273350 + }, + { + "epoch": 0.26934262273117754, + "grad_norm": 2.3430898189544678, + "learning_rate": 9.928621065172518e-06, + "loss": 3.1224, + "step": 273400 + }, + { + "epoch": 0.2693918807089996, + "grad_norm": 2.2021443843841553, + "learning_rate": 9.92859500760636e-06, + "loss": 3.0769, + "step": 273450 + }, + { + "epoch": 0.26944113868682174, + "grad_norm": 2.352940320968628, + "learning_rate": 9.928568945318997e-06, + "loss": 3.0709, + "step": 273500 + }, + { + "epoch": 0.2694903966646438, + "grad_norm": 2.5808804035186768, + "learning_rate": 9.92854287831045e-06, + "loss": 2.9843, + "step": 273550 + }, + { + "epoch": 0.2695396546424659, + "grad_norm": 2.486743688583374, + "learning_rate": 9.928516806580745e-06, + "loss": 3.0733, + "step": 273600 + }, + { + "epoch": 0.269588912620288, + "grad_norm": 2.575140953063965, + "learning_rate": 9.928490730129904e-06, + "loss": 3.0852, + "step": 273650 + }, + { + "epoch": 0.26963817059811007, + "grad_norm": 2.3352348804473877, + "learning_rate": 9.928464648957958e-06, + "loss": 3.0906, + "step": 273700 + }, + { + "epoch": 0.26968742857593214, + "grad_norm": 2.220576524734497, + "learning_rate": 9.928438563064928e-06, + "loss": 3.0577, + "step": 273750 + }, + { + "epoch": 0.26973668655375427, + "grad_norm": 2.4445624351501465, + "learning_rate": 9.928412472450842e-06, + "loss": 3.0408, + "step": 273800 + }, + { + "epoch": 0.26978594453157634, + "grad_norm": 2.130986213684082, + "learning_rate": 9.928386377115719e-06, + "loss": 3.0116, + "step": 273850 + }, + { + "epoch": 0.2698352025093984, + "grad_norm": 2.1336708068847656, + "learning_rate": 9.92836027705959e-06, + "loss": 3.1137, + "step": 273900 + }, + { + "epoch": 0.26988446048722053, + "grad_norm": 2.2219667434692383, + "learning_rate": 9.928334172282477e-06, + "loss": 3.1239, + "step": 273950 + }, + { + "epoch": 0.2699337184650426, + "grad_norm": 2.501338005065918, + "learning_rate": 9.928308062784406e-06, + "loss": 3.0486, + "step": 274000 + }, + { + "epoch": 0.2699829764428647, + "grad_norm": 2.1796624660491943, + "learning_rate": 9.928281948565402e-06, + "loss": 3.1381, + "step": 274050 + }, + { + "epoch": 0.2700322344206868, + "grad_norm": 2.40246844291687, + "learning_rate": 9.928255829625488e-06, + "loss": 3.0751, + "step": 274100 + }, + { + "epoch": 0.27008149239850887, + "grad_norm": 2.3338255882263184, + "learning_rate": 9.928229705964692e-06, + "loss": 3.0246, + "step": 274150 + }, + { + "epoch": 0.27013075037633094, + "grad_norm": 2.2065982818603516, + "learning_rate": 9.928203577583038e-06, + "loss": 3.0701, + "step": 274200 + }, + { + "epoch": 0.270180008354153, + "grad_norm": 2.411497116088867, + "learning_rate": 9.92817744448055e-06, + "loss": 3.1053, + "step": 274250 + }, + { + "epoch": 0.27022926633197514, + "grad_norm": 2.2643861770629883, + "learning_rate": 9.928151306657257e-06, + "loss": 3.0841, + "step": 274300 + }, + { + "epoch": 0.2702785243097972, + "grad_norm": 2.32452392578125, + "learning_rate": 9.999550306114007e-06, + "loss": 3.0754, + "step": 274350 + }, + { + "epoch": 0.2703277822876193, + "grad_norm": 2.2244744300842285, + "learning_rate": 9.999548118816721e-06, + "loss": 3.0503, + "step": 274400 + }, + { + "epoch": 0.2703770402654414, + "grad_norm": 2.102048635482788, + "learning_rate": 9.9995459262131e-06, + "loss": 3.0734, + "step": 274450 + }, + { + "epoch": 0.2704262982432635, + "grad_norm": 2.422478675842285, + "learning_rate": 9.999543728303145e-06, + "loss": 3.0853, + "step": 274500 + }, + { + "epoch": 0.27047555622108554, + "grad_norm": 2.229930877685547, + "learning_rate": 9.999541525086861e-06, + "loss": 3.0648, + "step": 274550 + }, + { + "epoch": 0.27052481419890767, + "grad_norm": 2.4857280254364014, + "learning_rate": 9.999539316564249e-06, + "loss": 3.0386, + "step": 274600 + }, + { + "epoch": 0.27057407217672974, + "grad_norm": 2.320030689239502, + "learning_rate": 9.999537102735309e-06, + "loss": 3.0781, + "step": 274650 + }, + { + "epoch": 0.2706233301545518, + "grad_norm": 2.167858362197876, + "learning_rate": 9.999534883600047e-06, + "loss": 3.1322, + "step": 274700 + }, + { + "epoch": 0.27067258813237394, + "grad_norm": 2.2518534660339355, + "learning_rate": 9.999532659158462e-06, + "loss": 3.0216, + "step": 274750 + }, + { + "epoch": 0.270721846110196, + "grad_norm": 2.3899853229522705, + "learning_rate": 9.99953042941056e-06, + "loss": 3.0256, + "step": 274800 + }, + { + "epoch": 0.2707711040880181, + "grad_norm": 2.4551777839660645, + "learning_rate": 9.999528194356338e-06, + "loss": 3.0872, + "step": 274850 + }, + { + "epoch": 0.2708203620658402, + "grad_norm": 2.323352098464966, + "learning_rate": 9.999525953995805e-06, + "loss": 3.0886, + "step": 274900 + }, + { + "epoch": 0.2708696200436623, + "grad_norm": 2.2773430347442627, + "learning_rate": 9.999523708328959e-06, + "loss": 3.085, + "step": 274950 + }, + { + "epoch": 0.27091887802148434, + "grad_norm": 2.1326823234558105, + "learning_rate": 9.999521457355803e-06, + "loss": 3.1329, + "step": 275000 + }, + { + "epoch": 0.27096813599930647, + "grad_norm": 2.406972646713257, + "learning_rate": 9.99951920107634e-06, + "loss": 3.047, + "step": 275050 + }, + { + "epoch": 0.27101739397712854, + "grad_norm": 2.329613208770752, + "learning_rate": 9.999516939490571e-06, + "loss": 3.0738, + "step": 275100 + }, + { + "epoch": 0.2710666519549506, + "grad_norm": 2.3396027088165283, + "learning_rate": 9.999514672598502e-06, + "loss": 3.0553, + "step": 275150 + }, + { + "epoch": 0.27111590993277274, + "grad_norm": 2.7780160903930664, + "learning_rate": 9.999512400400132e-06, + "loss": 3.0506, + "step": 275200 + }, + { + "epoch": 0.2711651679105948, + "grad_norm": 2.3778438568115234, + "learning_rate": 9.999510122895465e-06, + "loss": 3.041, + "step": 275250 + }, + { + "epoch": 0.2712144258884169, + "grad_norm": 2.399900436401367, + "learning_rate": 9.999507840084504e-06, + "loss": 3.1217, + "step": 275300 + }, + { + "epoch": 0.271263683866239, + "grad_norm": 2.2647435665130615, + "learning_rate": 9.999505551967248e-06, + "loss": 3.0607, + "step": 275350 + }, + { + "epoch": 0.2713129418440611, + "grad_norm": 2.193024158477783, + "learning_rate": 9.999503258543701e-06, + "loss": 3.0361, + "step": 275400 + }, + { + "epoch": 0.27136219982188314, + "grad_norm": 2.6412267684936523, + "learning_rate": 9.99950095981387e-06, + "loss": 3.0924, + "step": 275450 + }, + { + "epoch": 0.2714114577997052, + "grad_norm": 2.4857845306396484, + "learning_rate": 9.99949865577775e-06, + "loss": 3.0801, + "step": 275500 + }, + { + "epoch": 0.27146071577752734, + "grad_norm": 2.2095165252685547, + "learning_rate": 9.99949634643535e-06, + "loss": 2.9726, + "step": 275550 + }, + { + "epoch": 0.2715099737553494, + "grad_norm": 2.298539638519287, + "learning_rate": 9.999494031786666e-06, + "loss": 3.0459, + "step": 275600 + }, + { + "epoch": 0.2715592317331715, + "grad_norm": 2.4887797832489014, + "learning_rate": 9.999491711831708e-06, + "loss": 3.1113, + "step": 275650 + }, + { + "epoch": 0.2716084897109936, + "grad_norm": 2.295506000518799, + "learning_rate": 9.999489386570473e-06, + "loss": 2.996, + "step": 275700 + }, + { + "epoch": 0.2716577476888157, + "grad_norm": 2.3383877277374268, + "learning_rate": 9.999487056002965e-06, + "loss": 3.0084, + "step": 275750 + }, + { + "epoch": 0.27170700566663775, + "grad_norm": 2.214759588241577, + "learning_rate": 9.999484720129185e-06, + "loss": 2.9972, + "step": 275800 + }, + { + "epoch": 0.2717562636444599, + "grad_norm": 2.309497117996216, + "learning_rate": 9.999482378949136e-06, + "loss": 3.0523, + "step": 275850 + }, + { + "epoch": 0.27180552162228194, + "grad_norm": 2.2757134437561035, + "learning_rate": 9.999480032462826e-06, + "loss": 3.0658, + "step": 275900 + }, + { + "epoch": 0.271854779600104, + "grad_norm": 2.424730062484741, + "learning_rate": 9.999477680670247e-06, + "loss": 3.0694, + "step": 275950 + }, + { + "epoch": 0.27190403757792614, + "grad_norm": 2.357182264328003, + "learning_rate": 9.999475323571412e-06, + "loss": 3.0474, + "step": 276000 + }, + { + "epoch": 0.2719532955557482, + "grad_norm": 2.403914213180542, + "learning_rate": 9.999472961166317e-06, + "loss": 3.0675, + "step": 276050 + }, + { + "epoch": 0.2720025535335703, + "grad_norm": 2.138929605484009, + "learning_rate": 9.999470593454965e-06, + "loss": 3.0148, + "step": 276100 + }, + { + "epoch": 0.2720518115113924, + "grad_norm": 2.279961347579956, + "learning_rate": 9.999468220437362e-06, + "loss": 3.0594, + "step": 276150 + }, + { + "epoch": 0.2721010694892145, + "grad_norm": 2.349294662475586, + "learning_rate": 9.999465842113508e-06, + "loss": 3.082, + "step": 276200 + }, + { + "epoch": 0.27215032746703655, + "grad_norm": 2.2538154125213623, + "learning_rate": 9.999463458483405e-06, + "loss": 3.0437, + "step": 276250 + }, + { + "epoch": 0.27219958544485867, + "grad_norm": 2.328101873397827, + "learning_rate": 9.999461069547058e-06, + "loss": 3.0793, + "step": 276300 + }, + { + "epoch": 0.27224884342268074, + "grad_norm": 2.2643866539001465, + "learning_rate": 9.999458675304468e-06, + "loss": 3.0896, + "step": 276350 + }, + { + "epoch": 0.2722981014005028, + "grad_norm": 2.397108554840088, + "learning_rate": 9.999456275755636e-06, + "loss": 2.9836, + "step": 276400 + }, + { + "epoch": 0.27234735937832494, + "grad_norm": 2.1680667400360107, + "learning_rate": 9.999453870900565e-06, + "loss": 3.007, + "step": 276450 + }, + { + "epoch": 0.272396617356147, + "grad_norm": 2.438023567199707, + "learning_rate": 9.999451460739261e-06, + "loss": 3.0018, + "step": 276500 + }, + { + "epoch": 0.2724458753339691, + "grad_norm": 2.4156405925750732, + "learning_rate": 9.999449045271723e-06, + "loss": 3.0385, + "step": 276550 + }, + { + "epoch": 0.2724951333117912, + "grad_norm": 2.2574119567871094, + "learning_rate": 9.999446624497955e-06, + "loss": 3.0362, + "step": 276600 + }, + { + "epoch": 0.2725443912896133, + "grad_norm": 2.3206326961517334, + "learning_rate": 9.999444198417959e-06, + "loss": 3.0519, + "step": 276650 + }, + { + "epoch": 0.27259364926743535, + "grad_norm": 2.3654370307922363, + "learning_rate": 9.999441767031737e-06, + "loss": 3.0711, + "step": 276700 + }, + { + "epoch": 0.2726429072452574, + "grad_norm": 2.2183473110198975, + "learning_rate": 9.999439330339295e-06, + "loss": 3.0429, + "step": 276750 + }, + { + "epoch": 0.27269216522307954, + "grad_norm": 2.310246229171753, + "learning_rate": 9.999436888340631e-06, + "loss": 3.0839, + "step": 276800 + }, + { + "epoch": 0.2727414232009016, + "grad_norm": 2.2874252796173096, + "learning_rate": 9.99943444103575e-06, + "loss": 3.0924, + "step": 276850 + }, + { + "epoch": 0.2727906811787237, + "grad_norm": 2.253023624420166, + "learning_rate": 9.999431988424655e-06, + "loss": 3.08, + "step": 276900 + }, + { + "epoch": 0.2728399391565458, + "grad_norm": 2.5274083614349365, + "learning_rate": 9.999429530507347e-06, + "loss": 3.037, + "step": 276950 + }, + { + "epoch": 0.2728891971343679, + "grad_norm": 2.3758950233459473, + "learning_rate": 9.999427067283832e-06, + "loss": 3.0963, + "step": 277000 + }, + { + "epoch": 0.27293845511218995, + "grad_norm": 2.412848711013794, + "learning_rate": 9.999424598754106e-06, + "loss": 3.0997, + "step": 277050 + }, + { + "epoch": 0.2729877130900121, + "grad_norm": 2.5760467052459717, + "learning_rate": 9.999422124918177e-06, + "loss": 3.1185, + "step": 277100 + }, + { + "epoch": 0.27303697106783414, + "grad_norm": 2.035867929458618, + "learning_rate": 9.999419645776049e-06, + "loss": 2.9927, + "step": 277150 + }, + { + "epoch": 0.2730862290456562, + "grad_norm": 2.4078593254089355, + "learning_rate": 9.999417161327717e-06, + "loss": 3.0616, + "step": 277200 + }, + { + "epoch": 0.27313548702347834, + "grad_norm": 2.224599838256836, + "learning_rate": 9.999414671573194e-06, + "loss": 3.1021, + "step": 277250 + }, + { + "epoch": 0.2731847450013004, + "grad_norm": 2.329906702041626, + "learning_rate": 9.999412176512473e-06, + "loss": 3.0387, + "step": 277300 + }, + { + "epoch": 0.2732340029791225, + "grad_norm": 2.3796935081481934, + "learning_rate": 9.999409676145564e-06, + "loss": 3.0656, + "step": 277350 + }, + { + "epoch": 0.2732832609569446, + "grad_norm": 2.7732667922973633, + "learning_rate": 9.999407170472464e-06, + "loss": 3.0732, + "step": 277400 + }, + { + "epoch": 0.2733325189347667, + "grad_norm": 2.2295989990234375, + "learning_rate": 9.999404659493179e-06, + "loss": 3.058, + "step": 277450 + }, + { + "epoch": 0.27338177691258875, + "grad_norm": 2.2737255096435547, + "learning_rate": 9.999402143207711e-06, + "loss": 3.0743, + "step": 277500 + }, + { + "epoch": 0.2734310348904109, + "grad_norm": 2.4207143783569336, + "learning_rate": 9.999399621616062e-06, + "loss": 3.0996, + "step": 277550 + }, + { + "epoch": 0.27348029286823294, + "grad_norm": 2.2210240364074707, + "learning_rate": 9.999397094718236e-06, + "loss": 3.0456, + "step": 277600 + }, + { + "epoch": 0.273529550846055, + "grad_norm": 2.215590000152588, + "learning_rate": 9.999394562514234e-06, + "loss": 3.1106, + "step": 277650 + }, + { + "epoch": 0.27357880882387714, + "grad_norm": 2.325078248977661, + "learning_rate": 9.999392025004061e-06, + "loss": 3.0774, + "step": 277700 + }, + { + "epoch": 0.2736280668016992, + "grad_norm": 2.2615644931793213, + "learning_rate": 9.999389482187716e-06, + "loss": 2.971, + "step": 277750 + }, + { + "epoch": 0.2736773247795213, + "grad_norm": 2.4708690643310547, + "learning_rate": 9.999386934065206e-06, + "loss": 3.0943, + "step": 277800 + }, + { + "epoch": 0.2737265827573434, + "grad_norm": 2.424464464187622, + "learning_rate": 9.999384380636531e-06, + "loss": 3.0831, + "step": 277850 + }, + { + "epoch": 0.2737758407351655, + "grad_norm": 2.24625563621521, + "learning_rate": 9.999381821901696e-06, + "loss": 2.9923, + "step": 277900 + }, + { + "epoch": 0.27382509871298755, + "grad_norm": 2.2028613090515137, + "learning_rate": 9.999379257860699e-06, + "loss": 3.0483, + "step": 277950 + }, + { + "epoch": 0.2738743566908096, + "grad_norm": 2.288938045501709, + "learning_rate": 9.999376688513546e-06, + "loss": 3.002, + "step": 278000 + }, + { + "epoch": 0.27392361466863174, + "grad_norm": 2.2996225357055664, + "learning_rate": 9.99937411386024e-06, + "loss": 3.081, + "step": 278050 + }, + { + "epoch": 0.2739728726464538, + "grad_norm": 2.2090985774993896, + "learning_rate": 9.999371533900786e-06, + "loss": 3.0787, + "step": 278100 + }, + { + "epoch": 0.2740221306242759, + "grad_norm": 2.265267848968506, + "learning_rate": 9.999368948635181e-06, + "loss": 3.1141, + "step": 278150 + }, + { + "epoch": 0.274071388602098, + "grad_norm": 2.383439302444458, + "learning_rate": 9.999366358063432e-06, + "loss": 3.0144, + "step": 278200 + }, + { + "epoch": 0.2741206465799201, + "grad_norm": 2.314011573791504, + "learning_rate": 9.99936376218554e-06, + "loss": 3.0932, + "step": 278250 + }, + { + "epoch": 0.27416990455774215, + "grad_norm": 2.345916509628296, + "learning_rate": 9.999361161001508e-06, + "loss": 3.0309, + "step": 278300 + }, + { + "epoch": 0.2742191625355643, + "grad_norm": 2.444455862045288, + "learning_rate": 9.99935855451134e-06, + "loss": 3.045, + "step": 278350 + }, + { + "epoch": 0.27426842051338635, + "grad_norm": 2.2926480770111084, + "learning_rate": 9.999355942715037e-06, + "loss": 3.0693, + "step": 278400 + }, + { + "epoch": 0.2743176784912084, + "grad_norm": 2.5589003562927246, + "learning_rate": 9.999353325612603e-06, + "loss": 3.0614, + "step": 278450 + }, + { + "epoch": 0.27436693646903054, + "grad_norm": 2.4288032054901123, + "learning_rate": 9.99935070320404e-06, + "loss": 3.0479, + "step": 278500 + }, + { + "epoch": 0.2744161944468526, + "grad_norm": 2.4304988384246826, + "learning_rate": 9.99934807548935e-06, + "loss": 2.9636, + "step": 278550 + }, + { + "epoch": 0.2744654524246747, + "grad_norm": 2.3672192096710205, + "learning_rate": 9.99934544246854e-06, + "loss": 3.0841, + "step": 278600 + }, + { + "epoch": 0.2745147104024968, + "grad_norm": 2.1683273315429688, + "learning_rate": 9.999342804141607e-06, + "loss": 3.1062, + "step": 278650 + }, + { + "epoch": 0.2745639683803189, + "grad_norm": 2.4212872982025146, + "learning_rate": 9.999340160508556e-06, + "loss": 3.061, + "step": 278700 + }, + { + "epoch": 0.27461322635814095, + "grad_norm": 2.1729488372802734, + "learning_rate": 9.999337511569393e-06, + "loss": 3.089, + "step": 278750 + }, + { + "epoch": 0.2746624843359631, + "grad_norm": 2.3220813274383545, + "learning_rate": 9.999334857324117e-06, + "loss": 3.0259, + "step": 278800 + }, + { + "epoch": 0.27471174231378515, + "grad_norm": 2.301771402359009, + "learning_rate": 9.999332197772732e-06, + "loss": 3.0314, + "step": 278850 + }, + { + "epoch": 0.2747610002916072, + "grad_norm": 2.2659647464752197, + "learning_rate": 9.99932953291524e-06, + "loss": 3.07, + "step": 278900 + }, + { + "epoch": 0.27481025826942934, + "grad_norm": 2.3310420513153076, + "learning_rate": 9.999326862751645e-06, + "loss": 3.0517, + "step": 278950 + }, + { + "epoch": 0.2748595162472514, + "grad_norm": 2.45888614654541, + "learning_rate": 9.99932418728195e-06, + "loss": 3.0257, + "step": 279000 + }, + { + "epoch": 0.2749087742250735, + "grad_norm": 2.3410592079162598, + "learning_rate": 9.999321506506155e-06, + "loss": 3.0609, + "step": 279050 + }, + { + "epoch": 0.2749580322028956, + "grad_norm": 2.3784685134887695, + "learning_rate": 9.999318820424266e-06, + "loss": 3.0115, + "step": 279100 + }, + { + "epoch": 0.2750072901807177, + "grad_norm": 2.281674861907959, + "learning_rate": 9.999316129036286e-06, + "loss": 3.0028, + "step": 279150 + }, + { + "epoch": 0.27505654815853975, + "grad_norm": 2.2566380500793457, + "learning_rate": 9.999313432342216e-06, + "loss": 3.0574, + "step": 279200 + }, + { + "epoch": 0.2751058061363618, + "grad_norm": 2.232377529144287, + "learning_rate": 9.999310730342061e-06, + "loss": 3.0344, + "step": 279250 + }, + { + "epoch": 0.27515506411418394, + "grad_norm": 2.369981288909912, + "learning_rate": 9.999308023035821e-06, + "loss": 3.0882, + "step": 279300 + }, + { + "epoch": 0.275204322092006, + "grad_norm": 2.341582775115967, + "learning_rate": 9.999305310423502e-06, + "loss": 2.9885, + "step": 279350 + }, + { + "epoch": 0.2752535800698281, + "grad_norm": 2.3325612545013428, + "learning_rate": 9.999302592505105e-06, + "loss": 3.099, + "step": 279400 + }, + { + "epoch": 0.2753028380476502, + "grad_norm": 2.308741331100464, + "learning_rate": 9.99929986928063e-06, + "loss": 3.0759, + "step": 279450 + }, + { + "epoch": 0.2753520960254723, + "grad_norm": 2.2781736850738525, + "learning_rate": 9.999297140750087e-06, + "loss": 3.0622, + "step": 279500 + }, + { + "epoch": 0.27540135400329435, + "grad_norm": 2.5265614986419678, + "learning_rate": 9.999294406913472e-06, + "loss": 3.0741, + "step": 279550 + }, + { + "epoch": 0.2754506119811165, + "grad_norm": 2.3374273777008057, + "learning_rate": 9.999291667770794e-06, + "loss": 3.0682, + "step": 279600 + }, + { + "epoch": 0.27549986995893855, + "grad_norm": 2.3963499069213867, + "learning_rate": 9.99928892332205e-06, + "loss": 3.0536, + "step": 279650 + }, + { + "epoch": 0.2755491279367606, + "grad_norm": 2.218161106109619, + "learning_rate": 9.999286173567247e-06, + "loss": 3.0509, + "step": 279700 + }, + { + "epoch": 0.27559838591458274, + "grad_norm": 2.3588054180145264, + "learning_rate": 9.999283418506387e-06, + "loss": 3.0322, + "step": 279750 + }, + { + "epoch": 0.2756476438924048, + "grad_norm": 2.259392023086548, + "learning_rate": 9.999280658139472e-06, + "loss": 3.0548, + "step": 279800 + }, + { + "epoch": 0.2756969018702269, + "grad_norm": 2.2124147415161133, + "learning_rate": 9.999277892466505e-06, + "loss": 3.1132, + "step": 279850 + }, + { + "epoch": 0.275746159848049, + "grad_norm": 2.6995434761047363, + "learning_rate": 9.999275121487489e-06, + "loss": 3.0042, + "step": 279900 + }, + { + "epoch": 0.2757954178258711, + "grad_norm": 2.2837610244750977, + "learning_rate": 9.999272345202427e-06, + "loss": 3.0604, + "step": 279950 + }, + { + "epoch": 0.27584467580369315, + "grad_norm": 2.3456192016601562, + "learning_rate": 9.999269563611325e-06, + "loss": 3.0495, + "step": 280000 + }, + { + "epoch": 0.2758939337815153, + "grad_norm": 2.2953476905822754, + "learning_rate": 9.99926677671418e-06, + "loss": 3.0511, + "step": 280050 + }, + { + "epoch": 0.27594319175933735, + "grad_norm": 2.3278610706329346, + "learning_rate": 9.999263984511e-06, + "loss": 3.0001, + "step": 280100 + }, + { + "epoch": 0.2759924497371594, + "grad_norm": 2.863896608352661, + "learning_rate": 9.999261187001786e-06, + "loss": 3.0405, + "step": 280150 + }, + { + "epoch": 0.27604170771498154, + "grad_norm": 2.3923697471618652, + "learning_rate": 9.999258384186543e-06, + "loss": 3.0992, + "step": 280200 + }, + { + "epoch": 0.2760909656928036, + "grad_norm": 2.3339266777038574, + "learning_rate": 9.999255576065269e-06, + "loss": 3.0458, + "step": 280250 + }, + { + "epoch": 0.2761402236706257, + "grad_norm": 2.28961181640625, + "learning_rate": 9.99925276263797e-06, + "loss": 3.1952, + "step": 280300 + }, + { + "epoch": 0.2761894816484478, + "grad_norm": 2.163423538208008, + "learning_rate": 9.999249943904651e-06, + "loss": 3.076, + "step": 280350 + }, + { + "epoch": 0.2762387396262699, + "grad_norm": 2.619786262512207, + "learning_rate": 9.999247119865314e-06, + "loss": 3.0659, + "step": 280400 + }, + { + "epoch": 0.27628799760409195, + "grad_norm": 2.1788508892059326, + "learning_rate": 9.999244290519957e-06, + "loss": 3.0832, + "step": 280450 + }, + { + "epoch": 0.276337255581914, + "grad_norm": 2.5952553749084473, + "learning_rate": 9.99924145586859e-06, + "loss": 3.0865, + "step": 280500 + }, + { + "epoch": 0.27638651355973615, + "grad_norm": 2.305605411529541, + "learning_rate": 9.999238615911213e-06, + "loss": 3.0527, + "step": 280550 + }, + { + "epoch": 0.2764357715375582, + "grad_norm": 2.2598345279693604, + "learning_rate": 9.999235770647828e-06, + "loss": 3.129, + "step": 280600 + }, + { + "epoch": 0.2764850295153803, + "grad_norm": 2.29292368888855, + "learning_rate": 9.99923292007844e-06, + "loss": 2.9794, + "step": 280650 + }, + { + "epoch": 0.2765342874932024, + "grad_norm": 2.3193204402923584, + "learning_rate": 9.99923006420305e-06, + "loss": 3.0118, + "step": 280700 + }, + { + "epoch": 0.2765835454710245, + "grad_norm": 2.672445774078369, + "learning_rate": 9.999227203021664e-06, + "loss": 3.0638, + "step": 280750 + }, + { + "epoch": 0.27663280344884655, + "grad_norm": 2.384770393371582, + "learning_rate": 9.99922433653428e-06, + "loss": 3.0788, + "step": 280800 + }, + { + "epoch": 0.2766820614266687, + "grad_norm": 2.2893855571746826, + "learning_rate": 9.999221464740906e-06, + "loss": 3.0239, + "step": 280850 + }, + { + "epoch": 0.27673131940449075, + "grad_norm": 2.4062585830688477, + "learning_rate": 9.999218587641545e-06, + "loss": 3.1151, + "step": 280900 + }, + { + "epoch": 0.2767805773823128, + "grad_norm": 2.216714382171631, + "learning_rate": 9.999215705236196e-06, + "loss": 3.0571, + "step": 280950 + }, + { + "epoch": 0.27682983536013495, + "grad_norm": 2.533191204071045, + "learning_rate": 9.999212817524866e-06, + "loss": 3.0038, + "step": 281000 + }, + { + "epoch": 0.276879093337957, + "grad_norm": 2.2471811771392822, + "learning_rate": 9.999209924507556e-06, + "loss": 3.0829, + "step": 281050 + }, + { + "epoch": 0.2769283513157791, + "grad_norm": 2.387820243835449, + "learning_rate": 9.999207026184268e-06, + "loss": 2.9373, + "step": 281100 + }, + { + "epoch": 0.2769776092936012, + "grad_norm": 2.3220479488372803, + "learning_rate": 9.999204122555008e-06, + "loss": 3.1097, + "step": 281150 + }, + { + "epoch": 0.2770268672714233, + "grad_norm": 2.3009231090545654, + "learning_rate": 9.999201213619778e-06, + "loss": 3.0515, + "step": 281200 + }, + { + "epoch": 0.27707612524924535, + "grad_norm": 2.261983633041382, + "learning_rate": 9.999198299378581e-06, + "loss": 3.0796, + "step": 281250 + }, + { + "epoch": 0.2771253832270675, + "grad_norm": 2.4418768882751465, + "learning_rate": 9.999195379831419e-06, + "loss": 3.0667, + "step": 281300 + }, + { + "epoch": 0.27717464120488955, + "grad_norm": 2.669725179672241, + "learning_rate": 9.999192454978297e-06, + "loss": 3.0625, + "step": 281350 + }, + { + "epoch": 0.2772238991827116, + "grad_norm": 2.325237274169922, + "learning_rate": 9.999189524819216e-06, + "loss": 3.0592, + "step": 281400 + }, + { + "epoch": 0.27727315716053375, + "grad_norm": 2.2468607425689697, + "learning_rate": 9.99918658935418e-06, + "loss": 3.1522, + "step": 281450 + }, + { + "epoch": 0.2773224151383558, + "grad_norm": 2.2150721549987793, + "learning_rate": 9.999183648583193e-06, + "loss": 3.0317, + "step": 281500 + }, + { + "epoch": 0.2773716731161779, + "grad_norm": 2.246112585067749, + "learning_rate": 9.999180702506256e-06, + "loss": 3.0674, + "step": 281550 + }, + { + "epoch": 0.277420931094, + "grad_norm": 2.3870432376861572, + "learning_rate": 9.999177751123374e-06, + "loss": 3.0318, + "step": 281600 + }, + { + "epoch": 0.2774701890718221, + "grad_norm": 2.290712356567383, + "learning_rate": 9.99917479443455e-06, + "loss": 3.0748, + "step": 281650 + }, + { + "epoch": 0.27751944704964415, + "grad_norm": 2.2130281925201416, + "learning_rate": 9.999171832439787e-06, + "loss": 3.0302, + "step": 281700 + }, + { + "epoch": 0.2775687050274662, + "grad_norm": 2.1901941299438477, + "learning_rate": 9.99916886513909e-06, + "loss": 3.0167, + "step": 281750 + }, + { + "epoch": 0.27761796300528835, + "grad_norm": 2.1854476928710938, + "learning_rate": 9.999165892532457e-06, + "loss": 3.0492, + "step": 281800 + }, + { + "epoch": 0.2776672209831104, + "grad_norm": 2.3490681648254395, + "learning_rate": 9.999162914619895e-06, + "loss": 3.0485, + "step": 281850 + }, + { + "epoch": 0.2777164789609325, + "grad_norm": 2.260258913040161, + "learning_rate": 9.999159931401408e-06, + "loss": 3.0847, + "step": 281900 + }, + { + "epoch": 0.2777657369387546, + "grad_norm": 2.290295362472534, + "learning_rate": 9.999156942876994e-06, + "loss": 3.0546, + "step": 281950 + }, + { + "epoch": 0.2778149949165767, + "grad_norm": 2.217444658279419, + "learning_rate": 9.999153949046664e-06, + "loss": 3.0567, + "step": 282000 + }, + { + "epoch": 0.27786425289439876, + "grad_norm": 2.2332184314727783, + "learning_rate": 9.999150949910414e-06, + "loss": 3.088, + "step": 282050 + }, + { + "epoch": 0.2779135108722209, + "grad_norm": 2.3670341968536377, + "learning_rate": 9.999147945468251e-06, + "loss": 3.0415, + "step": 282100 + }, + { + "epoch": 0.27796276885004295, + "grad_norm": 2.2965822219848633, + "learning_rate": 9.999144935720178e-06, + "loss": 3.0574, + "step": 282150 + }, + { + "epoch": 0.278012026827865, + "grad_norm": 2.296645164489746, + "learning_rate": 9.999141920666197e-06, + "loss": 3.125, + "step": 282200 + }, + { + "epoch": 0.27806128480568715, + "grad_norm": 2.32525634765625, + "learning_rate": 9.999138900306311e-06, + "loss": 2.9826, + "step": 282250 + }, + { + "epoch": 0.2781105427835092, + "grad_norm": 2.3741369247436523, + "learning_rate": 9.999135874640525e-06, + "loss": 3.0473, + "step": 282300 + }, + { + "epoch": 0.2781598007613313, + "grad_norm": 2.440596103668213, + "learning_rate": 9.999132843668842e-06, + "loss": 3.0467, + "step": 282350 + }, + { + "epoch": 0.2782090587391534, + "grad_norm": 2.3162362575531006, + "learning_rate": 9.999129807391263e-06, + "loss": 3.0696, + "step": 282400 + }, + { + "epoch": 0.2782583167169755, + "grad_norm": 2.7181031703948975, + "learning_rate": 9.999126765807792e-06, + "loss": 3.1181, + "step": 282450 + }, + { + "epoch": 0.27830757469479755, + "grad_norm": 2.3502416610717773, + "learning_rate": 9.999123718918433e-06, + "loss": 3.0978, + "step": 282500 + }, + { + "epoch": 0.2783568326726197, + "grad_norm": 2.2742552757263184, + "learning_rate": 9.99912066672319e-06, + "loss": 3.0828, + "step": 282550 + }, + { + "epoch": 0.27840609065044175, + "grad_norm": 2.249372720718384, + "learning_rate": 9.999117609222064e-06, + "loss": 3.098, + "step": 282600 + }, + { + "epoch": 0.2784553486282638, + "grad_norm": 2.2155802249908447, + "learning_rate": 9.99911454641506e-06, + "loss": 3.0289, + "step": 282650 + }, + { + "epoch": 0.27850460660608595, + "grad_norm": 2.2292087078094482, + "learning_rate": 9.99911147830218e-06, + "loss": 3.0769, + "step": 282700 + }, + { + "epoch": 0.278553864583908, + "grad_norm": 2.421933650970459, + "learning_rate": 9.99910840488343e-06, + "loss": 3.067, + "step": 282750 + }, + { + "epoch": 0.2786031225617301, + "grad_norm": 2.8418850898742676, + "learning_rate": 9.99910532615881e-06, + "loss": 3.067, + "step": 282800 + }, + { + "epoch": 0.2786523805395522, + "grad_norm": 2.1977171897888184, + "learning_rate": 9.999102242128325e-06, + "loss": 3.0485, + "step": 282850 + }, + { + "epoch": 0.2787016385173743, + "grad_norm": 2.5100700855255127, + "learning_rate": 9.999099152791978e-06, + "loss": 2.9979, + "step": 282900 + }, + { + "epoch": 0.27875089649519635, + "grad_norm": 2.441507339477539, + "learning_rate": 9.999096058149772e-06, + "loss": 3.0531, + "step": 282950 + }, + { + "epoch": 0.2788001544730184, + "grad_norm": 2.248713254928589, + "learning_rate": 9.99909295820171e-06, + "loss": 3.0602, + "step": 283000 + }, + { + "epoch": 0.27884941245084055, + "grad_norm": 2.4158966541290283, + "learning_rate": 9.999089852947796e-06, + "loss": 3.0717, + "step": 283050 + }, + { + "epoch": 0.2788986704286626, + "grad_norm": 2.2319061756134033, + "learning_rate": 9.999086742388031e-06, + "loss": 3.078, + "step": 283100 + }, + { + "epoch": 0.2789479284064847, + "grad_norm": 2.2059195041656494, + "learning_rate": 9.999083626522425e-06, + "loss": 3.1234, + "step": 283150 + }, + { + "epoch": 0.2789971863843068, + "grad_norm": 2.0863349437713623, + "learning_rate": 9.999080505350972e-06, + "loss": 3.0232, + "step": 283200 + }, + { + "epoch": 0.2790464443621289, + "grad_norm": 2.1934447288513184, + "learning_rate": 9.999077378873681e-06, + "loss": 3.057, + "step": 283250 + }, + { + "epoch": 0.27909570233995096, + "grad_norm": 2.5078065395355225, + "learning_rate": 9.999074247090554e-06, + "loss": 2.9812, + "step": 283300 + }, + { + "epoch": 0.2791449603177731, + "grad_norm": 2.371345043182373, + "learning_rate": 9.999071110001597e-06, + "loss": 3.1486, + "step": 283350 + }, + { + "epoch": 0.27919421829559515, + "grad_norm": 2.221829652786255, + "learning_rate": 9.999067967606809e-06, + "loss": 3.0852, + "step": 283400 + }, + { + "epoch": 0.2792434762734172, + "grad_norm": 2.3172950744628906, + "learning_rate": 9.999064819906194e-06, + "loss": 3.1421, + "step": 283450 + }, + { + "epoch": 0.27929273425123935, + "grad_norm": 2.5547940731048584, + "learning_rate": 9.999061666899757e-06, + "loss": 3.0961, + "step": 283500 + }, + { + "epoch": 0.2793419922290614, + "grad_norm": 2.365513801574707, + "learning_rate": 9.999058508587502e-06, + "loss": 3.0703, + "step": 283550 + }, + { + "epoch": 0.2793912502068835, + "grad_norm": 2.114649534225464, + "learning_rate": 9.99905534496943e-06, + "loss": 3.0166, + "step": 283600 + }, + { + "epoch": 0.2794405081847056, + "grad_norm": 2.3697657585144043, + "learning_rate": 9.999052176045548e-06, + "loss": 3.035, + "step": 283650 + }, + { + "epoch": 0.2794897661625277, + "grad_norm": 2.3598742485046387, + "learning_rate": 9.999049001815855e-06, + "loss": 3.0013, + "step": 283700 + }, + { + "epoch": 0.27953902414034976, + "grad_norm": 2.351696014404297, + "learning_rate": 9.999045822280358e-06, + "loss": 2.9739, + "step": 283750 + }, + { + "epoch": 0.2795882821181719, + "grad_norm": 2.223191261291504, + "learning_rate": 9.999042637439056e-06, + "loss": 3.0387, + "step": 283800 + }, + { + "epoch": 0.27963754009599395, + "grad_norm": 2.472574472427368, + "learning_rate": 9.999039447291957e-06, + "loss": 3.0597, + "step": 283850 + }, + { + "epoch": 0.279686798073816, + "grad_norm": 2.383924722671509, + "learning_rate": 9.99903625183906e-06, + "loss": 3.0159, + "step": 283900 + }, + { + "epoch": 0.27973605605163815, + "grad_norm": 2.2692716121673584, + "learning_rate": 9.999033051080373e-06, + "loss": 3.1268, + "step": 283950 + }, + { + "epoch": 0.2797853140294602, + "grad_norm": 2.2256224155426025, + "learning_rate": 9.999029845015899e-06, + "loss": 3.0149, + "step": 284000 + }, + { + "epoch": 0.2798345720072823, + "grad_norm": 2.4322094917297363, + "learning_rate": 9.999026633645637e-06, + "loss": 3.0928, + "step": 284050 + }, + { + "epoch": 0.27988382998510436, + "grad_norm": 2.566685199737549, + "learning_rate": 9.999023416969595e-06, + "loss": 3.046, + "step": 284100 + }, + { + "epoch": 0.2799330879629265, + "grad_norm": 2.224459409713745, + "learning_rate": 9.999020194987772e-06, + "loss": 3.0461, + "step": 284150 + }, + { + "epoch": 0.27998234594074856, + "grad_norm": 2.3266563415527344, + "learning_rate": 9.999016967700176e-06, + "loss": 3.0313, + "step": 284200 + }, + { + "epoch": 0.2800316039185706, + "grad_norm": 2.110482931137085, + "learning_rate": 9.999013735106806e-06, + "loss": 3.0403, + "step": 284250 + }, + { + "epoch": 0.28008086189639275, + "grad_norm": 2.3589885234832764, + "learning_rate": 9.99901049720767e-06, + "loss": 3.0329, + "step": 284300 + }, + { + "epoch": 0.2801301198742148, + "grad_norm": 2.2958736419677734, + "learning_rate": 9.99900725400277e-06, + "loss": 3.0081, + "step": 284350 + }, + { + "epoch": 0.2801793778520369, + "grad_norm": 2.3444833755493164, + "learning_rate": 9.999004005492105e-06, + "loss": 3.0006, + "step": 284400 + }, + { + "epoch": 0.280228635829859, + "grad_norm": 2.2705130577087402, + "learning_rate": 9.999000751675684e-06, + "loss": 3.0836, + "step": 284450 + }, + { + "epoch": 0.2802778938076811, + "grad_norm": 2.3260834217071533, + "learning_rate": 9.998997492553508e-06, + "loss": 3.0467, + "step": 284500 + }, + { + "epoch": 0.28032715178550316, + "grad_norm": 2.385972499847412, + "learning_rate": 9.998994228125582e-06, + "loss": 3.0741, + "step": 284550 + }, + { + "epoch": 0.2803764097633253, + "grad_norm": 2.2653753757476807, + "learning_rate": 9.99899095839191e-06, + "loss": 3.1045, + "step": 284600 + }, + { + "epoch": 0.28042566774114736, + "grad_norm": 2.2826013565063477, + "learning_rate": 9.998987683352492e-06, + "loss": 3.0287, + "step": 284650 + }, + { + "epoch": 0.2804749257189694, + "grad_norm": 2.385037660598755, + "learning_rate": 9.998984403007333e-06, + "loss": 3.0685, + "step": 284700 + }, + { + "epoch": 0.28052418369679155, + "grad_norm": 2.3030974864959717, + "learning_rate": 9.998981117356437e-06, + "loss": 3.1085, + "step": 284750 + }, + { + "epoch": 0.2805734416746136, + "grad_norm": 2.5274698734283447, + "learning_rate": 9.998977826399808e-06, + "loss": 3.0624, + "step": 284800 + }, + { + "epoch": 0.2806226996524357, + "grad_norm": 2.213287353515625, + "learning_rate": 9.998974530137448e-06, + "loss": 3.0693, + "step": 284850 + }, + { + "epoch": 0.2806719576302578, + "grad_norm": 2.2143609523773193, + "learning_rate": 9.998971228569362e-06, + "loss": 3.1078, + "step": 284900 + }, + { + "epoch": 0.2807212156080799, + "grad_norm": 2.7265713214874268, + "learning_rate": 9.998967921695552e-06, + "loss": 3.0705, + "step": 284950 + }, + { + "epoch": 0.28077047358590196, + "grad_norm": 2.5114731788635254, + "learning_rate": 9.998964609516025e-06, + "loss": 3.1559, + "step": 285000 + }, + { + "epoch": 0.2808197315637241, + "grad_norm": 2.204561233520508, + "learning_rate": 9.998961292030779e-06, + "loss": 3.0338, + "step": 285050 + }, + { + "epoch": 0.28086898954154615, + "grad_norm": 2.443343162536621, + "learning_rate": 9.998957969239821e-06, + "loss": 3.1315, + "step": 285100 + }, + { + "epoch": 0.2809182475193682, + "grad_norm": 2.4720842838287354, + "learning_rate": 9.998954641143154e-06, + "loss": 3.0679, + "step": 285150 + }, + { + "epoch": 0.28096750549719035, + "grad_norm": 2.213996171951294, + "learning_rate": 9.998951307740783e-06, + "loss": 3.0937, + "step": 285200 + }, + { + "epoch": 0.2810167634750124, + "grad_norm": 2.5104918479919434, + "learning_rate": 9.998947969032709e-06, + "loss": 3.0652, + "step": 285250 + }, + { + "epoch": 0.2810660214528345, + "grad_norm": 2.366490364074707, + "learning_rate": 9.998944625018936e-06, + "loss": 3.0132, + "step": 285300 + }, + { + "epoch": 0.28111527943065656, + "grad_norm": 2.5512473583221436, + "learning_rate": 9.998941275699469e-06, + "loss": 3.0053, + "step": 285350 + }, + { + "epoch": 0.2811645374084787, + "grad_norm": 2.2868926525115967, + "learning_rate": 9.99893792107431e-06, + "loss": 3.0323, + "step": 285400 + }, + { + "epoch": 0.28121379538630076, + "grad_norm": 2.399994134902954, + "learning_rate": 9.998934561143464e-06, + "loss": 3.0645, + "step": 285450 + }, + { + "epoch": 0.28126305336412283, + "grad_norm": 2.1074087619781494, + "learning_rate": 9.998931195906932e-06, + "loss": 3.1039, + "step": 285500 + }, + { + "epoch": 0.28131231134194495, + "grad_norm": 2.338667154312134, + "learning_rate": 9.998927825364722e-06, + "loss": 3.071, + "step": 285550 + }, + { + "epoch": 0.281361569319767, + "grad_norm": 2.2370150089263916, + "learning_rate": 9.998924449516832e-06, + "loss": 3.0482, + "step": 285600 + }, + { + "epoch": 0.2814108272975891, + "grad_norm": 2.323967933654785, + "learning_rate": 9.998921068363271e-06, + "loss": 3.1089, + "step": 285650 + }, + { + "epoch": 0.2814600852754112, + "grad_norm": 2.2752182483673096, + "learning_rate": 9.998917681904038e-06, + "loss": 2.9928, + "step": 285700 + }, + { + "epoch": 0.2815093432532333, + "grad_norm": 2.3620493412017822, + "learning_rate": 9.99891429013914e-06, + "loss": 3.0495, + "step": 285750 + }, + { + "epoch": 0.28155860123105536, + "grad_norm": 2.251868724822998, + "learning_rate": 9.998910893068579e-06, + "loss": 3.0068, + "step": 285800 + }, + { + "epoch": 0.2816078592088775, + "grad_norm": 2.176079511642456, + "learning_rate": 9.998907490692359e-06, + "loss": 3.0275, + "step": 285850 + }, + { + "epoch": 0.28165711718669956, + "grad_norm": 2.4708309173583984, + "learning_rate": 9.998904083010483e-06, + "loss": 3.0241, + "step": 285900 + }, + { + "epoch": 0.2817063751645216, + "grad_norm": 2.4806573390960693, + "learning_rate": 9.998900670022956e-06, + "loss": 3.0124, + "step": 285950 + }, + { + "epoch": 0.28175563314234375, + "grad_norm": 2.2205982208251953, + "learning_rate": 9.998897251729781e-06, + "loss": 3.0123, + "step": 286000 + }, + { + "epoch": 0.2818048911201658, + "grad_norm": 2.8278918266296387, + "learning_rate": 9.99889382813096e-06, + "loss": 3.0056, + "step": 286050 + }, + { + "epoch": 0.2818541490979879, + "grad_norm": 3.671537399291992, + "learning_rate": 9.9988903992265e-06, + "loss": 3.0892, + "step": 286100 + }, + { + "epoch": 0.28190340707581, + "grad_norm": 2.279270648956299, + "learning_rate": 9.998886965016402e-06, + "loss": 3.0301, + "step": 286150 + }, + { + "epoch": 0.2819526650536321, + "grad_norm": 2.3384532928466797, + "learning_rate": 9.998883525500669e-06, + "loss": 3.1009, + "step": 286200 + }, + { + "epoch": 0.28200192303145416, + "grad_norm": 2.4001433849334717, + "learning_rate": 9.998880080679308e-06, + "loss": 3.0271, + "step": 286250 + }, + { + "epoch": 0.2820511810092763, + "grad_norm": 2.3616175651550293, + "learning_rate": 9.998876630552318e-06, + "loss": 2.9973, + "step": 286300 + }, + { + "epoch": 0.28210043898709836, + "grad_norm": 2.275803327560425, + "learning_rate": 9.998873175119708e-06, + "loss": 3.0043, + "step": 286350 + }, + { + "epoch": 0.2821496969649204, + "grad_norm": 2.025627613067627, + "learning_rate": 9.998869714381476e-06, + "loss": 2.9828, + "step": 286400 + }, + { + "epoch": 0.28219895494274255, + "grad_norm": 2.245349884033203, + "learning_rate": 9.998866248337631e-06, + "loss": 3.0455, + "step": 286450 + }, + { + "epoch": 0.2822482129205646, + "grad_norm": 2.2885589599609375, + "learning_rate": 9.998862776988174e-06, + "loss": 3.0832, + "step": 286500 + }, + { + "epoch": 0.2822974708983867, + "grad_norm": 2.3451578617095947, + "learning_rate": 9.998859300333109e-06, + "loss": 3.1079, + "step": 286550 + }, + { + "epoch": 0.28234672887620876, + "grad_norm": 2.4749865531921387, + "learning_rate": 9.99885581837244e-06, + "loss": 3.0313, + "step": 286600 + }, + { + "epoch": 0.2823959868540309, + "grad_norm": 2.249617338180542, + "learning_rate": 9.998852331106167e-06, + "loss": 3.072, + "step": 286650 + }, + { + "epoch": 0.28244524483185296, + "grad_norm": 2.603849172592163, + "learning_rate": 9.9988488385343e-06, + "loss": 2.9739, + "step": 286700 + }, + { + "epoch": 0.28249450280967503, + "grad_norm": 2.364607810974121, + "learning_rate": 9.998845340656839e-06, + "loss": 3.0469, + "step": 286750 + }, + { + "epoch": 0.28254376078749716, + "grad_norm": 2.3598315715789795, + "learning_rate": 9.99884183747379e-06, + "loss": 3.072, + "step": 286800 + }, + { + "epoch": 0.2825930187653192, + "grad_norm": 2.743598699569702, + "learning_rate": 9.998838328985154e-06, + "loss": 3.0913, + "step": 286850 + }, + { + "epoch": 0.2826422767431413, + "grad_norm": 2.375572443008423, + "learning_rate": 9.998834815190935e-06, + "loss": 3.0564, + "step": 286900 + }, + { + "epoch": 0.2826915347209634, + "grad_norm": 2.207986354827881, + "learning_rate": 9.998831296091139e-06, + "loss": 3.0552, + "step": 286950 + }, + { + "epoch": 0.2827407926987855, + "grad_norm": 2.2509355545043945, + "learning_rate": 9.998827771685767e-06, + "loss": 3.0795, + "step": 287000 + }, + { + "epoch": 0.28279005067660756, + "grad_norm": 2.2666332721710205, + "learning_rate": 9.998824241974825e-06, + "loss": 3.0698, + "step": 287050 + }, + { + "epoch": 0.2828393086544297, + "grad_norm": 2.177332639694214, + "learning_rate": 9.998820706958317e-06, + "loss": 3.0456, + "step": 287100 + }, + { + "epoch": 0.28288856663225176, + "grad_norm": 2.2303922176361084, + "learning_rate": 9.998817166636244e-06, + "loss": 2.9761, + "step": 287150 + }, + { + "epoch": 0.28293782461007383, + "grad_norm": 2.4240329265594482, + "learning_rate": 9.998813621008611e-06, + "loss": 3.0859, + "step": 287200 + }, + { + "epoch": 0.28298708258789596, + "grad_norm": 2.3608810901641846, + "learning_rate": 9.998810070075424e-06, + "loss": 3.0215, + "step": 287250 + }, + { + "epoch": 0.283036340565718, + "grad_norm": 2.373133420944214, + "learning_rate": 9.998806513836684e-06, + "loss": 3.0163, + "step": 287300 + }, + { + "epoch": 0.2830855985435401, + "grad_norm": 2.2016563415527344, + "learning_rate": 9.998802952292397e-06, + "loss": 3.003, + "step": 287350 + }, + { + "epoch": 0.2831348565213622, + "grad_norm": 2.2067315578460693, + "learning_rate": 9.998799385442563e-06, + "loss": 3.1107, + "step": 287400 + }, + { + "epoch": 0.2831841144991843, + "grad_norm": 2.4514243602752686, + "learning_rate": 9.99879581328719e-06, + "loss": 3.1229, + "step": 287450 + }, + { + "epoch": 0.28323337247700636, + "grad_norm": 2.3377537727355957, + "learning_rate": 9.99879223582628e-06, + "loss": 3.0575, + "step": 287500 + }, + { + "epoch": 0.2832826304548285, + "grad_norm": 2.2953643798828125, + "learning_rate": 9.998788653059835e-06, + "loss": 3.0249, + "step": 287550 + }, + { + "epoch": 0.28333188843265056, + "grad_norm": 2.4936139583587646, + "learning_rate": 9.998785064987863e-06, + "loss": 2.9752, + "step": 287600 + }, + { + "epoch": 0.28338114641047263, + "grad_norm": 2.254514694213867, + "learning_rate": 9.998781471610365e-06, + "loss": 3.0198, + "step": 287650 + }, + { + "epoch": 0.28343040438829475, + "grad_norm": 2.395435094833374, + "learning_rate": 9.998777872927344e-06, + "loss": 2.9857, + "step": 287700 + }, + { + "epoch": 0.2834796623661168, + "grad_norm": 2.3736109733581543, + "learning_rate": 9.998774268938806e-06, + "loss": 3.0874, + "step": 287750 + }, + { + "epoch": 0.2835289203439389, + "grad_norm": 2.3006699085235596, + "learning_rate": 9.998770659644752e-06, + "loss": 2.987, + "step": 287800 + }, + { + "epoch": 0.28357817832176097, + "grad_norm": 2.2769694328308105, + "learning_rate": 9.99876704504519e-06, + "loss": 3.0428, + "step": 287850 + }, + { + "epoch": 0.2836274362995831, + "grad_norm": 2.286903142929077, + "learning_rate": 9.998763425140121e-06, + "loss": 3.0186, + "step": 287900 + }, + { + "epoch": 0.28367669427740516, + "grad_norm": 2.3983216285705566, + "learning_rate": 9.998759799929551e-06, + "loss": 3.053, + "step": 287950 + }, + { + "epoch": 0.28372595225522723, + "grad_norm": 2.532210350036621, + "learning_rate": 9.99875616941348e-06, + "loss": 3.0114, + "step": 288000 + }, + { + "epoch": 0.28377521023304936, + "grad_norm": 2.2508087158203125, + "learning_rate": 9.998752533591915e-06, + "loss": 3.0658, + "step": 288050 + }, + { + "epoch": 0.28382446821087143, + "grad_norm": 2.661543130874634, + "learning_rate": 9.99874889246486e-06, + "loss": 3.0781, + "step": 288100 + }, + { + "epoch": 0.2838737261886935, + "grad_norm": 2.1585402488708496, + "learning_rate": 9.998745246032315e-06, + "loss": 3.081, + "step": 288150 + }, + { + "epoch": 0.2839229841665156, + "grad_norm": 2.7667500972747803, + "learning_rate": 9.99874159429429e-06, + "loss": 3.017, + "step": 288200 + }, + { + "epoch": 0.2839722421443377, + "grad_norm": 2.31703519821167, + "learning_rate": 9.998737937250783e-06, + "loss": 3.0051, + "step": 288250 + }, + { + "epoch": 0.28402150012215976, + "grad_norm": 2.3286850452423096, + "learning_rate": 9.998734274901801e-06, + "loss": 3.0532, + "step": 288300 + }, + { + "epoch": 0.2840707580999819, + "grad_norm": 2.318650484085083, + "learning_rate": 9.998730607247349e-06, + "loss": 3.0587, + "step": 288350 + }, + { + "epoch": 0.28412001607780396, + "grad_norm": 2.330873966217041, + "learning_rate": 9.998726934287426e-06, + "loss": 2.9778, + "step": 288400 + }, + { + "epoch": 0.28416927405562603, + "grad_norm": 2.1771373748779297, + "learning_rate": 9.998723256022043e-06, + "loss": 3.1318, + "step": 288450 + }, + { + "epoch": 0.28421853203344816, + "grad_norm": 2.530815362930298, + "learning_rate": 9.998719572451198e-06, + "loss": 3.0885, + "step": 288500 + }, + { + "epoch": 0.2842677900112702, + "grad_norm": 2.3224003314971924, + "learning_rate": 9.998715883574898e-06, + "loss": 3.03, + "step": 288550 + }, + { + "epoch": 0.2843170479890923, + "grad_norm": 2.365121364593506, + "learning_rate": 9.998712189393144e-06, + "loss": 3.0801, + "step": 288600 + }, + { + "epoch": 0.2843663059669144, + "grad_norm": 2.2820188999176025, + "learning_rate": 9.998708489905944e-06, + "loss": 3.075, + "step": 288650 + }, + { + "epoch": 0.2844155639447365, + "grad_norm": 2.341099262237549, + "learning_rate": 9.998704785113298e-06, + "loss": 3.0954, + "step": 288700 + }, + { + "epoch": 0.28446482192255856, + "grad_norm": 2.417246103286743, + "learning_rate": 9.998701075015212e-06, + "loss": 3.0314, + "step": 288750 + }, + { + "epoch": 0.2845140799003807, + "grad_norm": 2.5416901111602783, + "learning_rate": 9.99869735961169e-06, + "loss": 3.0437, + "step": 288800 + }, + { + "epoch": 0.28456333787820276, + "grad_norm": 2.4126973152160645, + "learning_rate": 9.998693638902736e-06, + "loss": 3.0994, + "step": 288850 + }, + { + "epoch": 0.28461259585602483, + "grad_norm": 2.352830410003662, + "learning_rate": 9.998689912888354e-06, + "loss": 2.997, + "step": 288900 + }, + { + "epoch": 0.28466185383384696, + "grad_norm": 2.3088693618774414, + "learning_rate": 9.998686181568546e-06, + "loss": 3.0273, + "step": 288950 + }, + { + "epoch": 0.284711111811669, + "grad_norm": 2.1958441734313965, + "learning_rate": 9.998682444943318e-06, + "loss": 3.0763, + "step": 289000 + }, + { + "epoch": 0.2847603697894911, + "grad_norm": 2.1652441024780273, + "learning_rate": 9.998678703012673e-06, + "loss": 3.1059, + "step": 289050 + }, + { + "epoch": 0.28480962776731317, + "grad_norm": 2.351027488708496, + "learning_rate": 9.998674955776614e-06, + "loss": 3.0375, + "step": 289100 + }, + { + "epoch": 0.2848588857451353, + "grad_norm": 2.4307076930999756, + "learning_rate": 9.99867120323515e-06, + "loss": 3.0258, + "step": 289150 + }, + { + "epoch": 0.28490814372295736, + "grad_norm": 2.421461820602417, + "learning_rate": 9.998667445388278e-06, + "loss": 3.1098, + "step": 289200 + }, + { + "epoch": 0.28495740170077943, + "grad_norm": 2.381002187728882, + "learning_rate": 9.998663682236006e-06, + "loss": 3.0194, + "step": 289250 + }, + { + "epoch": 0.28500665967860156, + "grad_norm": 2.201969861984253, + "learning_rate": 9.99865991377834e-06, + "loss": 3.1069, + "step": 289300 + }, + { + "epoch": 0.28505591765642363, + "grad_norm": 2.456117868423462, + "learning_rate": 9.998656140015279e-06, + "loss": 3.0396, + "step": 289350 + }, + { + "epoch": 0.2851051756342457, + "grad_norm": 2.3484225273132324, + "learning_rate": 9.99865236094683e-06, + "loss": 3.0222, + "step": 289400 + }, + { + "epoch": 0.2851544336120678, + "grad_norm": 2.276754856109619, + "learning_rate": 9.998648576572994e-06, + "loss": 3.0406, + "step": 289450 + }, + { + "epoch": 0.2852036915898899, + "grad_norm": 2.3536970615386963, + "learning_rate": 9.99864478689378e-06, + "loss": 2.9742, + "step": 289500 + }, + { + "epoch": 0.28525294956771197, + "grad_norm": 2.249814510345459, + "learning_rate": 9.998640991909189e-06, + "loss": 2.9907, + "step": 289550 + }, + { + "epoch": 0.2853022075455341, + "grad_norm": 2.3742475509643555, + "learning_rate": 9.998637191619223e-06, + "loss": 3.0442, + "step": 289600 + }, + { + "epoch": 0.28535146552335616, + "grad_norm": 2.253692150115967, + "learning_rate": 9.99863338602389e-06, + "loss": 3.0869, + "step": 289650 + }, + { + "epoch": 0.28540072350117823, + "grad_norm": 2.336601972579956, + "learning_rate": 9.998629575123193e-06, + "loss": 3.1209, + "step": 289700 + }, + { + "epoch": 0.28544998147900036, + "grad_norm": 2.5015978813171387, + "learning_rate": 9.998625758917134e-06, + "loss": 3.0614, + "step": 289750 + }, + { + "epoch": 0.28549923945682243, + "grad_norm": 2.152068614959717, + "learning_rate": 9.998621937405722e-06, + "loss": 3.0552, + "step": 289800 + }, + { + "epoch": 0.2855484974346445, + "grad_norm": 2.3531506061553955, + "learning_rate": 9.998618110588954e-06, + "loss": 3.0623, + "step": 289850 + }, + { + "epoch": 0.2855977554124666, + "grad_norm": 2.273463726043701, + "learning_rate": 9.998614278466838e-06, + "loss": 3.0507, + "step": 289900 + }, + { + "epoch": 0.2856470133902887, + "grad_norm": 2.2851500511169434, + "learning_rate": 9.99861044103938e-06, + "loss": 3.1008, + "step": 289950 + }, + { + "epoch": 0.28569627136811077, + "grad_norm": 2.275754690170288, + "learning_rate": 9.99860659830658e-06, + "loss": 2.9778, + "step": 290000 + }, + { + "epoch": 0.2857455293459329, + "grad_norm": 2.3485803604125977, + "learning_rate": 9.998602750268444e-06, + "loss": 2.9705, + "step": 290050 + }, + { + "epoch": 0.28579478732375496, + "grad_norm": 2.177736520767212, + "learning_rate": 9.998598896924976e-06, + "loss": 3.0499, + "step": 290100 + }, + { + "epoch": 0.28584404530157703, + "grad_norm": 2.284900188446045, + "learning_rate": 9.99859503827618e-06, + "loss": 3.0254, + "step": 290150 + }, + { + "epoch": 0.28589330327939916, + "grad_norm": 2.2090563774108887, + "learning_rate": 9.99859117432206e-06, + "loss": 2.9789, + "step": 290200 + }, + { + "epoch": 0.28594256125722123, + "grad_norm": 2.348712921142578, + "learning_rate": 9.998587305062621e-06, + "loss": 3.0074, + "step": 290250 + }, + { + "epoch": 0.2859918192350433, + "grad_norm": 2.168501615524292, + "learning_rate": 9.998583430497866e-06, + "loss": 3.022, + "step": 290300 + }, + { + "epoch": 0.28604107721286537, + "grad_norm": 2.227161407470703, + "learning_rate": 9.9985795506278e-06, + "loss": 3.035, + "step": 290350 + }, + { + "epoch": 0.2860903351906875, + "grad_norm": 2.1885433197021484, + "learning_rate": 9.998575665452427e-06, + "loss": 2.9717, + "step": 290400 + }, + { + "epoch": 0.28613959316850957, + "grad_norm": 2.3073792457580566, + "learning_rate": 9.998571774971748e-06, + "loss": 3.1001, + "step": 290450 + }, + { + "epoch": 0.28618885114633164, + "grad_norm": 2.255302906036377, + "learning_rate": 9.998567879185772e-06, + "loss": 3.0153, + "step": 290500 + }, + { + "epoch": 0.28623810912415376, + "grad_norm": 2.36064076423645, + "learning_rate": 9.9985639780945e-06, + "loss": 2.9695, + "step": 290550 + }, + { + "epoch": 0.28628736710197583, + "grad_norm": 2.3862011432647705, + "learning_rate": 9.998560071697939e-06, + "loss": 3.0633, + "step": 290600 + }, + { + "epoch": 0.2863366250797979, + "grad_norm": 2.2919342517852783, + "learning_rate": 9.99855615999609e-06, + "loss": 3.0401, + "step": 290650 + }, + { + "epoch": 0.28638588305762, + "grad_norm": 2.4770870208740234, + "learning_rate": 9.998552242988958e-06, + "loss": 3.0774, + "step": 290700 + }, + { + "epoch": 0.2864351410354421, + "grad_norm": 2.1949996948242188, + "learning_rate": 9.998548320676547e-06, + "loss": 3.0379, + "step": 290750 + }, + { + "epoch": 0.28648439901326417, + "grad_norm": 2.3981010913848877, + "learning_rate": 9.998544393058863e-06, + "loss": 3.0624, + "step": 290800 + }, + { + "epoch": 0.2865336569910863, + "grad_norm": 2.358799457550049, + "learning_rate": 9.998540460135909e-06, + "loss": 3.0046, + "step": 290850 + }, + { + "epoch": 0.28658291496890836, + "grad_norm": 2.4417061805725098, + "learning_rate": 9.998536521907688e-06, + "loss": 3.0653, + "step": 290900 + }, + { + "epoch": 0.28663217294673043, + "grad_norm": 2.6389684677124023, + "learning_rate": 9.998532578374205e-06, + "loss": 3.0171, + "step": 290950 + }, + { + "epoch": 0.28668143092455256, + "grad_norm": 2.358948230743408, + "learning_rate": 9.998528629535467e-06, + "loss": 2.9944, + "step": 291000 + }, + { + "epoch": 0.28673068890237463, + "grad_norm": 2.3416569232940674, + "learning_rate": 9.998524675391473e-06, + "loss": 3.0654, + "step": 291050 + }, + { + "epoch": 0.2867799468801967, + "grad_norm": 2.2462575435638428, + "learning_rate": 9.998520715942231e-06, + "loss": 3.0886, + "step": 291100 + }, + { + "epoch": 0.2868292048580188, + "grad_norm": 2.235870122909546, + "learning_rate": 9.998516751187744e-06, + "loss": 3.1541, + "step": 291150 + }, + { + "epoch": 0.2868784628358409, + "grad_norm": 2.026491641998291, + "learning_rate": 9.998512781128016e-06, + "loss": 3.0491, + "step": 291200 + }, + { + "epoch": 0.28692772081366297, + "grad_norm": 2.756687641143799, + "learning_rate": 9.99850880576305e-06, + "loss": 2.989, + "step": 291250 + }, + { + "epoch": 0.2869769787914851, + "grad_norm": 2.439873456954956, + "learning_rate": 9.998504825092854e-06, + "loss": 3.129, + "step": 291300 + }, + { + "epoch": 0.28702623676930716, + "grad_norm": 2.2006893157958984, + "learning_rate": 9.998500839117429e-06, + "loss": 3.0293, + "step": 291350 + }, + { + "epoch": 0.28707549474712923, + "grad_norm": 2.4875404834747314, + "learning_rate": 9.99849684783678e-06, + "loss": 3.068, + "step": 291400 + }, + { + "epoch": 0.28712475272495136, + "grad_norm": 2.3245279788970947, + "learning_rate": 9.998492851250913e-06, + "loss": 3.0295, + "step": 291450 + }, + { + "epoch": 0.28717401070277343, + "grad_norm": 2.655789852142334, + "learning_rate": 9.998488849359828e-06, + "loss": 3.0825, + "step": 291500 + }, + { + "epoch": 0.2872232686805955, + "grad_norm": 2.3727099895477295, + "learning_rate": 9.998484842163533e-06, + "loss": 3.0384, + "step": 291550 + }, + { + "epoch": 0.28727252665841757, + "grad_norm": 2.3597002029418945, + "learning_rate": 9.998480829662032e-06, + "loss": 3.0726, + "step": 291600 + }, + { + "epoch": 0.2873217846362397, + "grad_norm": 2.3375067710876465, + "learning_rate": 9.998476811855325e-06, + "loss": 2.9592, + "step": 291650 + }, + { + "epoch": 0.28737104261406177, + "grad_norm": 2.2842366695404053, + "learning_rate": 9.998472788743424e-06, + "loss": 3.0099, + "step": 291700 + }, + { + "epoch": 0.28742030059188384, + "grad_norm": 2.399832248687744, + "learning_rate": 9.998468760326326e-06, + "loss": 3.0611, + "step": 291750 + }, + { + "epoch": 0.28746955856970596, + "grad_norm": 2.417965888977051, + "learning_rate": 9.99846472660404e-06, + "loss": 3.0718, + "step": 291800 + }, + { + "epoch": 0.28751881654752803, + "grad_norm": 2.2810895442962646, + "learning_rate": 9.998460687576567e-06, + "loss": 3.094, + "step": 291850 + }, + { + "epoch": 0.2875680745253501, + "grad_norm": 2.367478370666504, + "learning_rate": 9.998456643243914e-06, + "loss": 3.0856, + "step": 291900 + }, + { + "epoch": 0.28761733250317223, + "grad_norm": 2.29252290725708, + "learning_rate": 9.998452593606083e-06, + "loss": 2.9614, + "step": 291950 + }, + { + "epoch": 0.2876665904809943, + "grad_norm": 2.439150810241699, + "learning_rate": 9.99844853866308e-06, + "loss": 3.0131, + "step": 292000 + }, + { + "epoch": 0.28771584845881637, + "grad_norm": 2.387671709060669, + "learning_rate": 9.998444478414908e-06, + "loss": 3.0589, + "step": 292050 + }, + { + "epoch": 0.2877651064366385, + "grad_norm": 2.462489128112793, + "learning_rate": 9.998440412861572e-06, + "loss": 3.0246, + "step": 292100 + }, + { + "epoch": 0.28781436441446057, + "grad_norm": 2.3577253818511963, + "learning_rate": 9.998436342003077e-06, + "loss": 3.0819, + "step": 292150 + }, + { + "epoch": 0.28786362239228264, + "grad_norm": 2.3059709072113037, + "learning_rate": 9.998432265839425e-06, + "loss": 3.0599, + "step": 292200 + }, + { + "epoch": 0.28791288037010476, + "grad_norm": 2.184478759765625, + "learning_rate": 9.998428184370623e-06, + "loss": 2.9718, + "step": 292250 + }, + { + "epoch": 0.28796213834792683, + "grad_norm": 2.3547329902648926, + "learning_rate": 9.998424097596674e-06, + "loss": 3.0426, + "step": 292300 + }, + { + "epoch": 0.2880113963257489, + "grad_norm": 2.318939685821533, + "learning_rate": 9.998420005517583e-06, + "loss": 3.0042, + "step": 292350 + }, + { + "epoch": 0.28806065430357103, + "grad_norm": 2.2254958152770996, + "learning_rate": 9.998415908133354e-06, + "loss": 2.9885, + "step": 292400 + }, + { + "epoch": 0.2881099122813931, + "grad_norm": 2.692089557647705, + "learning_rate": 9.998411805443991e-06, + "loss": 3.0373, + "step": 292450 + }, + { + "epoch": 0.28815917025921517, + "grad_norm": 2.802527904510498, + "learning_rate": 9.998407697449498e-06, + "loss": 3.0506, + "step": 292500 + }, + { + "epoch": 0.2882084282370373, + "grad_norm": 2.3120856285095215, + "learning_rate": 9.998403584149881e-06, + "loss": 3.0394, + "step": 292550 + }, + { + "epoch": 0.28825768621485937, + "grad_norm": 2.6264266967773438, + "learning_rate": 9.998399465545144e-06, + "loss": 3.0542, + "step": 292600 + }, + { + "epoch": 0.28830694419268144, + "grad_norm": 2.3126935958862305, + "learning_rate": 9.998395341635288e-06, + "loss": 3.0138, + "step": 292650 + }, + { + "epoch": 0.28835620217050356, + "grad_norm": 2.3678417205810547, + "learning_rate": 9.998391212420323e-06, + "loss": 3.0341, + "step": 292700 + }, + { + "epoch": 0.28840546014832563, + "grad_norm": 2.287625789642334, + "learning_rate": 9.998387077900248e-06, + "loss": 3.0277, + "step": 292750 + }, + { + "epoch": 0.2884547181261477, + "grad_norm": 2.3791117668151855, + "learning_rate": 9.998382938075072e-06, + "loss": 3.0604, + "step": 292800 + }, + { + "epoch": 0.2885039761039698, + "grad_norm": 2.245936870574951, + "learning_rate": 9.998378792944795e-06, + "loss": 3.0577, + "step": 292850 + }, + { + "epoch": 0.2885532340817919, + "grad_norm": 2.310142993927002, + "learning_rate": 9.998374642509427e-06, + "loss": 3.0252, + "step": 292900 + }, + { + "epoch": 0.28860249205961397, + "grad_norm": 2.2812869548797607, + "learning_rate": 9.998370486768966e-06, + "loss": 3.0281, + "step": 292950 + }, + { + "epoch": 0.28865175003743604, + "grad_norm": 2.322847366333008, + "learning_rate": 9.99836632572342e-06, + "loss": 3.0176, + "step": 293000 + }, + { + "epoch": 0.28870100801525816, + "grad_norm": 2.3665616512298584, + "learning_rate": 9.998362159372795e-06, + "loss": 3.0327, + "step": 293050 + }, + { + "epoch": 0.28875026599308024, + "grad_norm": 2.3478260040283203, + "learning_rate": 9.998357987717091e-06, + "loss": 3.0172, + "step": 293100 + }, + { + "epoch": 0.2887995239709023, + "grad_norm": 2.3367061614990234, + "learning_rate": 9.998353810756316e-06, + "loss": 3.0149, + "step": 293150 + }, + { + "epoch": 0.28884878194872443, + "grad_norm": 2.296567916870117, + "learning_rate": 9.998349628490472e-06, + "loss": 2.987, + "step": 293200 + }, + { + "epoch": 0.2888980399265465, + "grad_norm": 2.200796604156494, + "learning_rate": 9.998345440919565e-06, + "loss": 3.0011, + "step": 293250 + }, + { + "epoch": 0.28894729790436857, + "grad_norm": 2.3108108043670654, + "learning_rate": 9.9983412480436e-06, + "loss": 3.0328, + "step": 293300 + }, + { + "epoch": 0.2889965558821907, + "grad_norm": 2.2771708965301514, + "learning_rate": 9.99833704986258e-06, + "loss": 2.968, + "step": 293350 + }, + { + "epoch": 0.28904581386001277, + "grad_norm": 2.2548904418945312, + "learning_rate": 9.99833284637651e-06, + "loss": 3.0709, + "step": 293400 + }, + { + "epoch": 0.28909507183783484, + "grad_norm": 2.316235065460205, + "learning_rate": 9.998328637585393e-06, + "loss": 3.0418, + "step": 293450 + }, + { + "epoch": 0.28914432981565696, + "grad_norm": 2.4283809661865234, + "learning_rate": 9.998324423489238e-06, + "loss": 2.9466, + "step": 293500 + }, + { + "epoch": 0.28919358779347903, + "grad_norm": 2.586810827255249, + "learning_rate": 9.998320204088044e-06, + "loss": 3.072, + "step": 293550 + }, + { + "epoch": 0.2892428457713011, + "grad_norm": 2.4635560512542725, + "learning_rate": 9.998315979381818e-06, + "loss": 3.0213, + "step": 293600 + }, + { + "epoch": 0.28929210374912323, + "grad_norm": 3.2042336463928223, + "learning_rate": 9.998311749370564e-06, + "loss": 2.9962, + "step": 293650 + }, + { + "epoch": 0.2893413617269453, + "grad_norm": 2.270669937133789, + "learning_rate": 9.998307514054288e-06, + "loss": 2.9994, + "step": 293700 + }, + { + "epoch": 0.28939061970476737, + "grad_norm": 2.392421007156372, + "learning_rate": 9.998303273432993e-06, + "loss": 3.0089, + "step": 293750 + }, + { + "epoch": 0.2894398776825895, + "grad_norm": 2.2320640087127686, + "learning_rate": 9.998299027506683e-06, + "loss": 3.0224, + "step": 293800 + }, + { + "epoch": 0.28948913566041157, + "grad_norm": 2.44762921333313, + "learning_rate": 9.998294776275365e-06, + "loss": 3.0424, + "step": 293850 + }, + { + "epoch": 0.28953839363823364, + "grad_norm": 2.2852888107299805, + "learning_rate": 9.99829051973904e-06, + "loss": 3.0864, + "step": 293900 + }, + { + "epoch": 0.28958765161605576, + "grad_norm": 2.5310120582580566, + "learning_rate": 9.998286257897714e-06, + "loss": 3.0324, + "step": 293950 + }, + { + "epoch": 0.28963690959387783, + "grad_norm": 2.533622980117798, + "learning_rate": 9.998281990751394e-06, + "loss": 3.0093, + "step": 294000 + }, + { + "epoch": 0.2896861675716999, + "grad_norm": 2.4127769470214844, + "learning_rate": 9.998277718300081e-06, + "loss": 3.0144, + "step": 294050 + }, + { + "epoch": 0.289735425549522, + "grad_norm": 2.397904872894287, + "learning_rate": 9.99827344054378e-06, + "loss": 3.0353, + "step": 294100 + }, + { + "epoch": 0.2897846835273441, + "grad_norm": 3.5688552856445312, + "learning_rate": 9.998269157482499e-06, + "loss": 3.0347, + "step": 294150 + }, + { + "epoch": 0.28983394150516617, + "grad_norm": 2.3525664806365967, + "learning_rate": 9.99826486911624e-06, + "loss": 3.0278, + "step": 294200 + }, + { + "epoch": 0.28988319948298824, + "grad_norm": 2.272430658340454, + "learning_rate": 9.998260575445005e-06, + "loss": 2.9651, + "step": 294250 + }, + { + "epoch": 0.28993245746081037, + "grad_norm": 2.218174457550049, + "learning_rate": 9.998256276468803e-06, + "loss": 2.9642, + "step": 294300 + }, + { + "epoch": 0.28998171543863244, + "grad_norm": 2.1877808570861816, + "learning_rate": 9.998251972187637e-06, + "loss": 2.9469, + "step": 294350 + }, + { + "epoch": 0.2900309734164545, + "grad_norm": 2.243525743484497, + "learning_rate": 9.99824766260151e-06, + "loss": 3.0118, + "step": 294400 + }, + { + "epoch": 0.29008023139427663, + "grad_norm": 2.282008647918701, + "learning_rate": 9.998243347710428e-06, + "loss": 3.0374, + "step": 294450 + }, + { + "epoch": 0.2901294893720987, + "grad_norm": 2.290919780731201, + "learning_rate": 9.998239027514395e-06, + "loss": 3.0776, + "step": 294500 + }, + { + "epoch": 0.2901787473499208, + "grad_norm": 2.4914908409118652, + "learning_rate": 9.998234702013419e-06, + "loss": 3.0654, + "step": 294550 + }, + { + "epoch": 0.2902280053277429, + "grad_norm": 2.331376075744629, + "learning_rate": 9.998230371207499e-06, + "loss": 3.0268, + "step": 294600 + }, + { + "epoch": 0.29027726330556497, + "grad_norm": 2.17211651802063, + "learning_rate": 9.998226035096642e-06, + "loss": 3.0939, + "step": 294650 + }, + { + "epoch": 0.29032652128338704, + "grad_norm": 2.363924026489258, + "learning_rate": 9.998221693680854e-06, + "loss": 3.0348, + "step": 294700 + }, + { + "epoch": 0.29037577926120917, + "grad_norm": 2.2677977085113525, + "learning_rate": 9.998217346960137e-06, + "loss": 3.0658, + "step": 294750 + }, + { + "epoch": 0.29042503723903124, + "grad_norm": 2.41581392288208, + "learning_rate": 9.998212994934498e-06, + "loss": 3.0152, + "step": 294800 + }, + { + "epoch": 0.2904742952168533, + "grad_norm": 2.3087527751922607, + "learning_rate": 9.99820863760394e-06, + "loss": 3.096, + "step": 294850 + }, + { + "epoch": 0.29052355319467543, + "grad_norm": 2.3304736614227295, + "learning_rate": 9.998204274968468e-06, + "loss": 3.0041, + "step": 294900 + }, + { + "epoch": 0.2905728111724975, + "grad_norm": 2.237182378768921, + "learning_rate": 9.998199907028088e-06, + "loss": 2.9745, + "step": 294950 + }, + { + "epoch": 0.2906220691503196, + "grad_norm": 2.222710371017456, + "learning_rate": 9.998195533782802e-06, + "loss": 3.06, + "step": 295000 + }, + { + "epoch": 0.2906713271281417, + "grad_norm": 2.2881669998168945, + "learning_rate": 9.998191155232617e-06, + "loss": 3.132, + "step": 295050 + }, + { + "epoch": 0.29072058510596377, + "grad_norm": 2.479964256286621, + "learning_rate": 9.998186771377538e-06, + "loss": 3.0139, + "step": 295100 + }, + { + "epoch": 0.29076984308378584, + "grad_norm": 2.5020108222961426, + "learning_rate": 9.998182382217567e-06, + "loss": 3.0226, + "step": 295150 + }, + { + "epoch": 0.29081910106160797, + "grad_norm": 2.3414788246154785, + "learning_rate": 9.998177987752712e-06, + "loss": 3.0322, + "step": 295200 + }, + { + "epoch": 0.29086835903943004, + "grad_norm": 2.2535030841827393, + "learning_rate": 9.998173587982974e-06, + "loss": 3.005, + "step": 295250 + }, + { + "epoch": 0.2909176170172521, + "grad_norm": 2.2909293174743652, + "learning_rate": 9.99816918290836e-06, + "loss": 3.0692, + "step": 295300 + }, + { + "epoch": 0.2909668749950742, + "grad_norm": 2.2951035499572754, + "learning_rate": 9.998164772528873e-06, + "loss": 3.0997, + "step": 295350 + }, + { + "epoch": 0.2910161329728963, + "grad_norm": 2.302123785018921, + "learning_rate": 9.99816035684452e-06, + "loss": 3.0218, + "step": 295400 + }, + { + "epoch": 0.2910653909507184, + "grad_norm": 2.2479300498962402, + "learning_rate": 9.998155935855304e-06, + "loss": 3.0235, + "step": 295450 + }, + { + "epoch": 0.29111464892854044, + "grad_norm": 2.2750980854034424, + "learning_rate": 9.998151509561231e-06, + "loss": 3.0321, + "step": 295500 + }, + { + "epoch": 0.29116390690636257, + "grad_norm": 2.3885955810546875, + "learning_rate": 9.998147077962305e-06, + "loss": 3.0197, + "step": 295550 + }, + { + "epoch": 0.29121316488418464, + "grad_norm": 2.306899309158325, + "learning_rate": 9.99814264105853e-06, + "loss": 2.9751, + "step": 295600 + }, + { + "epoch": 0.2912624228620067, + "grad_norm": 2.3755881786346436, + "learning_rate": 9.998138198849911e-06, + "loss": 2.9904, + "step": 295650 + }, + { + "epoch": 0.29131168083982883, + "grad_norm": 2.2048189640045166, + "learning_rate": 9.998133751336453e-06, + "loss": 3.0756, + "step": 295700 + }, + { + "epoch": 0.2913609388176509, + "grad_norm": 2.330815553665161, + "learning_rate": 9.99812929851816e-06, + "loss": 3.0836, + "step": 295750 + }, + { + "epoch": 0.291410196795473, + "grad_norm": 2.3507096767425537, + "learning_rate": 9.99812484039504e-06, + "loss": 2.9933, + "step": 295800 + }, + { + "epoch": 0.2914594547732951, + "grad_norm": 2.127321481704712, + "learning_rate": 9.998120376967093e-06, + "loss": 3.0235, + "step": 295850 + }, + { + "epoch": 0.29150871275111717, + "grad_norm": 2.209505081176758, + "learning_rate": 9.998115908234326e-06, + "loss": 2.932, + "step": 295900 + }, + { + "epoch": 0.29155797072893924, + "grad_norm": 2.355177640914917, + "learning_rate": 9.998111434196744e-06, + "loss": 3.0603, + "step": 295950 + }, + { + "epoch": 0.29160722870676137, + "grad_norm": 2.3396031856536865, + "learning_rate": 9.998106954854353e-06, + "loss": 3.0326, + "step": 296000 + }, + { + "epoch": 0.29165648668458344, + "grad_norm": 2.326965808868408, + "learning_rate": 9.998102470207155e-06, + "loss": 2.9948, + "step": 296050 + }, + { + "epoch": 0.2917057446624055, + "grad_norm": 2.3114209175109863, + "learning_rate": 9.998097980255155e-06, + "loss": 3.2299, + "step": 296100 + }, + { + "epoch": 0.29175500264022763, + "grad_norm": 2.303318500518799, + "learning_rate": 9.998093484998361e-06, + "loss": 3.0625, + "step": 296150 + }, + { + "epoch": 0.2918042606180497, + "grad_norm": 2.2999136447906494, + "learning_rate": 9.998088984436772e-06, + "loss": 3.0399, + "step": 296200 + }, + { + "epoch": 0.2918535185958718, + "grad_norm": 2.250926971435547, + "learning_rate": 9.9980844785704e-06, + "loss": 3.0057, + "step": 296250 + }, + { + "epoch": 0.2919027765736939, + "grad_norm": 2.2653074264526367, + "learning_rate": 9.998079967399244e-06, + "loss": 3.0406, + "step": 296300 + }, + { + "epoch": 0.29195203455151597, + "grad_norm": 2.1483070850372314, + "learning_rate": 9.99807545092331e-06, + "loss": 3.0727, + "step": 296350 + }, + { + "epoch": 0.29200129252933804, + "grad_norm": 2.2660086154937744, + "learning_rate": 9.998070929142606e-06, + "loss": 3.0068, + "step": 296400 + }, + { + "epoch": 0.29205055050716017, + "grad_norm": 2.323580265045166, + "learning_rate": 9.998066402057133e-06, + "loss": 3.0377, + "step": 296450 + }, + { + "epoch": 0.29209980848498224, + "grad_norm": 2.492438316345215, + "learning_rate": 9.998061869666898e-06, + "loss": 3.043, + "step": 296500 + }, + { + "epoch": 0.2921490664628043, + "grad_norm": 2.421921491622925, + "learning_rate": 9.998057331971903e-06, + "loss": 3.0073, + "step": 296550 + }, + { + "epoch": 0.2921983244406264, + "grad_norm": 2.3024890422821045, + "learning_rate": 9.998052788972158e-06, + "loss": 2.9749, + "step": 296600 + }, + { + "epoch": 0.2922475824184485, + "grad_norm": 2.280564069747925, + "learning_rate": 9.998048240667662e-06, + "loss": 3.039, + "step": 296650 + }, + { + "epoch": 0.2922968403962706, + "grad_norm": 2.519284725189209, + "learning_rate": 9.998043687058423e-06, + "loss": 2.9565, + "step": 296700 + }, + { + "epoch": 0.29234609837409264, + "grad_norm": 2.4160988330841064, + "learning_rate": 9.998039128144447e-06, + "loss": 3.0407, + "step": 296750 + }, + { + "epoch": 0.29239535635191477, + "grad_norm": 2.5507562160491943, + "learning_rate": 9.998034563925737e-06, + "loss": 3.0224, + "step": 296800 + }, + { + "epoch": 0.29244461432973684, + "grad_norm": 2.036102294921875, + "learning_rate": 9.998029994402297e-06, + "loss": 2.9972, + "step": 296850 + }, + { + "epoch": 0.2924938723075589, + "grad_norm": 2.2396790981292725, + "learning_rate": 9.998025419574131e-06, + "loss": 3.0048, + "step": 296900 + }, + { + "epoch": 0.29254313028538104, + "grad_norm": 2.4221136569976807, + "learning_rate": 9.998020839441246e-06, + "loss": 3.0073, + "step": 296950 + }, + { + "epoch": 0.2925923882632031, + "grad_norm": 2.4926257133483887, + "learning_rate": 9.99801625400365e-06, + "loss": 3.036, + "step": 297000 + }, + { + "epoch": 0.2926416462410252, + "grad_norm": 2.058344841003418, + "learning_rate": 9.998011663261343e-06, + "loss": 3.0355, + "step": 297050 + }, + { + "epoch": 0.2926909042188473, + "grad_norm": 2.4081218242645264, + "learning_rate": 9.99800706721433e-06, + "loss": 3.065, + "step": 297100 + }, + { + "epoch": 0.2927401621966694, + "grad_norm": 2.359699249267578, + "learning_rate": 9.998002465862619e-06, + "loss": 2.9612, + "step": 297150 + }, + { + "epoch": 0.29278942017449144, + "grad_norm": 2.3137311935424805, + "learning_rate": 9.997997859206212e-06, + "loss": 2.9654, + "step": 297200 + }, + { + "epoch": 0.29283867815231357, + "grad_norm": 2.3155648708343506, + "learning_rate": 9.997993247245115e-06, + "loss": 3.0773, + "step": 297250 + }, + { + "epoch": 0.29288793613013564, + "grad_norm": 2.27374529838562, + "learning_rate": 9.997988629979334e-06, + "loss": 2.9492, + "step": 297300 + }, + { + "epoch": 0.2929371941079577, + "grad_norm": 2.234924554824829, + "learning_rate": 9.997984007408871e-06, + "loss": 3.0349, + "step": 297350 + }, + { + "epoch": 0.29298645208577984, + "grad_norm": 2.404151439666748, + "learning_rate": 9.997979379533733e-06, + "loss": 3.0389, + "step": 297400 + }, + { + "epoch": 0.2930357100636019, + "grad_norm": 2.3502793312072754, + "learning_rate": 9.997974746353925e-06, + "loss": 3.0308, + "step": 297450 + }, + { + "epoch": 0.293084968041424, + "grad_norm": 2.1905019283294678, + "learning_rate": 9.997970107869452e-06, + "loss": 3.0136, + "step": 297500 + }, + { + "epoch": 0.2931342260192461, + "grad_norm": 2.6126620769500732, + "learning_rate": 9.997965464080318e-06, + "loss": 2.9793, + "step": 297550 + }, + { + "epoch": 0.2931834839970682, + "grad_norm": 2.20261549949646, + "learning_rate": 9.997960814986528e-06, + "loss": 2.9788, + "step": 297600 + }, + { + "epoch": 0.29323274197489024, + "grad_norm": 2.3823049068450928, + "learning_rate": 9.997956160588088e-06, + "loss": 2.9969, + "step": 297650 + }, + { + "epoch": 0.29328199995271237, + "grad_norm": 2.961604595184326, + "learning_rate": 9.997951500885e-06, + "loss": 2.9895, + "step": 297700 + }, + { + "epoch": 0.29333125793053444, + "grad_norm": 2.2704825401306152, + "learning_rate": 9.997946835877275e-06, + "loss": 3.0208, + "step": 297750 + }, + { + "epoch": 0.2933805159083565, + "grad_norm": 2.2912776470184326, + "learning_rate": 9.99794216556491e-06, + "loss": 2.97, + "step": 297800 + }, + { + "epoch": 0.2934297738861786, + "grad_norm": 2.3277502059936523, + "learning_rate": 9.997937489947917e-06, + "loss": 3.0822, + "step": 297850 + }, + { + "epoch": 0.2934790318640007, + "grad_norm": 2.2956442832946777, + "learning_rate": 9.997932809026298e-06, + "loss": 3.0024, + "step": 297900 + }, + { + "epoch": 0.2935282898418228, + "grad_norm": 2.7489371299743652, + "learning_rate": 9.997928122800056e-06, + "loss": 3.0273, + "step": 297950 + }, + { + "epoch": 0.29357754781964485, + "grad_norm": 2.4630348682403564, + "learning_rate": 9.9979234312692e-06, + "loss": 2.9668, + "step": 298000 + }, + { + "epoch": 0.29362680579746697, + "grad_norm": 2.318434953689575, + "learning_rate": 9.997918734433733e-06, + "loss": 3.0377, + "step": 298050 + }, + { + "epoch": 0.29367606377528904, + "grad_norm": 2.313265085220337, + "learning_rate": 9.99791403229366e-06, + "loss": 3.0475, + "step": 298100 + }, + { + "epoch": 0.2937253217531111, + "grad_norm": 2.316800832748413, + "learning_rate": 9.997909324848983e-06, + "loss": 3.0248, + "step": 298150 + }, + { + "epoch": 0.29377457973093324, + "grad_norm": 2.141268014907837, + "learning_rate": 9.997904612099712e-06, + "loss": 3.0027, + "step": 298200 + }, + { + "epoch": 0.2938238377087553, + "grad_norm": 2.2197372913360596, + "learning_rate": 9.997899894045851e-06, + "loss": 3.0501, + "step": 298250 + }, + { + "epoch": 0.2938730956865774, + "grad_norm": 2.395688533782959, + "learning_rate": 9.997895170687402e-06, + "loss": 3.0357, + "step": 298300 + }, + { + "epoch": 0.2939223536643995, + "grad_norm": 2.3097569942474365, + "learning_rate": 9.997890442024375e-06, + "loss": 3.1222, + "step": 298350 + }, + { + "epoch": 0.2939716116422216, + "grad_norm": 2.4284706115722656, + "learning_rate": 9.997885708056769e-06, + "loss": 3.0151, + "step": 298400 + }, + { + "epoch": 0.29402086962004365, + "grad_norm": 2.2458603382110596, + "learning_rate": 9.997880968784592e-06, + "loss": 3.0383, + "step": 298450 + }, + { + "epoch": 0.29407012759786577, + "grad_norm": 2.279085159301758, + "learning_rate": 9.997876224207851e-06, + "loss": 3.0116, + "step": 298500 + }, + { + "epoch": 0.29411938557568784, + "grad_norm": 2.117119073867798, + "learning_rate": 9.997871474326548e-06, + "loss": 3.0286, + "step": 298550 + }, + { + "epoch": 0.2941686435535099, + "grad_norm": 2.2228713035583496, + "learning_rate": 9.99786671914069e-06, + "loss": 3.067, + "step": 298600 + }, + { + "epoch": 0.29421790153133204, + "grad_norm": 2.3333425521850586, + "learning_rate": 9.99786195865028e-06, + "loss": 3.0201, + "step": 298650 + }, + { + "epoch": 0.2942671595091541, + "grad_norm": 2.3131484985351562, + "learning_rate": 9.997857192855325e-06, + "loss": 2.988, + "step": 298700 + }, + { + "epoch": 0.2943164174869762, + "grad_norm": 2.408461809158325, + "learning_rate": 9.997852421755828e-06, + "loss": 3.0499, + "step": 298750 + }, + { + "epoch": 0.2943656754647983, + "grad_norm": 2.2897918224334717, + "learning_rate": 9.997847645351795e-06, + "loss": 2.9665, + "step": 298800 + }, + { + "epoch": 0.2944149334426204, + "grad_norm": 2.4277658462524414, + "learning_rate": 9.997842863643232e-06, + "loss": 3.0504, + "step": 298850 + }, + { + "epoch": 0.29446419142044244, + "grad_norm": 2.4914588928222656, + "learning_rate": 9.997838076630146e-06, + "loss": 2.999, + "step": 298900 + }, + { + "epoch": 0.2945134493982645, + "grad_norm": 2.178740978240967, + "learning_rate": 9.997833284312536e-06, + "loss": 2.9704, + "step": 298950 + }, + { + "epoch": 0.29456270737608664, + "grad_norm": 2.179828405380249, + "learning_rate": 9.997828486690413e-06, + "loss": 3.0652, + "step": 299000 + }, + { + "epoch": 0.2946119653539087, + "grad_norm": 2.3757736682891846, + "learning_rate": 9.997823683763778e-06, + "loss": 3.089, + "step": 299050 + }, + { + "epoch": 0.2946612233317308, + "grad_norm": 2.786357879638672, + "learning_rate": 9.99781887553264e-06, + "loss": 3.0685, + "step": 299100 + }, + { + "epoch": 0.2947104813095529, + "grad_norm": 2.277616262435913, + "learning_rate": 9.997814061996999e-06, + "loss": 3.0256, + "step": 299150 + }, + { + "epoch": 0.294759739287375, + "grad_norm": 2.2985942363739014, + "learning_rate": 9.997809243156865e-06, + "loss": 3.053, + "step": 299200 + }, + { + "epoch": 0.29480899726519705, + "grad_norm": 2.1226439476013184, + "learning_rate": 9.99780441901224e-06, + "loss": 3.0519, + "step": 299250 + }, + { + "epoch": 0.2948582552430192, + "grad_norm": 2.2674336433410645, + "learning_rate": 9.997799589563132e-06, + "loss": 3.0319, + "step": 299300 + }, + { + "epoch": 0.29490751322084124, + "grad_norm": 2.2621166706085205, + "learning_rate": 9.997794754809543e-06, + "loss": 2.9876, + "step": 299350 + }, + { + "epoch": 0.2949567711986633, + "grad_norm": 2.2113723754882812, + "learning_rate": 9.99778991475148e-06, + "loss": 2.9922, + "step": 299400 + }, + { + "epoch": 0.29500602917648544, + "grad_norm": 2.239797353744507, + "learning_rate": 9.997785069388948e-06, + "loss": 3.0752, + "step": 299450 + }, + { + "epoch": 0.2950552871543075, + "grad_norm": 2.2194085121154785, + "learning_rate": 9.997780218721952e-06, + "loss": 3.1058, + "step": 299500 + }, + { + "epoch": 0.2951045451321296, + "grad_norm": 2.343980312347412, + "learning_rate": 9.997775362750497e-06, + "loss": 3.0814, + "step": 299550 + }, + { + "epoch": 0.2951538031099517, + "grad_norm": 2.3087575435638428, + "learning_rate": 9.997770501474588e-06, + "loss": 3.0418, + "step": 299600 + }, + { + "epoch": 0.2952030610877738, + "grad_norm": 2.37811279296875, + "learning_rate": 9.99776563489423e-06, + "loss": 3.0077, + "step": 299650 + }, + { + "epoch": 0.29525231906559585, + "grad_norm": 2.32179594039917, + "learning_rate": 9.997760763009427e-06, + "loss": 2.9622, + "step": 299700 + }, + { + "epoch": 0.295301577043418, + "grad_norm": 2.497282028198242, + "learning_rate": 9.997755885820188e-06, + "loss": 3.0178, + "step": 299750 + }, + { + "epoch": 0.29535083502124004, + "grad_norm": 2.214566707611084, + "learning_rate": 9.997751003326516e-06, + "loss": 2.9933, + "step": 299800 + }, + { + "epoch": 0.2954000929990621, + "grad_norm": 2.4989564418792725, + "learning_rate": 9.997746115528414e-06, + "loss": 3.0463, + "step": 299850 + }, + { + "epoch": 0.29544935097688424, + "grad_norm": 2.462315320968628, + "learning_rate": 9.99774122242589e-06, + "loss": 3.022, + "step": 299900 + }, + { + "epoch": 0.2954986089547063, + "grad_norm": 2.3170173168182373, + "learning_rate": 9.997736324018948e-06, + "loss": 3.0488, + "step": 299950 + }, + { + "epoch": 0.2955478669325284, + "grad_norm": 2.4481375217437744, + "learning_rate": 9.997731420307593e-06, + "loss": 3.0538, + "step": 300000 + }, + { + "epoch": 0.2955971249103505, + "grad_norm": 2.29868483543396, + "learning_rate": 9.997726511291832e-06, + "loss": 3.0856, + "step": 300050 + }, + { + "epoch": 0.2956463828881726, + "grad_norm": 2.501415252685547, + "learning_rate": 9.997721596971669e-06, + "loss": 3.0964, + "step": 300100 + }, + { + "epoch": 0.29569564086599465, + "grad_norm": 2.195765733718872, + "learning_rate": 9.99771667734711e-06, + "loss": 3.0413, + "step": 300150 + }, + { + "epoch": 0.2957448988438167, + "grad_norm": 2.2101051807403564, + "learning_rate": 9.997711752418157e-06, + "loss": 3.0551, + "step": 300200 + }, + { + "epoch": 0.29579415682163884, + "grad_norm": 2.2774300575256348, + "learning_rate": 9.99770682218482e-06, + "loss": 2.9889, + "step": 300250 + }, + { + "epoch": 0.2958434147994609, + "grad_norm": 2.167348623275757, + "learning_rate": 9.9977018866471e-06, + "loss": 2.9904, + "step": 300300 + }, + { + "epoch": 0.295892672777283, + "grad_norm": 2.172355890274048, + "learning_rate": 9.997696945805005e-06, + "loss": 2.9973, + "step": 300350 + }, + { + "epoch": 0.2959419307551051, + "grad_norm": 2.1638593673706055, + "learning_rate": 9.997691999658538e-06, + "loss": 3.0649, + "step": 300400 + }, + { + "epoch": 0.2959911887329272, + "grad_norm": 2.2954022884368896, + "learning_rate": 9.997687048207708e-06, + "loss": 3.0624, + "step": 300450 + }, + { + "epoch": 0.29604044671074925, + "grad_norm": 2.4686570167541504, + "learning_rate": 9.997682091452516e-06, + "loss": 3.0161, + "step": 300500 + }, + { + "epoch": 0.2960897046885714, + "grad_norm": 2.515688896179199, + "learning_rate": 9.99767712939297e-06, + "loss": 2.9986, + "step": 300550 + }, + { + "epoch": 0.29613896266639345, + "grad_norm": 2.2803728580474854, + "learning_rate": 9.997672162029074e-06, + "loss": 2.9414, + "step": 300600 + }, + { + "epoch": 0.2961882206442155, + "grad_norm": 2.281238555908203, + "learning_rate": 9.997667189360833e-06, + "loss": 3.0179, + "step": 300650 + }, + { + "epoch": 0.29623747862203764, + "grad_norm": 2.2644777297973633, + "learning_rate": 9.997662211388253e-06, + "loss": 3.0369, + "step": 300700 + }, + { + "epoch": 0.2962867365998597, + "grad_norm": 2.4472367763519287, + "learning_rate": 9.997657228111341e-06, + "loss": 3.0525, + "step": 300750 + }, + { + "epoch": 0.2963359945776818, + "grad_norm": 2.2760369777679443, + "learning_rate": 9.997652239530099e-06, + "loss": 2.9871, + "step": 300800 + }, + { + "epoch": 0.2963852525555039, + "grad_norm": 2.277613639831543, + "learning_rate": 9.997647245644534e-06, + "loss": 3.0682, + "step": 300850 + }, + { + "epoch": 0.296434510533326, + "grad_norm": 2.2597336769104004, + "learning_rate": 9.997642246454652e-06, + "loss": 3.0561, + "step": 300900 + }, + { + "epoch": 0.29648376851114805, + "grad_norm": 2.398033857345581, + "learning_rate": 9.997637241960457e-06, + "loss": 2.9909, + "step": 300950 + }, + { + "epoch": 0.2965330264889702, + "grad_norm": 2.448765277862549, + "learning_rate": 9.997632232161954e-06, + "loss": 2.9499, + "step": 301000 + }, + { + "epoch": 0.29658228446679225, + "grad_norm": 2.232977867126465, + "learning_rate": 9.99762721705915e-06, + "loss": 3.0373, + "step": 301050 + }, + { + "epoch": 0.2966315424446143, + "grad_norm": 2.3032796382904053, + "learning_rate": 9.997622196652047e-06, + "loss": 3.0336, + "step": 301100 + }, + { + "epoch": 0.29668080042243644, + "grad_norm": 2.2576305866241455, + "learning_rate": 9.997617170940655e-06, + "loss": 3.1268, + "step": 301150 + }, + { + "epoch": 0.2967300584002585, + "grad_norm": 2.4217112064361572, + "learning_rate": 9.997612139924978e-06, + "loss": 3.0473, + "step": 301200 + }, + { + "epoch": 0.2967793163780806, + "grad_norm": 2.3375320434570312, + "learning_rate": 9.99760710360502e-06, + "loss": 3.044, + "step": 301250 + }, + { + "epoch": 0.2968285743559027, + "grad_norm": 2.3312928676605225, + "learning_rate": 9.997602061980783e-06, + "loss": 3.0174, + "step": 301300 + }, + { + "epoch": 0.2968778323337248, + "grad_norm": 2.4094583988189697, + "learning_rate": 9.99759701505228e-06, + "loss": 3.0034, + "step": 301350 + }, + { + "epoch": 0.29692709031154685, + "grad_norm": 2.443190813064575, + "learning_rate": 9.997591962819511e-06, + "loss": 3.1342, + "step": 301400 + }, + { + "epoch": 0.2969763482893689, + "grad_norm": 2.2841074466705322, + "learning_rate": 9.997586905282482e-06, + "loss": 3.0267, + "step": 301450 + }, + { + "epoch": 0.29702560626719104, + "grad_norm": 2.1871328353881836, + "learning_rate": 9.9975818424412e-06, + "loss": 3.0401, + "step": 301500 + }, + { + "epoch": 0.2970748642450131, + "grad_norm": 2.2660679817199707, + "learning_rate": 9.99757677429567e-06, + "loss": 3.0046, + "step": 301550 + }, + { + "epoch": 0.2971241222228352, + "grad_norm": 2.409891366958618, + "learning_rate": 9.997571700845897e-06, + "loss": 3.0434, + "step": 301600 + }, + { + "epoch": 0.2971733802006573, + "grad_norm": 2.324989080429077, + "learning_rate": 9.997566622091885e-06, + "loss": 3.0596, + "step": 301650 + }, + { + "epoch": 0.2972226381784794, + "grad_norm": 2.4707205295562744, + "learning_rate": 9.997561538033641e-06, + "loss": 3.0022, + "step": 301700 + }, + { + "epoch": 0.29727189615630145, + "grad_norm": 2.341087579727173, + "learning_rate": 9.997556448671172e-06, + "loss": 3.0307, + "step": 301750 + }, + { + "epoch": 0.2973211541341236, + "grad_norm": 2.1764256954193115, + "learning_rate": 9.99755135400448e-06, + "loss": 3.0554, + "step": 301800 + }, + { + "epoch": 0.29737041211194565, + "grad_norm": 2.3018980026245117, + "learning_rate": 9.997546254033571e-06, + "loss": 3.0157, + "step": 301850 + }, + { + "epoch": 0.2974196700897677, + "grad_norm": 2.4313461780548096, + "learning_rate": 9.997541148758453e-06, + "loss": 3.039, + "step": 301900 + }, + { + "epoch": 0.29746892806758984, + "grad_norm": 2.457601547241211, + "learning_rate": 9.997536038179126e-06, + "loss": 3.0171, + "step": 301950 + }, + { + "epoch": 0.2975181860454119, + "grad_norm": 2.3218934535980225, + "learning_rate": 9.997530922295603e-06, + "loss": 3.0604, + "step": 302000 + }, + { + "epoch": 0.297567444023234, + "grad_norm": 2.5782957077026367, + "learning_rate": 9.997525801107884e-06, + "loss": 3.0283, + "step": 302050 + }, + { + "epoch": 0.2976167020010561, + "grad_norm": 2.3540420532226562, + "learning_rate": 9.997520674615977e-06, + "loss": 3.0335, + "step": 302100 + }, + { + "epoch": 0.2976659599788782, + "grad_norm": 2.3962111473083496, + "learning_rate": 9.997515542819887e-06, + "loss": 3.0302, + "step": 302150 + }, + { + "epoch": 0.29771521795670025, + "grad_norm": Infinity, + "learning_rate": 9.997510405719618e-06, + "loss": 2.99, + "step": 302200 + }, + { + "epoch": 0.2977644759345224, + "grad_norm": 2.158447027206421, + "learning_rate": 9.997505263315175e-06, + "loss": 3.0011, + "step": 302250 + }, + { + "epoch": 0.29781373391234445, + "grad_norm": 2.385204553604126, + "learning_rate": 9.997500115606565e-06, + "loss": 3.002, + "step": 302300 + }, + { + "epoch": 0.2978629918901665, + "grad_norm": 2.5355417728424072, + "learning_rate": 9.997494962593795e-06, + "loss": 3.0532, + "step": 302350 + }, + { + "epoch": 0.29791224986798864, + "grad_norm": 2.195981025695801, + "learning_rate": 9.997489804276867e-06, + "loss": 3.019, + "step": 302400 + }, + { + "epoch": 0.2979615078458107, + "grad_norm": 2.195563316345215, + "learning_rate": 9.997484640655789e-06, + "loss": 3.0426, + "step": 302450 + }, + { + "epoch": 0.2980107658236328, + "grad_norm": 2.2413201332092285, + "learning_rate": 9.997479471730568e-06, + "loss": 2.9683, + "step": 302500 + }, + { + "epoch": 0.2980600238014549, + "grad_norm": 2.1960880756378174, + "learning_rate": 9.997474297501204e-06, + "loss": 3.0467, + "step": 302550 + }, + { + "epoch": 0.298109281779277, + "grad_norm": 2.2786059379577637, + "learning_rate": 9.997469117967706e-06, + "loss": 3.0152, + "step": 302600 + }, + { + "epoch": 0.29815853975709905, + "grad_norm": 2.312403440475464, + "learning_rate": 9.997463933130081e-06, + "loss": 3.0685, + "step": 302650 + }, + { + "epoch": 0.2982077977349211, + "grad_norm": 2.3801369667053223, + "learning_rate": 9.99745874298833e-06, + "loss": 2.9402, + "step": 302700 + }, + { + "epoch": 0.29825705571274325, + "grad_norm": 2.3448798656463623, + "learning_rate": 9.997453547542463e-06, + "loss": 3.0788, + "step": 302750 + }, + { + "epoch": 0.2983063136905653, + "grad_norm": 2.1296544075012207, + "learning_rate": 9.997448346792483e-06, + "loss": 3.0594, + "step": 302800 + }, + { + "epoch": 0.2983555716683874, + "grad_norm": 2.3189103603363037, + "learning_rate": 9.997443140738398e-06, + "loss": 3.0242, + "step": 302850 + }, + { + "epoch": 0.2984048296462095, + "grad_norm": 2.3028197288513184, + "learning_rate": 9.99743792938021e-06, + "loss": 3.0274, + "step": 302900 + }, + { + "epoch": 0.2984540876240316, + "grad_norm": 3.1368179321289062, + "learning_rate": 9.997432712717926e-06, + "loss": 3.0559, + "step": 302950 + }, + { + "epoch": 0.29850334560185365, + "grad_norm": 2.34022855758667, + "learning_rate": 9.997427490751553e-06, + "loss": 3.0227, + "step": 303000 + }, + { + "epoch": 0.2985526035796758, + "grad_norm": 2.2709569931030273, + "learning_rate": 9.997422263481093e-06, + "loss": 3.0003, + "step": 303050 + }, + { + "epoch": 0.29860186155749785, + "grad_norm": 2.143012523651123, + "learning_rate": 9.997417030906557e-06, + "loss": 3.0219, + "step": 303100 + }, + { + "epoch": 0.2986511195353199, + "grad_norm": 2.2246391773223877, + "learning_rate": 9.997411793027945e-06, + "loss": 3.0016, + "step": 303150 + }, + { + "epoch": 0.29870037751314205, + "grad_norm": 2.131103038787842, + "learning_rate": 9.997406549845266e-06, + "loss": 3.0385, + "step": 303200 + }, + { + "epoch": 0.2987496354909641, + "grad_norm": 2.262812614440918, + "learning_rate": 9.997401301358525e-06, + "loss": 3.0711, + "step": 303250 + }, + { + "epoch": 0.2987988934687862, + "grad_norm": 2.2424211502075195, + "learning_rate": 9.997396047567726e-06, + "loss": 2.989, + "step": 303300 + }, + { + "epoch": 0.2988481514466083, + "grad_norm": 2.405256509780884, + "learning_rate": 9.997390788472877e-06, + "loss": 3.0722, + "step": 303350 + }, + { + "epoch": 0.2988974094244304, + "grad_norm": 2.139882802963257, + "learning_rate": 9.997385524073981e-06, + "loss": 3.0068, + "step": 303400 + }, + { + "epoch": 0.29894666740225245, + "grad_norm": 2.2563867568969727, + "learning_rate": 9.997380254371047e-06, + "loss": 3.0265, + "step": 303450 + }, + { + "epoch": 0.2989959253800746, + "grad_norm": 2.5227653980255127, + "learning_rate": 9.997374979364077e-06, + "loss": 3.0312, + "step": 303500 + }, + { + "epoch": 0.29904518335789665, + "grad_norm": 2.6186342239379883, + "learning_rate": 9.997369699053078e-06, + "loss": 3.0304, + "step": 303550 + }, + { + "epoch": 0.2990944413357187, + "grad_norm": 2.1977999210357666, + "learning_rate": 9.997364413438054e-06, + "loss": 2.9144, + "step": 303600 + }, + { + "epoch": 0.29914369931354085, + "grad_norm": 2.3784339427948, + "learning_rate": 9.997359122519014e-06, + "loss": 2.9665, + "step": 303650 + }, + { + "epoch": 0.2991929572913629, + "grad_norm": 2.453524589538574, + "learning_rate": 9.997353826295963e-06, + "loss": 2.9995, + "step": 303700 + }, + { + "epoch": 0.299242215269185, + "grad_norm": 2.345092535018921, + "learning_rate": 9.997348524768905e-06, + "loss": 2.9761, + "step": 303750 + }, + { + "epoch": 0.2992914732470071, + "grad_norm": 2.3803884983062744, + "learning_rate": 9.997343217937845e-06, + "loss": 3.0774, + "step": 303800 + }, + { + "epoch": 0.2993407312248292, + "grad_norm": 2.3345894813537598, + "learning_rate": 9.99733790580279e-06, + "loss": 3.053, + "step": 303850 + }, + { + "epoch": 0.29938998920265125, + "grad_norm": 2.235635757446289, + "learning_rate": 9.997332588363746e-06, + "loss": 3.0229, + "step": 303900 + }, + { + "epoch": 0.2994392471804733, + "grad_norm": 2.2956478595733643, + "learning_rate": 9.997327265620719e-06, + "loss": 3.085, + "step": 303950 + }, + { + "epoch": 0.29948850515829545, + "grad_norm": 2.322538375854492, + "learning_rate": 9.997321937573713e-06, + "loss": 2.9686, + "step": 304000 + }, + { + "epoch": 0.2995377631361175, + "grad_norm": 2.379397392272949, + "learning_rate": 9.997316604222734e-06, + "loss": 3.0073, + "step": 304050 + }, + { + "epoch": 0.2995870211139396, + "grad_norm": 2.220425844192505, + "learning_rate": 9.997311265567787e-06, + "loss": 3.0213, + "step": 304100 + }, + { + "epoch": 0.2996362790917617, + "grad_norm": 2.5949504375457764, + "learning_rate": 9.997305921608881e-06, + "loss": 3.0337, + "step": 304150 + }, + { + "epoch": 0.2996855370695838, + "grad_norm": 2.1956796646118164, + "learning_rate": 9.997300572346019e-06, + "loss": 3.0389, + "step": 304200 + }, + { + "epoch": 0.29973479504740586, + "grad_norm": 2.2598912715911865, + "learning_rate": 9.997295217779204e-06, + "loss": 3.024, + "step": 304250 + }, + { + "epoch": 0.299784053025228, + "grad_norm": 2.486750364303589, + "learning_rate": 9.997289857908447e-06, + "loss": 3.0262, + "step": 304300 + }, + { + "epoch": 0.29983331100305005, + "grad_norm": 2.367497682571411, + "learning_rate": 9.997284492733753e-06, + "loss": 3.0634, + "step": 304350 + }, + { + "epoch": 0.2998825689808721, + "grad_norm": 2.3332293033599854, + "learning_rate": 9.997279122255124e-06, + "loss": 2.9679, + "step": 304400 + }, + { + "epoch": 0.29993182695869425, + "grad_norm": 2.4721522331237793, + "learning_rate": 9.997273746472567e-06, + "loss": 3.0169, + "step": 304450 + }, + { + "epoch": 0.2999810849365163, + "grad_norm": 2.169637441635132, + "learning_rate": 9.99726836538609e-06, + "loss": 3.0456, + "step": 304500 + }, + { + "epoch": 0.3000303429143384, + "grad_norm": 2.381326675415039, + "learning_rate": 9.997262978995698e-06, + "loss": 3.0523, + "step": 304550 + }, + { + "epoch": 0.3000796008921605, + "grad_norm": 2.2650561332702637, + "learning_rate": 9.997257587301394e-06, + "loss": 3.0069, + "step": 304600 + }, + { + "epoch": 0.3001288588699826, + "grad_norm": 2.2628531455993652, + "learning_rate": 9.997252190303187e-06, + "loss": 3.0396, + "step": 304650 + }, + { + "epoch": 0.30017811684780465, + "grad_norm": 2.180102825164795, + "learning_rate": 9.997246788001082e-06, + "loss": 3.0319, + "step": 304700 + }, + { + "epoch": 0.3002273748256268, + "grad_norm": 2.3927290439605713, + "learning_rate": 9.997241380395082e-06, + "loss": 3.0116, + "step": 304750 + }, + { + "epoch": 0.30027663280344885, + "grad_norm": 2.2854747772216797, + "learning_rate": 9.997235967485197e-06, + "loss": 3.0758, + "step": 304800 + }, + { + "epoch": 0.3003258907812709, + "grad_norm": 2.2865076065063477, + "learning_rate": 9.997230549271429e-06, + "loss": 3.039, + "step": 304850 + }, + { + "epoch": 0.30037514875909305, + "grad_norm": 2.3569352626800537, + "learning_rate": 9.997225125753787e-06, + "loss": 3.0039, + "step": 304900 + }, + { + "epoch": 0.3004244067369151, + "grad_norm": 2.228971242904663, + "learning_rate": 9.997219696932273e-06, + "loss": 2.9715, + "step": 304950 + }, + { + "epoch": 0.3004736647147372, + "grad_norm": 2.445197820663452, + "learning_rate": 9.997214262806896e-06, + "loss": 2.928, + "step": 305000 + }, + { + "epoch": 0.3005229226925593, + "grad_norm": 2.448639392852783, + "learning_rate": 9.997208823377662e-06, + "loss": 3.0986, + "step": 305050 + }, + { + "epoch": 0.3005721806703814, + "grad_norm": 2.1995644569396973, + "learning_rate": 9.997203378644574e-06, + "loss": 3.0189, + "step": 305100 + }, + { + "epoch": 0.30062143864820345, + "grad_norm": 2.227952480316162, + "learning_rate": 9.99719792860764e-06, + "loss": 3.0393, + "step": 305150 + }, + { + "epoch": 0.3006706966260255, + "grad_norm": 2.2867395877838135, + "learning_rate": 9.997192473266864e-06, + "loss": 2.9946, + "step": 305200 + }, + { + "epoch": 0.30071995460384765, + "grad_norm": 2.5173959732055664, + "learning_rate": 9.99718701262225e-06, + "loss": 3.0137, + "step": 305250 + }, + { + "epoch": 0.3007692125816697, + "grad_norm": 2.2430453300476074, + "learning_rate": 9.997181546673812e-06, + "loss": 3.0535, + "step": 305300 + }, + { + "epoch": 0.3008184705594918, + "grad_norm": 2.380549430847168, + "learning_rate": 9.997176075421547e-06, + "loss": 3.0731, + "step": 305350 + }, + { + "epoch": 0.3008677285373139, + "grad_norm": 2.1974780559539795, + "learning_rate": 9.997170598865464e-06, + "loss": 2.938, + "step": 305400 + }, + { + "epoch": 0.300916986515136, + "grad_norm": 2.898636817932129, + "learning_rate": 9.997165117005573e-06, + "loss": 3.0589, + "step": 305450 + }, + { + "epoch": 0.30096624449295806, + "grad_norm": 2.457097053527832, + "learning_rate": 9.997159629841871e-06, + "loss": 3.0604, + "step": 305500 + }, + { + "epoch": 0.3010155024707802, + "grad_norm": 2.1827590465545654, + "learning_rate": 9.997154137374372e-06, + "loss": 3.0812, + "step": 305550 + }, + { + "epoch": 0.30106476044860225, + "grad_norm": 2.3152217864990234, + "learning_rate": 9.997148639603078e-06, + "loss": 3.0249, + "step": 305600 + }, + { + "epoch": 0.3011140184264243, + "grad_norm": 2.355703115463257, + "learning_rate": 9.997143136527995e-06, + "loss": 3.0395, + "step": 305650 + }, + { + "epoch": 0.30116327640424645, + "grad_norm": 2.1953670978546143, + "learning_rate": 9.997137628149126e-06, + "loss": 2.9921, + "step": 305700 + }, + { + "epoch": 0.3012125343820685, + "grad_norm": 2.4789483547210693, + "learning_rate": 9.997132114466483e-06, + "loss": 2.9684, + "step": 305750 + }, + { + "epoch": 0.3012617923598906, + "grad_norm": 2.213282346725464, + "learning_rate": 9.99712659548007e-06, + "loss": 3.0086, + "step": 305800 + }, + { + "epoch": 0.3013110503377127, + "grad_norm": 2.323824882507324, + "learning_rate": 9.99712107118989e-06, + "loss": 3.0564, + "step": 305850 + }, + { + "epoch": 0.3013603083155348, + "grad_norm": 2.210498809814453, + "learning_rate": 9.99711554159595e-06, + "loss": 2.9832, + "step": 305900 + }, + { + "epoch": 0.30140956629335686, + "grad_norm": 2.251371145248413, + "learning_rate": 9.997110006698257e-06, + "loss": 3.0268, + "step": 305950 + }, + { + "epoch": 0.301458824271179, + "grad_norm": 2.3328423500061035, + "learning_rate": 9.997104466496816e-06, + "loss": 3.0216, + "step": 306000 + }, + { + "epoch": 0.30150808224900105, + "grad_norm": 3.0025675296783447, + "learning_rate": 9.997098920991635e-06, + "loss": 3.0037, + "step": 306050 + }, + { + "epoch": 0.3015573402268231, + "grad_norm": 2.229220390319824, + "learning_rate": 9.997093370182716e-06, + "loss": 3.067, + "step": 306100 + }, + { + "epoch": 0.30160659820464525, + "grad_norm": 2.289638042449951, + "learning_rate": 9.997087814070068e-06, + "loss": 2.9948, + "step": 306150 + }, + { + "epoch": 0.3016558561824673, + "grad_norm": 2.301940441131592, + "learning_rate": 9.997082252653694e-06, + "loss": 3.0275, + "step": 306200 + }, + { + "epoch": 0.3017051141602894, + "grad_norm": 2.6628968715667725, + "learning_rate": 9.997076685933605e-06, + "loss": 2.9845, + "step": 306250 + }, + { + "epoch": 0.3017543721381115, + "grad_norm": 2.418945789337158, + "learning_rate": 9.9970711139098e-06, + "loss": 3.0209, + "step": 306300 + }, + { + "epoch": 0.3018036301159336, + "grad_norm": 2.275040864944458, + "learning_rate": 9.997065536582293e-06, + "loss": 3.088, + "step": 306350 + }, + { + "epoch": 0.30185288809375566, + "grad_norm": 2.288247585296631, + "learning_rate": 9.997059953951083e-06, + "loss": 3.0621, + "step": 306400 + }, + { + "epoch": 0.3019021460715777, + "grad_norm": 2.1334221363067627, + "learning_rate": 9.997054366016178e-06, + "loss": 3.0332, + "step": 306450 + }, + { + "epoch": 0.30195140404939985, + "grad_norm": 2.259039878845215, + "learning_rate": 9.997048772777584e-06, + "loss": 2.9927, + "step": 306500 + }, + { + "epoch": 0.3020006620272219, + "grad_norm": 2.311420202255249, + "learning_rate": 9.997043174235308e-06, + "loss": 3.0285, + "step": 306550 + }, + { + "epoch": 0.302049920005044, + "grad_norm": 2.3487062454223633, + "learning_rate": 9.997037570389356e-06, + "loss": 3.0428, + "step": 306600 + }, + { + "epoch": 0.3020991779828661, + "grad_norm": 2.259214162826538, + "learning_rate": 9.997031961239734e-06, + "loss": 3.0648, + "step": 306650 + }, + { + "epoch": 0.3021484359606882, + "grad_norm": 2.426786422729492, + "learning_rate": 9.997026346786444e-06, + "loss": 2.9985, + "step": 306700 + }, + { + "epoch": 0.30219769393851026, + "grad_norm": 2.0850160121917725, + "learning_rate": 9.997020727029498e-06, + "loss": 3.0229, + "step": 306750 + }, + { + "epoch": 0.3022469519163324, + "grad_norm": 2.3047666549682617, + "learning_rate": 9.997015101968898e-06, + "loss": 3.0257, + "step": 306800 + }, + { + "epoch": 0.30229620989415446, + "grad_norm": 2.293976306915283, + "learning_rate": 9.997009471604649e-06, + "loss": 3.0741, + "step": 306850 + }, + { + "epoch": 0.3023454678719765, + "grad_norm": 2.3507769107818604, + "learning_rate": 9.997003835936762e-06, + "loss": 3.076, + "step": 306900 + }, + { + "epoch": 0.30239472584979865, + "grad_norm": 2.404139995574951, + "learning_rate": 9.99699819496524e-06, + "loss": 3.0424, + "step": 306950 + }, + { + "epoch": 0.3024439838276207, + "grad_norm": 2.33311128616333, + "learning_rate": 9.996992548690087e-06, + "loss": 2.9499, + "step": 307000 + }, + { + "epoch": 0.3024932418054428, + "grad_norm": 2.1716532707214355, + "learning_rate": 9.996986897111312e-06, + "loss": 3.0405, + "step": 307050 + }, + { + "epoch": 0.3025424997832649, + "grad_norm": 2.359008550643921, + "learning_rate": 9.99698124022892e-06, + "loss": 3.0739, + "step": 307100 + }, + { + "epoch": 0.302591757761087, + "grad_norm": 2.294940948486328, + "learning_rate": 9.996975578042916e-06, + "loss": 2.9145, + "step": 307150 + }, + { + "epoch": 0.30264101573890906, + "grad_norm": 2.402827501296997, + "learning_rate": 9.996969910553308e-06, + "loss": 3.093, + "step": 307200 + }, + { + "epoch": 0.3026902737167312, + "grad_norm": 2.377148151397705, + "learning_rate": 9.996964237760101e-06, + "loss": 3.0259, + "step": 307250 + }, + { + "epoch": 0.30273953169455325, + "grad_norm": 2.3695480823516846, + "learning_rate": 9.9969585596633e-06, + "loss": 3.0744, + "step": 307300 + }, + { + "epoch": 0.3027887896723753, + "grad_norm": 2.151103973388672, + "learning_rate": 9.996952876262914e-06, + "loss": 3.0261, + "step": 307350 + }, + { + "epoch": 0.30283804765019745, + "grad_norm": 2.3178980350494385, + "learning_rate": 9.996947187558947e-06, + "loss": 2.9997, + "step": 307400 + }, + { + "epoch": 0.3028873056280195, + "grad_norm": 2.3100035190582275, + "learning_rate": 9.996941493551404e-06, + "loss": 3.0042, + "step": 307450 + }, + { + "epoch": 0.3029365636058416, + "grad_norm": 2.260178327560425, + "learning_rate": 9.99693579424029e-06, + "loss": 3.0386, + "step": 307500 + }, + { + "epoch": 0.3029858215836637, + "grad_norm": 2.2575743198394775, + "learning_rate": 9.996930089625618e-06, + "loss": 3.0735, + "step": 307550 + }, + { + "epoch": 0.3030350795614858, + "grad_norm": 2.504180669784546, + "learning_rate": 9.996924379707386e-06, + "loss": 3.0823, + "step": 307600 + }, + { + "epoch": 0.30308433753930786, + "grad_norm": 2.2986879348754883, + "learning_rate": 9.996918664485604e-06, + "loss": 3.0707, + "step": 307650 + }, + { + "epoch": 0.30313359551712993, + "grad_norm": 2.3497161865234375, + "learning_rate": 9.996912943960278e-06, + "loss": 3.0328, + "step": 307700 + }, + { + "epoch": 0.30318285349495205, + "grad_norm": 2.2806992530822754, + "learning_rate": 9.996907218131413e-06, + "loss": 2.9764, + "step": 307750 + }, + { + "epoch": 0.3032321114727741, + "grad_norm": 2.3173491954803467, + "learning_rate": 9.996901486999015e-06, + "loss": 2.9841, + "step": 307800 + }, + { + "epoch": 0.3032813694505962, + "grad_norm": 2.175394296646118, + "learning_rate": 9.996895750563091e-06, + "loss": 3.0262, + "step": 307850 + }, + { + "epoch": 0.3033306274284183, + "grad_norm": 2.350114583969116, + "learning_rate": 9.996890008823646e-06, + "loss": 3.0565, + "step": 307900 + }, + { + "epoch": 0.3033798854062404, + "grad_norm": 2.230470895767212, + "learning_rate": 9.996884261780688e-06, + "loss": 3.0323, + "step": 307950 + }, + { + "epoch": 0.30342914338406246, + "grad_norm": 2.285947561264038, + "learning_rate": 9.996878509434223e-06, + "loss": 3.0385, + "step": 308000 + }, + { + "epoch": 0.3034784013618846, + "grad_norm": 2.3150689601898193, + "learning_rate": 9.996872751784254e-06, + "loss": 3.0597, + "step": 308050 + }, + { + "epoch": 0.30352765933970666, + "grad_norm": 2.342055320739746, + "learning_rate": 9.996866988830789e-06, + "loss": 2.9783, + "step": 308100 + }, + { + "epoch": 0.3035769173175287, + "grad_norm": 2.3872690200805664, + "learning_rate": 9.996861220573835e-06, + "loss": 3.0042, + "step": 308150 + }, + { + "epoch": 0.30362617529535085, + "grad_norm": 2.1792943477630615, + "learning_rate": 9.996855447013397e-06, + "loss": 3.0008, + "step": 308200 + }, + { + "epoch": 0.3036754332731729, + "grad_norm": 2.285071849822998, + "learning_rate": 9.996849668149482e-06, + "loss": 3.0516, + "step": 308250 + }, + { + "epoch": 0.303724691250995, + "grad_norm": 2.253110647201538, + "learning_rate": 9.996843883982096e-06, + "loss": 3.0555, + "step": 308300 + }, + { + "epoch": 0.3037739492288171, + "grad_norm": 2.46266770362854, + "learning_rate": 9.996838094511245e-06, + "loss": 3.0264, + "step": 308350 + }, + { + "epoch": 0.3038232072066392, + "grad_norm": 2.5955288410186768, + "learning_rate": 9.996832299736931e-06, + "loss": 3.0014, + "step": 308400 + }, + { + "epoch": 0.30387246518446126, + "grad_norm": 2.2424843311309814, + "learning_rate": 9.996826499659169e-06, + "loss": 3.052, + "step": 308450 + }, + { + "epoch": 0.3039217231622834, + "grad_norm": 2.156332492828369, + "learning_rate": 9.996820694277957e-06, + "loss": 2.9698, + "step": 308500 + }, + { + "epoch": 0.30397098114010546, + "grad_norm": 2.354210376739502, + "learning_rate": 9.996814883593307e-06, + "loss": 3.0519, + "step": 308550 + }, + { + "epoch": 0.3040202391179275, + "grad_norm": 2.365471839904785, + "learning_rate": 9.996809067605222e-06, + "loss": 2.9409, + "step": 308600 + }, + { + "epoch": 0.30406949709574965, + "grad_norm": 2.6839261054992676, + "learning_rate": 9.996803246313707e-06, + "loss": 3.0125, + "step": 308650 + }, + { + "epoch": 0.3041187550735717, + "grad_norm": 2.503632068634033, + "learning_rate": 9.99679741971877e-06, + "loss": 2.9781, + "step": 308700 + }, + { + "epoch": 0.3041680130513938, + "grad_norm": 2.2489304542541504, + "learning_rate": 9.996791587820419e-06, + "loss": 2.9685, + "step": 308750 + }, + { + "epoch": 0.3042172710292159, + "grad_norm": 2.4574649333953857, + "learning_rate": 9.996785750618656e-06, + "loss": 2.9733, + "step": 308800 + }, + { + "epoch": 0.304266529007038, + "grad_norm": 2.730006217956543, + "learning_rate": 9.996779908113492e-06, + "loss": 3.015, + "step": 308850 + }, + { + "epoch": 0.30431578698486006, + "grad_norm": 2.692553997039795, + "learning_rate": 9.996774060304929e-06, + "loss": 2.9729, + "step": 308900 + }, + { + "epoch": 0.30436504496268213, + "grad_norm": 2.324758768081665, + "learning_rate": 9.996768207192975e-06, + "loss": 3.0601, + "step": 308950 + }, + { + "epoch": 0.30441430294050426, + "grad_norm": 2.582911491394043, + "learning_rate": 9.996762348777635e-06, + "loss": 2.9935, + "step": 309000 + }, + { + "epoch": 0.3044635609183263, + "grad_norm": 2.258347272872925, + "learning_rate": 9.99675648505892e-06, + "loss": 2.9806, + "step": 309050 + }, + { + "epoch": 0.3045128188961484, + "grad_norm": 2.48697829246521, + "learning_rate": 9.99675061603683e-06, + "loss": 3.0326, + "step": 309100 + }, + { + "epoch": 0.3045620768739705, + "grad_norm": 2.434758186340332, + "learning_rate": 9.996744741711374e-06, + "loss": 3.013, + "step": 309150 + }, + { + "epoch": 0.3046113348517926, + "grad_norm": 2.227440118789673, + "learning_rate": 9.996738862082557e-06, + "loss": 3.0657, + "step": 309200 + }, + { + "epoch": 0.30466059282961466, + "grad_norm": 2.225358486175537, + "learning_rate": 9.996732977150387e-06, + "loss": 2.9734, + "step": 309250 + }, + { + "epoch": 0.3047098508074368, + "grad_norm": 2.424178123474121, + "learning_rate": 9.99672708691487e-06, + "loss": 2.9443, + "step": 309300 + }, + { + "epoch": 0.30475910878525886, + "grad_norm": 2.5271685123443604, + "learning_rate": 9.996721191376011e-06, + "loss": 2.9516, + "step": 309350 + }, + { + "epoch": 0.30480836676308093, + "grad_norm": 2.3428306579589844, + "learning_rate": 9.996715290533817e-06, + "loss": 3.1211, + "step": 309400 + }, + { + "epoch": 0.30485762474090305, + "grad_norm": 2.671780824661255, + "learning_rate": 9.996709384388295e-06, + "loss": 3.0473, + "step": 309450 + }, + { + "epoch": 0.3049068827187251, + "grad_norm": 2.3350250720977783, + "learning_rate": 9.99670347293945e-06, + "loss": 2.9777, + "step": 309500 + }, + { + "epoch": 0.3049561406965472, + "grad_norm": 2.3178908824920654, + "learning_rate": 9.996697556187289e-06, + "loss": 2.9643, + "step": 309550 + }, + { + "epoch": 0.3050053986743693, + "grad_norm": 2.3414082527160645, + "learning_rate": 9.996691634131816e-06, + "loss": 3.0202, + "step": 309600 + }, + { + "epoch": 0.3050546566521914, + "grad_norm": 2.3130717277526855, + "learning_rate": 9.996685706773042e-06, + "loss": 3.0475, + "step": 309650 + }, + { + "epoch": 0.30510391463001346, + "grad_norm": 2.318556547164917, + "learning_rate": 9.99667977411097e-06, + "loss": 2.9556, + "step": 309700 + }, + { + "epoch": 0.3051531726078356, + "grad_norm": 2.3123040199279785, + "learning_rate": 9.996673836145607e-06, + "loss": 2.9607, + "step": 309750 + }, + { + "epoch": 0.30520243058565766, + "grad_norm": 2.2077224254608154, + "learning_rate": 9.996667892876959e-06, + "loss": 3.0973, + "step": 309800 + }, + { + "epoch": 0.30525168856347973, + "grad_norm": 2.268465042114258, + "learning_rate": 9.996661944305032e-06, + "loss": 2.9428, + "step": 309850 + }, + { + "epoch": 0.30530094654130185, + "grad_norm": 2.3785126209259033, + "learning_rate": 9.996655990429834e-06, + "loss": 2.968, + "step": 309900 + }, + { + "epoch": 0.3053502045191239, + "grad_norm": 2.592348337173462, + "learning_rate": 9.99665003125137e-06, + "loss": 3.022, + "step": 309950 + }, + { + "epoch": 0.305399462496946, + "grad_norm": 2.1196916103363037, + "learning_rate": 9.996644066769646e-06, + "loss": 3.0511, + "step": 310000 + }, + { + "epoch": 0.3054487204747681, + "grad_norm": 2.401073694229126, + "learning_rate": 9.99663809698467e-06, + "loss": 3.0028, + "step": 310050 + }, + { + "epoch": 0.3054979784525902, + "grad_norm": 2.4942500591278076, + "learning_rate": 9.996632121896446e-06, + "loss": 3.076, + "step": 310100 + }, + { + "epoch": 0.30554723643041226, + "grad_norm": 2.3009376525878906, + "learning_rate": 9.996626141504982e-06, + "loss": 3.0589, + "step": 310150 + }, + { + "epoch": 0.30559649440823433, + "grad_norm": 2.46113657951355, + "learning_rate": 9.996620155810282e-06, + "loss": 3.026, + "step": 310200 + }, + { + "epoch": 0.30564575238605646, + "grad_norm": 2.284552812576294, + "learning_rate": 9.996614164812357e-06, + "loss": 3.023, + "step": 310250 + }, + { + "epoch": 0.3056950103638785, + "grad_norm": 2.466646194458008, + "learning_rate": 9.996608168511208e-06, + "loss": 2.9531, + "step": 310300 + }, + { + "epoch": 0.3057442683417006, + "grad_norm": 2.287456512451172, + "learning_rate": 9.996602166906847e-06, + "loss": 3.0496, + "step": 310350 + }, + { + "epoch": 0.3057935263195227, + "grad_norm": 2.4064130783081055, + "learning_rate": 9.996596159999276e-06, + "loss": 2.8812, + "step": 310400 + }, + { + "epoch": 0.3058427842973448, + "grad_norm": 2.314246892929077, + "learning_rate": 9.9965901477885e-06, + "loss": 3.0286, + "step": 310450 + }, + { + "epoch": 0.30589204227516686, + "grad_norm": 2.4322733879089355, + "learning_rate": 9.996584130274531e-06, + "loss": 3.095, + "step": 310500 + }, + { + "epoch": 0.305941300252989, + "grad_norm": 2.4454193115234375, + "learning_rate": 9.996578107457373e-06, + "loss": 2.9616, + "step": 310550 + }, + { + "epoch": 0.30599055823081106, + "grad_norm": 2.106123208999634, + "learning_rate": 9.996572079337031e-06, + "loss": 3.031, + "step": 310600 + }, + { + "epoch": 0.30603981620863313, + "grad_norm": 2.356388807296753, + "learning_rate": 9.996566045913511e-06, + "loss": 3.0367, + "step": 310650 + }, + { + "epoch": 0.30608907418645526, + "grad_norm": 2.699507236480713, + "learning_rate": 9.996560007186821e-06, + "loss": 3.0133, + "step": 310700 + }, + { + "epoch": 0.3061383321642773, + "grad_norm": 2.3565239906311035, + "learning_rate": 9.996553963156968e-06, + "loss": 3.0959, + "step": 310750 + }, + { + "epoch": 0.3061875901420994, + "grad_norm": 2.350118398666382, + "learning_rate": 9.996547913823957e-06, + "loss": 2.9957, + "step": 310800 + }, + { + "epoch": 0.3062368481199215, + "grad_norm": 2.19903564453125, + "learning_rate": 9.996541859187796e-06, + "loss": 3.0337, + "step": 310850 + }, + { + "epoch": 0.3062861060977436, + "grad_norm": 2.132171392440796, + "learning_rate": 9.996535799248489e-06, + "loss": 3.0084, + "step": 310900 + }, + { + "epoch": 0.30633536407556566, + "grad_norm": 2.3159096240997314, + "learning_rate": 9.996529734006044e-06, + "loss": 3.0251, + "step": 310950 + }, + { + "epoch": 0.3063846220533878, + "grad_norm": 2.138986587524414, + "learning_rate": 9.996523663460466e-06, + "loss": 3.1059, + "step": 311000 + }, + { + "epoch": 0.30643388003120986, + "grad_norm": 2.599769353866577, + "learning_rate": 9.996517587611766e-06, + "loss": 3.0324, + "step": 311050 + }, + { + "epoch": 0.30648313800903193, + "grad_norm": 2.2664358615875244, + "learning_rate": 9.996511506459944e-06, + "loss": 2.976, + "step": 311100 + }, + { + "epoch": 0.30653239598685406, + "grad_norm": 2.405214548110962, + "learning_rate": 9.99650542000501e-06, + "loss": 2.9534, + "step": 311150 + }, + { + "epoch": 0.3065816539646761, + "grad_norm": 2.411836624145508, + "learning_rate": 9.99649932824697e-06, + "loss": 3.0028, + "step": 311200 + }, + { + "epoch": 0.3066309119424982, + "grad_norm": 2.4898228645324707, + "learning_rate": 9.996493231185831e-06, + "loss": 3.0254, + "step": 311250 + }, + { + "epoch": 0.3066801699203203, + "grad_norm": 2.412057399749756, + "learning_rate": 9.9964871288216e-06, + "loss": 3.0769, + "step": 311300 + }, + { + "epoch": 0.3067294278981424, + "grad_norm": 2.5325610637664795, + "learning_rate": 9.996481021154282e-06, + "loss": 3.0989, + "step": 311350 + }, + { + "epoch": 0.30677868587596446, + "grad_norm": 2.1537461280822754, + "learning_rate": 9.996474908183883e-06, + "loss": 2.9702, + "step": 311400 + }, + { + "epoch": 0.30682794385378653, + "grad_norm": 2.4412622451782227, + "learning_rate": 9.996468789910411e-06, + "loss": 2.9731, + "step": 311450 + }, + { + "epoch": 0.30687720183160866, + "grad_norm": 2.3032448291778564, + "learning_rate": 9.996462666333871e-06, + "loss": 2.9668, + "step": 311500 + }, + { + "epoch": 0.30692645980943073, + "grad_norm": 2.5099568367004395, + "learning_rate": 9.996456537454272e-06, + "loss": 2.9738, + "step": 311550 + }, + { + "epoch": 0.3069757177872528, + "grad_norm": 2.254779815673828, + "learning_rate": 9.996450403271617e-06, + "loss": 2.9675, + "step": 311600 + }, + { + "epoch": 0.3070249757650749, + "grad_norm": 2.2271130084991455, + "learning_rate": 9.996444263785914e-06, + "loss": 3.0024, + "step": 311650 + }, + { + "epoch": 0.307074233742897, + "grad_norm": 2.1509151458740234, + "learning_rate": 9.996438118997172e-06, + "loss": 3.0283, + "step": 311700 + }, + { + "epoch": 0.30712349172071907, + "grad_norm": 2.706993341445923, + "learning_rate": 9.996431968905394e-06, + "loss": 2.9996, + "step": 311750 + }, + { + "epoch": 0.3071727496985412, + "grad_norm": 2.38284969329834, + "learning_rate": 9.99642581351059e-06, + "loss": 3.0083, + "step": 311800 + }, + { + "epoch": 0.30722200767636326, + "grad_norm": 2.423383951187134, + "learning_rate": 9.996419652812761e-06, + "loss": 3.0453, + "step": 311850 + }, + { + "epoch": 0.30727126565418533, + "grad_norm": 2.354200839996338, + "learning_rate": 9.996413486811919e-06, + "loss": 2.9777, + "step": 311900 + }, + { + "epoch": 0.30732052363200746, + "grad_norm": 2.4652881622314453, + "learning_rate": 9.996407315508069e-06, + "loss": 3.0205, + "step": 311950 + }, + { + "epoch": 0.30736978160982953, + "grad_norm": 2.443474769592285, + "learning_rate": 9.996401138901217e-06, + "loss": 3.0216, + "step": 312000 + }, + { + "epoch": 0.3074190395876516, + "grad_norm": 2.311983108520508, + "learning_rate": 9.99639495699137e-06, + "loss": 3.0313, + "step": 312050 + }, + { + "epoch": 0.3074682975654737, + "grad_norm": 2.1913750171661377, + "learning_rate": 9.996388769778533e-06, + "loss": 3.0571, + "step": 312100 + }, + { + "epoch": 0.3075175555432958, + "grad_norm": 2.2754499912261963, + "learning_rate": 9.996382577262715e-06, + "loss": 3.0225, + "step": 312150 + }, + { + "epoch": 0.30756681352111787, + "grad_norm": 2.28769850730896, + "learning_rate": 9.99637637944392e-06, + "loss": 3.0175, + "step": 312200 + }, + { + "epoch": 0.30761607149894, + "grad_norm": 2.184319257736206, + "learning_rate": 9.996370176322157e-06, + "loss": 3.0385, + "step": 312250 + }, + { + "epoch": 0.30766532947676206, + "grad_norm": 2.3818931579589844, + "learning_rate": 9.99636396789743e-06, + "loss": 3.0349, + "step": 312300 + }, + { + "epoch": 0.30771458745458413, + "grad_norm": 2.262977361679077, + "learning_rate": 9.99635775416975e-06, + "loss": 2.9937, + "step": 312350 + }, + { + "epoch": 0.30776384543240626, + "grad_norm": 2.317479133605957, + "learning_rate": 9.996351535139118e-06, + "loss": 3.0471, + "step": 312400 + }, + { + "epoch": 0.30781310341022833, + "grad_norm": 2.4597971439361572, + "learning_rate": 9.996345310805544e-06, + "loss": 3.0236, + "step": 312450 + }, + { + "epoch": 0.3078623613880504, + "grad_norm": 2.1973822116851807, + "learning_rate": 9.996339081169036e-06, + "loss": 3.0292, + "step": 312500 + }, + { + "epoch": 0.3079116193658725, + "grad_norm": 2.278440475463867, + "learning_rate": 9.996332846229596e-06, + "loss": 2.9371, + "step": 312550 + }, + { + "epoch": 0.3079608773436946, + "grad_norm": 2.3865201473236084, + "learning_rate": 9.996326605987234e-06, + "loss": 2.9635, + "step": 312600 + }, + { + "epoch": 0.30801013532151666, + "grad_norm": 2.779576063156128, + "learning_rate": 9.996320360441956e-06, + "loss": 3.0836, + "step": 312650 + }, + { + "epoch": 0.30805939329933874, + "grad_norm": 2.180945634841919, + "learning_rate": 9.996314109593769e-06, + "loss": 3.0043, + "step": 312700 + }, + { + "epoch": 0.30810865127716086, + "grad_norm": 2.250586986541748, + "learning_rate": 9.996307853442678e-06, + "loss": 3.0129, + "step": 312750 + }, + { + "epoch": 0.30815790925498293, + "grad_norm": 2.2202727794647217, + "learning_rate": 9.99630159198869e-06, + "loss": 3.0258, + "step": 312800 + }, + { + "epoch": 0.308207167232805, + "grad_norm": 2.4258713722229004, + "learning_rate": 9.996295325231814e-06, + "loss": 2.9999, + "step": 312850 + }, + { + "epoch": 0.3082564252106271, + "grad_norm": 2.327939748764038, + "learning_rate": 9.996289053172054e-06, + "loss": 3.0463, + "step": 312900 + }, + { + "epoch": 0.3083056831884492, + "grad_norm": 2.2418766021728516, + "learning_rate": 9.996282775809418e-06, + "loss": 3.0356, + "step": 312950 + }, + { + "epoch": 0.30835494116627127, + "grad_norm": 2.2699077129364014, + "learning_rate": 9.996276493143912e-06, + "loss": 2.9843, + "step": 313000 + }, + { + "epoch": 0.3084041991440934, + "grad_norm": 2.306989908218384, + "learning_rate": 9.996270205175543e-06, + "loss": 3.0128, + "step": 313050 + }, + { + "epoch": 0.30845345712191546, + "grad_norm": 2.2682242393493652, + "learning_rate": 9.996263911904317e-06, + "loss": 3.0425, + "step": 313100 + }, + { + "epoch": 0.30850271509973753, + "grad_norm": 2.287558078765869, + "learning_rate": 9.996257613330244e-06, + "loss": 2.936, + "step": 313150 + }, + { + "epoch": 0.30855197307755966, + "grad_norm": 2.1980414390563965, + "learning_rate": 9.996251309453325e-06, + "loss": 2.9992, + "step": 313200 + }, + { + "epoch": 0.30860123105538173, + "grad_norm": 2.264953136444092, + "learning_rate": 9.996245000273572e-06, + "loss": 3.0522, + "step": 313250 + }, + { + "epoch": 0.3086504890332038, + "grad_norm": 2.4297373294830322, + "learning_rate": 9.996238685790988e-06, + "loss": 2.9921, + "step": 313300 + }, + { + "epoch": 0.3086997470110259, + "grad_norm": 2.0938663482666016, + "learning_rate": 9.996232366005581e-06, + "loss": 2.984, + "step": 313350 + }, + { + "epoch": 0.308749004988848, + "grad_norm": 2.3178112506866455, + "learning_rate": 9.996226040917358e-06, + "loss": 3.0335, + "step": 313400 + }, + { + "epoch": 0.30879826296667007, + "grad_norm": 2.244946241378784, + "learning_rate": 9.996219710526325e-06, + "loss": 2.9682, + "step": 313450 + }, + { + "epoch": 0.3088475209444922, + "grad_norm": 2.359304189682007, + "learning_rate": 9.99621337483249e-06, + "loss": 3.0522, + "step": 313500 + }, + { + "epoch": 0.30889677892231426, + "grad_norm": 2.2672181129455566, + "learning_rate": 9.99620703383586e-06, + "loss": 2.9686, + "step": 313550 + }, + { + "epoch": 0.30894603690013633, + "grad_norm": 2.4422049522399902, + "learning_rate": 9.99620068753644e-06, + "loss": 3.0565, + "step": 313600 + }, + { + "epoch": 0.30899529487795846, + "grad_norm": 2.1991586685180664, + "learning_rate": 9.996194335934235e-06, + "loss": 3.0782, + "step": 313650 + }, + { + "epoch": 0.30904455285578053, + "grad_norm": 2.3003005981445312, + "learning_rate": 9.996187979029256e-06, + "loss": 3.0834, + "step": 313700 + }, + { + "epoch": 0.3090938108336026, + "grad_norm": 2.208747625350952, + "learning_rate": 9.99618161682151e-06, + "loss": 3.0066, + "step": 313750 + }, + { + "epoch": 0.3091430688114247, + "grad_norm": 2.2679004669189453, + "learning_rate": 9.996175249311e-06, + "loss": 2.9955, + "step": 313800 + }, + { + "epoch": 0.3091923267892468, + "grad_norm": 2.1517398357391357, + "learning_rate": 9.996168876497734e-06, + "loss": 2.9432, + "step": 313850 + }, + { + "epoch": 0.30924158476706887, + "grad_norm": 2.1666457653045654, + "learning_rate": 9.99616249838172e-06, + "loss": 3.0211, + "step": 313900 + }, + { + "epoch": 0.30929084274489094, + "grad_norm": 2.2385599613189697, + "learning_rate": 9.996156114962963e-06, + "loss": 3.0453, + "step": 313950 + }, + { + "epoch": 0.30934010072271306, + "grad_norm": 2.192492961883545, + "learning_rate": 9.996149726241471e-06, + "loss": 3.0573, + "step": 314000 + }, + { + "epoch": 0.30938935870053513, + "grad_norm": 2.2824437618255615, + "learning_rate": 9.99614333221725e-06, + "loss": 3.0015, + "step": 314050 + }, + { + "epoch": 0.3094386166783572, + "grad_norm": 2.2464230060577393, + "learning_rate": 9.996136932890308e-06, + "loss": 3.0042, + "step": 314100 + }, + { + "epoch": 0.30948787465617933, + "grad_norm": 2.2020175457000732, + "learning_rate": 9.996130528260652e-06, + "loss": 2.9732, + "step": 314150 + }, + { + "epoch": 0.3095371326340014, + "grad_norm": 2.2911062240600586, + "learning_rate": 9.996124118328287e-06, + "loss": 3.0953, + "step": 314200 + }, + { + "epoch": 0.30958639061182347, + "grad_norm": 2.2962448596954346, + "learning_rate": 9.99611770309322e-06, + "loss": 3.0144, + "step": 314250 + }, + { + "epoch": 0.3096356485896456, + "grad_norm": 2.3657443523406982, + "learning_rate": 9.99611128255546e-06, + "loss": 3.0084, + "step": 314300 + }, + { + "epoch": 0.30968490656746767, + "grad_norm": 2.2462432384490967, + "learning_rate": 9.996104856715012e-06, + "loss": 2.9702, + "step": 314350 + }, + { + "epoch": 0.30973416454528974, + "grad_norm": 2.4032435417175293, + "learning_rate": 9.99609842557188e-06, + "loss": 3.0407, + "step": 314400 + }, + { + "epoch": 0.30978342252311186, + "grad_norm": 2.3730244636535645, + "learning_rate": 9.996091989126078e-06, + "loss": 2.9775, + "step": 314450 + }, + { + "epoch": 0.30983268050093393, + "grad_norm": 2.1583681106567383, + "learning_rate": 9.996085547377608e-06, + "loss": 3.0215, + "step": 314500 + }, + { + "epoch": 0.309881938478756, + "grad_norm": 2.6782078742980957, + "learning_rate": 9.996079100326477e-06, + "loss": 3.0175, + "step": 314550 + }, + { + "epoch": 0.30993119645657813, + "grad_norm": 2.002676010131836, + "learning_rate": 9.996072647972692e-06, + "loss": 3.0156, + "step": 314600 + }, + { + "epoch": 0.3099804544344002, + "grad_norm": 2.3512744903564453, + "learning_rate": 9.99606619031626e-06, + "loss": 3.0528, + "step": 314650 + }, + { + "epoch": 0.31002971241222227, + "grad_norm": 2.251086473464966, + "learning_rate": 9.996059727357189e-06, + "loss": 3.0309, + "step": 314700 + }, + { + "epoch": 0.3100789703900444, + "grad_norm": 2.2585465908050537, + "learning_rate": 9.996053259095485e-06, + "loss": 3.0315, + "step": 314750 + }, + { + "epoch": 0.31012822836786647, + "grad_norm": 2.2087645530700684, + "learning_rate": 9.996046785531157e-06, + "loss": 3.0499, + "step": 314800 + }, + { + "epoch": 0.31017748634568854, + "grad_norm": 2.345472812652588, + "learning_rate": 9.996040306664206e-06, + "loss": 3.0331, + "step": 314850 + }, + { + "epoch": 0.31022674432351066, + "grad_norm": 2.2506842613220215, + "learning_rate": 9.996033822494644e-06, + "loss": 3.0033, + "step": 314900 + }, + { + "epoch": 0.31027600230133273, + "grad_norm": 2.4905457496643066, + "learning_rate": 9.996027333022478e-06, + "loss": 2.9611, + "step": 314950 + }, + { + "epoch": 0.3103252602791548, + "grad_norm": 2.3499624729156494, + "learning_rate": 9.996020838247711e-06, + "loss": 3.0723, + "step": 315000 + }, + { + "epoch": 0.3103745182569769, + "grad_norm": 2.2633299827575684, + "learning_rate": 9.996014338170354e-06, + "loss": 3.0467, + "step": 315050 + }, + { + "epoch": 0.310423776234799, + "grad_norm": 2.244985818862915, + "learning_rate": 9.996007832790413e-06, + "loss": 2.9855, + "step": 315100 + }, + { + "epoch": 0.31047303421262107, + "grad_norm": 2.754910945892334, + "learning_rate": 9.996001322107892e-06, + "loss": 3.0338, + "step": 315150 + }, + { + "epoch": 0.31052229219044314, + "grad_norm": 2.358602523803711, + "learning_rate": 9.995994806122802e-06, + "loss": 2.9972, + "step": 315200 + }, + { + "epoch": 0.31057155016826526, + "grad_norm": 2.564812183380127, + "learning_rate": 9.995988284835146e-06, + "loss": 3.0275, + "step": 315250 + }, + { + "epoch": 0.31062080814608733, + "grad_norm": 2.3292109966278076, + "learning_rate": 9.995981758244935e-06, + "loss": 2.9447, + "step": 315300 + }, + { + "epoch": 0.3106700661239094, + "grad_norm": 2.4661121368408203, + "learning_rate": 9.995975226352171e-06, + "loss": 3.0452, + "step": 315350 + }, + { + "epoch": 0.31071932410173153, + "grad_norm": 2.216602087020874, + "learning_rate": 9.995968689156866e-06, + "loss": 2.9841, + "step": 315400 + }, + { + "epoch": 0.3107685820795536, + "grad_norm": 2.1377151012420654, + "learning_rate": 9.995962146659023e-06, + "loss": 3.0245, + "step": 315450 + }, + { + "epoch": 0.31081784005737567, + "grad_norm": 2.236062526702881, + "learning_rate": 9.995955598858652e-06, + "loss": 2.9695, + "step": 315500 + }, + { + "epoch": 0.3108670980351978, + "grad_norm": 2.3402137756347656, + "learning_rate": 9.995949045755759e-06, + "loss": 3.0175, + "step": 315550 + }, + { + "epoch": 0.31091635601301987, + "grad_norm": 2.406885862350464, + "learning_rate": 9.99594248735035e-06, + "loss": 2.9877, + "step": 315600 + }, + { + "epoch": 0.31096561399084194, + "grad_norm": 2.3934450149536133, + "learning_rate": 9.995935923642432e-06, + "loss": 2.9834, + "step": 315650 + }, + { + "epoch": 0.31101487196866406, + "grad_norm": 2.196258306503296, + "learning_rate": 9.995929354632013e-06, + "loss": 3.0542, + "step": 315700 + }, + { + "epoch": 0.31106412994648613, + "grad_norm": 2.127100944519043, + "learning_rate": 9.9959227803191e-06, + "loss": 2.929, + "step": 315750 + }, + { + "epoch": 0.3111133879243082, + "grad_norm": 2.441338539123535, + "learning_rate": 9.995916200703697e-06, + "loss": 3.0144, + "step": 315800 + }, + { + "epoch": 0.31116264590213033, + "grad_norm": 2.4567346572875977, + "learning_rate": 9.995909615785814e-06, + "loss": 3.0162, + "step": 315850 + }, + { + "epoch": 0.3112119038799524, + "grad_norm": 2.27217960357666, + "learning_rate": 9.995903025565459e-06, + "loss": 2.9754, + "step": 315900 + }, + { + "epoch": 0.31126116185777447, + "grad_norm": 2.2861671447753906, + "learning_rate": 9.995896430042637e-06, + "loss": 3.0349, + "step": 315950 + }, + { + "epoch": 0.3113104198355966, + "grad_norm": 2.234182834625244, + "learning_rate": 9.995889829217354e-06, + "loss": 3.0451, + "step": 316000 + }, + { + "epoch": 0.31135967781341867, + "grad_norm": 2.1110923290252686, + "learning_rate": 9.995883223089619e-06, + "loss": 3.0071, + "step": 316050 + }, + { + "epoch": 0.31140893579124074, + "grad_norm": 2.4154622554779053, + "learning_rate": 9.99587661165944e-06, + "loss": 3.054, + "step": 316100 + }, + { + "epoch": 0.31145819376906286, + "grad_norm": 2.633284091949463, + "learning_rate": 9.99586999492682e-06, + "loss": 3.0022, + "step": 316150 + }, + { + "epoch": 0.31150745174688493, + "grad_norm": 2.7270987033843994, + "learning_rate": 9.99586337289177e-06, + "loss": 2.9218, + "step": 316200 + }, + { + "epoch": 0.311556709724707, + "grad_norm": 2.2962069511413574, + "learning_rate": 9.995856745554294e-06, + "loss": 2.9438, + "step": 316250 + }, + { + "epoch": 0.3116059677025291, + "grad_norm": 2.396324634552002, + "learning_rate": 9.995850112914401e-06, + "loss": 3.0248, + "step": 316300 + }, + { + "epoch": 0.3116552256803512, + "grad_norm": 2.4210610389709473, + "learning_rate": 9.995843474972098e-06, + "loss": 3.0113, + "step": 316350 + }, + { + "epoch": 0.31170448365817327, + "grad_norm": 2.3294830322265625, + "learning_rate": 9.995836831727392e-06, + "loss": 3.0268, + "step": 316400 + }, + { + "epoch": 0.31175374163599534, + "grad_norm": 2.487356424331665, + "learning_rate": 9.99583018318029e-06, + "loss": 2.9862, + "step": 316450 + }, + { + "epoch": 0.31180299961381747, + "grad_norm": 2.4091708660125732, + "learning_rate": 9.995823529330797e-06, + "loss": 3.0267, + "step": 316500 + }, + { + "epoch": 0.31185225759163954, + "grad_norm": 2.2486772537231445, + "learning_rate": 9.995816870178923e-06, + "loss": 3.0764, + "step": 316550 + }, + { + "epoch": 0.3119015155694616, + "grad_norm": 2.421083688735962, + "learning_rate": 9.995810205724673e-06, + "loss": 3.0502, + "step": 316600 + }, + { + "epoch": 0.31195077354728373, + "grad_norm": 2.528093099594116, + "learning_rate": 9.995803535968054e-06, + "loss": 3.0208, + "step": 316650 + }, + { + "epoch": 0.3120000315251058, + "grad_norm": 2.3876194953918457, + "learning_rate": 9.995796860909076e-06, + "loss": 3.0479, + "step": 316700 + }, + { + "epoch": 0.3120492895029279, + "grad_norm": 2.363154411315918, + "learning_rate": 9.995790180547742e-06, + "loss": 3.0245, + "step": 316750 + }, + { + "epoch": 0.31209854748075, + "grad_norm": 2.1681394577026367, + "learning_rate": 9.995783494884063e-06, + "loss": 2.9921, + "step": 316800 + }, + { + "epoch": 0.31214780545857207, + "grad_norm": 2.349562406539917, + "learning_rate": 9.995776803918043e-06, + "loss": 3.0644, + "step": 316850 + }, + { + "epoch": 0.31219706343639414, + "grad_norm": 2.398301839828491, + "learning_rate": 9.995770107649691e-06, + "loss": 2.9774, + "step": 316900 + }, + { + "epoch": 0.31224632141421627, + "grad_norm": 2.3575966358184814, + "learning_rate": 9.995763406079013e-06, + "loss": 2.993, + "step": 316950 + }, + { + "epoch": 0.31229557939203834, + "grad_norm": 2.2219886779785156, + "learning_rate": 9.995756699206018e-06, + "loss": 2.9969, + "step": 317000 + }, + { + "epoch": 0.3123448373698604, + "grad_norm": 2.3304412364959717, + "learning_rate": 9.995749987030711e-06, + "loss": 2.9615, + "step": 317050 + }, + { + "epoch": 0.31239409534768253, + "grad_norm": 2.2850918769836426, + "learning_rate": 9.995743269553098e-06, + "loss": 3.0284, + "step": 317100 + }, + { + "epoch": 0.3124433533255046, + "grad_norm": 2.4318318367004395, + "learning_rate": 9.99573654677319e-06, + "loss": 2.9997, + "step": 317150 + }, + { + "epoch": 0.3124926113033267, + "grad_norm": 2.356834650039673, + "learning_rate": 9.99572981869099e-06, + "loss": 3.02, + "step": 317200 + }, + { + "epoch": 0.3125418692811488, + "grad_norm": 2.3789877891540527, + "learning_rate": 9.995723085306509e-06, + "loss": 3.0594, + "step": 317250 + }, + { + "epoch": 0.31259112725897087, + "grad_norm": 2.407999277114868, + "learning_rate": 9.995716346619752e-06, + "loss": 2.9978, + "step": 317300 + }, + { + "epoch": 0.31264038523679294, + "grad_norm": 2.4253487586975098, + "learning_rate": 9.995709602630724e-06, + "loss": 2.9876, + "step": 317350 + }, + { + "epoch": 0.31268964321461507, + "grad_norm": 2.380905866622925, + "learning_rate": 9.995702853339438e-06, + "loss": 2.9729, + "step": 317400 + }, + { + "epoch": 0.31273890119243714, + "grad_norm": 2.5474369525909424, + "learning_rate": 9.995696098745897e-06, + "loss": 3.0358, + "step": 317450 + }, + { + "epoch": 0.3127881591702592, + "grad_norm": 2.7295117378234863, + "learning_rate": 9.99568933885011e-06, + "loss": 2.9899, + "step": 317500 + }, + { + "epoch": 0.3128374171480813, + "grad_norm": 2.3716928958892822, + "learning_rate": 9.995682573652082e-06, + "loss": 2.9854, + "step": 317550 + }, + { + "epoch": 0.3128866751259034, + "grad_norm": 2.4168429374694824, + "learning_rate": 9.99567580315182e-06, + "loss": 3.0011, + "step": 317600 + }, + { + "epoch": 0.31293593310372547, + "grad_norm": 2.353118896484375, + "learning_rate": 9.995669027349332e-06, + "loss": 3.119, + "step": 317650 + }, + { + "epoch": 0.31298519108154754, + "grad_norm": 2.571105718612671, + "learning_rate": 9.995662246244629e-06, + "loss": 2.9504, + "step": 317700 + }, + { + "epoch": 0.31303444905936967, + "grad_norm": 2.723294734954834, + "learning_rate": 9.995655459837714e-06, + "loss": 3.0455, + "step": 317750 + }, + { + "epoch": 0.31308370703719174, + "grad_norm": 2.4551429748535156, + "learning_rate": 9.995648668128594e-06, + "loss": 2.9658, + "step": 317800 + }, + { + "epoch": 0.3131329650150138, + "grad_norm": 2.3365747928619385, + "learning_rate": 9.995641871117277e-06, + "loss": 2.958, + "step": 317850 + }, + { + "epoch": 0.31318222299283593, + "grad_norm": 2.144131898880005, + "learning_rate": 9.995635068803772e-06, + "loss": 3.0209, + "step": 317900 + }, + { + "epoch": 0.313231480970658, + "grad_norm": 2.3323097229003906, + "learning_rate": 9.995628261188084e-06, + "loss": 3.0051, + "step": 317950 + }, + { + "epoch": 0.3132807389484801, + "grad_norm": 2.1606509685516357, + "learning_rate": 9.99562144827022e-06, + "loss": 3.0537, + "step": 318000 + }, + { + "epoch": 0.3133299969263022, + "grad_norm": 2.247515916824341, + "learning_rate": 9.995614630050189e-06, + "loss": 2.9701, + "step": 318050 + }, + { + "epoch": 0.31337925490412427, + "grad_norm": 2.201364755630493, + "learning_rate": 9.995607806527998e-06, + "loss": 3.05, + "step": 318100 + }, + { + "epoch": 0.31342851288194634, + "grad_norm": 2.3642578125, + "learning_rate": 9.995600977703654e-06, + "loss": 2.9976, + "step": 318150 + }, + { + "epoch": 0.31347777085976847, + "grad_norm": 2.337104558944702, + "learning_rate": 9.995594143577163e-06, + "loss": 3.0535, + "step": 318200 + }, + { + "epoch": 0.31352702883759054, + "grad_norm": 2.352006435394287, + "learning_rate": 9.995587304148534e-06, + "loss": 3.0028, + "step": 318250 + }, + { + "epoch": 0.3135762868154126, + "grad_norm": 2.178617000579834, + "learning_rate": 9.995580459417772e-06, + "loss": 2.9971, + "step": 318300 + }, + { + "epoch": 0.31362554479323473, + "grad_norm": 2.3368468284606934, + "learning_rate": 9.995573609384888e-06, + "loss": 3.016, + "step": 318350 + }, + { + "epoch": 0.3136748027710568, + "grad_norm": 2.3799073696136475, + "learning_rate": 9.995566754049885e-06, + "loss": 2.9897, + "step": 318400 + }, + { + "epoch": 0.3137240607488789, + "grad_norm": 2.2801501750946045, + "learning_rate": 9.995559893412774e-06, + "loss": 3.0281, + "step": 318450 + }, + { + "epoch": 0.313773318726701, + "grad_norm": 2.289668083190918, + "learning_rate": 9.995553027473559e-06, + "loss": 3.0158, + "step": 318500 + }, + { + "epoch": 0.31382257670452307, + "grad_norm": 2.1270036697387695, + "learning_rate": 9.99554615623225e-06, + "loss": 2.9879, + "step": 318550 + }, + { + "epoch": 0.31387183468234514, + "grad_norm": 2.274127960205078, + "learning_rate": 9.995539279688852e-06, + "loss": 3.0791, + "step": 318600 + }, + { + "epoch": 0.31392109266016727, + "grad_norm": 2.2569169998168945, + "learning_rate": 9.995532397843373e-06, + "loss": 2.9817, + "step": 318650 + }, + { + "epoch": 0.31397035063798934, + "grad_norm": 2.4550509452819824, + "learning_rate": 9.995525510695821e-06, + "loss": 2.9885, + "step": 318700 + }, + { + "epoch": 0.3140196086158114, + "grad_norm": 2.195941925048828, + "learning_rate": 9.995518618246204e-06, + "loss": 2.9544, + "step": 318750 + }, + { + "epoch": 0.3140688665936335, + "grad_norm": 2.394542932510376, + "learning_rate": 9.995511720494529e-06, + "loss": 3.0316, + "step": 318800 + }, + { + "epoch": 0.3141181245714556, + "grad_norm": 2.3514695167541504, + "learning_rate": 9.995504817440801e-06, + "loss": 3.0383, + "step": 318850 + }, + { + "epoch": 0.3141673825492777, + "grad_norm": 2.2090342044830322, + "learning_rate": 9.99549790908503e-06, + "loss": 2.9955, + "step": 318900 + }, + { + "epoch": 0.31421664052709974, + "grad_norm": 2.5291476249694824, + "learning_rate": 9.995490995427221e-06, + "loss": 2.9626, + "step": 318950 + }, + { + "epoch": 0.31426589850492187, + "grad_norm": 2.275815486907959, + "learning_rate": 9.995484076467386e-06, + "loss": 2.9422, + "step": 319000 + }, + { + "epoch": 0.31431515648274394, + "grad_norm": 2.208284854888916, + "learning_rate": 9.995477152205525e-06, + "loss": 3.0076, + "step": 319050 + }, + { + "epoch": 0.314364414460566, + "grad_norm": 4.213912010192871, + "learning_rate": 9.995470222641652e-06, + "loss": 3.0265, + "step": 319100 + }, + { + "epoch": 0.31441367243838814, + "grad_norm": 2.2220160961151123, + "learning_rate": 9.995463287775771e-06, + "loss": 3.0089, + "step": 319150 + }, + { + "epoch": 0.3144629304162102, + "grad_norm": 2.312147378921509, + "learning_rate": 9.995456347607889e-06, + "loss": 2.9944, + "step": 319200 + }, + { + "epoch": 0.3145121883940323, + "grad_norm": 2.451925754547119, + "learning_rate": 9.995449402138016e-06, + "loss": 2.9944, + "step": 319250 + }, + { + "epoch": 0.3145614463718544, + "grad_norm": 2.4694437980651855, + "learning_rate": 9.995442451366157e-06, + "loss": 3.0126, + "step": 319300 + }, + { + "epoch": 0.3146107043496765, + "grad_norm": 2.281668186187744, + "learning_rate": 9.995435495292321e-06, + "loss": 3.0263, + "step": 319350 + }, + { + "epoch": 0.31465996232749854, + "grad_norm": 2.482325315475464, + "learning_rate": 9.995428533916514e-06, + "loss": 3.017, + "step": 319400 + }, + { + "epoch": 0.31470922030532067, + "grad_norm": 2.202923536300659, + "learning_rate": 9.995421567238744e-06, + "loss": 3.0441, + "step": 319450 + }, + { + "epoch": 0.31475847828314274, + "grad_norm": 2.2867085933685303, + "learning_rate": 9.995414595259018e-06, + "loss": 3.0196, + "step": 319500 + }, + { + "epoch": 0.3148077362609648, + "grad_norm": 2.4636571407318115, + "learning_rate": 9.995407617977344e-06, + "loss": 3.0766, + "step": 319550 + }, + { + "epoch": 0.31485699423878694, + "grad_norm": 2.9409708976745605, + "learning_rate": 9.99540063539373e-06, + "loss": 3.0228, + "step": 319600 + }, + { + "epoch": 0.314906252216609, + "grad_norm": 2.1873981952667236, + "learning_rate": 9.99539364750818e-06, + "loss": 3.0106, + "step": 319650 + }, + { + "epoch": 0.3149555101944311, + "grad_norm": 2.3958446979522705, + "learning_rate": 9.995386654320706e-06, + "loss": 3.0242, + "step": 319700 + }, + { + "epoch": 0.3150047681722532, + "grad_norm": 2.2478067874908447, + "learning_rate": 9.995379655831312e-06, + "loss": 2.9735, + "step": 319750 + }, + { + "epoch": 0.3150540261500753, + "grad_norm": 2.314419746398926, + "learning_rate": 9.99537265204001e-06, + "loss": 3.003, + "step": 319800 + }, + { + "epoch": 0.31510328412789734, + "grad_norm": 2.2649142742156982, + "learning_rate": 9.995365642946803e-06, + "loss": 3.0239, + "step": 319850 + }, + { + "epoch": 0.31515254210571947, + "grad_norm": 2.521195411682129, + "learning_rate": 9.995358628551699e-06, + "loss": 3.0075, + "step": 319900 + }, + { + "epoch": 0.31520180008354154, + "grad_norm": 2.329045534133911, + "learning_rate": 9.995351608854706e-06, + "loss": 3.0193, + "step": 319950 + }, + { + "epoch": 0.3152510580613636, + "grad_norm": 2.2379143238067627, + "learning_rate": 9.995344583855832e-06, + "loss": 2.9969, + "step": 320000 + }, + { + "epoch": 0.3153003160391857, + "grad_norm": 2.5926640033721924, + "learning_rate": 9.995337553555083e-06, + "loss": 2.9851, + "step": 320050 + }, + { + "epoch": 0.3153495740170078, + "grad_norm": 2.437302350997925, + "learning_rate": 9.99533051795247e-06, + "loss": 2.9829, + "step": 320100 + }, + { + "epoch": 0.3153988319948299, + "grad_norm": 2.139824151992798, + "learning_rate": 9.995323477047994e-06, + "loss": 2.9494, + "step": 320150 + }, + { + "epoch": 0.31544808997265195, + "grad_norm": 2.424853563308716, + "learning_rate": 9.99531643084167e-06, + "loss": 3.0112, + "step": 320200 + }, + { + "epoch": 0.31549734795047407, + "grad_norm": 2.3947649002075195, + "learning_rate": 9.9953093793335e-06, + "loss": 3.0941, + "step": 320250 + }, + { + "epoch": 0.31554660592829614, + "grad_norm": 2.379643201828003, + "learning_rate": 9.995302322523495e-06, + "loss": 3.0199, + "step": 320300 + }, + { + "epoch": 0.3155958639061182, + "grad_norm": 2.3067097663879395, + "learning_rate": 9.99529526041166e-06, + "loss": 3.1057, + "step": 320350 + }, + { + "epoch": 0.31564512188394034, + "grad_norm": 2.2447855472564697, + "learning_rate": 9.995288192998005e-06, + "loss": 3.0004, + "step": 320400 + }, + { + "epoch": 0.3156943798617624, + "grad_norm": 2.2591230869293213, + "learning_rate": 9.995281120282535e-06, + "loss": 2.9962, + "step": 320450 + }, + { + "epoch": 0.3157436378395845, + "grad_norm": 2.517709493637085, + "learning_rate": 9.995274042265256e-06, + "loss": 2.9756, + "step": 320500 + }, + { + "epoch": 0.3157928958174066, + "grad_norm": 2.4261748790740967, + "learning_rate": 9.99526695894618e-06, + "loss": 2.9947, + "step": 320550 + }, + { + "epoch": 0.3158421537952287, + "grad_norm": 2.3525118827819824, + "learning_rate": 9.995259870325313e-06, + "loss": 2.9925, + "step": 320600 + }, + { + "epoch": 0.31589141177305075, + "grad_norm": 2.4597692489624023, + "learning_rate": 9.995252776402662e-06, + "loss": 3.0298, + "step": 320650 + }, + { + "epoch": 0.31594066975087287, + "grad_norm": 2.3268790245056152, + "learning_rate": 9.995245677178234e-06, + "loss": 3.0111, + "step": 320700 + }, + { + "epoch": 0.31598992772869494, + "grad_norm": 2.135472059249878, + "learning_rate": 9.995238572652036e-06, + "loss": 3.0403, + "step": 320750 + }, + { + "epoch": 0.316039185706517, + "grad_norm": 2.268904447555542, + "learning_rate": 9.995231462824078e-06, + "loss": 3.0308, + "step": 320800 + }, + { + "epoch": 0.31608844368433914, + "grad_norm": 2.2873711585998535, + "learning_rate": 9.995224347694365e-06, + "loss": 3.0155, + "step": 320850 + }, + { + "epoch": 0.3161377016621612, + "grad_norm": 2.263529062271118, + "learning_rate": 9.995217227262906e-06, + "loss": 2.9999, + "step": 320900 + }, + { + "epoch": 0.3161869596399833, + "grad_norm": 2.3661818504333496, + "learning_rate": 9.995210101529709e-06, + "loss": 3.0459, + "step": 320950 + }, + { + "epoch": 0.3162362176178054, + "grad_norm": 2.3859496116638184, + "learning_rate": 9.995202970494782e-06, + "loss": 2.9802, + "step": 321000 + }, + { + "epoch": 0.3162854755956275, + "grad_norm": 2.3529341220855713, + "learning_rate": 9.995195834158128e-06, + "loss": 2.989, + "step": 321050 + }, + { + "epoch": 0.31633473357344954, + "grad_norm": 2.301541328430176, + "learning_rate": 9.99518869251976e-06, + "loss": 2.9636, + "step": 321100 + }, + { + "epoch": 0.31638399155127167, + "grad_norm": 2.297841787338257, + "learning_rate": 9.995181545579683e-06, + "loss": 2.9542, + "step": 321150 + }, + { + "epoch": 0.31643324952909374, + "grad_norm": 2.3120968341827393, + "learning_rate": 9.995174393337905e-06, + "loss": 2.963, + "step": 321200 + }, + { + "epoch": 0.3164825075069158, + "grad_norm": 2.342954635620117, + "learning_rate": 9.995167235794433e-06, + "loss": 2.9842, + "step": 321250 + }, + { + "epoch": 0.3165317654847379, + "grad_norm": 2.352522611618042, + "learning_rate": 9.995160072949277e-06, + "loss": 2.9765, + "step": 321300 + }, + { + "epoch": 0.31658102346256, + "grad_norm": 2.2417845726013184, + "learning_rate": 9.995152904802443e-06, + "loss": 3.0048, + "step": 321350 + }, + { + "epoch": 0.3166302814403821, + "grad_norm": 2.1343228816986084, + "learning_rate": 9.995145731353937e-06, + "loss": 2.9975, + "step": 321400 + }, + { + "epoch": 0.31667953941820415, + "grad_norm": 2.7334096431732178, + "learning_rate": 9.995138552603769e-06, + "loss": 2.9545, + "step": 321450 + }, + { + "epoch": 0.3167287973960263, + "grad_norm": 2.294802188873291, + "learning_rate": 9.995131368551947e-06, + "loss": 2.9884, + "step": 321500 + }, + { + "epoch": 0.31677805537384834, + "grad_norm": 2.2811436653137207, + "learning_rate": 9.995124179198476e-06, + "loss": 2.967, + "step": 321550 + }, + { + "epoch": 0.3168273133516704, + "grad_norm": 2.640312910079956, + "learning_rate": 9.995116984543365e-06, + "loss": 3.04, + "step": 321600 + }, + { + "epoch": 0.31687657132949254, + "grad_norm": 2.3102548122406006, + "learning_rate": 9.99510978458662e-06, + "loss": 2.9984, + "step": 321650 + }, + { + "epoch": 0.3169258293073146, + "grad_norm": 2.1529226303100586, + "learning_rate": 9.995102579328253e-06, + "loss": 2.9703, + "step": 321700 + }, + { + "epoch": 0.3169750872851367, + "grad_norm": 2.3430707454681396, + "learning_rate": 9.995095368768267e-06, + "loss": 3.0283, + "step": 321750 + }, + { + "epoch": 0.3170243452629588, + "grad_norm": 2.2311439514160156, + "learning_rate": 9.995088152906673e-06, + "loss": 3.0448, + "step": 321800 + }, + { + "epoch": 0.3170736032407809, + "grad_norm": 2.359165906906128, + "learning_rate": 9.995080931743478e-06, + "loss": 3.0147, + "step": 321850 + }, + { + "epoch": 0.31712286121860295, + "grad_norm": 2.383981704711914, + "learning_rate": 9.995073705278687e-06, + "loss": 2.9731, + "step": 321900 + }, + { + "epoch": 0.3171721191964251, + "grad_norm": 2.0704076290130615, + "learning_rate": 9.99506647351231e-06, + "loss": 3.0084, + "step": 321950 + }, + { + "epoch": 0.31722137717424714, + "grad_norm": 2.310384750366211, + "learning_rate": 9.995059236444353e-06, + "loss": 2.9468, + "step": 322000 + }, + { + "epoch": 0.3172706351520692, + "grad_norm": 2.28782320022583, + "learning_rate": 9.995051994074826e-06, + "loss": 2.9596, + "step": 322050 + }, + { + "epoch": 0.31731989312989134, + "grad_norm": 2.2441258430480957, + "learning_rate": 9.995044746403738e-06, + "loss": 3.0287, + "step": 322100 + }, + { + "epoch": 0.3173691511077134, + "grad_norm": 2.2825212478637695, + "learning_rate": 9.995037493431093e-06, + "loss": 3.0142, + "step": 322150 + }, + { + "epoch": 0.3174184090855355, + "grad_norm": 2.242141008377075, + "learning_rate": 9.995030235156897e-06, + "loss": 3.0395, + "step": 322200 + }, + { + "epoch": 0.3174676670633576, + "grad_norm": 2.3794503211975098, + "learning_rate": 9.995022971581163e-06, + "loss": 2.963, + "step": 322250 + }, + { + "epoch": 0.3175169250411797, + "grad_norm": 2.3367199897766113, + "learning_rate": 9.995015702703898e-06, + "loss": 2.9844, + "step": 322300 + }, + { + "epoch": 0.31756618301900175, + "grad_norm": 2.353916645050049, + "learning_rate": 9.995008428525105e-06, + "loss": 3.0147, + "step": 322350 + }, + { + "epoch": 0.3176154409968239, + "grad_norm": 2.3322248458862305, + "learning_rate": 9.995001149044798e-06, + "loss": 3.018, + "step": 322400 + }, + { + "epoch": 0.31766469897464594, + "grad_norm": 2.142261505126953, + "learning_rate": 9.994993864262979e-06, + "loss": 3.0281, + "step": 322450 + }, + { + "epoch": 0.317713956952468, + "grad_norm": 2.2486965656280518, + "learning_rate": 9.994986574179659e-06, + "loss": 2.9248, + "step": 322500 + }, + { + "epoch": 0.3177632149302901, + "grad_norm": 2.243344306945801, + "learning_rate": 9.994979278794844e-06, + "loss": 3.008, + "step": 322550 + }, + { + "epoch": 0.3178124729081122, + "grad_norm": 2.05446457862854, + "learning_rate": 9.994971978108545e-06, + "loss": 2.9652, + "step": 322600 + }, + { + "epoch": 0.3178617308859343, + "grad_norm": 2.305025100708008, + "learning_rate": 9.994964672120767e-06, + "loss": 3.0106, + "step": 322650 + }, + { + "epoch": 0.31791098886375635, + "grad_norm": 2.3543810844421387, + "learning_rate": 9.994957360831518e-06, + "loss": 3.0716, + "step": 322700 + }, + { + "epoch": 0.3179602468415785, + "grad_norm": 2.408717155456543, + "learning_rate": 9.994950044240805e-06, + "loss": 3.0546, + "step": 322750 + }, + { + "epoch": 0.31800950481940055, + "grad_norm": 2.444916248321533, + "learning_rate": 9.994942722348638e-06, + "loss": 2.9957, + "step": 322800 + }, + { + "epoch": 0.3180587627972226, + "grad_norm": 2.5337023735046387, + "learning_rate": 9.994935395155024e-06, + "loss": 3.01, + "step": 322850 + }, + { + "epoch": 0.31810802077504474, + "grad_norm": 2.414316177368164, + "learning_rate": 9.994928062659968e-06, + "loss": 2.9439, + "step": 322900 + }, + { + "epoch": 0.3181572787528668, + "grad_norm": 2.371917247772217, + "learning_rate": 9.994920724863484e-06, + "loss": 3.006, + "step": 322950 + }, + { + "epoch": 0.3182065367306889, + "grad_norm": 2.2066924571990967, + "learning_rate": 9.994913381765573e-06, + "loss": 2.9917, + "step": 323000 + }, + { + "epoch": 0.318255794708511, + "grad_norm": 2.157283306121826, + "learning_rate": 9.994906033366249e-06, + "loss": 3.0114, + "step": 323050 + }, + { + "epoch": 0.3183050526863331, + "grad_norm": 2.162775754928589, + "learning_rate": 9.994898679665513e-06, + "loss": 2.9932, + "step": 323100 + }, + { + "epoch": 0.31835431066415515, + "grad_norm": 2.223466634750366, + "learning_rate": 9.994891320663377e-06, + "loss": 2.9806, + "step": 323150 + }, + { + "epoch": 0.3184035686419773, + "grad_norm": 2.2770650386810303, + "learning_rate": 9.994883956359849e-06, + "loss": 2.9578, + "step": 323200 + }, + { + "epoch": 0.31845282661979935, + "grad_norm": 2.8201406002044678, + "learning_rate": 9.994876586754934e-06, + "loss": 2.9771, + "step": 323250 + }, + { + "epoch": 0.3185020845976214, + "grad_norm": 2.286628007888794, + "learning_rate": 9.994869211848644e-06, + "loss": 3.0722, + "step": 323300 + }, + { + "epoch": 0.31855134257544354, + "grad_norm": 2.3103508949279785, + "learning_rate": 9.994861831640983e-06, + "loss": 2.9665, + "step": 323350 + }, + { + "epoch": 0.3186006005532656, + "grad_norm": 2.4238879680633545, + "learning_rate": 9.994854446131963e-06, + "loss": 2.9776, + "step": 323400 + }, + { + "epoch": 0.3186498585310877, + "grad_norm": 2.266263484954834, + "learning_rate": 9.994847055321587e-06, + "loss": 2.9405, + "step": 323450 + }, + { + "epoch": 0.3186991165089098, + "grad_norm": 2.386289596557617, + "learning_rate": 9.994839659209867e-06, + "loss": 3.0086, + "step": 323500 + }, + { + "epoch": 0.3187483744867319, + "grad_norm": 2.2134878635406494, + "learning_rate": 9.994832257796807e-06, + "loss": 3.0269, + "step": 323550 + }, + { + "epoch": 0.31879763246455395, + "grad_norm": 2.228789806365967, + "learning_rate": 9.994824851082419e-06, + "loss": 3.0477, + "step": 323600 + }, + { + "epoch": 0.3188468904423761, + "grad_norm": 2.457951545715332, + "learning_rate": 9.994817439066707e-06, + "loss": 3.0154, + "step": 323650 + }, + { + "epoch": 0.31889614842019814, + "grad_norm": 2.3145902156829834, + "learning_rate": 9.994810021749679e-06, + "loss": 3.0082, + "step": 323700 + }, + { + "epoch": 0.3189454063980202, + "grad_norm": 2.356865406036377, + "learning_rate": 9.994802599131347e-06, + "loss": 3.045, + "step": 323750 + }, + { + "epoch": 0.3189946643758423, + "grad_norm": 2.2982535362243652, + "learning_rate": 9.994795171211716e-06, + "loss": 3.0441, + "step": 323800 + }, + { + "epoch": 0.3190439223536644, + "grad_norm": 2.2874183654785156, + "learning_rate": 9.994787737990795e-06, + "loss": 3.0277, + "step": 323850 + }, + { + "epoch": 0.3190931803314865, + "grad_norm": 2.1828739643096924, + "learning_rate": 9.994780299468588e-06, + "loss": 2.9924, + "step": 323900 + }, + { + "epoch": 0.31914243830930855, + "grad_norm": 2.1970157623291016, + "learning_rate": 9.994772855645109e-06, + "loss": 3.0073, + "step": 323950 + }, + { + "epoch": 0.3191916962871307, + "grad_norm": 2.3162572383880615, + "learning_rate": 9.994765406520362e-06, + "loss": 2.9827, + "step": 324000 + }, + { + "epoch": 0.31924095426495275, + "grad_norm": 2.37149977684021, + "learning_rate": 9.994757952094356e-06, + "loss": 3.0302, + "step": 324050 + }, + { + "epoch": 0.3192902122427748, + "grad_norm": 2.2363533973693848, + "learning_rate": 9.994750492367097e-06, + "loss": 2.986, + "step": 324100 + }, + { + "epoch": 0.31933947022059694, + "grad_norm": 2.352264642715454, + "learning_rate": 9.994743027338595e-06, + "loss": 2.9581, + "step": 324150 + }, + { + "epoch": 0.319388728198419, + "grad_norm": 2.160536289215088, + "learning_rate": 9.99473555700886e-06, + "loss": 3.0578, + "step": 324200 + }, + { + "epoch": 0.3194379861762411, + "grad_norm": 2.2979745864868164, + "learning_rate": 9.994728081377897e-06, + "loss": 3.043, + "step": 324250 + }, + { + "epoch": 0.3194872441540632, + "grad_norm": 2.311696767807007, + "learning_rate": 9.994720600445713e-06, + "loss": 2.97, + "step": 324300 + }, + { + "epoch": 0.3195365021318853, + "grad_norm": 2.3789725303649902, + "learning_rate": 9.994713114212317e-06, + "loss": 2.9619, + "step": 324350 + }, + { + "epoch": 0.31958576010970735, + "grad_norm": 2.232785701751709, + "learning_rate": 9.994705622677717e-06, + "loss": 3.0087, + "step": 324400 + }, + { + "epoch": 0.3196350180875295, + "grad_norm": 2.22043514251709, + "learning_rate": 9.994698125841924e-06, + "loss": 2.9973, + "step": 324450 + }, + { + "epoch": 0.31968427606535155, + "grad_norm": 2.3424551486968994, + "learning_rate": 9.99469062370494e-06, + "loss": 3.0119, + "step": 324500 + }, + { + "epoch": 0.3197335340431736, + "grad_norm": 2.381495475769043, + "learning_rate": 9.994683116266778e-06, + "loss": 3.0328, + "step": 324550 + }, + { + "epoch": 0.31978279202099574, + "grad_norm": 2.4834647178649902, + "learning_rate": 9.994675603527444e-06, + "loss": 3.0523, + "step": 324600 + }, + { + "epoch": 0.3198320499988178, + "grad_norm": 2.2913899421691895, + "learning_rate": 9.994668085486946e-06, + "loss": 2.9658, + "step": 324650 + }, + { + "epoch": 0.3198813079766399, + "grad_norm": 2.4016404151916504, + "learning_rate": 9.994660562145292e-06, + "loss": 2.9555, + "step": 324700 + }, + { + "epoch": 0.319930565954462, + "grad_norm": 2.1837029457092285, + "learning_rate": 9.99465303350249e-06, + "loss": 2.9312, + "step": 324750 + }, + { + "epoch": 0.3199798239322841, + "grad_norm": 2.323606252670288, + "learning_rate": 9.994645499558547e-06, + "loss": 2.9735, + "step": 324800 + }, + { + "epoch": 0.32002908191010615, + "grad_norm": 2.3567721843719482, + "learning_rate": 9.994637960313474e-06, + "loss": 3.011, + "step": 324850 + }, + { + "epoch": 0.3200783398879283, + "grad_norm": 2.3108835220336914, + "learning_rate": 9.994630415767277e-06, + "loss": 3.0423, + "step": 324900 + }, + { + "epoch": 0.32012759786575035, + "grad_norm": 2.3268239498138428, + "learning_rate": 9.994622865919962e-06, + "loss": 2.997, + "step": 324950 + }, + { + "epoch": 0.3201768558435724, + "grad_norm": 2.180349826812744, + "learning_rate": 9.99461531077154e-06, + "loss": 2.973, + "step": 325000 + }, + { + "epoch": 0.3202261138213945, + "grad_norm": 2.347105026245117, + "learning_rate": 9.994607750322019e-06, + "loss": 2.9955, + "step": 325050 + }, + { + "epoch": 0.3202753717992166, + "grad_norm": 2.4370474815368652, + "learning_rate": 9.994600184571406e-06, + "loss": 3.0519, + "step": 325100 + }, + { + "epoch": 0.3203246297770387, + "grad_norm": 2.381134033203125, + "learning_rate": 9.994592613519708e-06, + "loss": 3.0755, + "step": 325150 + }, + { + "epoch": 0.32037388775486075, + "grad_norm": 2.1476330757141113, + "learning_rate": 9.994585037166935e-06, + "loss": 2.9089, + "step": 325200 + }, + { + "epoch": 0.3204231457326829, + "grad_norm": 2.3896710872650146, + "learning_rate": 9.994577455513094e-06, + "loss": 3.0204, + "step": 325250 + }, + { + "epoch": 0.32047240371050495, + "grad_norm": 2.370788335800171, + "learning_rate": 9.994569868558193e-06, + "loss": 2.9453, + "step": 325300 + }, + { + "epoch": 0.320521661688327, + "grad_norm": 2.2425074577331543, + "learning_rate": 9.99456227630224e-06, + "loss": 2.9871, + "step": 325350 + }, + { + "epoch": 0.32057091966614915, + "grad_norm": 2.576815605163574, + "learning_rate": 9.994554678745244e-06, + "loss": 3.0331, + "step": 325400 + }, + { + "epoch": 0.3206201776439712, + "grad_norm": 2.1372947692871094, + "learning_rate": 9.994547075887214e-06, + "loss": 2.9531, + "step": 325450 + }, + { + "epoch": 0.3206694356217933, + "grad_norm": 2.3413589000701904, + "learning_rate": 9.994539467728155e-06, + "loss": 2.9451, + "step": 325500 + }, + { + "epoch": 0.3207186935996154, + "grad_norm": 2.2747602462768555, + "learning_rate": 9.994531854268078e-06, + "loss": 3.0007, + "step": 325550 + }, + { + "epoch": 0.3207679515774375, + "grad_norm": 2.32114315032959, + "learning_rate": 9.994524235506987e-06, + "loss": 2.9767, + "step": 325600 + }, + { + "epoch": 0.32081720955525955, + "grad_norm": 2.346999406814575, + "learning_rate": 9.994516611444893e-06, + "loss": 3.0036, + "step": 325650 + }, + { + "epoch": 0.3208664675330817, + "grad_norm": 2.3094329833984375, + "learning_rate": 9.994508982081806e-06, + "loss": 2.911, + "step": 325700 + }, + { + "epoch": 0.32091572551090375, + "grad_norm": 2.427722215652466, + "learning_rate": 9.99450134741773e-06, + "loss": 3.0851, + "step": 325750 + }, + { + "epoch": 0.3209649834887258, + "grad_norm": 2.242171049118042, + "learning_rate": 9.994493707452676e-06, + "loss": 3.0294, + "step": 325800 + }, + { + "epoch": 0.32101424146654794, + "grad_norm": 2.292050361633301, + "learning_rate": 9.99448606218665e-06, + "loss": 2.9837, + "step": 325850 + }, + { + "epoch": 0.32106349944437, + "grad_norm": 2.536051034927368, + "learning_rate": 9.994478411619664e-06, + "loss": 2.9535, + "step": 325900 + }, + { + "epoch": 0.3211127574221921, + "grad_norm": 2.266040325164795, + "learning_rate": 9.994470755751722e-06, + "loss": 3.1034, + "step": 325950 + }, + { + "epoch": 0.3211620154000142, + "grad_norm": 2.4025213718414307, + "learning_rate": 9.994463094582834e-06, + "loss": 3.0084, + "step": 326000 + }, + { + "epoch": 0.3212112733778363, + "grad_norm": 2.29390025138855, + "learning_rate": 9.994455428113005e-06, + "loss": 2.9867, + "step": 326050 + }, + { + "epoch": 0.32126053135565835, + "grad_norm": 2.29315185546875, + "learning_rate": 9.994447756342248e-06, + "loss": 2.956, + "step": 326100 + }, + { + "epoch": 0.3213097893334805, + "grad_norm": 2.208200216293335, + "learning_rate": 9.994440079270568e-06, + "loss": 3.0173, + "step": 326150 + }, + { + "epoch": 0.32135904731130255, + "grad_norm": 2.4082369804382324, + "learning_rate": 9.994432396897975e-06, + "loss": 2.9984, + "step": 326200 + }, + { + "epoch": 0.3214083052891246, + "grad_norm": 2.4058754444122314, + "learning_rate": 9.994424709224477e-06, + "loss": 2.9854, + "step": 326250 + }, + { + "epoch": 0.3214575632669467, + "grad_norm": 2.2343225479125977, + "learning_rate": 9.99441701625008e-06, + "loss": 3.0, + "step": 326300 + }, + { + "epoch": 0.3215068212447688, + "grad_norm": 2.204411029815674, + "learning_rate": 9.994409317974795e-06, + "loss": 3.0461, + "step": 326350 + }, + { + "epoch": 0.3215560792225909, + "grad_norm": 2.4737579822540283, + "learning_rate": 9.994401614398626e-06, + "loss": 2.9469, + "step": 326400 + }, + { + "epoch": 0.32160533720041296, + "grad_norm": 2.3459532260894775, + "learning_rate": 9.994393905521587e-06, + "loss": 3.0485, + "step": 326450 + }, + { + "epoch": 0.3216545951782351, + "grad_norm": 2.244621992111206, + "learning_rate": 9.994386191343681e-06, + "loss": 2.9854, + "step": 326500 + }, + { + "epoch": 0.32170385315605715, + "grad_norm": 2.1877098083496094, + "learning_rate": 9.99437847186492e-06, + "loss": 3.0816, + "step": 326550 + }, + { + "epoch": 0.3217531111338792, + "grad_norm": 2.503509759902954, + "learning_rate": 9.994370747085307e-06, + "loss": 2.9971, + "step": 326600 + }, + { + "epoch": 0.32180236911170135, + "grad_norm": 2.2942562103271484, + "learning_rate": 9.994363017004858e-06, + "loss": 2.9995, + "step": 326650 + }, + { + "epoch": 0.3218516270895234, + "grad_norm": 2.210972309112549, + "learning_rate": 9.994355281623573e-06, + "loss": 3.0095, + "step": 326700 + }, + { + "epoch": 0.3219008850673455, + "grad_norm": 2.2744314670562744, + "learning_rate": 9.994347540941468e-06, + "loss": 2.975, + "step": 326750 + }, + { + "epoch": 0.3219501430451676, + "grad_norm": 2.2309682369232178, + "learning_rate": 9.994339794958544e-06, + "loss": 2.9718, + "step": 326800 + }, + { + "epoch": 0.3219994010229897, + "grad_norm": 2.408513069152832, + "learning_rate": 9.994332043674816e-06, + "loss": 3.0469, + "step": 326850 + }, + { + "epoch": 0.32204865900081175, + "grad_norm": 2.2341365814208984, + "learning_rate": 9.994324287090285e-06, + "loss": 2.967, + "step": 326900 + }, + { + "epoch": 0.3220979169786339, + "grad_norm": 2.3177425861358643, + "learning_rate": 9.994316525204965e-06, + "loss": 3.0808, + "step": 326950 + }, + { + "epoch": 0.32214717495645595, + "grad_norm": 2.3534317016601562, + "learning_rate": 9.994308758018862e-06, + "loss": 2.9999, + "step": 327000 + }, + { + "epoch": 0.322196432934278, + "grad_norm": 2.245447874069214, + "learning_rate": 9.994300985531984e-06, + "loss": 3.0175, + "step": 327050 + }, + { + "epoch": 0.32224569091210015, + "grad_norm": 2.216843843460083, + "learning_rate": 9.994293207744338e-06, + "loss": 2.9541, + "step": 327100 + }, + { + "epoch": 0.3222949488899222, + "grad_norm": 2.202080249786377, + "learning_rate": 9.994285424655939e-06, + "loss": 2.9572, + "step": 327150 + }, + { + "epoch": 0.3223442068677443, + "grad_norm": 2.3912792205810547, + "learning_rate": 9.994277636266785e-06, + "loss": 2.9878, + "step": 327200 + }, + { + "epoch": 0.3223934648455664, + "grad_norm": 2.3706064224243164, + "learning_rate": 9.994269842576892e-06, + "loss": 2.9844, + "step": 327250 + }, + { + "epoch": 0.3224427228233885, + "grad_norm": 2.697843313217163, + "learning_rate": 9.994262043586267e-06, + "loss": 3.0056, + "step": 327300 + }, + { + "epoch": 0.32249198080121055, + "grad_norm": 2.0959384441375732, + "learning_rate": 9.994254239294915e-06, + "loss": 3.0166, + "step": 327350 + }, + { + "epoch": 0.3225412387790327, + "grad_norm": 2.400178909301758, + "learning_rate": 9.994246429702846e-06, + "loss": 2.8883, + "step": 327400 + }, + { + "epoch": 0.32259049675685475, + "grad_norm": 2.2338175773620605, + "learning_rate": 9.99423861481007e-06, + "loss": 3.0509, + "step": 327450 + }, + { + "epoch": 0.3226397547346768, + "grad_norm": 2.32521653175354, + "learning_rate": 9.994230794616592e-06, + "loss": 2.9591, + "step": 327500 + }, + { + "epoch": 0.3226890127124989, + "grad_norm": 2.2035601139068604, + "learning_rate": 9.994222969122425e-06, + "loss": 3.0502, + "step": 327550 + }, + { + "epoch": 0.322738270690321, + "grad_norm": 2.258697032928467, + "learning_rate": 9.994215138327573e-06, + "loss": 2.9917, + "step": 327600 + }, + { + "epoch": 0.3227875286681431, + "grad_norm": 2.2851362228393555, + "learning_rate": 9.994207302232044e-06, + "loss": 3.0219, + "step": 327650 + }, + { + "epoch": 0.32283678664596516, + "grad_norm": 2.271369457244873, + "learning_rate": 9.994199460835851e-06, + "loss": 2.9621, + "step": 327700 + }, + { + "epoch": 0.3228860446237873, + "grad_norm": 2.2642946243286133, + "learning_rate": 9.994191614139e-06, + "loss": 3.0231, + "step": 327750 + }, + { + "epoch": 0.32293530260160935, + "grad_norm": 2.332162618637085, + "learning_rate": 9.994183762141496e-06, + "loss": 3.0123, + "step": 327800 + }, + { + "epoch": 0.3229845605794314, + "grad_norm": 2.2626237869262695, + "learning_rate": 9.99417590484335e-06, + "loss": 3.011, + "step": 327850 + }, + { + "epoch": 0.32303381855725355, + "grad_norm": 2.1220316886901855, + "learning_rate": 9.994168042244573e-06, + "loss": 3.0021, + "step": 327900 + }, + { + "epoch": 0.3230830765350756, + "grad_norm": 2.387272596359253, + "learning_rate": 9.99416017434517e-06, + "loss": 3.0009, + "step": 327950 + }, + { + "epoch": 0.3231323345128977, + "grad_norm": 2.328430652618408, + "learning_rate": 9.99415230114515e-06, + "loss": 2.9337, + "step": 328000 + }, + { + "epoch": 0.3231815924907198, + "grad_norm": 2.342526435852051, + "learning_rate": 9.994144422644521e-06, + "loss": 3.0253, + "step": 328050 + }, + { + "epoch": 0.3232308504685419, + "grad_norm": 2.1986277103424072, + "learning_rate": 9.994136538843292e-06, + "loss": 2.9378, + "step": 328100 + }, + { + "epoch": 0.32328010844636396, + "grad_norm": 2.2318062782287598, + "learning_rate": 9.994128649741472e-06, + "loss": 3.0628, + "step": 328150 + }, + { + "epoch": 0.3233293664241861, + "grad_norm": 2.562345504760742, + "learning_rate": 9.994120755339067e-06, + "loss": 3.0457, + "step": 328200 + }, + { + "epoch": 0.32337862440200815, + "grad_norm": 2.267371416091919, + "learning_rate": 9.994112855636089e-06, + "loss": 2.9136, + "step": 328250 + }, + { + "epoch": 0.3234278823798302, + "grad_norm": 2.4698474407196045, + "learning_rate": 9.994104950632543e-06, + "loss": 2.9654, + "step": 328300 + }, + { + "epoch": 0.32347714035765235, + "grad_norm": 2.4881293773651123, + "learning_rate": 9.99409704032844e-06, + "loss": 3.0255, + "step": 328350 + }, + { + "epoch": 0.3235263983354744, + "grad_norm": 2.2579379081726074, + "learning_rate": 9.994089124723787e-06, + "loss": 2.9906, + "step": 328400 + }, + { + "epoch": 0.3235756563132965, + "grad_norm": 2.533039093017578, + "learning_rate": 9.994081203818592e-06, + "loss": 3.0264, + "step": 328450 + }, + { + "epoch": 0.3236249142911186, + "grad_norm": 2.626873731613159, + "learning_rate": 9.994073277612862e-06, + "loss": 3.0112, + "step": 328500 + }, + { + "epoch": 0.3236741722689407, + "grad_norm": 2.190354824066162, + "learning_rate": 9.994065346106611e-06, + "loss": 2.9717, + "step": 328550 + }, + { + "epoch": 0.32372343024676276, + "grad_norm": 2.3194122314453125, + "learning_rate": 9.994057409299843e-06, + "loss": 3.009, + "step": 328600 + }, + { + "epoch": 0.3237726882245849, + "grad_norm": 2.4748988151550293, + "learning_rate": 9.994049467192566e-06, + "loss": 3.0041, + "step": 328650 + }, + { + "epoch": 0.32382194620240695, + "grad_norm": 2.3168745040893555, + "learning_rate": 9.994041519784789e-06, + "loss": 2.9442, + "step": 328700 + }, + { + "epoch": 0.323871204180229, + "grad_norm": 2.9261860847473145, + "learning_rate": 9.994033567076522e-06, + "loss": 3.0194, + "step": 328750 + }, + { + "epoch": 0.3239204621580511, + "grad_norm": 2.301459312438965, + "learning_rate": 9.994025609067773e-06, + "loss": 3.0219, + "step": 328800 + }, + { + "epoch": 0.3239697201358732, + "grad_norm": 2.138746500015259, + "learning_rate": 9.99401764575855e-06, + "loss": 3.0181, + "step": 328850 + }, + { + "epoch": 0.3240189781136953, + "grad_norm": 2.104782819747925, + "learning_rate": 9.99400967714886e-06, + "loss": 3.0074, + "step": 328900 + }, + { + "epoch": 0.32406823609151736, + "grad_norm": 2.301400899887085, + "learning_rate": 9.994001703238715e-06, + "loss": 2.9688, + "step": 328950 + }, + { + "epoch": 0.3241174940693395, + "grad_norm": 2.5314807891845703, + "learning_rate": 9.99399372402812e-06, + "loss": 2.9774, + "step": 329000 + }, + { + "epoch": 0.32416675204716155, + "grad_norm": 2.420837163925171, + "learning_rate": 9.993985739517087e-06, + "loss": 2.9931, + "step": 329050 + }, + { + "epoch": 0.3242160100249836, + "grad_norm": 2.3666465282440186, + "learning_rate": 9.993977749705619e-06, + "loss": 2.9487, + "step": 329100 + }, + { + "epoch": 0.32426526800280575, + "grad_norm": 2.3122732639312744, + "learning_rate": 9.99396975459373e-06, + "loss": 2.9995, + "step": 329150 + }, + { + "epoch": 0.3243145259806278, + "grad_norm": 2.2921929359436035, + "learning_rate": 9.993961754181425e-06, + "loss": 3.0405, + "step": 329200 + }, + { + "epoch": 0.3243637839584499, + "grad_norm": 2.3132262229919434, + "learning_rate": 9.993953748468715e-06, + "loss": 3.0025, + "step": 329250 + }, + { + "epoch": 0.324413041936272, + "grad_norm": 2.2428135871887207, + "learning_rate": 9.993945737455608e-06, + "loss": 3.0351, + "step": 329300 + }, + { + "epoch": 0.3244622999140941, + "grad_norm": 2.5213520526885986, + "learning_rate": 9.99393772114211e-06, + "loss": 2.99, + "step": 329350 + }, + { + "epoch": 0.32451155789191616, + "grad_norm": 2.218778371810913, + "learning_rate": 9.993929699528231e-06, + "loss": 2.956, + "step": 329400 + }, + { + "epoch": 0.3245608158697383, + "grad_norm": 2.276928186416626, + "learning_rate": 9.99392167261398e-06, + "loss": 2.9863, + "step": 329450 + }, + { + "epoch": 0.32461007384756035, + "grad_norm": 2.3644790649414062, + "learning_rate": 9.993913640399365e-06, + "loss": 3.0067, + "step": 329500 + }, + { + "epoch": 0.3246593318253824, + "grad_norm": 2.350954294204712, + "learning_rate": 9.993905602884397e-06, + "loss": 2.943, + "step": 329550 + }, + { + "epoch": 0.32470858980320455, + "grad_norm": 2.3388078212738037, + "learning_rate": 9.99389756006908e-06, + "loss": 3.0474, + "step": 329600 + }, + { + "epoch": 0.3247578477810266, + "grad_norm": 2.451012372970581, + "learning_rate": 9.993889511953426e-06, + "loss": 2.9841, + "step": 329650 + }, + { + "epoch": 0.3248071057588487, + "grad_norm": 2.3123908042907715, + "learning_rate": 9.993881458537443e-06, + "loss": 2.9834, + "step": 329700 + }, + { + "epoch": 0.3248563637366708, + "grad_norm": 2.227527141571045, + "learning_rate": 9.993873399821137e-06, + "loss": 3.0144, + "step": 329750 + }, + { + "epoch": 0.3249056217144929, + "grad_norm": 2.359307289123535, + "learning_rate": 9.993865335804522e-06, + "loss": 3.0508, + "step": 329800 + }, + { + "epoch": 0.32495487969231496, + "grad_norm": 2.3206188678741455, + "learning_rate": 9.993857266487599e-06, + "loss": 3.0415, + "step": 329850 + }, + { + "epoch": 0.325004137670137, + "grad_norm": 2.269857406616211, + "learning_rate": 9.993849191870382e-06, + "loss": 2.9873, + "step": 329900 + }, + { + "epoch": 0.32505339564795915, + "grad_norm": 2.2724006175994873, + "learning_rate": 9.99384111195288e-06, + "loss": 3.0562, + "step": 329950 + }, + { + "epoch": 0.3251026536257812, + "grad_norm": 2.170011043548584, + "learning_rate": 9.993833026735098e-06, + "loss": 2.985, + "step": 330000 + }, + { + "epoch": 0.3251519116036033, + "grad_norm": 2.2785377502441406, + "learning_rate": 9.993824936217048e-06, + "loss": 2.9524, + "step": 330050 + }, + { + "epoch": 0.3252011695814254, + "grad_norm": 2.688688039779663, + "learning_rate": 9.993816840398736e-06, + "loss": 3.0086, + "step": 330100 + }, + { + "epoch": 0.3252504275592475, + "grad_norm": 2.4050838947296143, + "learning_rate": 9.993808739280171e-06, + "loss": 2.9078, + "step": 330150 + }, + { + "epoch": 0.32529968553706956, + "grad_norm": 2.536747455596924, + "learning_rate": 9.993800632861364e-06, + "loss": 2.9702, + "step": 330200 + }, + { + "epoch": 0.3253489435148917, + "grad_norm": 2.2469213008880615, + "learning_rate": 9.993792521142319e-06, + "loss": 3.0417, + "step": 330250 + }, + { + "epoch": 0.32539820149271376, + "grad_norm": 2.398123264312744, + "learning_rate": 9.993784404123048e-06, + "loss": 3.0302, + "step": 330300 + }, + { + "epoch": 0.3254474594705358, + "grad_norm": 2.3504843711853027, + "learning_rate": 9.993776281803559e-06, + "loss": 2.9986, + "step": 330350 + }, + { + "epoch": 0.32549671744835795, + "grad_norm": 2.286226987838745, + "learning_rate": 9.993768154183864e-06, + "loss": 2.9825, + "step": 330400 + }, + { + "epoch": 0.32554597542618, + "grad_norm": 2.2099859714508057, + "learning_rate": 9.993760021263964e-06, + "loss": 2.9895, + "step": 330450 + }, + { + "epoch": 0.3255952334040021, + "grad_norm": 2.3747777938842773, + "learning_rate": 9.993751883043874e-06, + "loss": 3.043, + "step": 330500 + }, + { + "epoch": 0.3256444913818242, + "grad_norm": 2.2541325092315674, + "learning_rate": 9.9937437395236e-06, + "loss": 2.9894, + "step": 330550 + }, + { + "epoch": 0.3256937493596463, + "grad_norm": 2.2578492164611816, + "learning_rate": 9.993735590703152e-06, + "loss": 2.9132, + "step": 330600 + }, + { + "epoch": 0.32574300733746836, + "grad_norm": 2.53883695602417, + "learning_rate": 9.993727436582536e-06, + "loss": 3.005, + "step": 330650 + }, + { + "epoch": 0.3257922653152905, + "grad_norm": 2.362668514251709, + "learning_rate": 9.993719277161763e-06, + "loss": 3.0294, + "step": 330700 + }, + { + "epoch": 0.32584152329311256, + "grad_norm": 2.2556464672088623, + "learning_rate": 9.993711112440843e-06, + "loss": 3.0243, + "step": 330750 + }, + { + "epoch": 0.3258907812709346, + "grad_norm": 2.4334092140197754, + "learning_rate": 9.99370294241978e-06, + "loss": 3.0062, + "step": 330800 + }, + { + "epoch": 0.32594003924875675, + "grad_norm": 2.1844818592071533, + "learning_rate": 9.993694767098588e-06, + "loss": 2.9643, + "step": 330850 + }, + { + "epoch": 0.3259892972265788, + "grad_norm": 2.3118185997009277, + "learning_rate": 9.993686586477272e-06, + "loss": 2.9467, + "step": 330900 + }, + { + "epoch": 0.3260385552044009, + "grad_norm": 2.7873620986938477, + "learning_rate": 9.993678400555842e-06, + "loss": 2.9624, + "step": 330950 + }, + { + "epoch": 0.326087813182223, + "grad_norm": 2.245900869369507, + "learning_rate": 9.993670209334305e-06, + "loss": 2.915, + "step": 331000 + }, + { + "epoch": 0.3261370711600451, + "grad_norm": 2.3242104053497314, + "learning_rate": 9.993662012812674e-06, + "loss": 3.0562, + "step": 331050 + }, + { + "epoch": 0.32618632913786716, + "grad_norm": 2.4527854919433594, + "learning_rate": 9.993653810990952e-06, + "loss": 3.0276, + "step": 331100 + }, + { + "epoch": 0.32623558711568923, + "grad_norm": 2.298565149307251, + "learning_rate": 9.993645603869151e-06, + "loss": 3.0112, + "step": 331150 + }, + { + "epoch": 0.32628484509351136, + "grad_norm": 2.499424934387207, + "learning_rate": 9.993637391447281e-06, + "loss": 2.954, + "step": 331200 + }, + { + "epoch": 0.3263341030713334, + "grad_norm": 2.1946020126342773, + "learning_rate": 9.993629173725348e-06, + "loss": 3.0103, + "step": 331250 + }, + { + "epoch": 0.3263833610491555, + "grad_norm": 2.367494583129883, + "learning_rate": 9.993620950703363e-06, + "loss": 3.0019, + "step": 331300 + }, + { + "epoch": 0.3264326190269776, + "grad_norm": 2.3858416080474854, + "learning_rate": 9.993612722381332e-06, + "loss": 3.0011, + "step": 331350 + }, + { + "epoch": 0.3264818770047997, + "grad_norm": 2.383852005004883, + "learning_rate": 9.993604488759266e-06, + "loss": 2.9786, + "step": 331400 + }, + { + "epoch": 0.32653113498262176, + "grad_norm": 2.4190244674682617, + "learning_rate": 9.993596249837173e-06, + "loss": 2.9784, + "step": 331450 + }, + { + "epoch": 0.3265803929604439, + "grad_norm": 2.48496675491333, + "learning_rate": 9.993588005615062e-06, + "loss": 3.0069, + "step": 331500 + }, + { + "epoch": 0.32662965093826596, + "grad_norm": 2.1755332946777344, + "learning_rate": 9.99357975609294e-06, + "loss": 3.0153, + "step": 331550 + }, + { + "epoch": 0.32667890891608803, + "grad_norm": 2.2785990238189697, + "learning_rate": 9.993571501270818e-06, + "loss": 2.9896, + "step": 331600 + }, + { + "epoch": 0.32672816689391015, + "grad_norm": 2.293120861053467, + "learning_rate": 9.993563241148704e-06, + "loss": 3.0331, + "step": 331650 + }, + { + "epoch": 0.3267774248717322, + "grad_norm": 2.425532102584839, + "learning_rate": 9.993554975726608e-06, + "loss": 3.0257, + "step": 331700 + }, + { + "epoch": 0.3268266828495543, + "grad_norm": 2.5319900512695312, + "learning_rate": 9.993546705004535e-06, + "loss": 2.9868, + "step": 331750 + }, + { + "epoch": 0.3268759408273764, + "grad_norm": 2.21345853805542, + "learning_rate": 9.993538428982497e-06, + "loss": 2.9728, + "step": 331800 + }, + { + "epoch": 0.3269251988051985, + "grad_norm": 2.329190731048584, + "learning_rate": 9.993530147660503e-06, + "loss": 2.9663, + "step": 331850 + }, + { + "epoch": 0.32697445678302056, + "grad_norm": 2.462146520614624, + "learning_rate": 9.99352186103856e-06, + "loss": 3.0214, + "step": 331900 + }, + { + "epoch": 0.3270237147608427, + "grad_norm": 2.3075993061065674, + "learning_rate": 9.993513569116678e-06, + "loss": 2.9472, + "step": 331950 + }, + { + "epoch": 0.32707297273866476, + "grad_norm": 2.2807559967041016, + "learning_rate": 9.993505271894864e-06, + "loss": 3.0052, + "step": 332000 + }, + { + "epoch": 0.32712223071648683, + "grad_norm": 2.3136658668518066, + "learning_rate": 9.993496969373132e-06, + "loss": 3.055, + "step": 332050 + }, + { + "epoch": 0.32717148869430895, + "grad_norm": 2.2581095695495605, + "learning_rate": 9.993488661551484e-06, + "loss": 2.9854, + "step": 332100 + }, + { + "epoch": 0.327220746672131, + "grad_norm": 2.321136951446533, + "learning_rate": 9.993480348429932e-06, + "loss": 2.9882, + "step": 332150 + }, + { + "epoch": 0.3272700046499531, + "grad_norm": 2.277092695236206, + "learning_rate": 9.993472030008487e-06, + "loss": 2.9918, + "step": 332200 + }, + { + "epoch": 0.3273192626277752, + "grad_norm": 2.2012381553649902, + "learning_rate": 9.993463706287153e-06, + "loss": 2.9087, + "step": 332250 + }, + { + "epoch": 0.3273685206055973, + "grad_norm": 2.537729501724243, + "learning_rate": 9.993455377265942e-06, + "loss": 2.9835, + "step": 332300 + }, + { + "epoch": 0.32741777858341936, + "grad_norm": 2.158907413482666, + "learning_rate": 9.993447042944864e-06, + "loss": 2.9736, + "step": 332350 + }, + { + "epoch": 0.32746703656124143, + "grad_norm": 4.529309272766113, + "learning_rate": 9.993438703323925e-06, + "loss": 2.9802, + "step": 332400 + }, + { + "epoch": 0.32751629453906356, + "grad_norm": 2.182084798812866, + "learning_rate": 9.993430358403134e-06, + "loss": 2.9914, + "step": 332450 + }, + { + "epoch": 0.3275655525168856, + "grad_norm": 2.4692330360412598, + "learning_rate": 9.993422008182501e-06, + "loss": 2.8843, + "step": 332500 + }, + { + "epoch": 0.3276148104947077, + "grad_norm": 2.5865678787231445, + "learning_rate": 9.993413652662034e-06, + "loss": 2.9429, + "step": 332550 + }, + { + "epoch": 0.3276640684725298, + "grad_norm": 2.6665000915527344, + "learning_rate": 9.993405291841743e-06, + "loss": 2.9283, + "step": 332600 + }, + { + "epoch": 0.3277133264503519, + "grad_norm": 2.3242952823638916, + "learning_rate": 9.993396925721638e-06, + "loss": 3.0044, + "step": 332650 + }, + { + "epoch": 0.32776258442817396, + "grad_norm": 2.2576026916503906, + "learning_rate": 9.993388554301726e-06, + "loss": 3.0406, + "step": 332700 + }, + { + "epoch": 0.3278118424059961, + "grad_norm": 2.295717716217041, + "learning_rate": 9.993380177582014e-06, + "loss": 3.0268, + "step": 332750 + }, + { + "epoch": 0.32786110038381816, + "grad_norm": 2.3236746788024902, + "learning_rate": 9.993371795562515e-06, + "loss": 2.9486, + "step": 332800 + }, + { + "epoch": 0.32791035836164023, + "grad_norm": 2.175859212875366, + "learning_rate": 9.993363408243235e-06, + "loss": 3.0295, + "step": 332850 + }, + { + "epoch": 0.32795961633946236, + "grad_norm": 2.1589910984039307, + "learning_rate": 9.993355015624184e-06, + "loss": 3.0078, + "step": 332900 + }, + { + "epoch": 0.3280088743172844, + "grad_norm": 2.3910140991210938, + "learning_rate": 9.99334661770537e-06, + "loss": 2.9863, + "step": 332950 + }, + { + "epoch": 0.3280581322951065, + "grad_norm": 2.3442163467407227, + "learning_rate": 9.993338214486803e-06, + "loss": 2.9962, + "step": 333000 + }, + { + "epoch": 0.3281073902729286, + "grad_norm": 2.379298686981201, + "learning_rate": 9.993329805968493e-06, + "loss": 2.9801, + "step": 333050 + }, + { + "epoch": 0.3281566482507507, + "grad_norm": 2.2941434383392334, + "learning_rate": 9.993321392150444e-06, + "loss": 2.9357, + "step": 333100 + }, + { + "epoch": 0.32820590622857276, + "grad_norm": 2.4453113079071045, + "learning_rate": 9.993312973032672e-06, + "loss": 3.0427, + "step": 333150 + }, + { + "epoch": 0.3282551642063949, + "grad_norm": 2.39605975151062, + "learning_rate": 9.993304548615182e-06, + "loss": 3.0097, + "step": 333200 + }, + { + "epoch": 0.32830442218421696, + "grad_norm": 2.7117645740509033, + "learning_rate": 9.993296118897981e-06, + "loss": 3.0343, + "step": 333250 + }, + { + "epoch": 0.32835368016203903, + "grad_norm": 2.379652976989746, + "learning_rate": 9.993287683881082e-06, + "loss": 2.9877, + "step": 333300 + }, + { + "epoch": 0.32840293813986116, + "grad_norm": 2.298321008682251, + "learning_rate": 9.993279243564492e-06, + "loss": 3.0388, + "step": 333350 + }, + { + "epoch": 0.3284521961176832, + "grad_norm": 2.3403446674346924, + "learning_rate": 9.99327079794822e-06, + "loss": 2.9447, + "step": 333400 + }, + { + "epoch": 0.3285014540955053, + "grad_norm": 2.15140700340271, + "learning_rate": 9.993262347032276e-06, + "loss": 2.937, + "step": 333450 + }, + { + "epoch": 0.3285507120733274, + "grad_norm": 2.1999704837799072, + "learning_rate": 9.993253890816666e-06, + "loss": 3.0101, + "step": 333500 + }, + { + "epoch": 0.3285999700511495, + "grad_norm": 2.4294421672821045, + "learning_rate": 9.993245429301403e-06, + "loss": 2.9623, + "step": 333550 + }, + { + "epoch": 0.32864922802897156, + "grad_norm": 2.753049373626709, + "learning_rate": 9.993236962486494e-06, + "loss": 2.9858, + "step": 333600 + }, + { + "epoch": 0.32869848600679363, + "grad_norm": 2.5858705043792725, + "learning_rate": 9.993228490371947e-06, + "loss": 2.9057, + "step": 333650 + }, + { + "epoch": 0.32874774398461576, + "grad_norm": 2.4165945053100586, + "learning_rate": 9.993220012957772e-06, + "loss": 3.0272, + "step": 333700 + }, + { + "epoch": 0.32879700196243783, + "grad_norm": 2.4367964267730713, + "learning_rate": 9.993211530243978e-06, + "loss": 2.9823, + "step": 333750 + }, + { + "epoch": 0.3288462599402599, + "grad_norm": 2.544750213623047, + "learning_rate": 9.993203042230575e-06, + "loss": 3.0554, + "step": 333800 + }, + { + "epoch": 0.328895517918082, + "grad_norm": 2.3746838569641113, + "learning_rate": 9.99319454891757e-06, + "loss": 3.0241, + "step": 333850 + }, + { + "epoch": 0.3289447758959041, + "grad_norm": 2.3621301651000977, + "learning_rate": 9.993186050304975e-06, + "loss": 2.9568, + "step": 333900 + }, + { + "epoch": 0.32899403387372617, + "grad_norm": 2.2982468605041504, + "learning_rate": 9.993177546392795e-06, + "loss": 2.951, + "step": 333950 + }, + { + "epoch": 0.3290432918515483, + "grad_norm": 2.4338607788085938, + "learning_rate": 9.993169037181041e-06, + "loss": 3.0279, + "step": 334000 + }, + { + "epoch": 0.32909254982937036, + "grad_norm": 2.1829757690429688, + "learning_rate": 9.993160522669724e-06, + "loss": 2.9909, + "step": 334050 + }, + { + "epoch": 0.32914180780719243, + "grad_norm": 2.2956037521362305, + "learning_rate": 9.993152002858848e-06, + "loss": 2.9786, + "step": 334100 + }, + { + "epoch": 0.32919106578501456, + "grad_norm": 2.537494421005249, + "learning_rate": 9.993143477748427e-06, + "loss": 3.0373, + "step": 334150 + }, + { + "epoch": 0.32924032376283663, + "grad_norm": 2.2307798862457275, + "learning_rate": 9.99313494733847e-06, + "loss": 2.9364, + "step": 334200 + }, + { + "epoch": 0.3292895817406587, + "grad_norm": 2.4112865924835205, + "learning_rate": 9.993126411628983e-06, + "loss": 3.0184, + "step": 334250 + }, + { + "epoch": 0.3293388397184808, + "grad_norm": 2.1872291564941406, + "learning_rate": 9.993117870619975e-06, + "loss": 3.0146, + "step": 334300 + }, + { + "epoch": 0.3293880976963029, + "grad_norm": 2.160231351852417, + "learning_rate": 9.993109324311457e-06, + "loss": 2.9874, + "step": 334350 + }, + { + "epoch": 0.32943735567412497, + "grad_norm": 2.2773516178131104, + "learning_rate": 9.993100772703439e-06, + "loss": 2.9393, + "step": 334400 + }, + { + "epoch": 0.3294866136519471, + "grad_norm": 2.1903815269470215, + "learning_rate": 9.993092215795928e-06, + "loss": 2.9746, + "step": 334450 + }, + { + "epoch": 0.32953587162976916, + "grad_norm": 2.2268595695495605, + "learning_rate": 9.993083653588933e-06, + "loss": 3.0054, + "step": 334500 + }, + { + "epoch": 0.32958512960759123, + "grad_norm": 2.305537223815918, + "learning_rate": 9.993075086082463e-06, + "loss": 3.0616, + "step": 334550 + }, + { + "epoch": 0.32963438758541336, + "grad_norm": 2.455787181854248, + "learning_rate": 9.99306651327653e-06, + "loss": 3.0117, + "step": 334600 + }, + { + "epoch": 0.32968364556323543, + "grad_norm": 2.1642446517944336, + "learning_rate": 9.99305793517114e-06, + "loss": 3.0309, + "step": 334650 + }, + { + "epoch": 0.3297329035410575, + "grad_norm": 2.3401710987091064, + "learning_rate": 9.993049351766303e-06, + "loss": 2.9131, + "step": 334700 + }, + { + "epoch": 0.3297821615188796, + "grad_norm": 2.343400239944458, + "learning_rate": 9.993040763062028e-06, + "loss": 3.0278, + "step": 334750 + }, + { + "epoch": 0.3298314194967017, + "grad_norm": 2.202497959136963, + "learning_rate": 9.993032169058325e-06, + "loss": 2.9572, + "step": 334800 + }, + { + "epoch": 0.32988067747452376, + "grad_norm": 2.4046151638031006, + "learning_rate": 9.9930235697552e-06, + "loss": 2.9749, + "step": 334850 + }, + { + "epoch": 0.32992993545234583, + "grad_norm": 2.2465648651123047, + "learning_rate": 9.993014965152667e-06, + "loss": 2.9361, + "step": 334900 + }, + { + "epoch": 0.32997919343016796, + "grad_norm": 2.385225296020508, + "learning_rate": 9.993006355250731e-06, + "loss": 3.0206, + "step": 334950 + }, + { + "epoch": 0.33002845140799003, + "grad_norm": 2.2685365676879883, + "learning_rate": 9.992997740049404e-06, + "loss": 2.9295, + "step": 335000 + }, + { + "epoch": 0.3300777093858121, + "grad_norm": 2.42248797416687, + "learning_rate": 9.992989119548693e-06, + "loss": 2.9187, + "step": 335050 + }, + { + "epoch": 0.3301269673636342, + "grad_norm": 2.1209042072296143, + "learning_rate": 9.99298049374861e-06, + "loss": 2.9867, + "step": 335100 + }, + { + "epoch": 0.3301762253414563, + "grad_norm": 2.285369634628296, + "learning_rate": 9.99297186264916e-06, + "loss": 2.9813, + "step": 335150 + }, + { + "epoch": 0.33022548331927837, + "grad_norm": 2.674556016921997, + "learning_rate": 9.992963226250355e-06, + "loss": 2.9998, + "step": 335200 + }, + { + "epoch": 0.3302747412971005, + "grad_norm": 2.219677209854126, + "learning_rate": 9.992954584552204e-06, + "loss": 2.964, + "step": 335250 + }, + { + "epoch": 0.33032399927492256, + "grad_norm": 2.4326205253601074, + "learning_rate": 9.992945937554715e-06, + "loss": 2.9898, + "step": 335300 + }, + { + "epoch": 0.33037325725274463, + "grad_norm": 2.233905792236328, + "learning_rate": 9.9929372852579e-06, + "loss": 2.9781, + "step": 335350 + }, + { + "epoch": 0.33042251523056676, + "grad_norm": 2.197079658508301, + "learning_rate": 9.992928627661763e-06, + "loss": 3.0171, + "step": 335400 + }, + { + "epoch": 0.33047177320838883, + "grad_norm": 2.5884103775024414, + "learning_rate": 9.992919964766318e-06, + "loss": 2.9444, + "step": 335450 + }, + { + "epoch": 0.3305210311862109, + "grad_norm": 2.2713305950164795, + "learning_rate": 9.992911296571572e-06, + "loss": 3.0155, + "step": 335500 + }, + { + "epoch": 0.330570289164033, + "grad_norm": 2.3652918338775635, + "learning_rate": 9.992902623077535e-06, + "loss": 3.0359, + "step": 335550 + }, + { + "epoch": 0.3306195471418551, + "grad_norm": 2.208735704421997, + "learning_rate": 9.992893944284217e-06, + "loss": 2.9373, + "step": 335600 + }, + { + "epoch": 0.33066880511967717, + "grad_norm": 2.757791519165039, + "learning_rate": 9.992885260191624e-06, + "loss": 2.9903, + "step": 335650 + }, + { + "epoch": 0.3307180630974993, + "grad_norm": 2.39674973487854, + "learning_rate": 9.992876570799769e-06, + "loss": 3.0387, + "step": 335700 + }, + { + "epoch": 0.33076732107532136, + "grad_norm": 2.2821710109710693, + "learning_rate": 9.99286787610866e-06, + "loss": 2.9802, + "step": 335750 + }, + { + "epoch": 0.33081657905314343, + "grad_norm": 2.258312463760376, + "learning_rate": 9.992859176118304e-06, + "loss": 3.0403, + "step": 335800 + }, + { + "epoch": 0.33086583703096556, + "grad_norm": 2.3026468753814697, + "learning_rate": 9.992850470828712e-06, + "loss": 2.9715, + "step": 335850 + }, + { + "epoch": 0.33091509500878763, + "grad_norm": 2.3452036380767822, + "learning_rate": 9.992841760239894e-06, + "loss": 2.983, + "step": 335900 + }, + { + "epoch": 0.3309643529866097, + "grad_norm": 2.1054975986480713, + "learning_rate": 9.992833044351859e-06, + "loss": 2.9429, + "step": 335950 + }, + { + "epoch": 0.3310136109644318, + "grad_norm": 2.4448728561401367, + "learning_rate": 9.992824323164615e-06, + "loss": 2.9888, + "step": 336000 + }, + { + "epoch": 0.3310628689422539, + "grad_norm": 2.36635422706604, + "learning_rate": 9.992815596678172e-06, + "loss": 3.0021, + "step": 336050 + }, + { + "epoch": 0.33111212692007597, + "grad_norm": 2.2863500118255615, + "learning_rate": 9.992806864892539e-06, + "loss": 2.9728, + "step": 336100 + }, + { + "epoch": 0.33116138489789804, + "grad_norm": 2.253915309906006, + "learning_rate": 9.992798127807727e-06, + "loss": 3.01, + "step": 336150 + }, + { + "epoch": 0.33121064287572016, + "grad_norm": 2.2907164096832275, + "learning_rate": 9.99278938542374e-06, + "loss": 3.0121, + "step": 336200 + }, + { + "epoch": 0.33125990085354223, + "grad_norm": 2.4763290882110596, + "learning_rate": 9.992780637740595e-06, + "loss": 2.9711, + "step": 336250 + }, + { + "epoch": 0.3313091588313643, + "grad_norm": 2.458893060684204, + "learning_rate": 9.992771884758296e-06, + "loss": 2.9681, + "step": 336300 + }, + { + "epoch": 0.33135841680918643, + "grad_norm": 2.313654899597168, + "learning_rate": 9.992763126476855e-06, + "loss": 3.0046, + "step": 336350 + }, + { + "epoch": 0.3314076747870085, + "grad_norm": 2.5787501335144043, + "learning_rate": 9.992754362896278e-06, + "loss": 3.0679, + "step": 336400 + }, + { + "epoch": 0.33145693276483057, + "grad_norm": 2.196828842163086, + "learning_rate": 9.992745594016577e-06, + "loss": 2.9476, + "step": 336450 + }, + { + "epoch": 0.3315061907426527, + "grad_norm": 2.227865219116211, + "learning_rate": 9.992736819837759e-06, + "loss": 2.9764, + "step": 336500 + }, + { + "epoch": 0.33155544872047477, + "grad_norm": 2.3986427783966064, + "learning_rate": 9.992728040359838e-06, + "loss": 3.0366, + "step": 336550 + }, + { + "epoch": 0.33160470669829684, + "grad_norm": 2.258951187133789, + "learning_rate": 9.992719255582817e-06, + "loss": 2.9206, + "step": 336600 + }, + { + "epoch": 0.33165396467611896, + "grad_norm": 2.221583843231201, + "learning_rate": 9.992710465506711e-06, + "loss": 2.9891, + "step": 336650 + }, + { + "epoch": 0.33170322265394103, + "grad_norm": 2.284433126449585, + "learning_rate": 9.992701670131525e-06, + "loss": 3.0005, + "step": 336700 + }, + { + "epoch": 0.3317524806317631, + "grad_norm": 2.5204062461853027, + "learning_rate": 9.992692869457272e-06, + "loss": 2.9252, + "step": 336750 + }, + { + "epoch": 0.33180173860958523, + "grad_norm": 2.286022663116455, + "learning_rate": 9.992684063483959e-06, + "loss": 3.0085, + "step": 336800 + }, + { + "epoch": 0.3318509965874073, + "grad_norm": 2.324477434158325, + "learning_rate": 9.992675252211595e-06, + "loss": 2.9513, + "step": 336850 + }, + { + "epoch": 0.33190025456522937, + "grad_norm": 2.9260926246643066, + "learning_rate": 9.99266643564019e-06, + "loss": 3.0207, + "step": 336900 + }, + { + "epoch": 0.3319495125430515, + "grad_norm": 2.2520790100097656, + "learning_rate": 9.992657613769756e-06, + "loss": 2.9518, + "step": 336950 + }, + { + "epoch": 0.33199877052087357, + "grad_norm": 2.1295948028564453, + "learning_rate": 9.992648786600297e-06, + "loss": 3.0661, + "step": 337000 + }, + { + "epoch": 0.33204802849869564, + "grad_norm": 2.4285457134246826, + "learning_rate": 9.992639954131827e-06, + "loss": 3.046, + "step": 337050 + }, + { + "epoch": 0.33209728647651776, + "grad_norm": 2.2319424152374268, + "learning_rate": 9.992631116364353e-06, + "loss": 2.9649, + "step": 337100 + }, + { + "epoch": 0.33214654445433983, + "grad_norm": 2.3819668292999268, + "learning_rate": 9.992622273297886e-06, + "loss": 2.9942, + "step": 337150 + }, + { + "epoch": 0.3321958024321619, + "grad_norm": 2.414590835571289, + "learning_rate": 9.992613424932432e-06, + "loss": 2.9575, + "step": 337200 + }, + { + "epoch": 0.332245060409984, + "grad_norm": 2.330918312072754, + "learning_rate": 9.992604571268004e-06, + "loss": 2.9352, + "step": 337250 + }, + { + "epoch": 0.3322943183878061, + "grad_norm": 2.272796154022217, + "learning_rate": 9.992595712304611e-06, + "loss": 3.0256, + "step": 337300 + }, + { + "epoch": 0.33234357636562817, + "grad_norm": 2.175579071044922, + "learning_rate": 9.992586848042261e-06, + "loss": 3.0496, + "step": 337350 + }, + { + "epoch": 0.33239283434345024, + "grad_norm": 2.2883782386779785, + "learning_rate": 9.992577978480965e-06, + "loss": 2.9649, + "step": 337400 + }, + { + "epoch": 0.33244209232127236, + "grad_norm": 2.390902042388916, + "learning_rate": 9.992569103620732e-06, + "loss": 3.0285, + "step": 337450 + }, + { + "epoch": 0.33249135029909443, + "grad_norm": 2.321103811264038, + "learning_rate": 9.992560223461568e-06, + "loss": 3.0092, + "step": 337500 + }, + { + "epoch": 0.3325406082769165, + "grad_norm": 2.2887067794799805, + "learning_rate": 9.992551338003487e-06, + "loss": 2.9408, + "step": 337550 + }, + { + "epoch": 0.33258986625473863, + "grad_norm": 2.3040037155151367, + "learning_rate": 9.992542447246498e-06, + "loss": 2.921, + "step": 337600 + }, + { + "epoch": 0.3326391242325607, + "grad_norm": 2.2887332439422607, + "learning_rate": 9.992533551190607e-06, + "loss": 3.0094, + "step": 337650 + }, + { + "epoch": 0.33268838221038277, + "grad_norm": 2.275895357131958, + "learning_rate": 9.992524649835827e-06, + "loss": 3.0255, + "step": 337700 + }, + { + "epoch": 0.3327376401882049, + "grad_norm": 2.317335605621338, + "learning_rate": 9.992515743182167e-06, + "loss": 3.0139, + "step": 337750 + }, + { + "epoch": 0.33278689816602697, + "grad_norm": 2.305271625518799, + "learning_rate": 9.992506831229633e-06, + "loss": 2.9861, + "step": 337800 + }, + { + "epoch": 0.33283615614384904, + "grad_norm": 2.385145664215088, + "learning_rate": 9.992497913978238e-06, + "loss": 2.9699, + "step": 337850 + }, + { + "epoch": 0.33288541412167116, + "grad_norm": 2.1932711601257324, + "learning_rate": 9.992488991427989e-06, + "loss": 2.9783, + "step": 337900 + }, + { + "epoch": 0.33293467209949323, + "grad_norm": 2.3467745780944824, + "learning_rate": 9.992480063578898e-06, + "loss": 2.9499, + "step": 337950 + }, + { + "epoch": 0.3329839300773153, + "grad_norm": 2.316917896270752, + "learning_rate": 9.992471130430975e-06, + "loss": 3.0035, + "step": 338000 + }, + { + "epoch": 0.33303318805513743, + "grad_norm": 2.439969062805176, + "learning_rate": 9.992462191984226e-06, + "loss": 2.9711, + "step": 338050 + }, + { + "epoch": 0.3330824460329595, + "grad_norm": 2.2748117446899414, + "learning_rate": 9.992453248238663e-06, + "loss": 2.9137, + "step": 338100 + }, + { + "epoch": 0.33313170401078157, + "grad_norm": 2.2765555381774902, + "learning_rate": 9.992444299194295e-06, + "loss": 2.9763, + "step": 338150 + }, + { + "epoch": 0.3331809619886037, + "grad_norm": 2.1811654567718506, + "learning_rate": 9.992435344851131e-06, + "loss": 3.0055, + "step": 338200 + }, + { + "epoch": 0.33323021996642577, + "grad_norm": 2.1943769454956055, + "learning_rate": 9.99242638520918e-06, + "loss": 2.9652, + "step": 338250 + }, + { + "epoch": 0.33327947794424784, + "grad_norm": 2.296527862548828, + "learning_rate": 9.992417420268454e-06, + "loss": 3.0374, + "step": 338300 + }, + { + "epoch": 0.33332873592206996, + "grad_norm": 2.221212387084961, + "learning_rate": 9.99240845002896e-06, + "loss": 2.9582, + "step": 338350 + }, + { + "epoch": 0.33337799389989203, + "grad_norm": 2.460963249206543, + "learning_rate": 9.99239947449071e-06, + "loss": 3.0013, + "step": 338400 + }, + { + "epoch": 0.3334272518777141, + "grad_norm": 2.313725233078003, + "learning_rate": 9.992390493653709e-06, + "loss": 2.9468, + "step": 338450 + }, + { + "epoch": 0.33347650985553623, + "grad_norm": 2.2442595958709717, + "learning_rate": 9.992381507517972e-06, + "loss": 3.017, + "step": 338500 + }, + { + "epoch": 0.3335257678333583, + "grad_norm": 2.3330509662628174, + "learning_rate": 9.992372516083505e-06, + "loss": 3.0032, + "step": 338550 + }, + { + "epoch": 0.33357502581118037, + "grad_norm": 2.2215068340301514, + "learning_rate": 9.992363519350318e-06, + "loss": 2.9773, + "step": 338600 + }, + { + "epoch": 0.33362428378900244, + "grad_norm": 2.300535202026367, + "learning_rate": 9.992354517318422e-06, + "loss": 2.9411, + "step": 338650 + }, + { + "epoch": 0.33367354176682457, + "grad_norm": 2.1731655597686768, + "learning_rate": 9.992345509987825e-06, + "loss": 3.0383, + "step": 338700 + }, + { + "epoch": 0.33372279974464664, + "grad_norm": 2.399803400039673, + "learning_rate": 9.992336497358537e-06, + "loss": 2.9773, + "step": 338750 + }, + { + "epoch": 0.3337720577224687, + "grad_norm": 2.5427157878875732, + "learning_rate": 9.992327479430567e-06, + "loss": 2.959, + "step": 338800 + }, + { + "epoch": 0.33382131570029083, + "grad_norm": 2.1830692291259766, + "learning_rate": 9.992318456203927e-06, + "loss": 3.0222, + "step": 338850 + }, + { + "epoch": 0.3338705736781129, + "grad_norm": 2.279541015625, + "learning_rate": 9.992309427678625e-06, + "loss": 3.0038, + "step": 338900 + }, + { + "epoch": 0.333919831655935, + "grad_norm": 2.3429744243621826, + "learning_rate": 9.99230039385467e-06, + "loss": 3.0199, + "step": 338950 + }, + { + "epoch": 0.3339690896337571, + "grad_norm": 2.258209705352783, + "learning_rate": 9.992291354732072e-06, + "loss": 2.9547, + "step": 339000 + }, + { + "epoch": 0.33401834761157917, + "grad_norm": 2.2841849327087402, + "learning_rate": 9.99228231031084e-06, + "loss": 2.9849, + "step": 339050 + }, + { + "epoch": 0.33406760558940124, + "grad_norm": 2.160092830657959, + "learning_rate": 9.992273260590984e-06, + "loss": 3.0307, + "step": 339100 + }, + { + "epoch": 0.33411686356722337, + "grad_norm": 2.0771827697753906, + "learning_rate": 9.992264205572515e-06, + "loss": 3.0561, + "step": 339150 + }, + { + "epoch": 0.33416612154504544, + "grad_norm": 2.414069414138794, + "learning_rate": 9.992255145255442e-06, + "loss": 2.9715, + "step": 339200 + }, + { + "epoch": 0.3342153795228675, + "grad_norm": 2.228687047958374, + "learning_rate": 9.992246079639772e-06, + "loss": 2.9653, + "step": 339250 + }, + { + "epoch": 0.33426463750068963, + "grad_norm": 2.508601188659668, + "learning_rate": 9.992237008725517e-06, + "loss": 2.9967, + "step": 339300 + }, + { + "epoch": 0.3343138954785117, + "grad_norm": 2.3654191493988037, + "learning_rate": 9.992227932512688e-06, + "loss": 2.9933, + "step": 339350 + }, + { + "epoch": 0.3343631534563338, + "grad_norm": 2.3782734870910645, + "learning_rate": 9.992218851001293e-06, + "loss": 3.0212, + "step": 339400 + }, + { + "epoch": 0.3344124114341559, + "grad_norm": 2.282581090927124, + "learning_rate": 9.992209764191341e-06, + "loss": 3.0301, + "step": 339450 + }, + { + "epoch": 0.33446166941197797, + "grad_norm": 2.1989800930023193, + "learning_rate": 9.992200672082841e-06, + "loss": 3.0548, + "step": 339500 + }, + { + "epoch": 0.33451092738980004, + "grad_norm": 2.2526304721832275, + "learning_rate": 9.992191574675806e-06, + "loss": 2.9066, + "step": 339550 + }, + { + "epoch": 0.33456018536762216, + "grad_norm": 2.2149620056152344, + "learning_rate": 9.992182471970243e-06, + "loss": 2.9809, + "step": 339600 + }, + { + "epoch": 0.33460944334544424, + "grad_norm": 2.231394052505493, + "learning_rate": 9.992173363966161e-06, + "loss": 2.9904, + "step": 339650 + }, + { + "epoch": 0.3346587013232663, + "grad_norm": 2.4639899730682373, + "learning_rate": 9.992164250663574e-06, + "loss": 2.9557, + "step": 339700 + }, + { + "epoch": 0.33470795930108843, + "grad_norm": 2.4430172443389893, + "learning_rate": 9.992155132062485e-06, + "loss": 2.9861, + "step": 339750 + }, + { + "epoch": 0.3347572172789105, + "grad_norm": 2.40785813331604, + "learning_rate": 9.992146008162909e-06, + "loss": 2.9839, + "step": 339800 + }, + { + "epoch": 0.33480647525673257, + "grad_norm": 2.2311995029449463, + "learning_rate": 9.992136878964853e-06, + "loss": 2.9751, + "step": 339850 + }, + { + "epoch": 0.33485573323455464, + "grad_norm": 2.3457515239715576, + "learning_rate": 9.992127744468329e-06, + "loss": 3.0134, + "step": 339900 + }, + { + "epoch": 0.33490499121237677, + "grad_norm": 2.1657660007476807, + "learning_rate": 9.992118604673343e-06, + "loss": 3.028, + "step": 339950 + }, + { + "epoch": 0.33495424919019884, + "grad_norm": 2.1902859210968018, + "learning_rate": 9.99210945957991e-06, + "loss": 3.0152, + "step": 340000 + }, + { + "epoch": 0.3350035071680209, + "grad_norm": 2.5864670276641846, + "learning_rate": 9.992100309188034e-06, + "loss": 2.9547, + "step": 340050 + }, + { + "epoch": 0.33505276514584303, + "grad_norm": 2.47171688079834, + "learning_rate": 9.99209115349773e-06, + "loss": 2.9517, + "step": 340100 + }, + { + "epoch": 0.3351020231236651, + "grad_norm": 2.3506343364715576, + "learning_rate": 9.992081992509003e-06, + "loss": 3.0074, + "step": 340150 + }, + { + "epoch": 0.3351512811014872, + "grad_norm": 2.48714280128479, + "learning_rate": 9.992072826221865e-06, + "loss": 3.0341, + "step": 340200 + }, + { + "epoch": 0.3352005390793093, + "grad_norm": 2.6056833267211914, + "learning_rate": 9.992063654636328e-06, + "loss": 3.0036, + "step": 340250 + }, + { + "epoch": 0.33524979705713137, + "grad_norm": 2.325521469116211, + "learning_rate": 9.992054477752396e-06, + "loss": 2.8992, + "step": 340300 + }, + { + "epoch": 0.33529905503495344, + "grad_norm": 2.306194305419922, + "learning_rate": 9.992045295570084e-06, + "loss": 2.9489, + "step": 340350 + }, + { + "epoch": 0.33534831301277557, + "grad_norm": 2.2836101055145264, + "learning_rate": 9.992036108089399e-06, + "loss": 2.9678, + "step": 340400 + }, + { + "epoch": 0.33539757099059764, + "grad_norm": 2.2939021587371826, + "learning_rate": 9.992026915310353e-06, + "loss": 3.0101, + "step": 340450 + }, + { + "epoch": 0.3354468289684197, + "grad_norm": 2.3505148887634277, + "learning_rate": 9.992017717232953e-06, + "loss": 2.9949, + "step": 340500 + }, + { + "epoch": 0.33549608694624183, + "grad_norm": 2.2477991580963135, + "learning_rate": 9.99200851385721e-06, + "loss": 3.0493, + "step": 340550 + }, + { + "epoch": 0.3355453449240639, + "grad_norm": 2.4496474266052246, + "learning_rate": 9.991999305183136e-06, + "loss": 2.9898, + "step": 340600 + }, + { + "epoch": 0.335594602901886, + "grad_norm": 2.573664665222168, + "learning_rate": 9.991990091210737e-06, + "loss": 2.9632, + "step": 340650 + }, + { + "epoch": 0.3356438608797081, + "grad_norm": 2.534057855606079, + "learning_rate": 9.991980871940024e-06, + "loss": 3.0137, + "step": 340700 + }, + { + "epoch": 0.33569311885753017, + "grad_norm": 2.2583706378936768, + "learning_rate": 9.991971647371008e-06, + "loss": 2.9227, + "step": 340750 + }, + { + "epoch": 0.33574237683535224, + "grad_norm": 2.1019530296325684, + "learning_rate": 9.991962417503698e-06, + "loss": 3.0158, + "step": 340800 + }, + { + "epoch": 0.33579163481317437, + "grad_norm": 2.041778564453125, + "learning_rate": 9.991953182338103e-06, + "loss": 2.9953, + "step": 340850 + }, + { + "epoch": 0.33584089279099644, + "grad_norm": 2.1527388095855713, + "learning_rate": 9.991943941874235e-06, + "loss": 2.9835, + "step": 340900 + }, + { + "epoch": 0.3358901507688185, + "grad_norm": 2.728191614151001, + "learning_rate": 9.9919346961121e-06, + "loss": 3.038, + "step": 340950 + }, + { + "epoch": 0.33593940874664063, + "grad_norm": 2.285217523574829, + "learning_rate": 9.991925445051712e-06, + "loss": 3.0423, + "step": 341000 + }, + { + "epoch": 0.3359886667244627, + "grad_norm": 2.379448652267456, + "learning_rate": 9.991916188693078e-06, + "loss": 2.938, + "step": 341050 + }, + { + "epoch": 0.3360379247022848, + "grad_norm": 2.3929026126861572, + "learning_rate": 9.99190692703621e-06, + "loss": 2.9996, + "step": 341100 + }, + { + "epoch": 0.33608718268010684, + "grad_norm": 2.437197685241699, + "learning_rate": 9.991897660081117e-06, + "loss": 2.9674, + "step": 341150 + }, + { + "epoch": 0.33613644065792897, + "grad_norm": 2.381213426589966, + "learning_rate": 9.991888387827808e-06, + "loss": 2.9947, + "step": 341200 + }, + { + "epoch": 0.33618569863575104, + "grad_norm": 2.200547933578491, + "learning_rate": 9.991879110276293e-06, + "loss": 2.9466, + "step": 341250 + }, + { + "epoch": 0.3362349566135731, + "grad_norm": 2.6876845359802246, + "learning_rate": 9.991869827426583e-06, + "loss": 2.9949, + "step": 341300 + }, + { + "epoch": 0.33628421459139524, + "grad_norm": 2.347085952758789, + "learning_rate": 9.991860539278685e-06, + "loss": 2.9573, + "step": 341350 + }, + { + "epoch": 0.3363334725692173, + "grad_norm": 2.265566349029541, + "learning_rate": 9.991851245832613e-06, + "loss": 2.9714, + "step": 341400 + }, + { + "epoch": 0.3363827305470394, + "grad_norm": 2.13972544670105, + "learning_rate": 9.991841947088372e-06, + "loss": 2.896, + "step": 341450 + }, + { + "epoch": 0.3364319885248615, + "grad_norm": 2.360980749130249, + "learning_rate": 9.991832643045978e-06, + "loss": 2.9817, + "step": 341500 + }, + { + "epoch": 0.3364812465026836, + "grad_norm": 2.1735458374023438, + "learning_rate": 9.991823333705437e-06, + "loss": 2.907, + "step": 341550 + }, + { + "epoch": 0.33653050448050564, + "grad_norm": 2.348052740097046, + "learning_rate": 9.991814019066759e-06, + "loss": 3.0406, + "step": 341600 + }, + { + "epoch": 0.33657976245832777, + "grad_norm": 2.6593024730682373, + "learning_rate": 9.991804699129953e-06, + "loss": 3.0218, + "step": 341650 + }, + { + "epoch": 0.33662902043614984, + "grad_norm": 2.3228237628936768, + "learning_rate": 9.991795373895032e-06, + "loss": 3.0031, + "step": 341700 + }, + { + "epoch": 0.3366782784139719, + "grad_norm": 2.2855095863342285, + "learning_rate": 9.991786043362003e-06, + "loss": 2.9777, + "step": 341750 + }, + { + "epoch": 0.33672753639179404, + "grad_norm": 2.4082634449005127, + "learning_rate": 9.991776707530877e-06, + "loss": 2.94, + "step": 341800 + }, + { + "epoch": 0.3367767943696161, + "grad_norm": 2.3405725955963135, + "learning_rate": 9.991767366401665e-06, + "loss": 2.9253, + "step": 341850 + }, + { + "epoch": 0.3368260523474382, + "grad_norm": 2.2966246604919434, + "learning_rate": 9.991758019974375e-06, + "loss": 3.0294, + "step": 341900 + }, + { + "epoch": 0.3368753103252603, + "grad_norm": 2.2345314025878906, + "learning_rate": 9.991748668249016e-06, + "loss": 2.9859, + "step": 341950 + }, + { + "epoch": 0.3369245683030824, + "grad_norm": 2.359309196472168, + "learning_rate": 9.991739311225602e-06, + "loss": 3.0152, + "step": 342000 + }, + { + "epoch": 0.33697382628090444, + "grad_norm": 2.2795088291168213, + "learning_rate": 9.99172994890414e-06, + "loss": 3.0463, + "step": 342050 + }, + { + "epoch": 0.33702308425872657, + "grad_norm": 2.2686636447906494, + "learning_rate": 9.99172058128464e-06, + "loss": 3.0019, + "step": 342100 + }, + { + "epoch": 0.33707234223654864, + "grad_norm": 2.2810771465301514, + "learning_rate": 9.991711208367113e-06, + "loss": 2.9185, + "step": 342150 + }, + { + "epoch": 0.3371216002143707, + "grad_norm": 2.1937739849090576, + "learning_rate": 9.99170183015157e-06, + "loss": 2.9691, + "step": 342200 + }, + { + "epoch": 0.33717085819219283, + "grad_norm": 2.1052417755126953, + "learning_rate": 9.991692446638016e-06, + "loss": 2.9574, + "step": 342250 + }, + { + "epoch": 0.3372201161700149, + "grad_norm": 2.460070848464966, + "learning_rate": 9.991683057826466e-06, + "loss": 2.984, + "step": 342300 + }, + { + "epoch": 0.337269374147837, + "grad_norm": 2.656853675842285, + "learning_rate": 9.991673663716929e-06, + "loss": 2.9973, + "step": 342350 + }, + { + "epoch": 0.33731863212565905, + "grad_norm": 2.3854920864105225, + "learning_rate": 9.991664264309414e-06, + "loss": 3.05, + "step": 342400 + }, + { + "epoch": 0.33736789010348117, + "grad_norm": 2.235658884048462, + "learning_rate": 9.991654859603931e-06, + "loss": 2.9794, + "step": 342450 + }, + { + "epoch": 0.33741714808130324, + "grad_norm": 2.260119676589966, + "learning_rate": 9.99164544960049e-06, + "loss": 3.028, + "step": 342500 + }, + { + "epoch": 0.3374664060591253, + "grad_norm": 2.274001121520996, + "learning_rate": 9.991636034299101e-06, + "loss": 3.0098, + "step": 342550 + }, + { + "epoch": 0.33751566403694744, + "grad_norm": 2.380709171295166, + "learning_rate": 9.991626613699774e-06, + "loss": 2.9799, + "step": 342600 + }, + { + "epoch": 0.3375649220147695, + "grad_norm": 2.1756722927093506, + "learning_rate": 9.991617187802521e-06, + "loss": 3.0065, + "step": 342650 + }, + { + "epoch": 0.3376141799925916, + "grad_norm": 2.6460258960723877, + "learning_rate": 9.991607756607349e-06, + "loss": 2.9689, + "step": 342700 + }, + { + "epoch": 0.3376634379704137, + "grad_norm": 2.381195068359375, + "learning_rate": 9.991598320114268e-06, + "loss": 3.0193, + "step": 342750 + }, + { + "epoch": 0.3377126959482358, + "grad_norm": 2.544562339782715, + "learning_rate": 9.991588878323291e-06, + "loss": 3.0257, + "step": 342800 + }, + { + "epoch": 0.33776195392605785, + "grad_norm": 2.401815414428711, + "learning_rate": 9.991579431234426e-06, + "loss": 2.9748, + "step": 342850 + }, + { + "epoch": 0.33781121190387997, + "grad_norm": 2.247227907180786, + "learning_rate": 9.991569978847682e-06, + "loss": 2.9835, + "step": 342900 + }, + { + "epoch": 0.33786046988170204, + "grad_norm": 2.6301238536834717, + "learning_rate": 9.991560521163072e-06, + "loss": 3.0296, + "step": 342950 + }, + { + "epoch": 0.3379097278595241, + "grad_norm": 2.393620729446411, + "learning_rate": 9.991551058180603e-06, + "loss": 2.9969, + "step": 343000 + }, + { + "epoch": 0.33795898583734624, + "grad_norm": 2.2301530838012695, + "learning_rate": 9.991541589900287e-06, + "loss": 3.0286, + "step": 343050 + }, + { + "epoch": 0.3380082438151683, + "grad_norm": 2.2680397033691406, + "learning_rate": 9.991532116322133e-06, + "loss": 2.9995, + "step": 343100 + }, + { + "epoch": 0.3380575017929904, + "grad_norm": 2.36830472946167, + "learning_rate": 9.991522637446152e-06, + "loss": 2.9533, + "step": 343150 + }, + { + "epoch": 0.3381067597708125, + "grad_norm": 2.1988606452941895, + "learning_rate": 9.991513153272355e-06, + "loss": 3.0933, + "step": 343200 + }, + { + "epoch": 0.3381560177486346, + "grad_norm": 2.226529836654663, + "learning_rate": 9.991503663800748e-06, + "loss": 3.0916, + "step": 343250 + }, + { + "epoch": 0.33820527572645664, + "grad_norm": 2.073350191116333, + "learning_rate": 9.991494169031345e-06, + "loss": 2.9887, + "step": 343300 + }, + { + "epoch": 0.33825453370427877, + "grad_norm": 2.2130415439605713, + "learning_rate": 9.991484668964154e-06, + "loss": 2.972, + "step": 343350 + }, + { + "epoch": 0.33830379168210084, + "grad_norm": 2.3737876415252686, + "learning_rate": 9.991475163599185e-06, + "loss": 2.9696, + "step": 343400 + }, + { + "epoch": 0.3383530496599229, + "grad_norm": 2.4101626873016357, + "learning_rate": 9.991465652936452e-06, + "loss": 2.986, + "step": 343450 + }, + { + "epoch": 0.33840230763774504, + "grad_norm": 2.3866686820983887, + "learning_rate": 9.991456136975962e-06, + "loss": 2.9786, + "step": 343500 + }, + { + "epoch": 0.3384515656155671, + "grad_norm": 2.165193557739258, + "learning_rate": 9.99144661571772e-06, + "loss": 3.0427, + "step": 343550 + }, + { + "epoch": 0.3385008235933892, + "grad_norm": 2.1200273036956787, + "learning_rate": 9.991437089161746e-06, + "loss": 2.9087, + "step": 343600 + }, + { + "epoch": 0.33855008157121125, + "grad_norm": 2.2995500564575195, + "learning_rate": 9.991427557308044e-06, + "loss": 3.0072, + "step": 343650 + }, + { + "epoch": 0.3385993395490334, + "grad_norm": 2.128190517425537, + "learning_rate": 9.991418020156626e-06, + "loss": 2.9251, + "step": 343700 + }, + { + "epoch": 0.33864859752685544, + "grad_norm": 2.2819156646728516, + "learning_rate": 9.991408477707501e-06, + "loss": 2.9678, + "step": 343750 + }, + { + "epoch": 0.3386978555046775, + "grad_norm": 2.254160165786743, + "learning_rate": 9.99139892996068e-06, + "loss": 2.9411, + "step": 343800 + }, + { + "epoch": 0.33874711348249964, + "grad_norm": 2.727442741394043, + "learning_rate": 9.991389376916174e-06, + "loss": 3.0159, + "step": 343850 + }, + { + "epoch": 0.3387963714603217, + "grad_norm": 2.2122743129730225, + "learning_rate": 9.99137981857399e-06, + "loss": 2.9543, + "step": 343900 + }, + { + "epoch": 0.3388456294381438, + "grad_norm": 2.5268361568450928, + "learning_rate": 9.99137025493414e-06, + "loss": 2.9789, + "step": 343950 + }, + { + "epoch": 0.3388948874159659, + "grad_norm": 2.298715829849243, + "learning_rate": 9.991360685996635e-06, + "loss": 2.9972, + "step": 344000 + }, + { + "epoch": 0.338944145393788, + "grad_norm": 3.052751302719116, + "learning_rate": 9.991351111761486e-06, + "loss": 2.9869, + "step": 344050 + }, + { + "epoch": 0.33899340337161005, + "grad_norm": 2.4139404296875, + "learning_rate": 9.991341532228701e-06, + "loss": 2.9462, + "step": 344100 + }, + { + "epoch": 0.3390426613494322, + "grad_norm": 2.348548412322998, + "learning_rate": 9.99133194739829e-06, + "loss": 3.0253, + "step": 344150 + }, + { + "epoch": 0.33909191932725424, + "grad_norm": 2.281170606613159, + "learning_rate": 9.991322357270263e-06, + "loss": 2.9611, + "step": 344200 + }, + { + "epoch": 0.3391411773050763, + "grad_norm": 2.292881727218628, + "learning_rate": 9.991312761844633e-06, + "loss": 3.021, + "step": 344250 + }, + { + "epoch": 0.33919043528289844, + "grad_norm": 2.2033727169036865, + "learning_rate": 9.991303161121406e-06, + "loss": 2.9998, + "step": 344300 + }, + { + "epoch": 0.3392396932607205, + "grad_norm": 2.2920420169830322, + "learning_rate": 9.991293555100597e-06, + "loss": 2.9707, + "step": 344350 + }, + { + "epoch": 0.3392889512385426, + "grad_norm": 2.1283485889434814, + "learning_rate": 9.991283943782213e-06, + "loss": 2.9976, + "step": 344400 + }, + { + "epoch": 0.3393382092163647, + "grad_norm": 2.3616037368774414, + "learning_rate": 9.991274327166263e-06, + "loss": 3.0106, + "step": 344450 + }, + { + "epoch": 0.3393874671941868, + "grad_norm": 2.487152338027954, + "learning_rate": 9.991264705252761e-06, + "loss": 2.9713, + "step": 344500 + }, + { + "epoch": 0.33943672517200885, + "grad_norm": 2.0717523097991943, + "learning_rate": 9.991255078041715e-06, + "loss": 2.9369, + "step": 344550 + }, + { + "epoch": 0.33948598314983097, + "grad_norm": 2.4102656841278076, + "learning_rate": 9.991245445533137e-06, + "loss": 3.0063, + "step": 344600 + }, + { + "epoch": 0.33953524112765304, + "grad_norm": 2.2053208351135254, + "learning_rate": 9.991235807727033e-06, + "loss": 3.012, + "step": 344650 + }, + { + "epoch": 0.3395844991054751, + "grad_norm": 2.2586183547973633, + "learning_rate": 9.991226164623418e-06, + "loss": 2.9085, + "step": 344700 + }, + { + "epoch": 0.33963375708329724, + "grad_norm": 2.250734567642212, + "learning_rate": 9.991216516222299e-06, + "loss": 3.0182, + "step": 344750 + }, + { + "epoch": 0.3396830150611193, + "grad_norm": 2.422473430633545, + "learning_rate": 9.991206862523687e-06, + "loss": 3.0291, + "step": 344800 + }, + { + "epoch": 0.3397322730389414, + "grad_norm": 2.2623679637908936, + "learning_rate": 9.991197203527594e-06, + "loss": 2.9889, + "step": 344850 + }, + { + "epoch": 0.33978153101676345, + "grad_norm": 2.159167528152466, + "learning_rate": 9.99118753923403e-06, + "loss": 3.0104, + "step": 344900 + }, + { + "epoch": 0.3398307889945856, + "grad_norm": 2.2729225158691406, + "learning_rate": 9.991177869643002e-06, + "loss": 2.9349, + "step": 344950 + }, + { + "epoch": 0.33988004697240765, + "grad_norm": 2.531769037246704, + "learning_rate": 9.991168194754522e-06, + "loss": 3.0387, + "step": 345000 + }, + { + "epoch": 0.3399293049502297, + "grad_norm": 2.2669620513916016, + "learning_rate": 9.991158514568604e-06, + "loss": 2.9609, + "step": 345050 + }, + { + "epoch": 0.33997856292805184, + "grad_norm": 2.2080652713775635, + "learning_rate": 9.991148829085252e-06, + "loss": 2.8926, + "step": 345100 + }, + { + "epoch": 0.3400278209058739, + "grad_norm": 2.1584978103637695, + "learning_rate": 9.99113913830448e-06, + "loss": 2.9824, + "step": 345150 + }, + { + "epoch": 0.340077078883696, + "grad_norm": 2.4344892501831055, + "learning_rate": 9.991129442226299e-06, + "loss": 2.924, + "step": 345200 + }, + { + "epoch": 0.3401263368615181, + "grad_norm": 2.0717360973358154, + "learning_rate": 9.991119740850717e-06, + "loss": 2.9767, + "step": 345250 + }, + { + "epoch": 0.3401755948393402, + "grad_norm": 2.165419340133667, + "learning_rate": 9.991110034177744e-06, + "loss": 3.0201, + "step": 345300 + }, + { + "epoch": 0.34022485281716225, + "grad_norm": 2.2159440517425537, + "learning_rate": 9.991100322207393e-06, + "loss": 2.9541, + "step": 345350 + }, + { + "epoch": 0.3402741107949844, + "grad_norm": 2.2055206298828125, + "learning_rate": 9.991090604939673e-06, + "loss": 2.9292, + "step": 345400 + }, + { + "epoch": 0.34032336877280644, + "grad_norm": 2.1135871410369873, + "learning_rate": 9.991080882374594e-06, + "loss": 2.9486, + "step": 345450 + }, + { + "epoch": 0.3403726267506285, + "grad_norm": 2.3916714191436768, + "learning_rate": 9.991071154512165e-06, + "loss": 2.9154, + "step": 345500 + }, + { + "epoch": 0.34042188472845064, + "grad_norm": 2.391805410385132, + "learning_rate": 9.9910614213524e-06, + "loss": 3.0383, + "step": 345550 + }, + { + "epoch": 0.3404711427062727, + "grad_norm": 2.3449625968933105, + "learning_rate": 9.991051682895303e-06, + "loss": 3.0472, + "step": 345600 + }, + { + "epoch": 0.3405204006840948, + "grad_norm": 2.4201056957244873, + "learning_rate": 9.991041939140892e-06, + "loss": 2.9282, + "step": 345650 + }, + { + "epoch": 0.3405696586619169, + "grad_norm": 2.3704781532287598, + "learning_rate": 9.991032190089173e-06, + "loss": 2.9662, + "step": 345700 + }, + { + "epoch": 0.340618916639739, + "grad_norm": 2.3098537921905518, + "learning_rate": 9.991022435740158e-06, + "loss": 2.9859, + "step": 345750 + }, + { + "epoch": 0.34066817461756105, + "grad_norm": 2.3515007495880127, + "learning_rate": 9.991012676093854e-06, + "loss": 2.9755, + "step": 345800 + }, + { + "epoch": 0.3407174325953832, + "grad_norm": 2.5803990364074707, + "learning_rate": 9.991002911150277e-06, + "loss": 2.9669, + "step": 345850 + }, + { + "epoch": 0.34076669057320524, + "grad_norm": 2.27885365486145, + "learning_rate": 9.990993140909432e-06, + "loss": 2.9829, + "step": 345900 + }, + { + "epoch": 0.3408159485510273, + "grad_norm": 2.536454677581787, + "learning_rate": 9.990983365371331e-06, + "loss": 3.0492, + "step": 345950 + }, + { + "epoch": 0.3408652065288494, + "grad_norm": 2.3045566082000732, + "learning_rate": 9.990973584535988e-06, + "loss": 2.9135, + "step": 346000 + }, + { + "epoch": 0.3409144645066715, + "grad_norm": 2.2855710983276367, + "learning_rate": 9.990963798403407e-06, + "loss": 2.9932, + "step": 346050 + }, + { + "epoch": 0.3409637224844936, + "grad_norm": 2.2748939990997314, + "learning_rate": 9.990954006973602e-06, + "loss": 3.0366, + "step": 346100 + }, + { + "epoch": 0.34101298046231565, + "grad_norm": 2.356858968734741, + "learning_rate": 9.990944210246584e-06, + "loss": 2.9417, + "step": 346150 + }, + { + "epoch": 0.3410622384401378, + "grad_norm": 2.4118850231170654, + "learning_rate": 9.990934408222363e-06, + "loss": 2.9645, + "step": 346200 + }, + { + "epoch": 0.34111149641795985, + "grad_norm": 2.181513786315918, + "learning_rate": 9.990924600900948e-06, + "loss": 2.9969, + "step": 346250 + }, + { + "epoch": 0.3411607543957819, + "grad_norm": 2.583486318588257, + "learning_rate": 9.99091478828235e-06, + "loss": 2.9521, + "step": 346300 + }, + { + "epoch": 0.34121001237360404, + "grad_norm": 2.0625128746032715, + "learning_rate": 9.990904970366581e-06, + "loss": 2.9324, + "step": 346350 + }, + { + "epoch": 0.3412592703514261, + "grad_norm": 2.168313980102539, + "learning_rate": 9.99089514715365e-06, + "loss": 2.9682, + "step": 346400 + }, + { + "epoch": 0.3413085283292482, + "grad_norm": 2.3183281421661377, + "learning_rate": 9.990885318643568e-06, + "loss": 2.9616, + "step": 346450 + }, + { + "epoch": 0.3413577863070703, + "grad_norm": 2.278223991394043, + "learning_rate": 9.990875484836342e-06, + "loss": 2.9376, + "step": 346500 + }, + { + "epoch": 0.3414070442848924, + "grad_norm": 2.4263968467712402, + "learning_rate": 9.990865645731988e-06, + "loss": 2.9985, + "step": 346550 + }, + { + "epoch": 0.34145630226271445, + "grad_norm": 2.1062047481536865, + "learning_rate": 9.990855801330514e-06, + "loss": 2.9926, + "step": 346600 + }, + { + "epoch": 0.3415055602405366, + "grad_norm": 2.567962169647217, + "learning_rate": 9.99084595163193e-06, + "loss": 2.917, + "step": 346650 + }, + { + "epoch": 0.34155481821835865, + "grad_norm": 2.2755391597747803, + "learning_rate": 9.990836096636247e-06, + "loss": 2.9384, + "step": 346700 + }, + { + "epoch": 0.3416040761961807, + "grad_norm": 2.205108404159546, + "learning_rate": 9.990826236343474e-06, + "loss": 2.9385, + "step": 346750 + }, + { + "epoch": 0.34165333417400284, + "grad_norm": 2.5039007663726807, + "learning_rate": 9.990816370753622e-06, + "loss": 2.975, + "step": 346800 + }, + { + "epoch": 0.3417025921518249, + "grad_norm": 2.2418606281280518, + "learning_rate": 9.990806499866705e-06, + "loss": 2.9599, + "step": 346850 + }, + { + "epoch": 0.341751850129647, + "grad_norm": 2.300130605697632, + "learning_rate": 9.990796623682729e-06, + "loss": 2.9688, + "step": 346900 + }, + { + "epoch": 0.3418011081074691, + "grad_norm": 2.713865041732788, + "learning_rate": 9.990786742201706e-06, + "loss": 2.9462, + "step": 346950 + }, + { + "epoch": 0.3418503660852912, + "grad_norm": 2.27046275138855, + "learning_rate": 9.990776855423646e-06, + "loss": 2.9429, + "step": 347000 + }, + { + "epoch": 0.34189962406311325, + "grad_norm": 2.3259341716766357, + "learning_rate": 9.990766963348562e-06, + "loss": 2.996, + "step": 347050 + }, + { + "epoch": 0.3419488820409354, + "grad_norm": 2.387178897857666, + "learning_rate": 9.990757065976462e-06, + "loss": 2.9538, + "step": 347100 + }, + { + "epoch": 0.34199814001875745, + "grad_norm": 2.416694164276123, + "learning_rate": 9.990747163307355e-06, + "loss": 2.9741, + "step": 347150 + }, + { + "epoch": 0.3420473979965795, + "grad_norm": 2.373018980026245, + "learning_rate": 9.990737255341257e-06, + "loss": 2.969, + "step": 347200 + }, + { + "epoch": 0.3420966559744016, + "grad_norm": 2.6071646213531494, + "learning_rate": 9.990727342078174e-06, + "loss": 2.9791, + "step": 347250 + }, + { + "epoch": 0.3421459139522237, + "grad_norm": 2.2662038803100586, + "learning_rate": 9.990717423518116e-06, + "loss": 2.9652, + "step": 347300 + }, + { + "epoch": 0.3421951719300458, + "grad_norm": 2.372051954269409, + "learning_rate": 9.990707499661097e-06, + "loss": 2.9571, + "step": 347350 + }, + { + "epoch": 0.34224442990786785, + "grad_norm": 2.258258104324341, + "learning_rate": 9.990697570507125e-06, + "loss": 2.9576, + "step": 347400 + }, + { + "epoch": 0.34229368788569, + "grad_norm": 2.3307583332061768, + "learning_rate": 9.99068763605621e-06, + "loss": 2.9759, + "step": 347450 + }, + { + "epoch": 0.34234294586351205, + "grad_norm": 2.2210052013397217, + "learning_rate": 9.990677696308365e-06, + "loss": 2.9892, + "step": 347500 + }, + { + "epoch": 0.3423922038413341, + "grad_norm": 2.3302032947540283, + "learning_rate": 9.990667751263601e-06, + "loss": 2.9552, + "step": 347550 + }, + { + "epoch": 0.34244146181915625, + "grad_norm": 2.4955410957336426, + "learning_rate": 9.990657800921925e-06, + "loss": 3.0297, + "step": 347600 + }, + { + "epoch": 0.3424907197969783, + "grad_norm": 2.330674409866333, + "learning_rate": 9.990647845283349e-06, + "loss": 2.9302, + "step": 347650 + }, + { + "epoch": 0.3425399777748004, + "grad_norm": 2.3072350025177, + "learning_rate": 9.990637884347886e-06, + "loss": 2.9392, + "step": 347700 + }, + { + "epoch": 0.3425892357526225, + "grad_norm": 2.4104738235473633, + "learning_rate": 9.990627918115543e-06, + "loss": 2.9468, + "step": 347750 + }, + { + "epoch": 0.3426384937304446, + "grad_norm": 2.2761361598968506, + "learning_rate": 9.990617946586333e-06, + "loss": 2.9145, + "step": 347800 + }, + { + "epoch": 0.34268775170826665, + "grad_norm": 2.0653271675109863, + "learning_rate": 9.990607969760266e-06, + "loss": 3.0178, + "step": 347850 + }, + { + "epoch": 0.3427370096860888, + "grad_norm": 2.2013192176818848, + "learning_rate": 9.990597987637351e-06, + "loss": 2.9825, + "step": 347900 + }, + { + "epoch": 0.34278626766391085, + "grad_norm": 2.375685691833496, + "learning_rate": 9.9905880002176e-06, + "loss": 2.9935, + "step": 347950 + }, + { + "epoch": 0.3428355256417329, + "grad_norm": 2.183912515640259, + "learning_rate": 9.990578007501026e-06, + "loss": 2.9071, + "step": 348000 + }, + { + "epoch": 0.34288478361955504, + "grad_norm": 2.5142245292663574, + "learning_rate": 9.990568009487634e-06, + "loss": 2.9355, + "step": 348050 + }, + { + "epoch": 0.3429340415973771, + "grad_norm": 2.176879405975342, + "learning_rate": 9.990558006177439e-06, + "loss": 3.0129, + "step": 348100 + }, + { + "epoch": 0.3429832995751992, + "grad_norm": 2.2777137756347656, + "learning_rate": 9.99054799757045e-06, + "loss": 2.9364, + "step": 348150 + }, + { + "epoch": 0.3430325575530213, + "grad_norm": 2.2690072059631348, + "learning_rate": 9.99053798366668e-06, + "loss": 2.922, + "step": 348200 + }, + { + "epoch": 0.3430818155308434, + "grad_norm": 2.2960798740386963, + "learning_rate": 9.990527964466136e-06, + "loss": 2.9948, + "step": 348250 + }, + { + "epoch": 0.34313107350866545, + "grad_norm": 2.2886600494384766, + "learning_rate": 9.99051793996883e-06, + "loss": 2.9709, + "step": 348300 + }, + { + "epoch": 0.3431803314864876, + "grad_norm": 2.4524452686309814, + "learning_rate": 9.990507910174774e-06, + "loss": 3.0488, + "step": 348350 + }, + { + "epoch": 0.34322958946430965, + "grad_norm": 2.2020339965820312, + "learning_rate": 9.990497875083977e-06, + "loss": 2.9439, + "step": 348400 + }, + { + "epoch": 0.3432788474421317, + "grad_norm": 2.388333797454834, + "learning_rate": 9.990487834696449e-06, + "loss": 2.9929, + "step": 348450 + }, + { + "epoch": 0.3433281054199538, + "grad_norm": 2.8616886138916016, + "learning_rate": 9.990477789012204e-06, + "loss": 2.9613, + "step": 348500 + }, + { + "epoch": 0.3433773633977759, + "grad_norm": 2.3279268741607666, + "learning_rate": 9.990467738031249e-06, + "loss": 2.9514, + "step": 348550 + }, + { + "epoch": 0.343426621375598, + "grad_norm": 2.2998714447021484, + "learning_rate": 9.990457681753598e-06, + "loss": 2.947, + "step": 348600 + }, + { + "epoch": 0.34347587935342005, + "grad_norm": 2.3863558769226074, + "learning_rate": 9.990447620179259e-06, + "loss": 2.9124, + "step": 348650 + }, + { + "epoch": 0.3435251373312422, + "grad_norm": 2.3414969444274902, + "learning_rate": 9.990437553308241e-06, + "loss": 2.9197, + "step": 348700 + }, + { + "epoch": 0.34357439530906425, + "grad_norm": 2.135282039642334, + "learning_rate": 9.990427481140561e-06, + "loss": 2.947, + "step": 348750 + }, + { + "epoch": 0.3436236532868863, + "grad_norm": 2.227498769760132, + "learning_rate": 9.990417403676224e-06, + "loss": 2.9427, + "step": 348800 + }, + { + "epoch": 0.34367291126470845, + "grad_norm": 2.262646436691284, + "learning_rate": 9.990407320915243e-06, + "loss": 2.9491, + "step": 348850 + }, + { + "epoch": 0.3437221692425305, + "grad_norm": 2.4980483055114746, + "learning_rate": 9.990397232857628e-06, + "loss": 2.9484, + "step": 348900 + }, + { + "epoch": 0.3437714272203526, + "grad_norm": 2.377901315689087, + "learning_rate": 9.99038713950339e-06, + "loss": 2.9863, + "step": 348950 + }, + { + "epoch": 0.3438206851981747, + "grad_norm": 2.206224203109741, + "learning_rate": 9.99037704085254e-06, + "loss": 2.971, + "step": 349000 + }, + { + "epoch": 0.3438699431759968, + "grad_norm": 2.311054229736328, + "learning_rate": 9.99036693690509e-06, + "loss": 2.9825, + "step": 349050 + }, + { + "epoch": 0.34391920115381885, + "grad_norm": 2.1996545791625977, + "learning_rate": 9.990356827661046e-06, + "loss": 2.9799, + "step": 349100 + }, + { + "epoch": 0.343968459131641, + "grad_norm": 2.248378276824951, + "learning_rate": 9.990346713120425e-06, + "loss": 3.0003, + "step": 349150 + }, + { + "epoch": 0.34401771710946305, + "grad_norm": 2.123173475265503, + "learning_rate": 9.990336593283233e-06, + "loss": 2.9852, + "step": 349200 + }, + { + "epoch": 0.3440669750872851, + "grad_norm": 2.492102861404419, + "learning_rate": 9.990326468149483e-06, + "loss": 3.0217, + "step": 349250 + }, + { + "epoch": 0.34411623306510725, + "grad_norm": 2.264329671859741, + "learning_rate": 9.990316337719184e-06, + "loss": 2.999, + "step": 349300 + }, + { + "epoch": 0.3441654910429293, + "grad_norm": 2.2310173511505127, + "learning_rate": 9.990306201992348e-06, + "loss": 2.9608, + "step": 349350 + }, + { + "epoch": 0.3442147490207514, + "grad_norm": 2.173473596572876, + "learning_rate": 9.990296060968988e-06, + "loss": 3.0081, + "step": 349400 + }, + { + "epoch": 0.3442640069985735, + "grad_norm": 2.2602334022521973, + "learning_rate": 9.99028591464911e-06, + "loss": 2.9541, + "step": 349450 + }, + { + "epoch": 0.3443132649763956, + "grad_norm": 2.287379741668701, + "learning_rate": 9.990275763032727e-06, + "loss": 2.9274, + "step": 349500 + }, + { + "epoch": 0.34436252295421765, + "grad_norm": 2.1116716861724854, + "learning_rate": 9.99026560611985e-06, + "loss": 2.9572, + "step": 349550 + }, + { + "epoch": 0.3444117809320398, + "grad_norm": 2.2709805965423584, + "learning_rate": 9.990255443910493e-06, + "loss": 2.9859, + "step": 349600 + }, + { + "epoch": 0.34446103890986185, + "grad_norm": 2.3558762073516846, + "learning_rate": 9.99024527640466e-06, + "loss": 2.9577, + "step": 349650 + }, + { + "epoch": 0.3445102968876839, + "grad_norm": 2.6220345497131348, + "learning_rate": 9.990235103602367e-06, + "loss": 3.0072, + "step": 349700 + }, + { + "epoch": 0.344559554865506, + "grad_norm": 2.1461992263793945, + "learning_rate": 9.990224925503623e-06, + "loss": 2.8849, + "step": 349750 + }, + { + "epoch": 0.3446088128433281, + "grad_norm": 2.4202895164489746, + "learning_rate": 9.990214742108438e-06, + "loss": 3.01, + "step": 349800 + }, + { + "epoch": 0.3446580708211502, + "grad_norm": 2.3817427158355713, + "learning_rate": 9.990204553416824e-06, + "loss": 2.9602, + "step": 349850 + }, + { + "epoch": 0.34470732879897226, + "grad_norm": 2.3389711380004883, + "learning_rate": 9.990194359428792e-06, + "loss": 2.9574, + "step": 349900 + }, + { + "epoch": 0.3447565867767944, + "grad_norm": 2.449887990951538, + "learning_rate": 9.990184160144351e-06, + "loss": 2.8807, + "step": 349950 + }, + { + "epoch": 0.34480584475461645, + "grad_norm": 2.046319007873535, + "learning_rate": 9.990173955563515e-06, + "loss": 2.9828, + "step": 350000 + }, + { + "epoch": 0.3448551027324385, + "grad_norm": 2.2042977809906006, + "learning_rate": 9.990163745686292e-06, + "loss": 2.9986, + "step": 350050 + }, + { + "epoch": 0.34490436071026065, + "grad_norm": 2.2430944442749023, + "learning_rate": 9.990153530512694e-06, + "loss": 3.0311, + "step": 350100 + }, + { + "epoch": 0.3449536186880827, + "grad_norm": 2.3518428802490234, + "learning_rate": 9.990143310042731e-06, + "loss": 2.9379, + "step": 350150 + }, + { + "epoch": 0.3450028766659048, + "grad_norm": 2.335559606552124, + "learning_rate": 9.990133084276415e-06, + "loss": 2.9948, + "step": 350200 + }, + { + "epoch": 0.3450521346437269, + "grad_norm": 2.346264600753784, + "learning_rate": 9.990122853213758e-06, + "loss": 2.9984, + "step": 350250 + }, + { + "epoch": 0.345101392621549, + "grad_norm": 2.236358880996704, + "learning_rate": 9.990112616854767e-06, + "loss": 2.9606, + "step": 350300 + }, + { + "epoch": 0.34515065059937106, + "grad_norm": 2.4653396606445312, + "learning_rate": 9.990102375199456e-06, + "loss": 2.9754, + "step": 350350 + }, + { + "epoch": 0.3451999085771932, + "grad_norm": 2.4637696743011475, + "learning_rate": 9.990092128247836e-06, + "loss": 3.0131, + "step": 350400 + }, + { + "epoch": 0.34524916655501525, + "grad_norm": 2.2879128456115723, + "learning_rate": 9.990081875999915e-06, + "loss": 2.9637, + "step": 350450 + }, + { + "epoch": 0.3452984245328373, + "grad_norm": 2.128631114959717, + "learning_rate": 9.990071618455708e-06, + "loss": 2.994, + "step": 350500 + }, + { + "epoch": 0.34534768251065945, + "grad_norm": 2.239274501800537, + "learning_rate": 9.99006135561522e-06, + "loss": 3.0033, + "step": 350550 + }, + { + "epoch": 0.3453969404884815, + "grad_norm": 2.1282689571380615, + "learning_rate": 9.990051087478468e-06, + "loss": 3.0316, + "step": 350600 + }, + { + "epoch": 0.3454461984663036, + "grad_norm": 2.2384164333343506, + "learning_rate": 9.99004081404546e-06, + "loss": 2.9616, + "step": 350650 + }, + { + "epoch": 0.3454954564441257, + "grad_norm": 2.3246188163757324, + "learning_rate": 9.990030535316208e-06, + "loss": 2.98, + "step": 350700 + }, + { + "epoch": 0.3455447144219478, + "grad_norm": 2.3451755046844482, + "learning_rate": 9.990020251290722e-06, + "loss": 2.9336, + "step": 350750 + }, + { + "epoch": 0.34559397239976986, + "grad_norm": 2.212925910949707, + "learning_rate": 9.990009961969013e-06, + "loss": 2.9396, + "step": 350800 + }, + { + "epoch": 0.345643230377592, + "grad_norm": 2.2294952869415283, + "learning_rate": 9.98999966735109e-06, + "loss": 2.9371, + "step": 350850 + }, + { + "epoch": 0.34569248835541405, + "grad_norm": 2.126357316970825, + "learning_rate": 9.98998936743697e-06, + "loss": 2.9553, + "step": 350900 + }, + { + "epoch": 0.3457417463332361, + "grad_norm": 2.472942590713501, + "learning_rate": 9.989979062226657e-06, + "loss": 2.9545, + "step": 350950 + }, + { + "epoch": 0.3457910043110582, + "grad_norm": 2.397287607192993, + "learning_rate": 9.989968751720165e-06, + "loss": 3.0573, + "step": 351000 + }, + { + "epoch": 0.3458402622888803, + "grad_norm": 2.3889689445495605, + "learning_rate": 9.989958435917506e-06, + "loss": 2.9325, + "step": 351050 + }, + { + "epoch": 0.3458895202667024, + "grad_norm": 2.3354578018188477, + "learning_rate": 9.989948114818688e-06, + "loss": 2.9935, + "step": 351100 + }, + { + "epoch": 0.34593877824452446, + "grad_norm": 2.4032914638519287, + "learning_rate": 9.989937788423726e-06, + "loss": 3.0069, + "step": 351150 + }, + { + "epoch": 0.3459880362223466, + "grad_norm": 2.4838826656341553, + "learning_rate": 9.989927456732628e-06, + "loss": 2.9375, + "step": 351200 + }, + { + "epoch": 0.34603729420016865, + "grad_norm": 2.5062577724456787, + "learning_rate": 9.989917119745404e-06, + "loss": 2.9435, + "step": 351250 + }, + { + "epoch": 0.3460865521779907, + "grad_norm": 3.2255961894989014, + "learning_rate": 9.989906777462068e-06, + "loss": 2.8883, + "step": 351300 + }, + { + "epoch": 0.34613581015581285, + "grad_norm": 2.119917154312134, + "learning_rate": 9.98989642988263e-06, + "loss": 2.9855, + "step": 351350 + }, + { + "epoch": 0.3461850681336349, + "grad_norm": 2.2643604278564453, + "learning_rate": 9.9898860770071e-06, + "loss": 3.0105, + "step": 351400 + }, + { + "epoch": 0.346234326111457, + "grad_norm": 2.2403066158294678, + "learning_rate": 9.989875718835489e-06, + "loss": 2.9812, + "step": 351450 + }, + { + "epoch": 0.3462835840892791, + "grad_norm": 2.1916775703430176, + "learning_rate": 9.98986535536781e-06, + "loss": 3.0031, + "step": 351500 + }, + { + "epoch": 0.3463328420671012, + "grad_norm": 2.522244930267334, + "learning_rate": 9.98985498660407e-06, + "loss": 2.9959, + "step": 351550 + }, + { + "epoch": 0.34638210004492326, + "grad_norm": 2.2052228450775146, + "learning_rate": 9.989844612544284e-06, + "loss": 2.9928, + "step": 351600 + }, + { + "epoch": 0.3464313580227454, + "grad_norm": 2.498222827911377, + "learning_rate": 9.989834233188462e-06, + "loss": 2.9789, + "step": 351650 + }, + { + "epoch": 0.34648061600056745, + "grad_norm": 2.411092758178711, + "learning_rate": 9.989823848536615e-06, + "loss": 2.9887, + "step": 351700 + }, + { + "epoch": 0.3465298739783895, + "grad_norm": 2.2537975311279297, + "learning_rate": 9.989813458588753e-06, + "loss": 2.9318, + "step": 351750 + }, + { + "epoch": 0.34657913195621165, + "grad_norm": 2.305896282196045, + "learning_rate": 9.989803063344887e-06, + "loss": 3.0033, + "step": 351800 + }, + { + "epoch": 0.3466283899340337, + "grad_norm": 2.3288862705230713, + "learning_rate": 9.98979266280503e-06, + "loss": 2.9588, + "step": 351850 + }, + { + "epoch": 0.3466776479118558, + "grad_norm": 2.464510679244995, + "learning_rate": 9.989782256969189e-06, + "loss": 3.028, + "step": 351900 + }, + { + "epoch": 0.3467269058896779, + "grad_norm": 2.442789316177368, + "learning_rate": 9.98977184583738e-06, + "loss": 3.0335, + "step": 351950 + }, + { + "epoch": 0.3467761638675, + "grad_norm": 2.246234893798828, + "learning_rate": 9.989761429409613e-06, + "loss": 2.9772, + "step": 352000 + }, + { + "epoch": 0.34682542184532206, + "grad_norm": 2.212339401245117, + "learning_rate": 9.989751007685895e-06, + "loss": 2.9783, + "step": 352050 + }, + { + "epoch": 0.3468746798231442, + "grad_norm": 2.2476232051849365, + "learning_rate": 9.989740580666242e-06, + "loss": 2.9176, + "step": 352100 + }, + { + "epoch": 0.34692393780096625, + "grad_norm": 2.1859371662139893, + "learning_rate": 9.98973014835066e-06, + "loss": 3.0212, + "step": 352150 + }, + { + "epoch": 0.3469731957787883, + "grad_norm": 2.4295313358306885, + "learning_rate": 9.989719710739167e-06, + "loss": 2.9607, + "step": 352200 + }, + { + "epoch": 0.3470224537566104, + "grad_norm": 2.2728710174560547, + "learning_rate": 9.989709267831767e-06, + "loss": 2.9783, + "step": 352250 + }, + { + "epoch": 0.3470717117344325, + "grad_norm": 2.8162448406219482, + "learning_rate": 9.989698819628477e-06, + "loss": 2.9824, + "step": 352300 + }, + { + "epoch": 0.3471209697122546, + "grad_norm": 2.3433547019958496, + "learning_rate": 9.989688366129303e-06, + "loss": 2.9871, + "step": 352350 + }, + { + "epoch": 0.34717022769007666, + "grad_norm": 2.3773014545440674, + "learning_rate": 9.98967790733426e-06, + "loss": 2.9912, + "step": 352400 + }, + { + "epoch": 0.3472194856678988, + "grad_norm": 2.2639942169189453, + "learning_rate": 9.989667443243356e-06, + "loss": 2.9114, + "step": 352450 + }, + { + "epoch": 0.34726874364572086, + "grad_norm": 2.3164124488830566, + "learning_rate": 9.989656973856603e-06, + "loss": 2.941, + "step": 352500 + }, + { + "epoch": 0.3473180016235429, + "grad_norm": 2.096545696258545, + "learning_rate": 9.989646499174015e-06, + "loss": 2.9609, + "step": 352550 + }, + { + "epoch": 0.34736725960136505, + "grad_norm": 2.305152177810669, + "learning_rate": 9.9896360191956e-06, + "loss": 2.9586, + "step": 352600 + }, + { + "epoch": 0.3474165175791871, + "grad_norm": 2.227459669113159, + "learning_rate": 9.98962553392137e-06, + "loss": 2.9828, + "step": 352650 + }, + { + "epoch": 0.3474657755570092, + "grad_norm": 2.441070556640625, + "learning_rate": 9.989615043351335e-06, + "loss": 2.9343, + "step": 352700 + }, + { + "epoch": 0.3475150335348313, + "grad_norm": 2.367798089981079, + "learning_rate": 9.989604547485508e-06, + "loss": 3.0377, + "step": 352750 + }, + { + "epoch": 0.3475642915126534, + "grad_norm": 2.2145838737487793, + "learning_rate": 9.9895940463239e-06, + "loss": 2.9151, + "step": 352800 + }, + { + "epoch": 0.34761354949047546, + "grad_norm": 2.217722177505493, + "learning_rate": 9.98958353986652e-06, + "loss": 2.9814, + "step": 352850 + }, + { + "epoch": 0.3476628074682976, + "grad_norm": 2.535430669784546, + "learning_rate": 9.989573028113382e-06, + "loss": 3.0156, + "step": 352900 + }, + { + "epoch": 0.34771206544611966, + "grad_norm": 2.456815719604492, + "learning_rate": 9.989562511064496e-06, + "loss": 3.0341, + "step": 352950 + }, + { + "epoch": 0.3477613234239417, + "grad_norm": 2.612229585647583, + "learning_rate": 9.989551988719873e-06, + "loss": 2.9221, + "step": 353000 + }, + { + "epoch": 0.34781058140176385, + "grad_norm": 2.1536998748779297, + "learning_rate": 9.989541461079522e-06, + "loss": 3.0046, + "step": 353050 + }, + { + "epoch": 0.3478598393795859, + "grad_norm": 2.5180563926696777, + "learning_rate": 9.989530928143457e-06, + "loss": 2.9562, + "step": 353100 + }, + { + "epoch": 0.347909097357408, + "grad_norm": 2.20945405960083, + "learning_rate": 9.989520389911691e-06, + "loss": 3.0499, + "step": 353150 + }, + { + "epoch": 0.3479583553352301, + "grad_norm": 2.201202392578125, + "learning_rate": 9.989509846384231e-06, + "loss": 2.9096, + "step": 353200 + }, + { + "epoch": 0.3480076133130522, + "grad_norm": 2.3982272148132324, + "learning_rate": 9.989499297561089e-06, + "loss": 2.9474, + "step": 353250 + }, + { + "epoch": 0.34805687129087426, + "grad_norm": 2.297053098678589, + "learning_rate": 9.989488743442277e-06, + "loss": 2.9797, + "step": 353300 + }, + { + "epoch": 0.3481061292686964, + "grad_norm": 2.201094627380371, + "learning_rate": 9.98947818402781e-06, + "loss": 2.9741, + "step": 353350 + }, + { + "epoch": 0.34815538724651846, + "grad_norm": 2.219196081161499, + "learning_rate": 9.989467619317692e-06, + "loss": 2.9874, + "step": 353400 + }, + { + "epoch": 0.3482046452243405, + "grad_norm": 2.271169424057007, + "learning_rate": 9.989457049311938e-06, + "loss": 2.9629, + "step": 353450 + }, + { + "epoch": 0.3482539032021626, + "grad_norm": 2.208867073059082, + "learning_rate": 9.98944647401056e-06, + "loss": 2.9172, + "step": 353500 + }, + { + "epoch": 0.3483031611799847, + "grad_norm": 2.2258501052856445, + "learning_rate": 9.989435893413565e-06, + "loss": 2.9294, + "step": 353550 + }, + { + "epoch": 0.3483524191578068, + "grad_norm": 2.352414608001709, + "learning_rate": 9.989425307520971e-06, + "loss": 3.03, + "step": 353600 + }, + { + "epoch": 0.34840167713562886, + "grad_norm": 2.137986660003662, + "learning_rate": 9.989414716332783e-06, + "loss": 2.9929, + "step": 353650 + }, + { + "epoch": 0.348450935113451, + "grad_norm": 2.227226495742798, + "learning_rate": 9.989404119849018e-06, + "loss": 3.0565, + "step": 353700 + }, + { + "epoch": 0.34850019309127306, + "grad_norm": 2.1841230392456055, + "learning_rate": 9.989393518069683e-06, + "loss": 2.9638, + "step": 353750 + }, + { + "epoch": 0.34854945106909513, + "grad_norm": 2.903904676437378, + "learning_rate": 9.989382910994789e-06, + "loss": 3.0011, + "step": 353800 + }, + { + "epoch": 0.34859870904691725, + "grad_norm": 2.297473907470703, + "learning_rate": 9.989372298624349e-06, + "loss": 2.902, + "step": 353850 + }, + { + "epoch": 0.3486479670247393, + "grad_norm": 2.259995460510254, + "learning_rate": 9.989361680958373e-06, + "loss": 2.9633, + "step": 353900 + }, + { + "epoch": 0.3486972250025614, + "grad_norm": 2.30344557762146, + "learning_rate": 9.989351057996875e-06, + "loss": 2.9547, + "step": 353950 + }, + { + "epoch": 0.3487464829803835, + "grad_norm": 2.433990716934204, + "learning_rate": 9.989340429739864e-06, + "loss": 2.8804, + "step": 354000 + }, + { + "epoch": 0.3487957409582056, + "grad_norm": 2.1678948402404785, + "learning_rate": 9.98932979618735e-06, + "loss": 2.967, + "step": 354050 + }, + { + "epoch": 0.34884499893602766, + "grad_norm": 2.4016711711883545, + "learning_rate": 9.989319157339348e-06, + "loss": 2.944, + "step": 354100 + }, + { + "epoch": 0.3488942569138498, + "grad_norm": 2.3355093002319336, + "learning_rate": 9.989308513195866e-06, + "loss": 3.0276, + "step": 354150 + }, + { + "epoch": 0.34894351489167186, + "grad_norm": 2.378403425216675, + "learning_rate": 9.989297863756916e-06, + "loss": 2.9824, + "step": 354200 + }, + { + "epoch": 0.34899277286949393, + "grad_norm": 2.3439412117004395, + "learning_rate": 9.989287209022512e-06, + "loss": 2.9846, + "step": 354250 + }, + { + "epoch": 0.34904203084731605, + "grad_norm": 2.409548282623291, + "learning_rate": 9.989276548992662e-06, + "loss": 2.9595, + "step": 354300 + }, + { + "epoch": 0.3490912888251381, + "grad_norm": 2.2143380641937256, + "learning_rate": 9.989265883667378e-06, + "loss": 2.9634, + "step": 354350 + }, + { + "epoch": 0.3491405468029602, + "grad_norm": 2.348940849304199, + "learning_rate": 9.989255213046673e-06, + "loss": 2.9346, + "step": 354400 + }, + { + "epoch": 0.3491898047807823, + "grad_norm": 2.1382863521575928, + "learning_rate": 9.989244537130558e-06, + "loss": 2.9854, + "step": 354450 + }, + { + "epoch": 0.3492390627586044, + "grad_norm": 2.2843260765075684, + "learning_rate": 9.98923385591904e-06, + "loss": 3.0107, + "step": 354500 + }, + { + "epoch": 0.34928832073642646, + "grad_norm": 2.2672765254974365, + "learning_rate": 9.989223169412136e-06, + "loss": 2.9257, + "step": 354550 + }, + { + "epoch": 0.3493375787142486, + "grad_norm": 2.379582166671753, + "learning_rate": 9.989212477609855e-06, + "loss": 2.9586, + "step": 354600 + }, + { + "epoch": 0.34938683669207066, + "grad_norm": 2.3744139671325684, + "learning_rate": 9.98920178051221e-06, + "loss": 2.9649, + "step": 354650 + }, + { + "epoch": 0.3494360946698927, + "grad_norm": 2.415107250213623, + "learning_rate": 9.989191078119209e-06, + "loss": 2.929, + "step": 354700 + }, + { + "epoch": 0.3494853526477148, + "grad_norm": 2.24250864982605, + "learning_rate": 9.989180370430863e-06, + "loss": 2.9899, + "step": 354750 + }, + { + "epoch": 0.3495346106255369, + "grad_norm": 2.2148826122283936, + "learning_rate": 9.98916965744719e-06, + "loss": 2.9633, + "step": 354800 + }, + { + "epoch": 0.349583868603359, + "grad_norm": 2.344635009765625, + "learning_rate": 9.989158939168195e-06, + "loss": 2.9389, + "step": 354850 + }, + { + "epoch": 0.34963312658118106, + "grad_norm": 2.3568108081817627, + "learning_rate": 9.98914821559389e-06, + "loss": 2.9867, + "step": 354900 + }, + { + "epoch": 0.3496823845590032, + "grad_norm": 2.244147300720215, + "learning_rate": 9.989137486724288e-06, + "loss": 3.0213, + "step": 354950 + }, + { + "epoch": 0.34973164253682526, + "grad_norm": 2.2120232582092285, + "learning_rate": 9.989126752559404e-06, + "loss": 2.9431, + "step": 355000 + }, + { + "epoch": 0.34978090051464733, + "grad_norm": 2.498013973236084, + "learning_rate": 9.98911601309924e-06, + "loss": 2.9326, + "step": 355050 + }, + { + "epoch": 0.34983015849246946, + "grad_norm": 2.6632590293884277, + "learning_rate": 9.989105268343816e-06, + "loss": 2.9297, + "step": 355100 + }, + { + "epoch": 0.3498794164702915, + "grad_norm": 2.4480175971984863, + "learning_rate": 9.989094518293139e-06, + "loss": 2.9617, + "step": 355150 + }, + { + "epoch": 0.3499286744481136, + "grad_norm": 2.4922454357147217, + "learning_rate": 9.989083762947223e-06, + "loss": 2.969, + "step": 355200 + }, + { + "epoch": 0.3499779324259357, + "grad_norm": 2.4732539653778076, + "learning_rate": 9.989073002306078e-06, + "loss": 2.9374, + "step": 355250 + }, + { + "epoch": 0.3500271904037578, + "grad_norm": 2.362722396850586, + "learning_rate": 9.989062236369714e-06, + "loss": 2.9908, + "step": 355300 + }, + { + "epoch": 0.35007644838157986, + "grad_norm": 2.4490747451782227, + "learning_rate": 9.989051465138145e-06, + "loss": 2.9625, + "step": 355350 + }, + { + "epoch": 0.350125706359402, + "grad_norm": 2.2461891174316406, + "learning_rate": 9.989040688611382e-06, + "loss": 2.9439, + "step": 355400 + }, + { + "epoch": 0.35017496433722406, + "grad_norm": 2.2784056663513184, + "learning_rate": 9.989029906789434e-06, + "loss": 2.9879, + "step": 355450 + }, + { + "epoch": 0.35022422231504613, + "grad_norm": 2.343109607696533, + "learning_rate": 9.989019119672315e-06, + "loss": 2.9904, + "step": 355500 + }, + { + "epoch": 0.35027348029286826, + "grad_norm": 2.2368667125701904, + "learning_rate": 9.989008327260036e-06, + "loss": 2.9599, + "step": 355550 + }, + { + "epoch": 0.3503227382706903, + "grad_norm": 2.131196975708008, + "learning_rate": 9.988997529552609e-06, + "loss": 2.9368, + "step": 355600 + }, + { + "epoch": 0.3503719962485124, + "grad_norm": 2.273326873779297, + "learning_rate": 9.988986726550044e-06, + "loss": 2.9755, + "step": 355650 + }, + { + "epoch": 0.3504212542263345, + "grad_norm": 2.1986782550811768, + "learning_rate": 9.988975918252354e-06, + "loss": 2.9604, + "step": 355700 + }, + { + "epoch": 0.3504705122041566, + "grad_norm": 2.2057065963745117, + "learning_rate": 9.988965104659548e-06, + "loss": 2.8619, + "step": 355750 + }, + { + "epoch": 0.35051977018197866, + "grad_norm": 2.3890700340270996, + "learning_rate": 9.988954285771638e-06, + "loss": 2.8533, + "step": 355800 + }, + { + "epoch": 0.3505690281598008, + "grad_norm": 2.234123468399048, + "learning_rate": 9.988943461588639e-06, + "loss": 3.046, + "step": 355850 + }, + { + "epoch": 0.35061828613762286, + "grad_norm": 2.371110200881958, + "learning_rate": 9.988932632110559e-06, + "loss": 2.9889, + "step": 355900 + }, + { + "epoch": 0.35066754411544493, + "grad_norm": 2.7943058013916016, + "learning_rate": 9.988921797337412e-06, + "loss": 2.9495, + "step": 355950 + }, + { + "epoch": 0.350716802093267, + "grad_norm": 2.357105016708374, + "learning_rate": 9.988910957269206e-06, + "loss": 2.9976, + "step": 356000 + }, + { + "epoch": 0.3507660600710891, + "grad_norm": 2.2993555068969727, + "learning_rate": 9.988900111905955e-06, + "loss": 3.0133, + "step": 356050 + }, + { + "epoch": 0.3508153180489112, + "grad_norm": 2.297830820083618, + "learning_rate": 9.988889261247668e-06, + "loss": 3.0257, + "step": 356100 + }, + { + "epoch": 0.35086457602673327, + "grad_norm": 2.292912721633911, + "learning_rate": 9.988878405294362e-06, + "loss": 2.9236, + "step": 356150 + }, + { + "epoch": 0.3509138340045554, + "grad_norm": 2.2850868701934814, + "learning_rate": 9.988867544046043e-06, + "loss": 3.0578, + "step": 356200 + }, + { + "epoch": 0.35096309198237746, + "grad_norm": 2.3444504737854004, + "learning_rate": 9.988856677502726e-06, + "loss": 2.8876, + "step": 356250 + }, + { + "epoch": 0.35101234996019953, + "grad_norm": 2.772386312484741, + "learning_rate": 9.988845805664422e-06, + "loss": 2.9507, + "step": 356300 + }, + { + "epoch": 0.35106160793802166, + "grad_norm": 2.4222049713134766, + "learning_rate": 9.988834928531138e-06, + "loss": 2.9493, + "step": 356350 + }, + { + "epoch": 0.35111086591584373, + "grad_norm": 2.3395941257476807, + "learning_rate": 9.988824046102893e-06, + "loss": 2.9923, + "step": 356400 + }, + { + "epoch": 0.3511601238936658, + "grad_norm": 2.233515739440918, + "learning_rate": 9.988813158379691e-06, + "loss": 2.9678, + "step": 356450 + }, + { + "epoch": 0.3512093818714879, + "grad_norm": 2.153210401535034, + "learning_rate": 9.98880226536155e-06, + "loss": 2.9764, + "step": 356500 + }, + { + "epoch": 0.35125863984931, + "grad_norm": 2.2453973293304443, + "learning_rate": 9.988791367048477e-06, + "loss": 2.9744, + "step": 356550 + }, + { + "epoch": 0.35130789782713207, + "grad_norm": 2.2487072944641113, + "learning_rate": 9.98878046344049e-06, + "loss": 2.954, + "step": 356600 + }, + { + "epoch": 0.3513571558049542, + "grad_norm": 2.2866883277893066, + "learning_rate": 9.988769554537592e-06, + "loss": 2.9819, + "step": 356650 + }, + { + "epoch": 0.35140641378277626, + "grad_norm": 2.303689956665039, + "learning_rate": 9.988758640339798e-06, + "loss": 3.0144, + "step": 356700 + }, + { + "epoch": 0.35145567176059833, + "grad_norm": 2.1545472145080566, + "learning_rate": 9.98874772084712e-06, + "loss": 2.9374, + "step": 356750 + }, + { + "epoch": 0.35150492973842046, + "grad_norm": 2.2671215534210205, + "learning_rate": 9.988736796059573e-06, + "loss": 2.9738, + "step": 356800 + }, + { + "epoch": 0.3515541877162425, + "grad_norm": 2.3107852935791016, + "learning_rate": 9.988725865977164e-06, + "loss": 2.9539, + "step": 356850 + }, + { + "epoch": 0.3516034456940646, + "grad_norm": 2.239840507507324, + "learning_rate": 9.988714930599906e-06, + "loss": 2.9699, + "step": 356900 + }, + { + "epoch": 0.3516527036718867, + "grad_norm": 2.3705084323883057, + "learning_rate": 9.988703989927808e-06, + "loss": 3.0575, + "step": 356950 + }, + { + "epoch": 0.3517019616497088, + "grad_norm": 2.132145881652832, + "learning_rate": 9.988693043960889e-06, + "loss": 2.9612, + "step": 357000 + }, + { + "epoch": 0.35175121962753086, + "grad_norm": 2.151716947555542, + "learning_rate": 9.98868209269915e-06, + "loss": 3.0733, + "step": 357050 + }, + { + "epoch": 0.351800477605353, + "grad_norm": 2.3170366287231445, + "learning_rate": 9.988671136142612e-06, + "loss": 3.0176, + "step": 357100 + }, + { + "epoch": 0.35184973558317506, + "grad_norm": 2.314544677734375, + "learning_rate": 9.988660174291285e-06, + "loss": 2.9312, + "step": 357150 + }, + { + "epoch": 0.35189899356099713, + "grad_norm": 2.4334261417388916, + "learning_rate": 9.988649207145174e-06, + "loss": 2.9695, + "step": 357200 + }, + { + "epoch": 0.3519482515388192, + "grad_norm": 2.3540995121002197, + "learning_rate": 9.988638234704299e-06, + "loss": 2.9904, + "step": 357250 + }, + { + "epoch": 0.3519975095166413, + "grad_norm": 2.376408100128174, + "learning_rate": 9.988627256968664e-06, + "loss": 3.0294, + "step": 357300 + }, + { + "epoch": 0.3520467674944634, + "grad_norm": 2.3114094734191895, + "learning_rate": 9.988616273938288e-06, + "loss": 2.9612, + "step": 357350 + }, + { + "epoch": 0.35209602547228547, + "grad_norm": 2.1653671264648438, + "learning_rate": 9.988605285613179e-06, + "loss": 2.965, + "step": 357400 + }, + { + "epoch": 0.3521452834501076, + "grad_norm": 2.359914541244507, + "learning_rate": 9.988594291993348e-06, + "loss": 2.9944, + "step": 357450 + }, + { + "epoch": 0.35219454142792966, + "grad_norm": 2.2301807403564453, + "learning_rate": 9.988583293078809e-06, + "loss": 3.0185, + "step": 357500 + }, + { + "epoch": 0.35224379940575173, + "grad_norm": 2.339677333831787, + "learning_rate": 9.98857228886957e-06, + "loss": 2.9666, + "step": 357550 + }, + { + "epoch": 0.35229305738357386, + "grad_norm": 2.388258457183838, + "learning_rate": 9.988561279365647e-06, + "loss": 2.9407, + "step": 357600 + }, + { + "epoch": 0.35234231536139593, + "grad_norm": 2.3422458171844482, + "learning_rate": 9.988550264567049e-06, + "loss": 2.937, + "step": 357650 + }, + { + "epoch": 0.352391573339218, + "grad_norm": 2.264251708984375, + "learning_rate": 9.988539244473788e-06, + "loss": 2.9296, + "step": 357700 + }, + { + "epoch": 0.3524408313170401, + "grad_norm": 2.200772762298584, + "learning_rate": 9.988528219085876e-06, + "loss": 2.9293, + "step": 357750 + }, + { + "epoch": 0.3524900892948622, + "grad_norm": 2.2455132007598877, + "learning_rate": 9.988517188403325e-06, + "loss": 2.9781, + "step": 357800 + }, + { + "epoch": 0.35253934727268427, + "grad_norm": 2.0888431072235107, + "learning_rate": 9.988506152426148e-06, + "loss": 2.9484, + "step": 357850 + }, + { + "epoch": 0.3525886052505064, + "grad_norm": 2.261477470397949, + "learning_rate": 9.988495111154352e-06, + "loss": 2.9471, + "step": 357900 + }, + { + "epoch": 0.35263786322832846, + "grad_norm": 2.394035577774048, + "learning_rate": 9.988484064587953e-06, + "loss": 2.9385, + "step": 357950 + }, + { + "epoch": 0.35268712120615053, + "grad_norm": 2.3250510692596436, + "learning_rate": 9.988473012726964e-06, + "loss": 2.9749, + "step": 358000 + }, + { + "epoch": 0.35273637918397266, + "grad_norm": 2.2200255393981934, + "learning_rate": 9.988461955571391e-06, + "loss": 2.9754, + "step": 358050 + }, + { + "epoch": 0.35278563716179473, + "grad_norm": 2.394873857498169, + "learning_rate": 9.988450893121251e-06, + "loss": 3.0553, + "step": 358100 + }, + { + "epoch": 0.3528348951396168, + "grad_norm": 2.2738566398620605, + "learning_rate": 9.988439825376552e-06, + "loss": 3.0531, + "step": 358150 + }, + { + "epoch": 0.3528841531174389, + "grad_norm": 2.412632703781128, + "learning_rate": 9.988428752337309e-06, + "loss": 2.9567, + "step": 358200 + }, + { + "epoch": 0.352933411095261, + "grad_norm": 2.41582989692688, + "learning_rate": 9.988417674003532e-06, + "loss": 2.966, + "step": 358250 + }, + { + "epoch": 0.35298266907308307, + "grad_norm": 2.322500705718994, + "learning_rate": 9.988406590375234e-06, + "loss": 2.9727, + "step": 358300 + }, + { + "epoch": 0.3530319270509052, + "grad_norm": 2.422396659851074, + "learning_rate": 9.988395501452424e-06, + "loss": 2.9461, + "step": 358350 + }, + { + "epoch": 0.35308118502872726, + "grad_norm": 2.250699996948242, + "learning_rate": 9.988384407235117e-06, + "loss": 2.9876, + "step": 358400 + }, + { + "epoch": 0.35313044300654933, + "grad_norm": 2.2414398193359375, + "learning_rate": 9.988373307723324e-06, + "loss": 2.9498, + "step": 358450 + }, + { + "epoch": 0.3531797009843714, + "grad_norm": 2.470309257507324, + "learning_rate": 9.988362202917054e-06, + "loss": 2.9702, + "step": 358500 + }, + { + "epoch": 0.35322895896219353, + "grad_norm": 2.403179168701172, + "learning_rate": 9.988351092816324e-06, + "loss": 2.9003, + "step": 358550 + }, + { + "epoch": 0.3532782169400156, + "grad_norm": 2.502044677734375, + "learning_rate": 9.988339977421139e-06, + "loss": 2.9964, + "step": 358600 + }, + { + "epoch": 0.35332747491783767, + "grad_norm": 2.3590304851531982, + "learning_rate": 9.988328856731517e-06, + "loss": 2.9218, + "step": 358650 + }, + { + "epoch": 0.3533767328956598, + "grad_norm": 2.409125328063965, + "learning_rate": 9.988317730747467e-06, + "loss": 2.9444, + "step": 358700 + }, + { + "epoch": 0.35342599087348187, + "grad_norm": 2.5448644161224365, + "learning_rate": 9.988306599469e-06, + "loss": 2.984, + "step": 358750 + }, + { + "epoch": 0.35347524885130394, + "grad_norm": 2.252824068069458, + "learning_rate": 9.98829546289613e-06, + "loss": 2.9029, + "step": 358800 + }, + { + "epoch": 0.35352450682912606, + "grad_norm": 2.3183107376098633, + "learning_rate": 9.988284321028868e-06, + "loss": 2.9424, + "step": 358850 + }, + { + "epoch": 0.35357376480694813, + "grad_norm": 2.322129964828491, + "learning_rate": 9.988273173867224e-06, + "loss": 2.9279, + "step": 358900 + }, + { + "epoch": 0.3536230227847702, + "grad_norm": 2.255025863647461, + "learning_rate": 9.988262021411212e-06, + "loss": 2.9145, + "step": 358950 + }, + { + "epoch": 0.35367228076259233, + "grad_norm": 2.1860194206237793, + "learning_rate": 9.988250863660845e-06, + "loss": 2.9387, + "step": 359000 + }, + { + "epoch": 0.3537215387404144, + "grad_norm": 2.0853891372680664, + "learning_rate": 9.988239700616132e-06, + "loss": 2.9281, + "step": 359050 + }, + { + "epoch": 0.35377079671823647, + "grad_norm": 2.384831428527832, + "learning_rate": 9.988228532277087e-06, + "loss": 2.9531, + "step": 359100 + }, + { + "epoch": 0.3538200546960586, + "grad_norm": 2.2465662956237793, + "learning_rate": 9.988217358643718e-06, + "loss": 2.9637, + "step": 359150 + }, + { + "epoch": 0.35386931267388066, + "grad_norm": 2.1904618740081787, + "learning_rate": 9.988206179716043e-06, + "loss": 2.9842, + "step": 359200 + }, + { + "epoch": 0.35391857065170274, + "grad_norm": 2.2384650707244873, + "learning_rate": 9.98819499549407e-06, + "loss": 2.9332, + "step": 359250 + }, + { + "epoch": 0.35396782862952486, + "grad_norm": 2.1186344623565674, + "learning_rate": 9.98818380597781e-06, + "loss": 2.9131, + "step": 359300 + }, + { + "epoch": 0.35401708660734693, + "grad_norm": 2.356910228729248, + "learning_rate": 9.988172611167274e-06, + "loss": 3.0061, + "step": 359350 + }, + { + "epoch": 0.354066344585169, + "grad_norm": 2.4279441833496094, + "learning_rate": 9.988161411062479e-06, + "loss": 2.938, + "step": 359400 + }, + { + "epoch": 0.3541156025629911, + "grad_norm": 2.365097761154175, + "learning_rate": 9.988150205663434e-06, + "loss": 2.9729, + "step": 359450 + }, + { + "epoch": 0.3541648605408132, + "grad_norm": 2.3526840209960938, + "learning_rate": 9.98813899497015e-06, + "loss": 2.9465, + "step": 359500 + }, + { + "epoch": 0.35421411851863527, + "grad_norm": 2.284461498260498, + "learning_rate": 9.988127778982643e-06, + "loss": 3.0423, + "step": 359550 + }, + { + "epoch": 0.3542633764964574, + "grad_norm": 2.309664487838745, + "learning_rate": 9.988116557700918e-06, + "loss": 3.0386, + "step": 359600 + }, + { + "epoch": 0.35431263447427946, + "grad_norm": 2.3674206733703613, + "learning_rate": 9.988105331124993e-06, + "loss": 2.9665, + "step": 359650 + }, + { + "epoch": 0.35436189245210153, + "grad_norm": 2.3047034740448, + "learning_rate": 9.988094099254877e-06, + "loss": 2.979, + "step": 359700 + }, + { + "epoch": 0.3544111504299236, + "grad_norm": 2.397012948989868, + "learning_rate": 9.988082862090582e-06, + "loss": 2.9424, + "step": 359750 + }, + { + "epoch": 0.35446040840774573, + "grad_norm": 2.3555734157562256, + "learning_rate": 9.98807161963212e-06, + "loss": 2.9627, + "step": 359800 + }, + { + "epoch": 0.3545096663855678, + "grad_norm": 2.236656904220581, + "learning_rate": 9.988060371879504e-06, + "loss": 2.9614, + "step": 359850 + }, + { + "epoch": 0.35455892436338987, + "grad_norm": 2.461535930633545, + "learning_rate": 9.988049118832745e-06, + "loss": 2.8731, + "step": 359900 + }, + { + "epoch": 0.354608182341212, + "grad_norm": 2.2792508602142334, + "learning_rate": 9.988037860491855e-06, + "loss": 3.0164, + "step": 359950 + }, + { + "epoch": 0.35465744031903407, + "grad_norm": 2.491628646850586, + "learning_rate": 9.988026596856848e-06, + "loss": 2.9098, + "step": 360000 + }, + { + "epoch": 0.35470669829685614, + "grad_norm": 2.393871307373047, + "learning_rate": 9.988015327927732e-06, + "loss": 2.9714, + "step": 360050 + }, + { + "epoch": 0.35475595627467826, + "grad_norm": 2.2619409561157227, + "learning_rate": 9.98800405370452e-06, + "loss": 2.8985, + "step": 360100 + }, + { + "epoch": 0.35480521425250033, + "grad_norm": 2.492304801940918, + "learning_rate": 9.98799277418723e-06, + "loss": 2.9599, + "step": 360150 + }, + { + "epoch": 0.3548544722303224, + "grad_norm": 2.1492764949798584, + "learning_rate": 9.987981489375865e-06, + "loss": 2.9358, + "step": 360200 + }, + { + "epoch": 0.35490373020814453, + "grad_norm": 2.3312129974365234, + "learning_rate": 9.987970199270441e-06, + "loss": 2.969, + "step": 360250 + }, + { + "epoch": 0.3549529881859666, + "grad_norm": 2.2014849185943604, + "learning_rate": 9.987958903870972e-06, + "loss": 2.9999, + "step": 360300 + }, + { + "epoch": 0.35500224616378867, + "grad_norm": 2.3313918113708496, + "learning_rate": 9.987947603177466e-06, + "loss": 2.9053, + "step": 360350 + }, + { + "epoch": 0.3550515041416108, + "grad_norm": 2.3025901317596436, + "learning_rate": 9.987936297189939e-06, + "loss": 2.9352, + "step": 360400 + }, + { + "epoch": 0.35510076211943287, + "grad_norm": 2.3685007095336914, + "learning_rate": 9.987924985908399e-06, + "loss": 2.9073, + "step": 360450 + }, + { + "epoch": 0.35515002009725494, + "grad_norm": 2.3598215579986572, + "learning_rate": 9.987913669332862e-06, + "loss": 2.9633, + "step": 360500 + }, + { + "epoch": 0.35519927807507706, + "grad_norm": 2.3687245845794678, + "learning_rate": 9.987902347463337e-06, + "loss": 2.9756, + "step": 360550 + }, + { + "epoch": 0.35524853605289913, + "grad_norm": 2.204298734664917, + "learning_rate": 9.987891020299836e-06, + "loss": 2.9631, + "step": 360600 + }, + { + "epoch": 0.3552977940307212, + "grad_norm": 2.3189995288848877, + "learning_rate": 9.987879687842374e-06, + "loss": 2.9543, + "step": 360650 + }, + { + "epoch": 0.35534705200854333, + "grad_norm": 2.2477996349334717, + "learning_rate": 9.987868350090959e-06, + "loss": 3.0321, + "step": 360700 + }, + { + "epoch": 0.3553963099863654, + "grad_norm": 2.292478322982788, + "learning_rate": 9.987857007045607e-06, + "loss": 2.9709, + "step": 360750 + }, + { + "epoch": 0.35544556796418747, + "grad_norm": 2.332951545715332, + "learning_rate": 9.987845658706326e-06, + "loss": 2.9043, + "step": 360800 + }, + { + "epoch": 0.35549482594200954, + "grad_norm": 2.3388519287109375, + "learning_rate": 9.987834305073133e-06, + "loss": 2.9777, + "step": 360850 + }, + { + "epoch": 0.35554408391983167, + "grad_norm": 2.2666866779327393, + "learning_rate": 9.987822946146036e-06, + "loss": 2.9716, + "step": 360900 + }, + { + "epoch": 0.35559334189765374, + "grad_norm": 2.3690900802612305, + "learning_rate": 9.987811581925048e-06, + "loss": 2.9342, + "step": 360950 + }, + { + "epoch": 0.3556425998754758, + "grad_norm": 2.228529214859009, + "learning_rate": 9.98780021241018e-06, + "loss": 3.0428, + "step": 361000 + }, + { + "epoch": 0.35569185785329793, + "grad_norm": 2.387155771255493, + "learning_rate": 9.987788837601447e-06, + "loss": 3.023, + "step": 361050 + }, + { + "epoch": 0.35574111583112, + "grad_norm": 2.4653706550598145, + "learning_rate": 9.98777745749886e-06, + "loss": 2.9577, + "step": 361100 + }, + { + "epoch": 0.3557903738089421, + "grad_norm": 2.3315913677215576, + "learning_rate": 9.98776607210243e-06, + "loss": 2.9488, + "step": 361150 + }, + { + "epoch": 0.3558396317867642, + "grad_norm": 2.3207244873046875, + "learning_rate": 9.987754681412168e-06, + "loss": 2.9376, + "step": 361200 + }, + { + "epoch": 0.35588888976458627, + "grad_norm": 2.247079849243164, + "learning_rate": 9.987743285428089e-06, + "loss": 3.038, + "step": 361250 + }, + { + "epoch": 0.35593814774240834, + "grad_norm": 2.353811025619507, + "learning_rate": 9.987731884150204e-06, + "loss": 2.9644, + "step": 361300 + }, + { + "epoch": 0.35598740572023047, + "grad_norm": 2.222411632537842, + "learning_rate": 9.987720477578527e-06, + "loss": 2.9213, + "step": 361350 + }, + { + "epoch": 0.35603666369805254, + "grad_norm": 2.2937471866607666, + "learning_rate": 9.987709065713066e-06, + "loss": 3.0056, + "step": 361400 + }, + { + "epoch": 0.3560859216758746, + "grad_norm": 2.420436143875122, + "learning_rate": 9.987697648553835e-06, + "loss": 2.9928, + "step": 361450 + }, + { + "epoch": 0.35613517965369673, + "grad_norm": 2.3408195972442627, + "learning_rate": 9.987686226100847e-06, + "loss": 2.9229, + "step": 361500 + }, + { + "epoch": 0.3561844376315188, + "grad_norm": 2.26583194732666, + "learning_rate": 9.987674798354113e-06, + "loss": 2.984, + "step": 361550 + }, + { + "epoch": 0.3562336956093409, + "grad_norm": 2.393211841583252, + "learning_rate": 9.987663365313645e-06, + "loss": 2.8888, + "step": 361600 + }, + { + "epoch": 0.356282953587163, + "grad_norm": 2.5020511150360107, + "learning_rate": 9.987651926979456e-06, + "loss": 2.9762, + "step": 361650 + }, + { + "epoch": 0.35633221156498507, + "grad_norm": 2.3906404972076416, + "learning_rate": 9.987640483351557e-06, + "loss": 2.9536, + "step": 361700 + }, + { + "epoch": 0.35638146954280714, + "grad_norm": 2.4819223880767822, + "learning_rate": 9.987629034429962e-06, + "loss": 2.9712, + "step": 361750 + }, + { + "epoch": 0.35643072752062926, + "grad_norm": 2.050848960876465, + "learning_rate": 9.987617580214683e-06, + "loss": 2.9817, + "step": 361800 + }, + { + "epoch": 0.35647998549845133, + "grad_norm": 2.185004711151123, + "learning_rate": 9.98760612070573e-06, + "loss": 2.9406, + "step": 361850 + }, + { + "epoch": 0.3565292434762734, + "grad_norm": 2.222330093383789, + "learning_rate": 9.987594655903116e-06, + "loss": 2.9584, + "step": 361900 + }, + { + "epoch": 0.35657850145409553, + "grad_norm": 2.568225860595703, + "learning_rate": 9.987583185806854e-06, + "loss": 2.9547, + "step": 361950 + }, + { + "epoch": 0.3566277594319176, + "grad_norm": 2.1780872344970703, + "learning_rate": 9.987571710416956e-06, + "loss": 2.9979, + "step": 362000 + }, + { + "epoch": 0.35667701740973967, + "grad_norm": 2.386396884918213, + "learning_rate": 9.987560229733434e-06, + "loss": 2.9594, + "step": 362050 + }, + { + "epoch": 0.35672627538756174, + "grad_norm": 2.1966021060943604, + "learning_rate": 9.9875487437563e-06, + "loss": 2.9675, + "step": 362100 + }, + { + "epoch": 0.35677553336538387, + "grad_norm": 2.2054474353790283, + "learning_rate": 9.987537252485566e-06, + "loss": 2.9671, + "step": 362150 + }, + { + "epoch": 0.35682479134320594, + "grad_norm": 2.214151382446289, + "learning_rate": 9.987525755921246e-06, + "loss": 3.0101, + "step": 362200 + }, + { + "epoch": 0.356874049321028, + "grad_norm": 2.341277599334717, + "learning_rate": 9.987514254063349e-06, + "loss": 2.9643, + "step": 362250 + }, + { + "epoch": 0.35692330729885013, + "grad_norm": 2.472186326980591, + "learning_rate": 9.98750274691189e-06, + "loss": 2.9857, + "step": 362300 + }, + { + "epoch": 0.3569725652766722, + "grad_norm": 2.230236768722534, + "learning_rate": 9.98749123446688e-06, + "loss": 2.959, + "step": 362350 + }, + { + "epoch": 0.3570218232544943, + "grad_norm": 2.503843069076538, + "learning_rate": 9.98747971672833e-06, + "loss": 3.0442, + "step": 362400 + }, + { + "epoch": 0.3570710812323164, + "grad_norm": 2.2867438793182373, + "learning_rate": 9.987468193696255e-06, + "loss": 3.0416, + "step": 362450 + }, + { + "epoch": 0.35712033921013847, + "grad_norm": 2.368241786956787, + "learning_rate": 9.987456665370665e-06, + "loss": 2.9049, + "step": 362500 + }, + { + "epoch": 0.35716959718796054, + "grad_norm": 2.510207176208496, + "learning_rate": 9.987445131751574e-06, + "loss": 2.9384, + "step": 362550 + }, + { + "epoch": 0.35721885516578267, + "grad_norm": 2.4941651821136475, + "learning_rate": 9.987433592838992e-06, + "loss": 2.9454, + "step": 362600 + }, + { + "epoch": 0.35726811314360474, + "grad_norm": 2.286174774169922, + "learning_rate": 9.987422048632934e-06, + "loss": 2.9053, + "step": 362650 + }, + { + "epoch": 0.3573173711214268, + "grad_norm": 2.355470657348633, + "learning_rate": 9.98741049913341e-06, + "loss": 2.9402, + "step": 362700 + }, + { + "epoch": 0.35736662909924893, + "grad_norm": 3.3021585941314697, + "learning_rate": 9.987398944340433e-06, + "loss": 2.908, + "step": 362750 + }, + { + "epoch": 0.357415887077071, + "grad_norm": 2.586327314376831, + "learning_rate": 9.987387384254016e-06, + "loss": 2.9315, + "step": 362800 + }, + { + "epoch": 0.3574651450548931, + "grad_norm": 2.2139227390289307, + "learning_rate": 9.987375818874169e-06, + "loss": 3.0591, + "step": 362850 + }, + { + "epoch": 0.3575144030327152, + "grad_norm": 2.154409885406494, + "learning_rate": 9.987364248200908e-06, + "loss": 3.0009, + "step": 362900 + }, + { + "epoch": 0.35756366101053727, + "grad_norm": 2.3353519439697266, + "learning_rate": 9.987352672234242e-06, + "loss": 3.0115, + "step": 362950 + }, + { + "epoch": 0.35761291898835934, + "grad_norm": 2.2507901191711426, + "learning_rate": 9.987341090974183e-06, + "loss": 3.0133, + "step": 363000 + }, + { + "epoch": 0.35766217696618147, + "grad_norm": 2.3147101402282715, + "learning_rate": 9.987329504420747e-06, + "loss": 3.0013, + "step": 363050 + }, + { + "epoch": 0.35771143494400354, + "grad_norm": 2.372318744659424, + "learning_rate": 9.987317912573944e-06, + "loss": 2.9555, + "step": 363100 + }, + { + "epoch": 0.3577606929218256, + "grad_norm": 2.5550198554992676, + "learning_rate": 9.987306315433785e-06, + "loss": 3.0199, + "step": 363150 + }, + { + "epoch": 0.35780995089964773, + "grad_norm": 2.389742136001587, + "learning_rate": 9.987294713000284e-06, + "loss": 2.9651, + "step": 363200 + }, + { + "epoch": 0.3578592088774698, + "grad_norm": 2.2752418518066406, + "learning_rate": 9.987283105273453e-06, + "loss": 2.9912, + "step": 363250 + }, + { + "epoch": 0.3579084668552919, + "grad_norm": 2.1332128047943115, + "learning_rate": 9.987271492253305e-06, + "loss": 3.0228, + "step": 363300 + }, + { + "epoch": 0.35795772483311394, + "grad_norm": 2.503787040710449, + "learning_rate": 9.987259873939851e-06, + "loss": 2.8871, + "step": 363350 + }, + { + "epoch": 0.35800698281093607, + "grad_norm": 2.158021926879883, + "learning_rate": 9.987248250333103e-06, + "loss": 2.9359, + "step": 363400 + }, + { + "epoch": 0.35805624078875814, + "grad_norm": 2.12276029586792, + "learning_rate": 9.987236621433076e-06, + "loss": 3.0293, + "step": 363450 + }, + { + "epoch": 0.3581054987665802, + "grad_norm": 2.621997833251953, + "learning_rate": 9.987224987239779e-06, + "loss": 2.9296, + "step": 363500 + }, + { + "epoch": 0.35815475674440234, + "grad_norm": 2.2211179733276367, + "learning_rate": 9.987213347753226e-06, + "loss": 2.8561, + "step": 363550 + }, + { + "epoch": 0.3582040147222244, + "grad_norm": 2.472642660140991, + "learning_rate": 9.987201702973432e-06, + "loss": 2.9218, + "step": 363600 + }, + { + "epoch": 0.3582532727000465, + "grad_norm": 2.535191059112549, + "learning_rate": 9.987190052900403e-06, + "loss": 2.9592, + "step": 363650 + }, + { + "epoch": 0.3583025306778686, + "grad_norm": 2.1980667114257812, + "learning_rate": 9.987178397534156e-06, + "loss": 2.9872, + "step": 363700 + }, + { + "epoch": 0.3583517886556907, + "grad_norm": 2.2821850776672363, + "learning_rate": 9.987166736874703e-06, + "loss": 2.9709, + "step": 363750 + }, + { + "epoch": 0.35840104663351274, + "grad_norm": 2.5591869354248047, + "learning_rate": 9.987155070922055e-06, + "loss": 2.9498, + "step": 363800 + }, + { + "epoch": 0.35845030461133487, + "grad_norm": 1.9769644737243652, + "learning_rate": 9.987143399676227e-06, + "loss": 2.9963, + "step": 363850 + }, + { + "epoch": 0.35849956258915694, + "grad_norm": 2.4309816360473633, + "learning_rate": 9.987131723137229e-06, + "loss": 2.9249, + "step": 363900 + }, + { + "epoch": 0.358548820566979, + "grad_norm": 2.3071231842041016, + "learning_rate": 9.987120041305073e-06, + "loss": 2.9601, + "step": 363950 + }, + { + "epoch": 0.35859807854480114, + "grad_norm": 2.614656686782837, + "learning_rate": 9.987108354179773e-06, + "loss": 3.0029, + "step": 364000 + }, + { + "epoch": 0.3586473365226232, + "grad_norm": 2.2864067554473877, + "learning_rate": 9.987096661761341e-06, + "loss": 2.9195, + "step": 364050 + }, + { + "epoch": 0.3586965945004453, + "grad_norm": 2.4196524620056152, + "learning_rate": 9.987084964049787e-06, + "loss": 2.9755, + "step": 364100 + }, + { + "epoch": 0.3587458524782674, + "grad_norm": 2.383248805999756, + "learning_rate": 9.987073261045128e-06, + "loss": 2.9514, + "step": 364150 + }, + { + "epoch": 0.35879511045608947, + "grad_norm": 2.295335054397583, + "learning_rate": 9.987061552747374e-06, + "loss": 2.8639, + "step": 364200 + }, + { + "epoch": 0.35884436843391154, + "grad_norm": 2.1179449558258057, + "learning_rate": 9.987049839156535e-06, + "loss": 2.8941, + "step": 364250 + }, + { + "epoch": 0.35889362641173367, + "grad_norm": 2.1878271102905273, + "learning_rate": 9.987038120272627e-06, + "loss": 2.8847, + "step": 364300 + }, + { + "epoch": 0.35894288438955574, + "grad_norm": 2.256791591644287, + "learning_rate": 9.987026396095663e-06, + "loss": 3.0081, + "step": 364350 + }, + { + "epoch": 0.3589921423673778, + "grad_norm": 2.2335755825042725, + "learning_rate": 9.987014666625652e-06, + "loss": 2.9417, + "step": 364400 + }, + { + "epoch": 0.35904140034519993, + "grad_norm": 2.2680416107177734, + "learning_rate": 9.987002931862609e-06, + "loss": 2.9282, + "step": 364450 + }, + { + "epoch": 0.359090658323022, + "grad_norm": 2.2982795238494873, + "learning_rate": 9.986991191806545e-06, + "loss": 2.9667, + "step": 364500 + }, + { + "epoch": 0.3591399163008441, + "grad_norm": 2.4332163333892822, + "learning_rate": 9.986979446457474e-06, + "loss": 3.01, + "step": 364550 + }, + { + "epoch": 0.35918917427866615, + "grad_norm": 2.370990037918091, + "learning_rate": 9.986967695815405e-06, + "loss": 2.9812, + "step": 364600 + }, + { + "epoch": 0.35923843225648827, + "grad_norm": 2.5364742279052734, + "learning_rate": 9.986955939880355e-06, + "loss": 2.945, + "step": 364650 + }, + { + "epoch": 0.35928769023431034, + "grad_norm": 2.8644158840179443, + "learning_rate": 9.986944178652336e-06, + "loss": 2.9634, + "step": 364700 + }, + { + "epoch": 0.3593369482121324, + "grad_norm": 2.2953004837036133, + "learning_rate": 9.986932412131356e-06, + "loss": 2.8936, + "step": 364750 + }, + { + "epoch": 0.35938620618995454, + "grad_norm": 2.4109456539154053, + "learning_rate": 9.986920640317432e-06, + "loss": 2.8999, + "step": 364800 + }, + { + "epoch": 0.3594354641677766, + "grad_norm": 2.4547135829925537, + "learning_rate": 9.986908863210576e-06, + "loss": 2.8984, + "step": 364850 + }, + { + "epoch": 0.3594847221455987, + "grad_norm": 2.2411201000213623, + "learning_rate": 9.986897080810796e-06, + "loss": 3.0263, + "step": 364900 + }, + { + "epoch": 0.3595339801234208, + "grad_norm": 2.384784460067749, + "learning_rate": 9.986885293118111e-06, + "loss": 2.9623, + "step": 364950 + }, + { + "epoch": 0.3595832381012429, + "grad_norm": 2.492511034011841, + "learning_rate": 9.98687350013253e-06, + "loss": 2.9564, + "step": 365000 + }, + { + "epoch": 0.35963249607906494, + "grad_norm": 2.6095516681671143, + "learning_rate": 9.986861701854066e-06, + "loss": 2.9609, + "step": 365050 + }, + { + "epoch": 0.35968175405688707, + "grad_norm": 2.2681164741516113, + "learning_rate": 9.986849898282731e-06, + "loss": 2.9927, + "step": 365100 + }, + { + "epoch": 0.35973101203470914, + "grad_norm": 2.4563984870910645, + "learning_rate": 9.986838089418539e-06, + "loss": 2.9934, + "step": 365150 + }, + { + "epoch": 0.3597802700125312, + "grad_norm": 2.186204671859741, + "learning_rate": 9.9868262752615e-06, + "loss": 2.9799, + "step": 365200 + }, + { + "epoch": 0.35982952799035334, + "grad_norm": 2.279505729675293, + "learning_rate": 9.98681445581163e-06, + "loss": 2.9062, + "step": 365250 + }, + { + "epoch": 0.3598787859681754, + "grad_norm": 2.197453260421753, + "learning_rate": 9.986802631068937e-06, + "loss": 2.93, + "step": 365300 + }, + { + "epoch": 0.3599280439459975, + "grad_norm": 2.3760247230529785, + "learning_rate": 9.986790801033439e-06, + "loss": 2.9809, + "step": 365350 + }, + { + "epoch": 0.3599773019238196, + "grad_norm": 2.4089417457580566, + "learning_rate": 9.986778965705145e-06, + "loss": 2.9856, + "step": 365400 + }, + { + "epoch": 0.3600265599016417, + "grad_norm": 2.1843948364257812, + "learning_rate": 9.986767125084068e-06, + "loss": 2.9609, + "step": 365450 + }, + { + "epoch": 0.36007581787946374, + "grad_norm": 2.1791772842407227, + "learning_rate": 9.98675527917022e-06, + "loss": 3.0356, + "step": 365500 + }, + { + "epoch": 0.36012507585728587, + "grad_norm": 2.2925949096679688, + "learning_rate": 9.986743427963617e-06, + "loss": 2.9506, + "step": 365550 + }, + { + "epoch": 0.36017433383510794, + "grad_norm": 2.3661959171295166, + "learning_rate": 9.986731571464266e-06, + "loss": 2.9346, + "step": 365600 + }, + { + "epoch": 0.36022359181293, + "grad_norm": 2.3750007152557373, + "learning_rate": 9.986719709672183e-06, + "loss": 2.9629, + "step": 365650 + }, + { + "epoch": 0.36027284979075214, + "grad_norm": 2.163908004760742, + "learning_rate": 9.986707842587383e-06, + "loss": 2.9306, + "step": 365700 + }, + { + "epoch": 0.3603221077685742, + "grad_norm": 2.4724814891815186, + "learning_rate": 9.986695970209873e-06, + "loss": 2.9844, + "step": 365750 + }, + { + "epoch": 0.3603713657463963, + "grad_norm": 2.3490304946899414, + "learning_rate": 9.98668409253967e-06, + "loss": 3.0254, + "step": 365800 + }, + { + "epoch": 0.36042062372421835, + "grad_norm": 2.229485273361206, + "learning_rate": 9.986672209576784e-06, + "loss": 2.9513, + "step": 365850 + }, + { + "epoch": 0.3604698817020405, + "grad_norm": 2.306624412536621, + "learning_rate": 9.98666032132123e-06, + "loss": 3.0177, + "step": 365900 + }, + { + "epoch": 0.36051913967986254, + "grad_norm": 2.2640554904937744, + "learning_rate": 9.986648427773019e-06, + "loss": 2.9233, + "step": 365950 + }, + { + "epoch": 0.3605683976576846, + "grad_norm": 2.309554100036621, + "learning_rate": 9.986636528932162e-06, + "loss": 2.8838, + "step": 366000 + }, + { + "epoch": 0.36061765563550674, + "grad_norm": 2.2442033290863037, + "learning_rate": 9.986624624798676e-06, + "loss": 2.9352, + "step": 366050 + }, + { + "epoch": 0.3606669136133288, + "grad_norm": 2.1522607803344727, + "learning_rate": 9.986612715372568e-06, + "loss": 2.9641, + "step": 366100 + }, + { + "epoch": 0.3607161715911509, + "grad_norm": 2.265014886856079, + "learning_rate": 9.986600800653857e-06, + "loss": 2.8896, + "step": 366150 + }, + { + "epoch": 0.360765429568973, + "grad_norm": 2.417192220687866, + "learning_rate": 9.986588880642551e-06, + "loss": 2.97, + "step": 366200 + }, + { + "epoch": 0.3608146875467951, + "grad_norm": 2.3026037216186523, + "learning_rate": 9.986576955338666e-06, + "loss": 2.9248, + "step": 366250 + }, + { + "epoch": 0.36086394552461715, + "grad_norm": 2.2389063835144043, + "learning_rate": 9.98656502474221e-06, + "loss": 3.057, + "step": 366300 + }, + { + "epoch": 0.3609132035024393, + "grad_norm": 2.346858263015747, + "learning_rate": 9.986553088853199e-06, + "loss": 3.041, + "step": 366350 + }, + { + "epoch": 0.36096246148026134, + "grad_norm": 2.509387493133545, + "learning_rate": 9.986541147671647e-06, + "loss": 2.9944, + "step": 366400 + }, + { + "epoch": 0.3610117194580834, + "grad_norm": 2.3504912853240967, + "learning_rate": 9.986529201197563e-06, + "loss": 2.971, + "step": 366450 + }, + { + "epoch": 0.36106097743590554, + "grad_norm": 2.295961380004883, + "learning_rate": 9.98651724943096e-06, + "loss": 2.8939, + "step": 366500 + }, + { + "epoch": 0.3611102354137276, + "grad_norm": 2.0386202335357666, + "learning_rate": 9.986505292371855e-06, + "loss": 2.925, + "step": 366550 + }, + { + "epoch": 0.3611594933915497, + "grad_norm": 2.1347155570983887, + "learning_rate": 9.986493330020256e-06, + "loss": 2.9795, + "step": 366600 + }, + { + "epoch": 0.3612087513693718, + "grad_norm": 2.1900064945220947, + "learning_rate": 9.986481362376178e-06, + "loss": 3.007, + "step": 366650 + }, + { + "epoch": 0.3612580093471939, + "grad_norm": 2.499217987060547, + "learning_rate": 9.986469389439634e-06, + "loss": 2.9398, + "step": 366700 + }, + { + "epoch": 0.36130726732501595, + "grad_norm": 2.3247182369232178, + "learning_rate": 9.986457411210636e-06, + "loss": 2.9411, + "step": 366750 + }, + { + "epoch": 0.36135652530283807, + "grad_norm": 2.3710410594940186, + "learning_rate": 9.986445427689195e-06, + "loss": 2.9323, + "step": 366800 + }, + { + "epoch": 0.36140578328066014, + "grad_norm": 2.4857873916625977, + "learning_rate": 9.986433438875326e-06, + "loss": 2.9854, + "step": 366850 + }, + { + "epoch": 0.3614550412584822, + "grad_norm": 2.5338656902313232, + "learning_rate": 9.986421444769042e-06, + "loss": 2.932, + "step": 366900 + }, + { + "epoch": 0.36150429923630434, + "grad_norm": 2.296097755432129, + "learning_rate": 9.986409445370355e-06, + "loss": 2.9773, + "step": 366950 + }, + { + "epoch": 0.3615535572141264, + "grad_norm": 2.2456724643707275, + "learning_rate": 9.986397440679275e-06, + "loss": 2.9337, + "step": 367000 + }, + { + "epoch": 0.3616028151919485, + "grad_norm": 2.162834405899048, + "learning_rate": 9.986385430695819e-06, + "loss": 2.9935, + "step": 367050 + }, + { + "epoch": 0.36165207316977055, + "grad_norm": 2.3892998695373535, + "learning_rate": 9.986373415419998e-06, + "loss": 2.9845, + "step": 367100 + }, + { + "epoch": 0.3617013311475927, + "grad_norm": 2.4545092582702637, + "learning_rate": 9.986361394851825e-06, + "loss": 2.9134, + "step": 367150 + }, + { + "epoch": 0.36175058912541475, + "grad_norm": 2.2731893062591553, + "learning_rate": 9.986349368991313e-06, + "loss": 3.0407, + "step": 367200 + }, + { + "epoch": 0.3617998471032368, + "grad_norm": 2.473620891571045, + "learning_rate": 9.986337337838473e-06, + "loss": 3.0378, + "step": 367250 + }, + { + "epoch": 0.36184910508105894, + "grad_norm": 2.3677704334259033, + "learning_rate": 9.98632530139332e-06, + "loss": 2.9873, + "step": 367300 + }, + { + "epoch": 0.361898363058881, + "grad_norm": 2.3013105392456055, + "learning_rate": 9.986313259655865e-06, + "loss": 2.967, + "step": 367350 + }, + { + "epoch": 0.3619476210367031, + "grad_norm": 2.243058443069458, + "learning_rate": 9.986301212626123e-06, + "loss": 2.9289, + "step": 367400 + }, + { + "epoch": 0.3619968790145252, + "grad_norm": 2.2675092220306396, + "learning_rate": 9.986289160304104e-06, + "loss": 2.9242, + "step": 367450 + }, + { + "epoch": 0.3620461369923473, + "grad_norm": 2.5041918754577637, + "learning_rate": 9.986277102689823e-06, + "loss": 2.999, + "step": 367500 + }, + { + "epoch": 0.36209539497016935, + "grad_norm": 2.413787603378296, + "learning_rate": 9.986265039783292e-06, + "loss": 2.9146, + "step": 367550 + }, + { + "epoch": 0.3621446529479915, + "grad_norm": 2.516357660293579, + "learning_rate": 9.986252971584522e-06, + "loss": 3.0032, + "step": 367600 + }, + { + "epoch": 0.36219391092581354, + "grad_norm": 2.228656768798828, + "learning_rate": 9.986240898093529e-06, + "loss": 2.8708, + "step": 367650 + }, + { + "epoch": 0.3622431689036356, + "grad_norm": 2.2819879055023193, + "learning_rate": 9.986228819310326e-06, + "loss": 2.9564, + "step": 367700 + }, + { + "epoch": 0.36229242688145774, + "grad_norm": 2.2887861728668213, + "learning_rate": 9.986216735234922e-06, + "loss": 2.9807, + "step": 367750 + }, + { + "epoch": 0.3623416848592798, + "grad_norm": 2.261397123336792, + "learning_rate": 9.986204645867332e-06, + "loss": 2.9571, + "step": 367800 + }, + { + "epoch": 0.3623909428371019, + "grad_norm": 2.3557450771331787, + "learning_rate": 9.98619255120757e-06, + "loss": 2.9528, + "step": 367850 + }, + { + "epoch": 0.362440200814924, + "grad_norm": 2.3293778896331787, + "learning_rate": 9.986180451255649e-06, + "loss": 2.9899, + "step": 367900 + }, + { + "epoch": 0.3624894587927461, + "grad_norm": 2.6079020500183105, + "learning_rate": 9.986168346011577e-06, + "loss": 2.9645, + "step": 367950 + }, + { + "epoch": 0.36253871677056815, + "grad_norm": 2.277489185333252, + "learning_rate": 9.986156235475372e-06, + "loss": 2.9775, + "step": 368000 + }, + { + "epoch": 0.3625879747483903, + "grad_norm": 2.3319947719573975, + "learning_rate": 9.986144119647047e-06, + "loss": 2.9454, + "step": 368050 + }, + { + "epoch": 0.36263723272621234, + "grad_norm": 2.243565320968628, + "learning_rate": 9.986131998526612e-06, + "loss": 2.9363, + "step": 368100 + }, + { + "epoch": 0.3626864907040344, + "grad_norm": 2.294163465499878, + "learning_rate": 9.98611987211408e-06, + "loss": 2.9562, + "step": 368150 + }, + { + "epoch": 0.36273574868185654, + "grad_norm": 2.332155227661133, + "learning_rate": 9.986107740409466e-06, + "loss": 2.9653, + "step": 368200 + }, + { + "epoch": 0.3627850066596786, + "grad_norm": 2.35566782951355, + "learning_rate": 9.986095603412782e-06, + "loss": 2.9554, + "step": 368250 + }, + { + "epoch": 0.3628342646375007, + "grad_norm": 2.2926464080810547, + "learning_rate": 9.98608346112404e-06, + "loss": 2.9571, + "step": 368300 + }, + { + "epoch": 0.36288352261532275, + "grad_norm": 2.4502904415130615, + "learning_rate": 9.986071313543253e-06, + "loss": 3.0489, + "step": 368350 + }, + { + "epoch": 0.3629327805931449, + "grad_norm": 2.2514894008636475, + "learning_rate": 9.986059160670436e-06, + "loss": 2.9877, + "step": 368400 + }, + { + "epoch": 0.36298203857096695, + "grad_norm": 2.3496105670928955, + "learning_rate": 9.986047002505599e-06, + "loss": 2.978, + "step": 368450 + }, + { + "epoch": 0.363031296548789, + "grad_norm": 2.636286735534668, + "learning_rate": 9.986034839048757e-06, + "loss": 2.9124, + "step": 368500 + }, + { + "epoch": 0.36308055452661114, + "grad_norm": 2.268375873565674, + "learning_rate": 9.986022670299922e-06, + "loss": 2.9387, + "step": 368550 + }, + { + "epoch": 0.3631298125044332, + "grad_norm": 2.1338846683502197, + "learning_rate": 9.986010496259107e-06, + "loss": 2.9319, + "step": 368600 + }, + { + "epoch": 0.3631790704822553, + "grad_norm": 2.427481174468994, + "learning_rate": 9.985998316926324e-06, + "loss": 2.9621, + "step": 368650 + }, + { + "epoch": 0.3632283284600774, + "grad_norm": 2.7243635654449463, + "learning_rate": 9.985986132301588e-06, + "loss": 2.9761, + "step": 368700 + }, + { + "epoch": 0.3632775864378995, + "grad_norm": 2.272566080093384, + "learning_rate": 9.98597394238491e-06, + "loss": 2.912, + "step": 368750 + }, + { + "epoch": 0.36332684441572155, + "grad_norm": 2.305267095565796, + "learning_rate": 9.985961747176305e-06, + "loss": 2.9062, + "step": 368800 + }, + { + "epoch": 0.3633761023935437, + "grad_norm": 2.4928536415100098, + "learning_rate": 9.985949546675784e-06, + "loss": 2.9573, + "step": 368850 + }, + { + "epoch": 0.36342536037136575, + "grad_norm": 2.3475279808044434, + "learning_rate": 9.98593734088336e-06, + "loss": 3.0469, + "step": 368900 + }, + { + "epoch": 0.3634746183491878, + "grad_norm": 2.2307870388031006, + "learning_rate": 9.985925129799046e-06, + "loss": 2.9711, + "step": 368950 + }, + { + "epoch": 0.36352387632700994, + "grad_norm": 2.524484395980835, + "learning_rate": 9.98591291342286e-06, + "loss": 2.964, + "step": 369000 + }, + { + "epoch": 0.363573134304832, + "grad_norm": 2.4868838787078857, + "learning_rate": 9.985900691754805e-06, + "loss": 2.9731, + "step": 369050 + }, + { + "epoch": 0.3636223922826541, + "grad_norm": 2.344097137451172, + "learning_rate": 9.985888464794902e-06, + "loss": 2.9503, + "step": 369100 + }, + { + "epoch": 0.3636716502604762, + "grad_norm": 2.3688931465148926, + "learning_rate": 9.985876232543161e-06, + "loss": 2.9353, + "step": 369150 + }, + { + "epoch": 0.3637209082382983, + "grad_norm": 2.55981183052063, + "learning_rate": 9.985863994999597e-06, + "loss": 3.0133, + "step": 369200 + }, + { + "epoch": 0.36377016621612035, + "grad_norm": 2.342165470123291, + "learning_rate": 9.98585175216422e-06, + "loss": 2.9629, + "step": 369250 + }, + { + "epoch": 0.3638194241939425, + "grad_norm": 2.3682823181152344, + "learning_rate": 9.985839504037044e-06, + "loss": 2.9878, + "step": 369300 + }, + { + "epoch": 0.36386868217176455, + "grad_norm": 2.286450147628784, + "learning_rate": 9.985827250618083e-06, + "loss": 2.9082, + "step": 369350 + }, + { + "epoch": 0.3639179401495866, + "grad_norm": 2.179332971572876, + "learning_rate": 9.98581499190735e-06, + "loss": 2.9408, + "step": 369400 + }, + { + "epoch": 0.36396719812740874, + "grad_norm": 2.163161277770996, + "learning_rate": 9.985802727904857e-06, + "loss": 2.8897, + "step": 369450 + }, + { + "epoch": 0.3640164561052308, + "grad_norm": 2.451552152633667, + "learning_rate": 9.985790458610617e-06, + "loss": 2.9046, + "step": 369500 + }, + { + "epoch": 0.3640657140830529, + "grad_norm": 2.3201611042022705, + "learning_rate": 9.985778184024645e-06, + "loss": 2.9703, + "step": 369550 + }, + { + "epoch": 0.36411497206087495, + "grad_norm": 2.236283302307129, + "learning_rate": 9.985765904146952e-06, + "loss": 2.9671, + "step": 369600 + }, + { + "epoch": 0.3641642300386971, + "grad_norm": 2.3461666107177734, + "learning_rate": 9.985753618977549e-06, + "loss": 3.0061, + "step": 369650 + }, + { + "epoch": 0.36421348801651915, + "grad_norm": 2.5266242027282715, + "learning_rate": 9.985741328516455e-06, + "loss": 2.9636, + "step": 369700 + }, + { + "epoch": 0.3642627459943412, + "grad_norm": 2.3190064430236816, + "learning_rate": 9.985729032763678e-06, + "loss": 3.0236, + "step": 369750 + }, + { + "epoch": 0.36431200397216335, + "grad_norm": 2.3034493923187256, + "learning_rate": 9.985716731719234e-06, + "loss": 2.9802, + "step": 369800 + }, + { + "epoch": 0.3643612619499854, + "grad_norm": 2.4126546382904053, + "learning_rate": 9.985704425383134e-06, + "loss": 2.8808, + "step": 369850 + }, + { + "epoch": 0.3644105199278075, + "grad_norm": 2.2490317821502686, + "learning_rate": 9.985692113755391e-06, + "loss": 2.9698, + "step": 369900 + }, + { + "epoch": 0.3644597779056296, + "grad_norm": 2.1654293537139893, + "learning_rate": 9.98567979683602e-06, + "loss": 2.9584, + "step": 369950 + }, + { + "epoch": 0.3645090358834517, + "grad_norm": 2.415447235107422, + "learning_rate": 9.985667474625034e-06, + "loss": 2.9243, + "step": 370000 + }, + { + "epoch": 0.36455829386127375, + "grad_norm": 2.400291919708252, + "learning_rate": 9.985655147122442e-06, + "loss": 2.9454, + "step": 370050 + }, + { + "epoch": 0.3646075518390959, + "grad_norm": 2.5096678733825684, + "learning_rate": 9.985642814328263e-06, + "loss": 3.0104, + "step": 370100 + }, + { + "epoch": 0.36465680981691795, + "grad_norm": 2.4434659481048584, + "learning_rate": 9.985630476242504e-06, + "loss": 2.8913, + "step": 370150 + }, + { + "epoch": 0.36470606779474, + "grad_norm": 2.177313804626465, + "learning_rate": 9.985618132865184e-06, + "loss": 2.9738, + "step": 370200 + }, + { + "epoch": 0.36475532577256214, + "grad_norm": 2.4110448360443115, + "learning_rate": 9.985605784196311e-06, + "loss": 2.9989, + "step": 370250 + }, + { + "epoch": 0.3648045837503842, + "grad_norm": 2.401695489883423, + "learning_rate": 9.985593430235904e-06, + "loss": 2.9327, + "step": 370300 + }, + { + "epoch": 0.3648538417282063, + "grad_norm": 2.3993358612060547, + "learning_rate": 9.98558107098397e-06, + "loss": 2.892, + "step": 370350 + }, + { + "epoch": 0.3649030997060284, + "grad_norm": 2.3527796268463135, + "learning_rate": 9.985568706440525e-06, + "loss": 2.9539, + "step": 370400 + }, + { + "epoch": 0.3649523576838505, + "grad_norm": 2.3178253173828125, + "learning_rate": 9.985556336605583e-06, + "loss": 2.9045, + "step": 370450 + }, + { + "epoch": 0.36500161566167255, + "grad_norm": 2.186147451400757, + "learning_rate": 9.985543961479156e-06, + "loss": 2.9434, + "step": 370500 + }, + { + "epoch": 0.3650508736394947, + "grad_norm": 2.193699836730957, + "learning_rate": 9.985531581061255e-06, + "loss": 2.9492, + "step": 370550 + }, + { + "epoch": 0.36510013161731675, + "grad_norm": 2.316592216491699, + "learning_rate": 9.985519195351898e-06, + "loss": 2.9169, + "step": 370600 + }, + { + "epoch": 0.3651493895951388, + "grad_norm": 2.2591137886047363, + "learning_rate": 9.985506804351094e-06, + "loss": 2.9958, + "step": 370650 + }, + { + "epoch": 0.36519864757296094, + "grad_norm": 2.52751088142395, + "learning_rate": 9.985494408058857e-06, + "loss": 2.9704, + "step": 370700 + }, + { + "epoch": 0.365247905550783, + "grad_norm": 2.3038623332977295, + "learning_rate": 9.985482006475201e-06, + "loss": 2.9745, + "step": 370750 + }, + { + "epoch": 0.3652971635286051, + "grad_norm": 2.0761172771453857, + "learning_rate": 9.98546959960014e-06, + "loss": 2.9095, + "step": 370800 + }, + { + "epoch": 0.36534642150642715, + "grad_norm": 2.328183889389038, + "learning_rate": 9.985457187433685e-06, + "loss": 2.9857, + "step": 370850 + }, + { + "epoch": 0.3653956794842493, + "grad_norm": 2.258840560913086, + "learning_rate": 9.985444769975852e-06, + "loss": 3.008, + "step": 370900 + }, + { + "epoch": 0.36544493746207135, + "grad_norm": 2.78092098236084, + "learning_rate": 9.98543234722665e-06, + "loss": 3.0109, + "step": 370950 + }, + { + "epoch": 0.3654941954398934, + "grad_norm": 2.2364814281463623, + "learning_rate": 9.985419919186097e-06, + "loss": 2.9273, + "step": 371000 + }, + { + "epoch": 0.36554345341771555, + "grad_norm": 2.4338488578796387, + "learning_rate": 9.9854074858542e-06, + "loss": 2.9747, + "step": 371050 + }, + { + "epoch": 0.3655927113955376, + "grad_norm": 2.2652275562286377, + "learning_rate": 9.98539504723098e-06, + "loss": 3.0107, + "step": 371100 + }, + { + "epoch": 0.3656419693733597, + "grad_norm": 2.472808837890625, + "learning_rate": 9.985382603316445e-06, + "loss": 2.9147, + "step": 371150 + }, + { + "epoch": 0.3656912273511818, + "grad_norm": 2.108581304550171, + "learning_rate": 9.98537015411061e-06, + "loss": 3.007, + "step": 371200 + }, + { + "epoch": 0.3657404853290039, + "grad_norm": 2.147116184234619, + "learning_rate": 9.985357699613485e-06, + "loss": 2.917, + "step": 371250 + }, + { + "epoch": 0.36578974330682595, + "grad_norm": 2.3535375595092773, + "learning_rate": 9.985345239825087e-06, + "loss": 2.9482, + "step": 371300 + }, + { + "epoch": 0.3658390012846481, + "grad_norm": 2.345290184020996, + "learning_rate": 9.985332774745428e-06, + "loss": 2.9712, + "step": 371350 + }, + { + "epoch": 0.36588825926247015, + "grad_norm": 2.2744669914245605, + "learning_rate": 9.985320304374522e-06, + "loss": 2.9224, + "step": 371400 + }, + { + "epoch": 0.3659375172402922, + "grad_norm": 2.3576161861419678, + "learning_rate": 9.985307828712382e-06, + "loss": 2.9867, + "step": 371450 + }, + { + "epoch": 0.36598677521811435, + "grad_norm": 2.1783623695373535, + "learning_rate": 9.98529534775902e-06, + "loss": 2.9896, + "step": 371500 + }, + { + "epoch": 0.3660360331959364, + "grad_norm": 2.198509693145752, + "learning_rate": 9.98528286151445e-06, + "loss": 3.0207, + "step": 371550 + }, + { + "epoch": 0.3660852911737585, + "grad_norm": 2.1839842796325684, + "learning_rate": 9.985270369978684e-06, + "loss": 2.9209, + "step": 371600 + }, + { + "epoch": 0.3661345491515806, + "grad_norm": 2.2869932651519775, + "learning_rate": 9.98525787315174e-06, + "loss": 2.9513, + "step": 371650 + }, + { + "epoch": 0.3661838071294027, + "grad_norm": 2.234440326690674, + "learning_rate": 9.985245371033625e-06, + "loss": 3.0109, + "step": 371700 + }, + { + "epoch": 0.36623306510722475, + "grad_norm": 2.219343900680542, + "learning_rate": 9.985232863624356e-06, + "loss": 2.9843, + "step": 371750 + }, + { + "epoch": 0.3662823230850469, + "grad_norm": 2.0281782150268555, + "learning_rate": 9.985220350923944e-06, + "loss": 2.9324, + "step": 371800 + }, + { + "epoch": 0.36633158106286895, + "grad_norm": 2.2894396781921387, + "learning_rate": 9.985207832932405e-06, + "loss": 2.9609, + "step": 371850 + }, + { + "epoch": 0.366380839040691, + "grad_norm": 2.1873865127563477, + "learning_rate": 9.98519530964975e-06, + "loss": 2.9367, + "step": 371900 + }, + { + "epoch": 0.36643009701851315, + "grad_norm": 2.3196494579315186, + "learning_rate": 9.985182781075994e-06, + "loss": 2.9752, + "step": 371950 + }, + { + "epoch": 0.3664793549963352, + "grad_norm": 2.3310279846191406, + "learning_rate": 9.985170247211148e-06, + "loss": 2.9706, + "step": 372000 + }, + { + "epoch": 0.3665286129741573, + "grad_norm": 2.1667609214782715, + "learning_rate": 9.985157708055228e-06, + "loss": 2.9831, + "step": 372050 + }, + { + "epoch": 0.36657787095197936, + "grad_norm": 2.3297908306121826, + "learning_rate": 9.985145163608246e-06, + "loss": 2.9683, + "step": 372100 + }, + { + "epoch": 0.3666271289298015, + "grad_norm": 2.489253520965576, + "learning_rate": 9.985132613870216e-06, + "loss": 2.8663, + "step": 372150 + }, + { + "epoch": 0.36667638690762355, + "grad_norm": 2.3462092876434326, + "learning_rate": 9.98512005884115e-06, + "loss": 2.9662, + "step": 372200 + }, + { + "epoch": 0.3667256448854456, + "grad_norm": 2.2548539638519287, + "learning_rate": 9.985107498521061e-06, + "loss": 2.8724, + "step": 372250 + }, + { + "epoch": 0.36677490286326775, + "grad_norm": 2.333889961242676, + "learning_rate": 9.985094932909964e-06, + "loss": 2.934, + "step": 372300 + }, + { + "epoch": 0.3668241608410898, + "grad_norm": 2.2352466583251953, + "learning_rate": 9.985082362007873e-06, + "loss": 2.9945, + "step": 372350 + }, + { + "epoch": 0.3668734188189119, + "grad_norm": 2.476855754852295, + "learning_rate": 9.985069785814799e-06, + "loss": 2.9613, + "step": 372400 + }, + { + "epoch": 0.366922676796734, + "grad_norm": 2.2411882877349854, + "learning_rate": 9.985057204330757e-06, + "loss": 2.994, + "step": 372450 + }, + { + "epoch": 0.3669719347745561, + "grad_norm": 2.365665912628174, + "learning_rate": 9.985044617555759e-06, + "loss": 2.9658, + "step": 372500 + }, + { + "epoch": 0.36702119275237816, + "grad_norm": 2.410503387451172, + "learning_rate": 9.98503202548982e-06, + "loss": 2.9761, + "step": 372550 + }, + { + "epoch": 0.3670704507302003, + "grad_norm": 2.296252727508545, + "learning_rate": 9.985019428132951e-06, + "loss": 3.0214, + "step": 372600 + }, + { + "epoch": 0.36711970870802235, + "grad_norm": 2.4165725708007812, + "learning_rate": 9.98500682548517e-06, + "loss": 2.9707, + "step": 372650 + }, + { + "epoch": 0.3671689666858444, + "grad_norm": 2.514622211456299, + "learning_rate": 9.984994217546484e-06, + "loss": 2.9524, + "step": 372700 + }, + { + "epoch": 0.36721822466366655, + "grad_norm": 2.3531386852264404, + "learning_rate": 9.98498160431691e-06, + "loss": 2.9862, + "step": 372750 + }, + { + "epoch": 0.3672674826414886, + "grad_norm": 2.316027879714966, + "learning_rate": 9.984968985796462e-06, + "loss": 3.0338, + "step": 372800 + }, + { + "epoch": 0.3673167406193107, + "grad_norm": 2.230767011642456, + "learning_rate": 9.984956361985152e-06, + "loss": 2.9049, + "step": 372850 + }, + { + "epoch": 0.3673659985971328, + "grad_norm": 2.3280653953552246, + "learning_rate": 9.984943732882993e-06, + "loss": 2.9116, + "step": 372900 + }, + { + "epoch": 0.3674152565749549, + "grad_norm": 2.170419454574585, + "learning_rate": 9.98493109849e-06, + "loss": 2.9143, + "step": 372950 + }, + { + "epoch": 0.36746451455277696, + "grad_norm": 2.414006233215332, + "learning_rate": 9.984918458806186e-06, + "loss": 2.9215, + "step": 373000 + }, + { + "epoch": 0.3675137725305991, + "grad_norm": 2.1818084716796875, + "learning_rate": 9.984905813831563e-06, + "loss": 2.9626, + "step": 373050 + }, + { + "epoch": 0.36756303050842115, + "grad_norm": 2.2205467224121094, + "learning_rate": 9.984893163566147e-06, + "loss": 2.9607, + "step": 373100 + }, + { + "epoch": 0.3676122884862432, + "grad_norm": 2.4412145614624023, + "learning_rate": 9.98488050800995e-06, + "loss": 2.9775, + "step": 373150 + }, + { + "epoch": 0.36766154646406535, + "grad_norm": 2.4067630767822266, + "learning_rate": 9.984867847162983e-06, + "loss": 2.9083, + "step": 373200 + }, + { + "epoch": 0.3677108044418874, + "grad_norm": 2.3929998874664307, + "learning_rate": 9.984855181025263e-06, + "loss": 2.9626, + "step": 373250 + }, + { + "epoch": 0.3677600624197095, + "grad_norm": 2.305105447769165, + "learning_rate": 9.984842509596803e-06, + "loss": 2.9716, + "step": 373300 + }, + { + "epoch": 0.36780932039753156, + "grad_norm": 2.321188449859619, + "learning_rate": 9.984829832877613e-06, + "loss": 3.0035, + "step": 373350 + }, + { + "epoch": 0.3678585783753537, + "grad_norm": 2.369049310684204, + "learning_rate": 9.984817150867711e-06, + "loss": 2.9729, + "step": 373400 + }, + { + "epoch": 0.36790783635317575, + "grad_norm": 2.4235341548919678, + "learning_rate": 9.98480446356711e-06, + "loss": 2.9881, + "step": 373450 + }, + { + "epoch": 0.3679570943309978, + "grad_norm": 2.2693064212799072, + "learning_rate": 9.98479177097582e-06, + "loss": 2.9061, + "step": 373500 + }, + { + "epoch": 0.36800635230881995, + "grad_norm": 2.517242670059204, + "learning_rate": 9.984779073093857e-06, + "loss": 2.9807, + "step": 373550 + }, + { + "epoch": 0.368055610286642, + "grad_norm": 2.0748016834259033, + "learning_rate": 9.984766369921233e-06, + "loss": 2.9643, + "step": 373600 + }, + { + "epoch": 0.3681048682644641, + "grad_norm": 2.199024200439453, + "learning_rate": 9.984753661457964e-06, + "loss": 2.9868, + "step": 373650 + }, + { + "epoch": 0.3681541262422862, + "grad_norm": 2.3255419731140137, + "learning_rate": 9.98474094770406e-06, + "loss": 2.9676, + "step": 373700 + }, + { + "epoch": 0.3682033842201083, + "grad_norm": 2.3928890228271484, + "learning_rate": 9.984728228659539e-06, + "loss": 2.9904, + "step": 373750 + }, + { + "epoch": 0.36825264219793036, + "grad_norm": 2.1849429607391357, + "learning_rate": 9.984715504324412e-06, + "loss": 2.9982, + "step": 373800 + }, + { + "epoch": 0.3683019001757525, + "grad_norm": 2.254878520965576, + "learning_rate": 9.984702774698688e-06, + "loss": 2.9344, + "step": 373850 + }, + { + "epoch": 0.36835115815357455, + "grad_norm": 2.37884783744812, + "learning_rate": 9.984690039782389e-06, + "loss": 2.946, + "step": 373900 + }, + { + "epoch": 0.3684004161313966, + "grad_norm": 2.3156776428222656, + "learning_rate": 9.984677299575524e-06, + "loss": 2.9555, + "step": 373950 + }, + { + "epoch": 0.36844967410921875, + "grad_norm": 2.351181745529175, + "learning_rate": 9.984664554078106e-06, + "loss": 2.9793, + "step": 374000 + }, + { + "epoch": 0.3684989320870408, + "grad_norm": 2.1720635890960693, + "learning_rate": 9.98465180329015e-06, + "loss": 2.9607, + "step": 374050 + }, + { + "epoch": 0.3685481900648629, + "grad_norm": 2.1921885013580322, + "learning_rate": 9.98463904721167e-06, + "loss": 2.9083, + "step": 374100 + }, + { + "epoch": 0.368597448042685, + "grad_norm": 2.349820137023926, + "learning_rate": 9.984626285842676e-06, + "loss": 2.9387, + "step": 374150 + }, + { + "epoch": 0.3686467060205071, + "grad_norm": 2.197554349899292, + "learning_rate": 9.984613519183185e-06, + "loss": 2.9459, + "step": 374200 + }, + { + "epoch": 0.36869596399832916, + "grad_norm": 2.1750175952911377, + "learning_rate": 9.98460074723321e-06, + "loss": 2.9614, + "step": 374250 + }, + { + "epoch": 0.3687452219761513, + "grad_norm": 2.2455074787139893, + "learning_rate": 9.984587969992765e-06, + "loss": 2.9816, + "step": 374300 + }, + { + "epoch": 0.36879447995397335, + "grad_norm": 2.44236421585083, + "learning_rate": 9.984575187461863e-06, + "loss": 2.9058, + "step": 374350 + }, + { + "epoch": 0.3688437379317954, + "grad_norm": 2.3136444091796875, + "learning_rate": 9.984562399640516e-06, + "loss": 2.9997, + "step": 374400 + }, + { + "epoch": 0.36889299590961755, + "grad_norm": 2.4760191440582275, + "learning_rate": 9.98454960652874e-06, + "loss": 2.9829, + "step": 374450 + }, + { + "epoch": 0.3689422538874396, + "grad_norm": 2.095914602279663, + "learning_rate": 9.984536808126546e-06, + "loss": 2.9528, + "step": 374500 + }, + { + "epoch": 0.3689915118652617, + "grad_norm": 2.1936299800872803, + "learning_rate": 9.984524004433951e-06, + "loss": 2.9808, + "step": 374550 + }, + { + "epoch": 0.36904076984308376, + "grad_norm": 2.119621992111206, + "learning_rate": 9.984511195450966e-06, + "loss": 2.9285, + "step": 374600 + }, + { + "epoch": 0.3690900278209059, + "grad_norm": 2.307997703552246, + "learning_rate": 9.984498381177605e-06, + "loss": 2.9547, + "step": 374650 + }, + { + "epoch": 0.36913928579872796, + "grad_norm": 2.356783628463745, + "learning_rate": 9.984485561613882e-06, + "loss": 2.8964, + "step": 374700 + }, + { + "epoch": 0.36918854377655, + "grad_norm": 2.4519779682159424, + "learning_rate": 9.98447273675981e-06, + "loss": 2.9427, + "step": 374750 + }, + { + "epoch": 0.36923780175437215, + "grad_norm": 2.169921875, + "learning_rate": 9.984459906615403e-06, + "loss": 2.9656, + "step": 374800 + }, + { + "epoch": 0.3692870597321942, + "grad_norm": 2.3823702335357666, + "learning_rate": 9.984447071180675e-06, + "loss": 2.9071, + "step": 374850 + }, + { + "epoch": 0.3693363177100163, + "grad_norm": 2.482151746749878, + "learning_rate": 9.98443423045564e-06, + "loss": 2.8651, + "step": 374900 + }, + { + "epoch": 0.3693855756878384, + "grad_norm": 2.277364730834961, + "learning_rate": 9.984421384440312e-06, + "loss": 2.9314, + "step": 374950 + }, + { + "epoch": 0.3694348336656605, + "grad_norm": 2.3988230228424072, + "learning_rate": 9.984408533134703e-06, + "loss": 2.972, + "step": 375000 + }, + { + "epoch": 0.36948409164348256, + "grad_norm": 2.3329312801361084, + "learning_rate": 9.984395676538825e-06, + "loss": 3.0057, + "step": 375050 + }, + { + "epoch": 0.3695333496213047, + "grad_norm": 2.27927303314209, + "learning_rate": 9.984382814652695e-06, + "loss": 2.9203, + "step": 375100 + }, + { + "epoch": 0.36958260759912676, + "grad_norm": 2.181896448135376, + "learning_rate": 9.984369947476327e-06, + "loss": 2.9744, + "step": 375150 + }, + { + "epoch": 0.3696318655769488, + "grad_norm": 2.4688825607299805, + "learning_rate": 9.984357075009732e-06, + "loss": 2.9081, + "step": 375200 + }, + { + "epoch": 0.36968112355477095, + "grad_norm": 2.3291006088256836, + "learning_rate": 9.984344197252924e-06, + "loss": 2.9749, + "step": 375250 + }, + { + "epoch": 0.369730381532593, + "grad_norm": 2.0842764377593994, + "learning_rate": 9.98433131420592e-06, + "loss": 2.9621, + "step": 375300 + }, + { + "epoch": 0.3697796395104151, + "grad_norm": 2.230250358581543, + "learning_rate": 9.984318425868728e-06, + "loss": 2.9321, + "step": 375350 + }, + { + "epoch": 0.3698288974882372, + "grad_norm": 2.2620532512664795, + "learning_rate": 9.984305532241368e-06, + "loss": 2.957, + "step": 375400 + }, + { + "epoch": 0.3698781554660593, + "grad_norm": 2.3934977054595947, + "learning_rate": 9.984292633323848e-06, + "loss": 2.9044, + "step": 375450 + }, + { + "epoch": 0.36992741344388136, + "grad_norm": 2.244781494140625, + "learning_rate": 9.984279729116186e-06, + "loss": 2.8956, + "step": 375500 + }, + { + "epoch": 0.3699766714217035, + "grad_norm": 2.2515761852264404, + "learning_rate": 9.984266819618393e-06, + "loss": 2.9966, + "step": 375550 + }, + { + "epoch": 0.37002592939952555, + "grad_norm": 2.2890937328338623, + "learning_rate": 9.984253904830483e-06, + "loss": 2.9551, + "step": 375600 + }, + { + "epoch": 0.3700751873773476, + "grad_norm": 2.1997873783111572, + "learning_rate": 9.98424098475247e-06, + "loss": 2.9423, + "step": 375650 + }, + { + "epoch": 0.3701244453551697, + "grad_norm": 2.3467485904693604, + "learning_rate": 9.98422805938437e-06, + "loss": 2.9414, + "step": 375700 + }, + { + "epoch": 0.3701737033329918, + "grad_norm": 2.1842405796051025, + "learning_rate": 9.984215128726195e-06, + "loss": 2.9733, + "step": 375750 + }, + { + "epoch": 0.3702229613108139, + "grad_norm": 2.2888901233673096, + "learning_rate": 9.984202192777957e-06, + "loss": 2.9493, + "step": 375800 + }, + { + "epoch": 0.37027221928863596, + "grad_norm": 2.376406192779541, + "learning_rate": 9.984189251539671e-06, + "loss": 2.9967, + "step": 375850 + }, + { + "epoch": 0.3703214772664581, + "grad_norm": 2.2927086353302, + "learning_rate": 9.984176305011353e-06, + "loss": 2.9621, + "step": 375900 + }, + { + "epoch": 0.37037073524428016, + "grad_norm": 2.871405601501465, + "learning_rate": 9.984163353193014e-06, + "loss": 2.9996, + "step": 375950 + }, + { + "epoch": 0.37041999322210223, + "grad_norm": 2.3754570484161377, + "learning_rate": 9.984150396084668e-06, + "loss": 3.0148, + "step": 376000 + }, + { + "epoch": 0.37046925119992435, + "grad_norm": 2.215423107147217, + "learning_rate": 9.984137433686328e-06, + "loss": 2.9743, + "step": 376050 + }, + { + "epoch": 0.3705185091777464, + "grad_norm": 2.1638331413269043, + "learning_rate": 9.98412446599801e-06, + "loss": 2.9501, + "step": 376100 + }, + { + "epoch": 0.3705677671555685, + "grad_norm": 2.2707722187042236, + "learning_rate": 9.984111493019726e-06, + "loss": 2.8654, + "step": 376150 + }, + { + "epoch": 0.3706170251333906, + "grad_norm": 2.252742290496826, + "learning_rate": 9.984098514751491e-06, + "loss": 2.9567, + "step": 376200 + }, + { + "epoch": 0.3706662831112127, + "grad_norm": 2.2318477630615234, + "learning_rate": 9.98408553119332e-06, + "loss": 2.9694, + "step": 376250 + }, + { + "epoch": 0.37071554108903476, + "grad_norm": 2.3586008548736572, + "learning_rate": 9.984072542345223e-06, + "loss": 2.9453, + "step": 376300 + }, + { + "epoch": 0.3707647990668569, + "grad_norm": 2.398897409439087, + "learning_rate": 9.984059548207217e-06, + "loss": 2.939, + "step": 376350 + }, + { + "epoch": 0.37081405704467896, + "grad_norm": 2.1523945331573486, + "learning_rate": 9.984046548779315e-06, + "loss": 3.0162, + "step": 376400 + }, + { + "epoch": 0.37086331502250103, + "grad_norm": 2.14494252204895, + "learning_rate": 9.984033544061529e-06, + "loss": 2.9228, + "step": 376450 + }, + { + "epoch": 0.37091257300032315, + "grad_norm": 2.288625478744507, + "learning_rate": 9.984020534053874e-06, + "loss": 2.9365, + "step": 376500 + }, + { + "epoch": 0.3709618309781452, + "grad_norm": 2.6372318267822266, + "learning_rate": 9.984007518756366e-06, + "loss": 2.9431, + "step": 376550 + }, + { + "epoch": 0.3710110889559673, + "grad_norm": 2.268389940261841, + "learning_rate": 9.983994498169016e-06, + "loss": 2.9589, + "step": 376600 + }, + { + "epoch": 0.3710603469337894, + "grad_norm": 2.0807669162750244, + "learning_rate": 9.983981472291839e-06, + "loss": 2.9207, + "step": 376650 + }, + { + "epoch": 0.3711096049116115, + "grad_norm": 2.4463517665863037, + "learning_rate": 9.983968441124847e-06, + "loss": 3.0308, + "step": 376700 + }, + { + "epoch": 0.37115886288943356, + "grad_norm": 2.3623292446136475, + "learning_rate": 9.983955404668056e-06, + "loss": 2.9293, + "step": 376750 + }, + { + "epoch": 0.3712081208672557, + "grad_norm": 2.2119033336639404, + "learning_rate": 9.983942362921479e-06, + "loss": 2.9782, + "step": 376800 + }, + { + "epoch": 0.37125737884507776, + "grad_norm": 2.3923895359039307, + "learning_rate": 9.98392931588513e-06, + "loss": 2.9648, + "step": 376850 + }, + { + "epoch": 0.3713066368228998, + "grad_norm": 2.366384983062744, + "learning_rate": 9.983916263559024e-06, + "loss": 2.9679, + "step": 376900 + }, + { + "epoch": 0.3713558948007219, + "grad_norm": 2.367220878601074, + "learning_rate": 9.983903205943174e-06, + "loss": 2.9611, + "step": 376950 + }, + { + "epoch": 0.371405152778544, + "grad_norm": 2.2120461463928223, + "learning_rate": 9.983890143037592e-06, + "loss": 2.9881, + "step": 377000 + }, + { + "epoch": 0.3714544107563661, + "grad_norm": 2.301856756210327, + "learning_rate": 9.983877074842295e-06, + "loss": 2.978, + "step": 377050 + }, + { + "epoch": 0.37150366873418816, + "grad_norm": 2.3141276836395264, + "learning_rate": 9.983864001357293e-06, + "loss": 2.9988, + "step": 377100 + }, + { + "epoch": 0.3715529267120103, + "grad_norm": 2.574843406677246, + "learning_rate": 9.983850922582605e-06, + "loss": 3.03, + "step": 377150 + }, + { + "epoch": 0.37160218468983236, + "grad_norm": 2.3167924880981445, + "learning_rate": 9.98383783851824e-06, + "loss": 3.0272, + "step": 377200 + }, + { + "epoch": 0.37165144266765443, + "grad_norm": 2.2507688999176025, + "learning_rate": 9.983824749164214e-06, + "loss": 3.0214, + "step": 377250 + }, + { + "epoch": 0.37170070064547656, + "grad_norm": 2.3787684440612793, + "learning_rate": 9.983811654520542e-06, + "loss": 3.0381, + "step": 377300 + }, + { + "epoch": 0.3717499586232986, + "grad_norm": 2.086843967437744, + "learning_rate": 9.983798554587237e-06, + "loss": 2.9696, + "step": 377350 + }, + { + "epoch": 0.3717992166011207, + "grad_norm": 2.3180222511291504, + "learning_rate": 9.983785449364312e-06, + "loss": 3.0184, + "step": 377400 + }, + { + "epoch": 0.3718484745789428, + "grad_norm": 2.451256275177002, + "learning_rate": 9.98377233885178e-06, + "loss": 2.9659, + "step": 377450 + }, + { + "epoch": 0.3718977325567649, + "grad_norm": 2.152449131011963, + "learning_rate": 9.983759223049657e-06, + "loss": 2.9323, + "step": 377500 + }, + { + "epoch": 0.37194699053458696, + "grad_norm": 2.3356964588165283, + "learning_rate": 9.983746101957959e-06, + "loss": 2.9107, + "step": 377550 + }, + { + "epoch": 0.3719962485124091, + "grad_norm": 2.23681640625, + "learning_rate": 9.983732975576695e-06, + "loss": 3.0137, + "step": 377600 + }, + { + "epoch": 0.37204550649023116, + "grad_norm": 2.232588529586792, + "learning_rate": 9.983719843905882e-06, + "loss": 2.9537, + "step": 377650 + }, + { + "epoch": 0.37209476446805323, + "grad_norm": 2.1547937393188477, + "learning_rate": 9.983706706945533e-06, + "loss": 2.904, + "step": 377700 + }, + { + "epoch": 0.37214402244587536, + "grad_norm": 2.462756872177124, + "learning_rate": 9.983693564695663e-06, + "loss": 2.9208, + "step": 377750 + }, + { + "epoch": 0.3721932804236974, + "grad_norm": 2.170409679412842, + "learning_rate": 9.983680417156284e-06, + "loss": 2.9105, + "step": 377800 + }, + { + "epoch": 0.3722425384015195, + "grad_norm": 2.18009877204895, + "learning_rate": 9.983667264327411e-06, + "loss": 2.9855, + "step": 377850 + }, + { + "epoch": 0.3722917963793416, + "grad_norm": 2.197504758834839, + "learning_rate": 9.983654106209058e-06, + "loss": 2.8638, + "step": 377900 + }, + { + "epoch": 0.3723410543571637, + "grad_norm": 2.2677419185638428, + "learning_rate": 9.98364094280124e-06, + "loss": 2.9591, + "step": 377950 + }, + { + "epoch": 0.37239031233498576, + "grad_norm": 2.124849319458008, + "learning_rate": 9.983627774103969e-06, + "loss": 2.9426, + "step": 378000 + }, + { + "epoch": 0.3724395703128079, + "grad_norm": 2.1744275093078613, + "learning_rate": 9.98361460011726e-06, + "loss": 2.9031, + "step": 378050 + }, + { + "epoch": 0.37248882829062996, + "grad_norm": 2.2415709495544434, + "learning_rate": 9.983601420841127e-06, + "loss": 2.9301, + "step": 378100 + }, + { + "epoch": 0.37253808626845203, + "grad_norm": 2.313066244125366, + "learning_rate": 9.983588236275583e-06, + "loss": 2.9666, + "step": 378150 + }, + { + "epoch": 0.3725873442462741, + "grad_norm": 2.235607385635376, + "learning_rate": 9.983575046420645e-06, + "loss": 2.9215, + "step": 378200 + }, + { + "epoch": 0.3726366022240962, + "grad_norm": 2.7131216526031494, + "learning_rate": 9.983561851276322e-06, + "loss": 3.0446, + "step": 378250 + }, + { + "epoch": 0.3726858602019183, + "grad_norm": 2.1737685203552246, + "learning_rate": 9.983548650842632e-06, + "loss": 2.982, + "step": 378300 + }, + { + "epoch": 0.37273511817974037, + "grad_norm": 2.439265489578247, + "learning_rate": 9.983535445119589e-06, + "loss": 2.9811, + "step": 378350 + }, + { + "epoch": 0.3727843761575625, + "grad_norm": 2.48960018157959, + "learning_rate": 9.983522234107205e-06, + "loss": 2.9064, + "step": 378400 + }, + { + "epoch": 0.37283363413538456, + "grad_norm": 3.1482553482055664, + "learning_rate": 9.983509017805496e-06, + "loss": 2.9722, + "step": 378450 + }, + { + "epoch": 0.37288289211320663, + "grad_norm": 2.257810354232788, + "learning_rate": 9.983495796214474e-06, + "loss": 2.9485, + "step": 378500 + }, + { + "epoch": 0.37293215009102876, + "grad_norm": 2.324298143386841, + "learning_rate": 9.983482569334154e-06, + "loss": 2.9941, + "step": 378550 + }, + { + "epoch": 0.37298140806885083, + "grad_norm": 2.148759126663208, + "learning_rate": 9.98346933716455e-06, + "loss": 2.9585, + "step": 378600 + }, + { + "epoch": 0.3730306660466729, + "grad_norm": 2.2838656902313232, + "learning_rate": 9.983456099705676e-06, + "loss": 2.9601, + "step": 378650 + }, + { + "epoch": 0.373079924024495, + "grad_norm": 2.3178234100341797, + "learning_rate": 9.983442856957546e-06, + "loss": 2.9077, + "step": 378700 + }, + { + "epoch": 0.3731291820023171, + "grad_norm": 2.3086631298065186, + "learning_rate": 9.983429608920174e-06, + "loss": 2.8466, + "step": 378750 + }, + { + "epoch": 0.37317843998013916, + "grad_norm": 2.237861156463623, + "learning_rate": 9.983416355593576e-06, + "loss": 2.9318, + "step": 378800 + }, + { + "epoch": 0.3732276979579613, + "grad_norm": 2.318366527557373, + "learning_rate": 9.983403096977763e-06, + "loss": 2.8833, + "step": 378850 + }, + { + "epoch": 0.37327695593578336, + "grad_norm": 2.258074998855591, + "learning_rate": 9.98338983307275e-06, + "loss": 2.9152, + "step": 378900 + }, + { + "epoch": 0.37332621391360543, + "grad_norm": 2.2780075073242188, + "learning_rate": 9.983376563878552e-06, + "loss": 2.9505, + "step": 378950 + }, + { + "epoch": 0.37337547189142756, + "grad_norm": 2.4153873920440674, + "learning_rate": 9.983363289395182e-06, + "loss": 2.9706, + "step": 379000 + }, + { + "epoch": 0.3734247298692496, + "grad_norm": 2.261320114135742, + "learning_rate": 9.983350009622655e-06, + "loss": 2.9845, + "step": 379050 + }, + { + "epoch": 0.3734739878470717, + "grad_norm": 2.2231285572052, + "learning_rate": 9.983336724560985e-06, + "loss": 2.9666, + "step": 379100 + }, + { + "epoch": 0.3735232458248938, + "grad_norm": 2.301809072494507, + "learning_rate": 9.983323434210184e-06, + "loss": 2.9267, + "step": 379150 + }, + { + "epoch": 0.3735725038027159, + "grad_norm": 2.3009023666381836, + "learning_rate": 9.98331013857027e-06, + "loss": 2.9643, + "step": 379200 + }, + { + "epoch": 0.37362176178053796, + "grad_norm": 2.296684503555298, + "learning_rate": 9.983296837641257e-06, + "loss": 2.9944, + "step": 379250 + }, + { + "epoch": 0.3736710197583601, + "grad_norm": 2.2427263259887695, + "learning_rate": 9.983283531423153e-06, + "loss": 2.9733, + "step": 379300 + }, + { + "epoch": 0.37372027773618216, + "grad_norm": 2.1772594451904297, + "learning_rate": 9.983270219915977e-06, + "loss": 2.981, + "step": 379350 + }, + { + "epoch": 0.37376953571400423, + "grad_norm": 2.2721621990203857, + "learning_rate": 9.983256903119743e-06, + "loss": 2.9747, + "step": 379400 + }, + { + "epoch": 0.3738187936918263, + "grad_norm": 2.5400753021240234, + "learning_rate": 9.983243581034467e-06, + "loss": 2.954, + "step": 379450 + }, + { + "epoch": 0.3738680516696484, + "grad_norm": 2.2973482608795166, + "learning_rate": 9.983230253660157e-06, + "loss": 2.9293, + "step": 379500 + }, + { + "epoch": 0.3739173096474705, + "grad_norm": 2.349046230316162, + "learning_rate": 9.983216920996833e-06, + "loss": 2.9491, + "step": 379550 + }, + { + "epoch": 0.37396656762529257, + "grad_norm": 2.2408735752105713, + "learning_rate": 9.983203583044505e-06, + "loss": 2.9659, + "step": 379600 + }, + { + "epoch": 0.3740158256031147, + "grad_norm": 2.256530284881592, + "learning_rate": 9.983190239803192e-06, + "loss": 2.9685, + "step": 379650 + }, + { + "epoch": 0.37406508358093676, + "grad_norm": 2.320786476135254, + "learning_rate": 9.983176891272903e-06, + "loss": 2.9333, + "step": 379700 + }, + { + "epoch": 0.37411434155875883, + "grad_norm": 2.120586395263672, + "learning_rate": 9.983163537453655e-06, + "loss": 2.955, + "step": 379750 + }, + { + "epoch": 0.37416359953658096, + "grad_norm": 2.2356503009796143, + "learning_rate": 9.983150178345463e-06, + "loss": 2.9932, + "step": 379800 + }, + { + "epoch": 0.37421285751440303, + "grad_norm": 2.387631893157959, + "learning_rate": 9.983136813948338e-06, + "loss": 2.9027, + "step": 379850 + }, + { + "epoch": 0.3742621154922251, + "grad_norm": 2.155264377593994, + "learning_rate": 9.983123444262297e-06, + "loss": 2.9777, + "step": 379900 + }, + { + "epoch": 0.3743113734700472, + "grad_norm": 2.4461753368377686, + "learning_rate": 9.983110069287353e-06, + "loss": 3.0014, + "step": 379950 + }, + { + "epoch": 0.3743606314478693, + "grad_norm": 2.2724449634552, + "learning_rate": 9.98309668902352e-06, + "loss": 2.9499, + "step": 380000 + }, + { + "epoch": 0.37440988942569137, + "grad_norm": 2.389441967010498, + "learning_rate": 9.983083303470813e-06, + "loss": 2.9624, + "step": 380050 + }, + { + "epoch": 0.3744591474035135, + "grad_norm": 2.2443060874938965, + "learning_rate": 9.983069912629245e-06, + "loss": 2.8841, + "step": 380100 + }, + { + "epoch": 0.37450840538133556, + "grad_norm": 2.349874496459961, + "learning_rate": 9.983056516498832e-06, + "loss": 2.8847, + "step": 380150 + }, + { + "epoch": 0.37455766335915763, + "grad_norm": 2.331456184387207, + "learning_rate": 9.983043115079587e-06, + "loss": 2.9813, + "step": 380200 + }, + { + "epoch": 0.37460692133697976, + "grad_norm": 2.2243292331695557, + "learning_rate": 9.983029708371523e-06, + "loss": 2.8511, + "step": 380250 + }, + { + "epoch": 0.37465617931480183, + "grad_norm": 2.399918794631958, + "learning_rate": 9.983016296374658e-06, + "loss": 2.9639, + "step": 380300 + }, + { + "epoch": 0.3747054372926239, + "grad_norm": 2.401742696762085, + "learning_rate": 9.983002879089002e-06, + "loss": 2.9453, + "step": 380350 + }, + { + "epoch": 0.374754695270446, + "grad_norm": 2.1671876907348633, + "learning_rate": 9.982989456514572e-06, + "loss": 2.9845, + "step": 380400 + }, + { + "epoch": 0.3748039532482681, + "grad_norm": 2.22387433052063, + "learning_rate": 9.982976028651383e-06, + "loss": 2.9935, + "step": 380450 + }, + { + "epoch": 0.37485321122609017, + "grad_norm": 2.282721519470215, + "learning_rate": 9.982962595499444e-06, + "loss": 2.9534, + "step": 380500 + }, + { + "epoch": 0.3749024692039123, + "grad_norm": 2.334825038909912, + "learning_rate": 9.982949157058777e-06, + "loss": 2.9438, + "step": 380550 + }, + { + "epoch": 0.37495172718173436, + "grad_norm": 2.341357469558716, + "learning_rate": 9.982935713329389e-06, + "loss": 2.9912, + "step": 380600 + }, + { + "epoch": 0.37500098515955643, + "grad_norm": 2.213778257369995, + "learning_rate": 9.982922264311297e-06, + "loss": 2.9659, + "step": 380650 + }, + { + "epoch": 0.3750502431373785, + "grad_norm": 2.7562761306762695, + "learning_rate": 9.982908810004518e-06, + "loss": 2.8802, + "step": 380700 + }, + { + "epoch": 0.37509950111520063, + "grad_norm": 2.3115639686584473, + "learning_rate": 9.982895350409065e-06, + "loss": 2.9607, + "step": 380750 + }, + { + "epoch": 0.3751487590930227, + "grad_norm": 2.305588483810425, + "learning_rate": 9.982881885524949e-06, + "loss": 2.9551, + "step": 380800 + }, + { + "epoch": 0.37519801707084477, + "grad_norm": 2.3463213443756104, + "learning_rate": 9.982868415352187e-06, + "loss": 2.9875, + "step": 380850 + }, + { + "epoch": 0.3752472750486669, + "grad_norm": 2.2627429962158203, + "learning_rate": 9.982854939890794e-06, + "loss": 2.9236, + "step": 380900 + }, + { + "epoch": 0.37529653302648897, + "grad_norm": 2.2784066200256348, + "learning_rate": 9.982841459140781e-06, + "loss": 2.9761, + "step": 380950 + }, + { + "epoch": 0.37534579100431104, + "grad_norm": 2.6127262115478516, + "learning_rate": 9.982827973102166e-06, + "loss": 2.9523, + "step": 381000 + }, + { + "epoch": 0.37539504898213316, + "grad_norm": 2.474867820739746, + "learning_rate": 9.982814481774961e-06, + "loss": 2.9892, + "step": 381050 + }, + { + "epoch": 0.37544430695995523, + "grad_norm": 2.3431968688964844, + "learning_rate": 9.982800985159181e-06, + "loss": 2.9654, + "step": 381100 + }, + { + "epoch": 0.3754935649377773, + "grad_norm": 2.230130195617676, + "learning_rate": 9.982787483254843e-06, + "loss": 2.9834, + "step": 381150 + }, + { + "epoch": 0.37554282291559943, + "grad_norm": 2.3372015953063965, + "learning_rate": 9.982773976061955e-06, + "loss": 2.955, + "step": 381200 + }, + { + "epoch": 0.3755920808934215, + "grad_norm": 2.236506700515747, + "learning_rate": 9.982760463580538e-06, + "loss": 2.9096, + "step": 381250 + }, + { + "epoch": 0.37564133887124357, + "grad_norm": 2.1727795600891113, + "learning_rate": 9.982746945810602e-06, + "loss": 3.0035, + "step": 381300 + }, + { + "epoch": 0.3756905968490657, + "grad_norm": 2.188779592514038, + "learning_rate": 9.982733422752162e-06, + "loss": 2.9821, + "step": 381350 + }, + { + "epoch": 0.37573985482688776, + "grad_norm": 2.1022939682006836, + "learning_rate": 9.982719894405235e-06, + "loss": 2.9635, + "step": 381400 + }, + { + "epoch": 0.37578911280470984, + "grad_norm": 2.4634947776794434, + "learning_rate": 9.982706360769833e-06, + "loss": 3.0527, + "step": 381450 + }, + { + "epoch": 0.37583837078253196, + "grad_norm": 2.3958916664123535, + "learning_rate": 9.98269282184597e-06, + "loss": 2.9836, + "step": 381500 + }, + { + "epoch": 0.37588762876035403, + "grad_norm": 2.3277928829193115, + "learning_rate": 9.982679277633662e-06, + "loss": 2.8822, + "step": 381550 + }, + { + "epoch": 0.3759368867381761, + "grad_norm": 2.15012788772583, + "learning_rate": 9.982665728132923e-06, + "loss": 2.9239, + "step": 381600 + }, + { + "epoch": 0.3759861447159982, + "grad_norm": 2.4942073822021484, + "learning_rate": 9.982652173343765e-06, + "loss": 2.9144, + "step": 381650 + }, + { + "epoch": 0.3760354026938203, + "grad_norm": 2.2041287422180176, + "learning_rate": 9.982638613266206e-06, + "loss": 2.9738, + "step": 381700 + }, + { + "epoch": 0.37608466067164237, + "grad_norm": 2.3448901176452637, + "learning_rate": 9.982625047900259e-06, + "loss": 2.9674, + "step": 381750 + }, + { + "epoch": 0.3761339186494645, + "grad_norm": 2.1779625415802, + "learning_rate": 9.982611477245939e-06, + "loss": 2.9622, + "step": 381800 + }, + { + "epoch": 0.37618317662728656, + "grad_norm": 2.3161866664886475, + "learning_rate": 9.982597901303258e-06, + "loss": 2.9463, + "step": 381850 + }, + { + "epoch": 0.37623243460510863, + "grad_norm": 2.3850579261779785, + "learning_rate": 9.98258432007223e-06, + "loss": 2.9491, + "step": 381900 + }, + { + "epoch": 0.3762816925829307, + "grad_norm": 2.595911979675293, + "learning_rate": 9.982570733552877e-06, + "loss": 2.9195, + "step": 381950 + }, + { + "epoch": 0.37633095056075283, + "grad_norm": 2.4508371353149414, + "learning_rate": 9.982557141745203e-06, + "loss": 2.9322, + "step": 382000 + }, + { + "epoch": 0.3763802085385749, + "grad_norm": 2.613887310028076, + "learning_rate": 9.982543544649228e-06, + "loss": 2.8852, + "step": 382050 + }, + { + "epoch": 0.37642946651639697, + "grad_norm": 2.1819987297058105, + "learning_rate": 9.982529942264967e-06, + "loss": 2.9761, + "step": 382100 + }, + { + "epoch": 0.3764787244942191, + "grad_norm": 2.2993366718292236, + "learning_rate": 9.982516334592433e-06, + "loss": 2.8982, + "step": 382150 + }, + { + "epoch": 0.37652798247204117, + "grad_norm": 2.30232572555542, + "learning_rate": 9.98250272163164e-06, + "loss": 2.8999, + "step": 382200 + }, + { + "epoch": 0.37657724044986324, + "grad_norm": 2.351127862930298, + "learning_rate": 9.982489103382602e-06, + "loss": 2.9606, + "step": 382250 + }, + { + "epoch": 0.37662649842768536, + "grad_norm": 2.365689516067505, + "learning_rate": 9.982475479845337e-06, + "loss": 2.9566, + "step": 382300 + }, + { + "epoch": 0.37667575640550743, + "grad_norm": 2.505145311355591, + "learning_rate": 9.982461851019855e-06, + "loss": 2.9188, + "step": 382350 + }, + { + "epoch": 0.3767250143833295, + "grad_norm": 2.3287723064422607, + "learning_rate": 9.982448216906174e-06, + "loss": 2.9526, + "step": 382400 + }, + { + "epoch": 0.37677427236115163, + "grad_norm": 2.4133005142211914, + "learning_rate": 9.982434577504305e-06, + "loss": 2.9188, + "step": 382450 + }, + { + "epoch": 0.3768235303389737, + "grad_norm": 2.5473077297210693, + "learning_rate": 9.982420932814265e-06, + "loss": 2.9247, + "step": 382500 + }, + { + "epoch": 0.37687278831679577, + "grad_norm": 2.6773362159729004, + "learning_rate": 9.982407282836067e-06, + "loss": 3.0252, + "step": 382550 + }, + { + "epoch": 0.3769220462946179, + "grad_norm": 2.104987144470215, + "learning_rate": 9.982393627569728e-06, + "loss": 2.9268, + "step": 382600 + }, + { + "epoch": 0.37697130427243997, + "grad_norm": 2.200910806655884, + "learning_rate": 9.98237996701526e-06, + "loss": 3.0066, + "step": 382650 + }, + { + "epoch": 0.37702056225026204, + "grad_norm": 2.27367901802063, + "learning_rate": 9.98236630117268e-06, + "loss": 2.9697, + "step": 382700 + }, + { + "epoch": 0.37706982022808416, + "grad_norm": 2.525668144226074, + "learning_rate": 9.982352630041998e-06, + "loss": 2.9623, + "step": 382750 + }, + { + "epoch": 0.37711907820590623, + "grad_norm": 2.3522837162017822, + "learning_rate": 9.982338953623231e-06, + "loss": 2.931, + "step": 382800 + }, + { + "epoch": 0.3771683361837283, + "grad_norm": 2.35286545753479, + "learning_rate": 9.982325271916395e-06, + "loss": 2.9289, + "step": 382850 + }, + { + "epoch": 0.37721759416155043, + "grad_norm": 2.290729522705078, + "learning_rate": 9.982311584921505e-06, + "loss": 2.9473, + "step": 382900 + }, + { + "epoch": 0.3772668521393725, + "grad_norm": 2.3276634216308594, + "learning_rate": 9.982297892638571e-06, + "loss": 2.9352, + "step": 382950 + }, + { + "epoch": 0.37731611011719457, + "grad_norm": 2.5626871585845947, + "learning_rate": 9.982284195067612e-06, + "loss": 2.9355, + "step": 383000 + }, + { + "epoch": 0.3773653680950167, + "grad_norm": 2.207639694213867, + "learning_rate": 9.98227049220864e-06, + "loss": 2.9153, + "step": 383050 + }, + { + "epoch": 0.37741462607283877, + "grad_norm": 2.305419921875, + "learning_rate": 9.982256784061673e-06, + "loss": 3.0122, + "step": 383100 + }, + { + "epoch": 0.37746388405066084, + "grad_norm": 2.1661837100982666, + "learning_rate": 9.98224307062672e-06, + "loss": 2.9663, + "step": 383150 + }, + { + "epoch": 0.3775131420284829, + "grad_norm": 2.267343282699585, + "learning_rate": 9.9822293519038e-06, + "loss": 2.9553, + "step": 383200 + }, + { + "epoch": 0.37756240000630503, + "grad_norm": 2.2342965602874756, + "learning_rate": 9.982215627892925e-06, + "loss": 2.9212, + "step": 383250 + }, + { + "epoch": 0.3776116579841271, + "grad_norm": 2.263211488723755, + "learning_rate": 9.982201898594113e-06, + "loss": 2.919, + "step": 383300 + }, + { + "epoch": 0.3776609159619492, + "grad_norm": 2.8387789726257324, + "learning_rate": 9.982188164007374e-06, + "loss": 2.9201, + "step": 383350 + }, + { + "epoch": 0.3777101739397713, + "grad_norm": 2.2591090202331543, + "learning_rate": 9.982174424132727e-06, + "loss": 3.0324, + "step": 383400 + }, + { + "epoch": 0.37775943191759337, + "grad_norm": 3.034109354019165, + "learning_rate": 9.982160678970182e-06, + "loss": 2.9244, + "step": 383450 + }, + { + "epoch": 0.37780868989541544, + "grad_norm": 2.061087131500244, + "learning_rate": 9.982146928519758e-06, + "loss": 3.0109, + "step": 383500 + }, + { + "epoch": 0.37785794787323757, + "grad_norm": 2.2751572132110596, + "learning_rate": 9.982133172781468e-06, + "loss": 3.0019, + "step": 383550 + }, + { + "epoch": 0.37790720585105964, + "grad_norm": 2.3569588661193848, + "learning_rate": 9.982119411755324e-06, + "loss": 2.9185, + "step": 383600 + }, + { + "epoch": 0.3779564638288817, + "grad_norm": 2.2831270694732666, + "learning_rate": 9.982105645441345e-06, + "loss": 2.9403, + "step": 383650 + }, + { + "epoch": 0.37800572180670383, + "grad_norm": 2.432507038116455, + "learning_rate": 9.982091873839542e-06, + "loss": 2.9767, + "step": 383700 + }, + { + "epoch": 0.3780549797845259, + "grad_norm": 2.270585536956787, + "learning_rate": 9.982078096949931e-06, + "loss": 2.929, + "step": 383750 + }, + { + "epoch": 0.37810423776234797, + "grad_norm": 2.340792417526245, + "learning_rate": 9.982064314772528e-06, + "loss": 2.9184, + "step": 383800 + }, + { + "epoch": 0.3781534957401701, + "grad_norm": 2.3089921474456787, + "learning_rate": 9.982050527307345e-06, + "loss": 2.8877, + "step": 383850 + }, + { + "epoch": 0.37820275371799217, + "grad_norm": 2.2184929847717285, + "learning_rate": 9.982036734554397e-06, + "loss": 2.9103, + "step": 383900 + }, + { + "epoch": 0.37825201169581424, + "grad_norm": 2.3943469524383545, + "learning_rate": 9.982022936513703e-06, + "loss": 2.9159, + "step": 383950 + }, + { + "epoch": 0.37830126967363636, + "grad_norm": 2.278113842010498, + "learning_rate": 9.982009133185272e-06, + "loss": 2.928, + "step": 384000 + }, + { + "epoch": 0.37835052765145843, + "grad_norm": 2.2057759761810303, + "learning_rate": 9.98199532456912e-06, + "loss": 2.9584, + "step": 384050 + }, + { + "epoch": 0.3783997856292805, + "grad_norm": 2.3828728199005127, + "learning_rate": 9.981981510665264e-06, + "loss": 2.9749, + "step": 384100 + }, + { + "epoch": 0.37844904360710263, + "grad_norm": 2.357083797454834, + "learning_rate": 9.981967691473717e-06, + "loss": 2.9828, + "step": 384150 + }, + { + "epoch": 0.3784983015849247, + "grad_norm": 2.1632978916168213, + "learning_rate": 9.981953866994493e-06, + "loss": 3.0096, + "step": 384200 + }, + { + "epoch": 0.37854755956274677, + "grad_norm": 2.434342384338379, + "learning_rate": 9.981940037227607e-06, + "loss": 2.9527, + "step": 384250 + }, + { + "epoch": 0.3785968175405689, + "grad_norm": 2.4327685832977295, + "learning_rate": 9.981926202173078e-06, + "loss": 2.941, + "step": 384300 + }, + { + "epoch": 0.37864607551839097, + "grad_norm": 2.2512011528015137, + "learning_rate": 9.981912361830913e-06, + "loss": 2.9878, + "step": 384350 + }, + { + "epoch": 0.37869533349621304, + "grad_norm": 2.3989973068237305, + "learning_rate": 9.981898516201134e-06, + "loss": 2.8809, + "step": 384400 + }, + { + "epoch": 0.3787445914740351, + "grad_norm": 2.205296277999878, + "learning_rate": 9.981884665283749e-06, + "loss": 2.8709, + "step": 384450 + }, + { + "epoch": 0.37879384945185723, + "grad_norm": 2.297117233276367, + "learning_rate": 9.981870809078776e-06, + "loss": 2.9554, + "step": 384500 + }, + { + "epoch": 0.3788431074296793, + "grad_norm": 2.2439632415771484, + "learning_rate": 9.981856947586232e-06, + "loss": 2.9371, + "step": 384550 + }, + { + "epoch": 0.3788923654075014, + "grad_norm": 2.375781536102295, + "learning_rate": 9.98184308080613e-06, + "loss": 2.9318, + "step": 384600 + }, + { + "epoch": 0.3789416233853235, + "grad_norm": 2.498847723007202, + "learning_rate": 9.981829208738482e-06, + "loss": 2.9316, + "step": 384650 + }, + { + "epoch": 0.37899088136314557, + "grad_norm": 2.691211700439453, + "learning_rate": 9.981815331383303e-06, + "loss": 2.9711, + "step": 384700 + }, + { + "epoch": 0.37904013934096764, + "grad_norm": 2.297635316848755, + "learning_rate": 9.981801448740613e-06, + "loss": 2.9477, + "step": 384750 + }, + { + "epoch": 0.37908939731878977, + "grad_norm": 2.405432939529419, + "learning_rate": 9.981787560810424e-06, + "loss": 2.9108, + "step": 384800 + }, + { + "epoch": 0.37913865529661184, + "grad_norm": 2.379842758178711, + "learning_rate": 9.981773667592747e-06, + "loss": 2.9015, + "step": 384850 + }, + { + "epoch": 0.3791879132744339, + "grad_norm": 2.2931039333343506, + "learning_rate": 9.981759769087601e-06, + "loss": 2.9823, + "step": 384900 + }, + { + "epoch": 0.37923717125225603, + "grad_norm": 2.2205679416656494, + "learning_rate": 9.981745865294999e-06, + "loss": 2.9363, + "step": 384950 + }, + { + "epoch": 0.3792864292300781, + "grad_norm": 2.728055238723755, + "learning_rate": 9.981731956214957e-06, + "loss": 2.9761, + "step": 385000 + }, + { + "epoch": 0.3793356872079002, + "grad_norm": 2.280641794204712, + "learning_rate": 9.981718041847489e-06, + "loss": 3.0008, + "step": 385050 + }, + { + "epoch": 0.3793849451857223, + "grad_norm": 2.257351875305176, + "learning_rate": 9.981704122192611e-06, + "loss": 2.9602, + "step": 385100 + }, + { + "epoch": 0.37943420316354437, + "grad_norm": 3.0179498195648193, + "learning_rate": 9.981690197250333e-06, + "loss": 2.9687, + "step": 385150 + }, + { + "epoch": 0.37948346114136644, + "grad_norm": 2.336813449859619, + "learning_rate": 9.981676267020678e-06, + "loss": 2.9693, + "step": 385200 + }, + { + "epoch": 0.37953271911918857, + "grad_norm": 2.234058141708374, + "learning_rate": 9.981662331503655e-06, + "loss": 2.9492, + "step": 385250 + }, + { + "epoch": 0.37958197709701064, + "grad_norm": 2.5932741165161133, + "learning_rate": 9.981648390699277e-06, + "loss": 2.8995, + "step": 385300 + }, + { + "epoch": 0.3796312350748327, + "grad_norm": 2.118471622467041, + "learning_rate": 9.981634444607564e-06, + "loss": 2.9583, + "step": 385350 + }, + { + "epoch": 0.37968049305265483, + "grad_norm": 2.3443875312805176, + "learning_rate": 9.981620493228528e-06, + "loss": 2.9432, + "step": 385400 + }, + { + "epoch": 0.3797297510304769, + "grad_norm": 2.5241475105285645, + "learning_rate": 9.981606536562185e-06, + "loss": 2.9689, + "step": 385450 + }, + { + "epoch": 0.379779009008299, + "grad_norm": 2.3216278553009033, + "learning_rate": 9.981592574608548e-06, + "loss": 2.9448, + "step": 385500 + }, + { + "epoch": 0.3798282669861211, + "grad_norm": 2.2835676670074463, + "learning_rate": 9.981578607367635e-06, + "loss": 2.9422, + "step": 385550 + }, + { + "epoch": 0.37987752496394317, + "grad_norm": 2.383307933807373, + "learning_rate": 9.981564634839457e-06, + "loss": 2.892, + "step": 385600 + }, + { + "epoch": 0.37992678294176524, + "grad_norm": 2.352965831756592, + "learning_rate": 9.981550657024032e-06, + "loss": 2.9538, + "step": 385650 + }, + { + "epoch": 0.3799760409195873, + "grad_norm": 2.384941339492798, + "learning_rate": 9.981536673921372e-06, + "loss": 2.9642, + "step": 385700 + }, + { + "epoch": 0.38002529889740944, + "grad_norm": 2.2934842109680176, + "learning_rate": 9.981522685531494e-06, + "loss": 2.9216, + "step": 385750 + }, + { + "epoch": 0.3800745568752315, + "grad_norm": 2.301699161529541, + "learning_rate": 9.981508691854414e-06, + "loss": 2.9439, + "step": 385800 + }, + { + "epoch": 0.3801238148530536, + "grad_norm": 2.949601173400879, + "learning_rate": 9.981494692890142e-06, + "loss": 2.9173, + "step": 385850 + }, + { + "epoch": 0.3801730728308757, + "grad_norm": 2.270137071609497, + "learning_rate": 9.981480688638697e-06, + "loss": 2.9662, + "step": 385900 + }, + { + "epoch": 0.3802223308086978, + "grad_norm": 2.2989695072174072, + "learning_rate": 9.981466679100093e-06, + "loss": 2.9579, + "step": 385950 + }, + { + "epoch": 0.38027158878651984, + "grad_norm": 2.3157379627227783, + "learning_rate": 9.981452664274345e-06, + "loss": 2.9569, + "step": 386000 + }, + { + "epoch": 0.38032084676434197, + "grad_norm": 2.313140392303467, + "learning_rate": 9.981438644161466e-06, + "loss": 2.9406, + "step": 386050 + }, + { + "epoch": 0.38037010474216404, + "grad_norm": 2.3458917140960693, + "learning_rate": 9.981424618761474e-06, + "loss": 2.8884, + "step": 386100 + }, + { + "epoch": 0.3804193627199861, + "grad_norm": 2.3566503524780273, + "learning_rate": 9.981410588074382e-06, + "loss": 2.9198, + "step": 386150 + }, + { + "epoch": 0.38046862069780824, + "grad_norm": 2.458625316619873, + "learning_rate": 9.981396552100205e-06, + "loss": 2.924, + "step": 386200 + }, + { + "epoch": 0.3805178786756303, + "grad_norm": 2.2177822589874268, + "learning_rate": 9.981382510838958e-06, + "loss": 2.9881, + "step": 386250 + }, + { + "epoch": 0.3805671366534524, + "grad_norm": 2.2473883628845215, + "learning_rate": 9.981368464290656e-06, + "loss": 2.8866, + "step": 386300 + }, + { + "epoch": 0.3806163946312745, + "grad_norm": 2.1755101680755615, + "learning_rate": 9.981354412455314e-06, + "loss": 2.93, + "step": 386350 + }, + { + "epoch": 0.38066565260909657, + "grad_norm": 2.4807536602020264, + "learning_rate": 9.981340355332947e-06, + "loss": 2.9347, + "step": 386400 + }, + { + "epoch": 0.38071491058691864, + "grad_norm": 2.348172187805176, + "learning_rate": 9.98132629292357e-06, + "loss": 3.0173, + "step": 386450 + }, + { + "epoch": 0.38076416856474077, + "grad_norm": 2.2987067699432373, + "learning_rate": 9.981312225227196e-06, + "loss": 2.9281, + "step": 386500 + }, + { + "epoch": 0.38081342654256284, + "grad_norm": 2.388678789138794, + "learning_rate": 9.981298152243842e-06, + "loss": 2.9713, + "step": 386550 + }, + { + "epoch": 0.3808626845203849, + "grad_norm": 2.2215285301208496, + "learning_rate": 9.981284073973522e-06, + "loss": 2.9481, + "step": 386600 + }, + { + "epoch": 0.38091194249820703, + "grad_norm": 2.215832471847534, + "learning_rate": 9.981269990416253e-06, + "loss": 2.9948, + "step": 386650 + }, + { + "epoch": 0.3809612004760291, + "grad_norm": 2.3434252738952637, + "learning_rate": 9.98125590157205e-06, + "loss": 2.8829, + "step": 386700 + }, + { + "epoch": 0.3810104584538512, + "grad_norm": 2.145595073699951, + "learning_rate": 9.981241807440922e-06, + "loss": 2.936, + "step": 386750 + }, + { + "epoch": 0.3810597164316733, + "grad_norm": 2.2813658714294434, + "learning_rate": 9.98122770802289e-06, + "loss": 2.9104, + "step": 386800 + }, + { + "epoch": 0.38110897440949537, + "grad_norm": 2.4013724327087402, + "learning_rate": 9.981213603317969e-06, + "loss": 2.8793, + "step": 386850 + }, + { + "epoch": 0.38115823238731744, + "grad_norm": 2.357405424118042, + "learning_rate": 9.98119949332617e-06, + "loss": 2.9712, + "step": 386900 + }, + { + "epoch": 0.3812074903651395, + "grad_norm": 2.4944064617156982, + "learning_rate": 9.981185378047512e-06, + "loss": 2.8684, + "step": 386950 + }, + { + "epoch": 0.38125674834296164, + "grad_norm": 2.2425084114074707, + "learning_rate": 9.981171257482006e-06, + "loss": 2.8901, + "step": 387000 + }, + { + "epoch": 0.3813060063207837, + "grad_norm": 2.3172616958618164, + "learning_rate": 9.981157131629671e-06, + "loss": 3.005, + "step": 387050 + }, + { + "epoch": 0.3813552642986058, + "grad_norm": 2.245866060256958, + "learning_rate": 9.98114300049052e-06, + "loss": 2.9596, + "step": 387100 + }, + { + "epoch": 0.3814045222764279, + "grad_norm": 2.287627696990967, + "learning_rate": 9.981128864064567e-06, + "loss": 2.894, + "step": 387150 + }, + { + "epoch": 0.38145378025425, + "grad_norm": 2.3936431407928467, + "learning_rate": 9.98111472235183e-06, + "loss": 2.9409, + "step": 387200 + }, + { + "epoch": 0.38150303823207204, + "grad_norm": 2.4696266651153564, + "learning_rate": 9.98110057535232e-06, + "loss": 2.9893, + "step": 387250 + }, + { + "epoch": 0.38155229620989417, + "grad_norm": 2.525493621826172, + "learning_rate": 9.981086423066057e-06, + "loss": 2.9332, + "step": 387300 + }, + { + "epoch": 0.38160155418771624, + "grad_norm": 2.131305694580078, + "learning_rate": 9.98107226549305e-06, + "loss": 2.9005, + "step": 387350 + }, + { + "epoch": 0.3816508121655383, + "grad_norm": 2.3775036334991455, + "learning_rate": 9.981058102633319e-06, + "loss": 2.8762, + "step": 387400 + }, + { + "epoch": 0.38170007014336044, + "grad_norm": 2.217301368713379, + "learning_rate": 9.981043934486877e-06, + "loss": 2.9507, + "step": 387450 + }, + { + "epoch": 0.3817493281211825, + "grad_norm": 2.307715654373169, + "learning_rate": 9.98102976105374e-06, + "loss": 2.9253, + "step": 387500 + }, + { + "epoch": 0.3817985860990046, + "grad_norm": 2.32776141166687, + "learning_rate": 9.98101558233392e-06, + "loss": 2.9376, + "step": 387550 + }, + { + "epoch": 0.3818478440768267, + "grad_norm": 2.397231340408325, + "learning_rate": 9.981001398327438e-06, + "loss": 2.8954, + "step": 387600 + }, + { + "epoch": 0.3818971020546488, + "grad_norm": 2.1324849128723145, + "learning_rate": 9.980987209034303e-06, + "loss": 2.9716, + "step": 387650 + }, + { + "epoch": 0.38194636003247084, + "grad_norm": 2.6258606910705566, + "learning_rate": 9.980973014454534e-06, + "loss": 2.9195, + "step": 387700 + }, + { + "epoch": 0.38199561801029297, + "grad_norm": 2.2931880950927734, + "learning_rate": 9.980958814588143e-06, + "loss": 2.909, + "step": 387750 + }, + { + "epoch": 0.38204487598811504, + "grad_norm": 2.3988630771636963, + "learning_rate": 9.980944609435147e-06, + "loss": 2.9345, + "step": 387800 + }, + { + "epoch": 0.3820941339659371, + "grad_norm": 2.296689510345459, + "learning_rate": 9.980930398995563e-06, + "loss": 2.9057, + "step": 387850 + }, + { + "epoch": 0.38214339194375924, + "grad_norm": 2.304396152496338, + "learning_rate": 9.9809161832694e-06, + "loss": 2.9478, + "step": 387900 + }, + { + "epoch": 0.3821926499215813, + "grad_norm": 2.404599905014038, + "learning_rate": 9.98090196225668e-06, + "loss": 2.9499, + "step": 387950 + }, + { + "epoch": 0.3822419078994034, + "grad_norm": 2.4150235652923584, + "learning_rate": 9.980887735957413e-06, + "loss": 2.9602, + "step": 388000 + }, + { + "epoch": 0.3822911658772255, + "grad_norm": 2.237356662750244, + "learning_rate": 9.980873504371615e-06, + "loss": 2.9024, + "step": 388050 + }, + { + "epoch": 0.3823404238550476, + "grad_norm": 2.144479751586914, + "learning_rate": 9.980859267499306e-06, + "loss": 2.9968, + "step": 388100 + }, + { + "epoch": 0.38238968183286964, + "grad_norm": 2.637641668319702, + "learning_rate": 9.980845025340494e-06, + "loss": 2.8828, + "step": 388150 + }, + { + "epoch": 0.3824389398106917, + "grad_norm": 2.304641008377075, + "learning_rate": 9.9808307778952e-06, + "loss": 2.9123, + "step": 388200 + }, + { + "epoch": 0.38248819778851384, + "grad_norm": 2.3309876918792725, + "learning_rate": 9.980816525163436e-06, + "loss": 2.9351, + "step": 388250 + }, + { + "epoch": 0.3825374557663359, + "grad_norm": 2.467850685119629, + "learning_rate": 9.980802267145216e-06, + "loss": 2.9444, + "step": 388300 + }, + { + "epoch": 0.382586713744158, + "grad_norm": 2.334174156188965, + "learning_rate": 9.980788003840557e-06, + "loss": 2.911, + "step": 388350 + }, + { + "epoch": 0.3826359717219801, + "grad_norm": 2.2261812686920166, + "learning_rate": 9.980773735249476e-06, + "loss": 2.9438, + "step": 388400 + }, + { + "epoch": 0.3826852296998022, + "grad_norm": 2.416165590286255, + "learning_rate": 9.980759461371984e-06, + "loss": 2.9935, + "step": 388450 + }, + { + "epoch": 0.38273448767762425, + "grad_norm": 2.4472458362579346, + "learning_rate": 9.9807451822081e-06, + "loss": 2.9617, + "step": 388500 + }, + { + "epoch": 0.3827837456554464, + "grad_norm": 2.564047336578369, + "learning_rate": 9.980730897757837e-06, + "loss": 2.9265, + "step": 388550 + }, + { + "epoch": 0.38283300363326844, + "grad_norm": 2.3975324630737305, + "learning_rate": 9.98071660802121e-06, + "loss": 2.9481, + "step": 388600 + }, + { + "epoch": 0.3828822616110905, + "grad_norm": 2.257317304611206, + "learning_rate": 9.980702312998235e-06, + "loss": 2.931, + "step": 388650 + }, + { + "epoch": 0.38293151958891264, + "grad_norm": 2.3408148288726807, + "learning_rate": 9.980688012688926e-06, + "loss": 2.8911, + "step": 388700 + }, + { + "epoch": 0.3829807775667347, + "grad_norm": 2.4286556243896484, + "learning_rate": 9.9806737070933e-06, + "loss": 2.8983, + "step": 388750 + }, + { + "epoch": 0.3830300355445568, + "grad_norm": 2.3013904094696045, + "learning_rate": 9.98065939621137e-06, + "loss": 2.9112, + "step": 388800 + }, + { + "epoch": 0.3830792935223789, + "grad_norm": 2.1755692958831787, + "learning_rate": 9.980645080043154e-06, + "loss": 2.9062, + "step": 388850 + }, + { + "epoch": 0.383128551500201, + "grad_norm": 2.4384419918060303, + "learning_rate": 9.980630758588665e-06, + "loss": 2.9605, + "step": 388900 + }, + { + "epoch": 0.38317780947802305, + "grad_norm": 2.3144309520721436, + "learning_rate": 9.98061643184792e-06, + "loss": 2.9, + "step": 388950 + }, + { + "epoch": 0.38322706745584517, + "grad_norm": 2.4883787631988525, + "learning_rate": 9.980602099820932e-06, + "loss": 2.9426, + "step": 389000 + }, + { + "epoch": 0.38327632543366724, + "grad_norm": 2.3406410217285156, + "learning_rate": 9.980587762507716e-06, + "loss": 2.9644, + "step": 389050 + }, + { + "epoch": 0.3833255834114893, + "grad_norm": 2.0897815227508545, + "learning_rate": 9.980573419908291e-06, + "loss": 2.9962, + "step": 389100 + }, + { + "epoch": 0.38337484138931144, + "grad_norm": 2.359740972518921, + "learning_rate": 9.980559072022668e-06, + "loss": 2.9128, + "step": 389150 + }, + { + "epoch": 0.3834240993671335, + "grad_norm": 2.1455585956573486, + "learning_rate": 9.980544718850864e-06, + "loss": 2.918, + "step": 389200 + }, + { + "epoch": 0.3834733573449556, + "grad_norm": 2.3495569229125977, + "learning_rate": 9.980530360392894e-06, + "loss": 2.8974, + "step": 389250 + }, + { + "epoch": 0.3835226153227777, + "grad_norm": 2.241229772567749, + "learning_rate": 9.980515996648776e-06, + "loss": 2.9793, + "step": 389300 + }, + { + "epoch": 0.3835718733005998, + "grad_norm": 2.5353002548217773, + "learning_rate": 9.98050162761852e-06, + "loss": 2.9812, + "step": 389350 + }, + { + "epoch": 0.38362113127842185, + "grad_norm": 2.265974283218384, + "learning_rate": 9.980487253302145e-06, + "loss": 2.8892, + "step": 389400 + }, + { + "epoch": 0.3836703892562439, + "grad_norm": 2.4591147899627686, + "learning_rate": 9.980472873699663e-06, + "loss": 2.9266, + "step": 389450 + }, + { + "epoch": 0.38371964723406604, + "grad_norm": 2.5734171867370605, + "learning_rate": 9.980458488811093e-06, + "loss": 2.8837, + "step": 389500 + }, + { + "epoch": 0.3837689052118881, + "grad_norm": 2.7331769466400146, + "learning_rate": 9.98044409863645e-06, + "loss": 2.949, + "step": 389550 + }, + { + "epoch": 0.3838181631897102, + "grad_norm": 2.399404287338257, + "learning_rate": 9.980429703175746e-06, + "loss": 2.9357, + "step": 389600 + }, + { + "epoch": 0.3838674211675323, + "grad_norm": 2.356882095336914, + "learning_rate": 9.980415302428998e-06, + "loss": 2.9902, + "step": 389650 + }, + { + "epoch": 0.3839166791453544, + "grad_norm": 2.3405539989471436, + "learning_rate": 9.980400896396223e-06, + "loss": 2.9746, + "step": 389700 + }, + { + "epoch": 0.38396593712317645, + "grad_norm": 2.1536624431610107, + "learning_rate": 9.980386485077434e-06, + "loss": 2.9951, + "step": 389750 + }, + { + "epoch": 0.3840151951009986, + "grad_norm": 2.2097647190093994, + "learning_rate": 9.98037206847265e-06, + "loss": 2.9882, + "step": 389800 + }, + { + "epoch": 0.38406445307882064, + "grad_norm": 2.235062599182129, + "learning_rate": 9.98035764658188e-06, + "loss": 2.9422, + "step": 389850 + }, + { + "epoch": 0.3841137110566427, + "grad_norm": 2.4241256713867188, + "learning_rate": 9.980343219405143e-06, + "loss": 2.968, + "step": 389900 + }, + { + "epoch": 0.38416296903446484, + "grad_norm": 2.419393301010132, + "learning_rate": 9.980328786942455e-06, + "loss": 2.9558, + "step": 389950 + }, + { + "epoch": 0.3842122270122869, + "grad_norm": 2.4655115604400635, + "learning_rate": 9.980314349193828e-06, + "loss": 2.939, + "step": 390000 + }, + { + "epoch": 0.384261484990109, + "grad_norm": 2.3204212188720703, + "learning_rate": 9.980299906159283e-06, + "loss": 2.9337, + "step": 390050 + }, + { + "epoch": 0.3843107429679311, + "grad_norm": 2.180722713470459, + "learning_rate": 9.98028545783883e-06, + "loss": 2.9663, + "step": 390100 + }, + { + "epoch": 0.3843600009457532, + "grad_norm": 2.111448049545288, + "learning_rate": 9.980271004232487e-06, + "loss": 2.9666, + "step": 390150 + }, + { + "epoch": 0.38440925892357525, + "grad_norm": 2.310415744781494, + "learning_rate": 9.980256545340269e-06, + "loss": 2.9857, + "step": 390200 + }, + { + "epoch": 0.3844585169013974, + "grad_norm": 2.3165862560272217, + "learning_rate": 9.98024208116219e-06, + "loss": 2.9814, + "step": 390250 + }, + { + "epoch": 0.38450777487921944, + "grad_norm": 2.269986152648926, + "learning_rate": 9.980227611698267e-06, + "loss": 2.9886, + "step": 390300 + }, + { + "epoch": 0.3845570328570415, + "grad_norm": 2.1247880458831787, + "learning_rate": 9.980213136948514e-06, + "loss": 2.9267, + "step": 390350 + }, + { + "epoch": 0.38460629083486364, + "grad_norm": 2.4749484062194824, + "learning_rate": 9.980198656912947e-06, + "loss": 2.9791, + "step": 390400 + }, + { + "epoch": 0.3846555488126857, + "grad_norm": 2.229966402053833, + "learning_rate": 9.980184171591585e-06, + "loss": 2.9246, + "step": 390450 + }, + { + "epoch": 0.3847048067905078, + "grad_norm": 2.3978431224823, + "learning_rate": 9.980169680984435e-06, + "loss": 2.9576, + "step": 390500 + }, + { + "epoch": 0.3847540647683299, + "grad_norm": 2.3180367946624756, + "learning_rate": 9.98015518509152e-06, + "loss": 2.9483, + "step": 390550 + }, + { + "epoch": 0.384803322746152, + "grad_norm": 2.3679287433624268, + "learning_rate": 9.980140683912852e-06, + "loss": 2.8582, + "step": 390600 + }, + { + "epoch": 0.38485258072397405, + "grad_norm": 2.435758113861084, + "learning_rate": 9.980126177448446e-06, + "loss": 2.9315, + "step": 390650 + }, + { + "epoch": 0.3849018387017961, + "grad_norm": 2.3825714588165283, + "learning_rate": 9.980111665698321e-06, + "loss": 2.9572, + "step": 390700 + }, + { + "epoch": 0.38495109667961824, + "grad_norm": 2.192493200302124, + "learning_rate": 9.980097148662486e-06, + "loss": 2.9259, + "step": 390750 + }, + { + "epoch": 0.3850003546574403, + "grad_norm": 2.3830983638763428, + "learning_rate": 9.980082626340964e-06, + "loss": 2.9253, + "step": 390800 + }, + { + "epoch": 0.3850496126352624, + "grad_norm": 2.2133421897888184, + "learning_rate": 9.980068098733764e-06, + "loss": 2.9066, + "step": 390850 + }, + { + "epoch": 0.3850988706130845, + "grad_norm": 2.2772750854492188, + "learning_rate": 9.980053565840905e-06, + "loss": 2.9671, + "step": 390900 + }, + { + "epoch": 0.3851481285909066, + "grad_norm": 2.212965726852417, + "learning_rate": 9.980039027662401e-06, + "loss": 2.9365, + "step": 390950 + }, + { + "epoch": 0.38519738656872865, + "grad_norm": 2.531747817993164, + "learning_rate": 9.980024484198268e-06, + "loss": 2.8745, + "step": 391000 + }, + { + "epoch": 0.3852466445465508, + "grad_norm": 2.2650229930877686, + "learning_rate": 9.980009935448522e-06, + "loss": 2.9549, + "step": 391050 + }, + { + "epoch": 0.38529590252437285, + "grad_norm": 2.329008102416992, + "learning_rate": 9.979995381413177e-06, + "loss": 2.9974, + "step": 391100 + }, + { + "epoch": 0.3853451605021949, + "grad_norm": 2.1057419776916504, + "learning_rate": 9.97998082209225e-06, + "loss": 2.8691, + "step": 391150 + }, + { + "epoch": 0.38539441848001704, + "grad_norm": 2.4120092391967773, + "learning_rate": 9.979966257485754e-06, + "loss": 2.9484, + "step": 391200 + }, + { + "epoch": 0.3854436764578391, + "grad_norm": 2.2876040935516357, + "learning_rate": 9.979951687593708e-06, + "loss": 2.8632, + "step": 391250 + }, + { + "epoch": 0.3854929344356612, + "grad_norm": 2.1893272399902344, + "learning_rate": 9.979937112416126e-06, + "loss": 2.8805, + "step": 391300 + }, + { + "epoch": 0.3855421924134833, + "grad_norm": 2.0460052490234375, + "learning_rate": 9.979922531953021e-06, + "loss": 3.0156, + "step": 391350 + }, + { + "epoch": 0.3855914503913054, + "grad_norm": 2.2820894718170166, + "learning_rate": 9.979907946204412e-06, + "loss": 3.0276, + "step": 391400 + }, + { + "epoch": 0.38564070836912745, + "grad_norm": 2.1148838996887207, + "learning_rate": 9.979893355170312e-06, + "loss": 3.0004, + "step": 391450 + }, + { + "epoch": 0.3856899663469496, + "grad_norm": 2.232616662979126, + "learning_rate": 9.979878758850738e-06, + "loss": 3.0147, + "step": 391500 + }, + { + "epoch": 0.38573922432477165, + "grad_norm": 2.332245111465454, + "learning_rate": 9.979864157245706e-06, + "loss": 2.9141, + "step": 391550 + }, + { + "epoch": 0.3857884823025937, + "grad_norm": 2.227760076522827, + "learning_rate": 9.979849550355229e-06, + "loss": 2.9422, + "step": 391600 + }, + { + "epoch": 0.38583774028041584, + "grad_norm": 2.28507924079895, + "learning_rate": 9.979834938179324e-06, + "loss": 2.9111, + "step": 391650 + }, + { + "epoch": 0.3858869982582379, + "grad_norm": 2.2557334899902344, + "learning_rate": 9.979820320718008e-06, + "loss": 2.874, + "step": 391700 + }, + { + "epoch": 0.38593625623606, + "grad_norm": 2.42170786857605, + "learning_rate": 9.979805697971294e-06, + "loss": 2.9151, + "step": 391750 + }, + { + "epoch": 0.38598551421388205, + "grad_norm": 2.4137589931488037, + "learning_rate": 9.979791069939198e-06, + "loss": 2.9202, + "step": 391800 + }, + { + "epoch": 0.3860347721917042, + "grad_norm": 2.3238813877105713, + "learning_rate": 9.979776436621737e-06, + "loss": 2.9524, + "step": 391850 + }, + { + "epoch": 0.38608403016952625, + "grad_norm": 2.295217514038086, + "learning_rate": 9.979761798018924e-06, + "loss": 2.9432, + "step": 391900 + }, + { + "epoch": 0.3861332881473483, + "grad_norm": 2.3724780082702637, + "learning_rate": 9.979747154130778e-06, + "loss": 2.9694, + "step": 391950 + }, + { + "epoch": 0.38618254612517044, + "grad_norm": 2.45386004447937, + "learning_rate": 9.979732504957312e-06, + "loss": 2.9135, + "step": 392000 + }, + { + "epoch": 0.3862318041029925, + "grad_norm": 2.141925573348999, + "learning_rate": 9.979717850498542e-06, + "loss": 3.0031, + "step": 392050 + }, + { + "epoch": 0.3862810620808146, + "grad_norm": 2.257509708404541, + "learning_rate": 9.979703190754485e-06, + "loss": 2.923, + "step": 392100 + }, + { + "epoch": 0.3863303200586367, + "grad_norm": 2.45166015625, + "learning_rate": 9.979688525725154e-06, + "loss": 2.9685, + "step": 392150 + }, + { + "epoch": 0.3863795780364588, + "grad_norm": 2.0280039310455322, + "learning_rate": 9.979673855410565e-06, + "loss": 2.9469, + "step": 392200 + }, + { + "epoch": 0.38642883601428085, + "grad_norm": 2.0004684925079346, + "learning_rate": 9.979659179810736e-06, + "loss": 2.9351, + "step": 392250 + }, + { + "epoch": 0.386478093992103, + "grad_norm": 2.286041021347046, + "learning_rate": 9.979644498925681e-06, + "loss": 2.9328, + "step": 392300 + }, + { + "epoch": 0.38652735196992505, + "grad_norm": 2.3231406211853027, + "learning_rate": 9.979629812755416e-06, + "loss": 2.8741, + "step": 392350 + }, + { + "epoch": 0.3865766099477471, + "grad_norm": 2.4011967182159424, + "learning_rate": 9.979615121299954e-06, + "loss": 2.9646, + "step": 392400 + }, + { + "epoch": 0.38662586792556924, + "grad_norm": 2.19826078414917, + "learning_rate": 9.979600424559315e-06, + "loss": 2.912, + "step": 392450 + }, + { + "epoch": 0.3866751259033913, + "grad_norm": 2.4304044246673584, + "learning_rate": 9.979585722533511e-06, + "loss": 2.9234, + "step": 392500 + }, + { + "epoch": 0.3867243838812134, + "grad_norm": 2.1641976833343506, + "learning_rate": 9.97957101522256e-06, + "loss": 2.9399, + "step": 392550 + }, + { + "epoch": 0.3867736418590355, + "grad_norm": 2.334501266479492, + "learning_rate": 9.979556302626478e-06, + "loss": 2.9586, + "step": 392600 + }, + { + "epoch": 0.3868228998368576, + "grad_norm": 2.230888605117798, + "learning_rate": 9.979541584745275e-06, + "loss": 2.8994, + "step": 392650 + }, + { + "epoch": 0.38687215781467965, + "grad_norm": 2.181016206741333, + "learning_rate": 9.979526861578974e-06, + "loss": 2.9058, + "step": 392700 + }, + { + "epoch": 0.3869214157925018, + "grad_norm": 2.206455707550049, + "learning_rate": 9.979512133127588e-06, + "loss": 2.9382, + "step": 392750 + }, + { + "epoch": 0.38697067377032385, + "grad_norm": 2.3430042266845703, + "learning_rate": 9.97949739939113e-06, + "loss": 2.9415, + "step": 392800 + }, + { + "epoch": 0.3870199317481459, + "grad_norm": 2.262279510498047, + "learning_rate": 9.979482660369619e-06, + "loss": 2.939, + "step": 392850 + }, + { + "epoch": 0.38706918972596804, + "grad_norm": 2.2696304321289062, + "learning_rate": 9.97946791606307e-06, + "loss": 2.9772, + "step": 392900 + }, + { + "epoch": 0.3871184477037901, + "grad_norm": 2.395158529281616, + "learning_rate": 9.979453166471498e-06, + "loss": 2.936, + "step": 392950 + }, + { + "epoch": 0.3871677056816122, + "grad_norm": 2.3531689643859863, + "learning_rate": 9.979438411594917e-06, + "loss": 2.9255, + "step": 393000 + }, + { + "epoch": 0.38721696365943425, + "grad_norm": 2.2111704349517822, + "learning_rate": 9.979423651433345e-06, + "loss": 2.9817, + "step": 393050 + }, + { + "epoch": 0.3872662216372564, + "grad_norm": 2.2347002029418945, + "learning_rate": 9.979408885986798e-06, + "loss": 2.9635, + "step": 393100 + }, + { + "epoch": 0.38731547961507845, + "grad_norm": 2.2006163597106934, + "learning_rate": 9.97939411525529e-06, + "loss": 2.9003, + "step": 393150 + }, + { + "epoch": 0.3873647375929005, + "grad_norm": 2.330244779586792, + "learning_rate": 9.979379339238836e-06, + "loss": 2.9877, + "step": 393200 + }, + { + "epoch": 0.38741399557072265, + "grad_norm": 2.5020036697387695, + "learning_rate": 9.979364557937454e-06, + "loss": 2.9181, + "step": 393250 + }, + { + "epoch": 0.3874632535485447, + "grad_norm": 2.2434589862823486, + "learning_rate": 9.97934977135116e-06, + "loss": 2.9512, + "step": 393300 + }, + { + "epoch": 0.3875125115263668, + "grad_norm": 2.4554030895233154, + "learning_rate": 9.979334979479968e-06, + "loss": 2.8939, + "step": 393350 + }, + { + "epoch": 0.3875617695041889, + "grad_norm": 2.2982890605926514, + "learning_rate": 9.979320182323894e-06, + "loss": 2.9071, + "step": 393400 + }, + { + "epoch": 0.387611027482011, + "grad_norm": 2.199631929397583, + "learning_rate": 9.979305379882954e-06, + "loss": 2.9793, + "step": 393450 + }, + { + "epoch": 0.38766028545983305, + "grad_norm": 2.4045679569244385, + "learning_rate": 9.979290572157163e-06, + "loss": 2.9522, + "step": 393500 + }, + { + "epoch": 0.3877095434376552, + "grad_norm": 2.3802435398101807, + "learning_rate": 9.979275759146538e-06, + "loss": 2.9835, + "step": 393550 + }, + { + "epoch": 0.38775880141547725, + "grad_norm": 2.3602936267852783, + "learning_rate": 9.979260940851094e-06, + "loss": 2.9622, + "step": 393600 + }, + { + "epoch": 0.3878080593932993, + "grad_norm": 2.3720545768737793, + "learning_rate": 9.979246117270846e-06, + "loss": 2.968, + "step": 393650 + }, + { + "epoch": 0.38785731737112145, + "grad_norm": 2.466792583465576, + "learning_rate": 9.979231288405808e-06, + "loss": 2.9853, + "step": 393700 + }, + { + "epoch": 0.3879065753489435, + "grad_norm": 2.3967325687408447, + "learning_rate": 9.979216454256002e-06, + "loss": 2.9462, + "step": 393750 + }, + { + "epoch": 0.3879558333267656, + "grad_norm": 2.1932873725891113, + "learning_rate": 9.97920161482144e-06, + "loss": 2.9118, + "step": 393800 + }, + { + "epoch": 0.3880050913045877, + "grad_norm": 2.2917845249176025, + "learning_rate": 9.979186770102136e-06, + "loss": 2.9933, + "step": 393850 + }, + { + "epoch": 0.3880543492824098, + "grad_norm": 2.3030731678009033, + "learning_rate": 9.979171920098107e-06, + "loss": 2.9489, + "step": 393900 + }, + { + "epoch": 0.38810360726023185, + "grad_norm": 2.384669065475464, + "learning_rate": 9.97915706480937e-06, + "loss": 2.9821, + "step": 393950 + }, + { + "epoch": 0.388152865238054, + "grad_norm": 2.3065404891967773, + "learning_rate": 9.979142204235941e-06, + "loss": 3.008, + "step": 394000 + }, + { + "epoch": 0.38820212321587605, + "grad_norm": 2.3541007041931152, + "learning_rate": 9.979127338377834e-06, + "loss": 2.8413, + "step": 394050 + }, + { + "epoch": 0.3882513811936981, + "grad_norm": 2.270364999771118, + "learning_rate": 9.979112467235065e-06, + "loss": 2.9201, + "step": 394100 + }, + { + "epoch": 0.38830063917152025, + "grad_norm": 2.312594175338745, + "learning_rate": 9.97909759080765e-06, + "loss": 2.9364, + "step": 394150 + }, + { + "epoch": 0.3883498971493423, + "grad_norm": 2.14455509185791, + "learning_rate": 9.979082709095607e-06, + "loss": 2.91, + "step": 394200 + }, + { + "epoch": 0.3883991551271644, + "grad_norm": 2.552051067352295, + "learning_rate": 9.979067822098949e-06, + "loss": 2.9603, + "step": 394250 + }, + { + "epoch": 0.38844841310498646, + "grad_norm": 2.354018211364746, + "learning_rate": 9.979052929817691e-06, + "loss": 2.9459, + "step": 394300 + }, + { + "epoch": 0.3884976710828086, + "grad_norm": 2.487475633621216, + "learning_rate": 9.979038032251853e-06, + "loss": 2.9601, + "step": 394350 + }, + { + "epoch": 0.38854692906063065, + "grad_norm": 2.2067630290985107, + "learning_rate": 9.979023129401447e-06, + "loss": 2.921, + "step": 394400 + }, + { + "epoch": 0.3885961870384527, + "grad_norm": 2.3017663955688477, + "learning_rate": 9.97900822126649e-06, + "loss": 2.9652, + "step": 394450 + }, + { + "epoch": 0.38864544501627485, + "grad_norm": 2.801220178604126, + "learning_rate": 9.978993307846999e-06, + "loss": 2.9422, + "step": 394500 + }, + { + "epoch": 0.3886947029940969, + "grad_norm": 2.1518797874450684, + "learning_rate": 9.978978389142988e-06, + "loss": 3.0358, + "step": 394550 + }, + { + "epoch": 0.388743960971919, + "grad_norm": 2.2775542736053467, + "learning_rate": 9.978963465154473e-06, + "loss": 2.9091, + "step": 394600 + }, + { + "epoch": 0.3887932189497411, + "grad_norm": 2.538243055343628, + "learning_rate": 9.97894853588147e-06, + "loss": 2.8619, + "step": 394650 + }, + { + "epoch": 0.3888424769275632, + "grad_norm": 2.184316635131836, + "learning_rate": 9.978933601323997e-06, + "loss": 2.9463, + "step": 394700 + }, + { + "epoch": 0.38889173490538526, + "grad_norm": 2.2088029384613037, + "learning_rate": 9.978918661482069e-06, + "loss": 2.9625, + "step": 394750 + }, + { + "epoch": 0.3889409928832074, + "grad_norm": 2.1962997913360596, + "learning_rate": 9.9789037163557e-06, + "loss": 2.9286, + "step": 394800 + }, + { + "epoch": 0.38899025086102945, + "grad_norm": 2.1585006713867188, + "learning_rate": 9.978888765944905e-06, + "loss": 2.9684, + "step": 394850 + }, + { + "epoch": 0.3890395088388515, + "grad_norm": 2.433762788772583, + "learning_rate": 9.978873810249704e-06, + "loss": 2.8823, + "step": 394900 + }, + { + "epoch": 0.38908876681667365, + "grad_norm": 2.336325168609619, + "learning_rate": 9.978858849270109e-06, + "loss": 2.9762, + "step": 394950 + }, + { + "epoch": 0.3891380247944957, + "grad_norm": 2.327550172805786, + "learning_rate": 9.978843883006138e-06, + "loss": 2.9386, + "step": 395000 + }, + { + "epoch": 0.3891872827723178, + "grad_norm": 2.145416021347046, + "learning_rate": 9.978828911457807e-06, + "loss": 2.96, + "step": 395050 + }, + { + "epoch": 0.3892365407501399, + "grad_norm": 2.2875404357910156, + "learning_rate": 9.97881393462513e-06, + "loss": 2.95, + "step": 395100 + }, + { + "epoch": 0.389285798727962, + "grad_norm": 2.2477335929870605, + "learning_rate": 9.978798952508125e-06, + "loss": 2.9663, + "step": 395150 + }, + { + "epoch": 0.38933505670578405, + "grad_norm": 2.6066479682922363, + "learning_rate": 9.978783965106808e-06, + "loss": 2.9516, + "step": 395200 + }, + { + "epoch": 0.3893843146836062, + "grad_norm": 2.2401375770568848, + "learning_rate": 9.978768972421192e-06, + "loss": 2.8868, + "step": 395250 + }, + { + "epoch": 0.38943357266142825, + "grad_norm": 2.476761817932129, + "learning_rate": 9.978753974451297e-06, + "loss": 2.919, + "step": 395300 + }, + { + "epoch": 0.3894828306392503, + "grad_norm": 2.2410004138946533, + "learning_rate": 9.978738971197136e-06, + "loss": 2.9159, + "step": 395350 + }, + { + "epoch": 0.38953208861707245, + "grad_norm": 2.38810396194458, + "learning_rate": 9.978723962658724e-06, + "loss": 2.968, + "step": 395400 + }, + { + "epoch": 0.3895813465948945, + "grad_norm": 2.720501184463501, + "learning_rate": 9.97870894883608e-06, + "loss": 2.9265, + "step": 395450 + }, + { + "epoch": 0.3896306045727166, + "grad_norm": 2.3998937606811523, + "learning_rate": 9.97869392972922e-06, + "loss": 2.9485, + "step": 395500 + }, + { + "epoch": 0.38967986255053866, + "grad_norm": 2.3676793575286865, + "learning_rate": 9.978678905338158e-06, + "loss": 2.9389, + "step": 395550 + }, + { + "epoch": 0.3897291205283608, + "grad_norm": 2.835904359817505, + "learning_rate": 9.978663875662909e-06, + "loss": 2.9099, + "step": 395600 + }, + { + "epoch": 0.38977837850618285, + "grad_norm": 2.4374372959136963, + "learning_rate": 9.978648840703491e-06, + "loss": 2.9211, + "step": 395650 + }, + { + "epoch": 0.3898276364840049, + "grad_norm": 2.2885894775390625, + "learning_rate": 9.978633800459918e-06, + "loss": 2.9723, + "step": 395700 + }, + { + "epoch": 0.38987689446182705, + "grad_norm": 2.2738661766052246, + "learning_rate": 9.97861875493221e-06, + "loss": 2.9602, + "step": 395750 + }, + { + "epoch": 0.3899261524396491, + "grad_norm": 2.3410656452178955, + "learning_rate": 9.978603704120379e-06, + "loss": 2.9655, + "step": 395800 + }, + { + "epoch": 0.3899754104174712, + "grad_norm": 2.549328565597534, + "learning_rate": 9.978588648024444e-06, + "loss": 2.9178, + "step": 395850 + }, + { + "epoch": 0.3900246683952933, + "grad_norm": 2.274117946624756, + "learning_rate": 9.978573586644417e-06, + "loss": 2.9096, + "step": 395900 + }, + { + "epoch": 0.3900739263731154, + "grad_norm": 2.539698362350464, + "learning_rate": 9.978558519980317e-06, + "loss": 2.9022, + "step": 395950 + }, + { + "epoch": 0.39012318435093746, + "grad_norm": 2.4561045169830322, + "learning_rate": 9.978543448032159e-06, + "loss": 3.001, + "step": 396000 + }, + { + "epoch": 0.3901724423287596, + "grad_norm": 2.276322364807129, + "learning_rate": 9.97852837079996e-06, + "loss": 2.9612, + "step": 396050 + }, + { + "epoch": 0.39022170030658165, + "grad_norm": 2.2897722721099854, + "learning_rate": 9.978513288283735e-06, + "loss": 2.8889, + "step": 396100 + }, + { + "epoch": 0.3902709582844037, + "grad_norm": 2.2371175289154053, + "learning_rate": 9.978498200483499e-06, + "loss": 2.9893, + "step": 396150 + }, + { + "epoch": 0.39032021626222585, + "grad_norm": 2.3963043689727783, + "learning_rate": 9.97848310739927e-06, + "loss": 2.8752, + "step": 396200 + }, + { + "epoch": 0.3903694742400479, + "grad_norm": 2.3227274417877197, + "learning_rate": 9.978468009031064e-06, + "loss": 2.8799, + "step": 396250 + }, + { + "epoch": 0.39041873221787, + "grad_norm": 2.6712234020233154, + "learning_rate": 9.978452905378896e-06, + "loss": 2.9923, + "step": 396300 + }, + { + "epoch": 0.3904679901956921, + "grad_norm": 2.091886281967163, + "learning_rate": 9.978437796442782e-06, + "loss": 2.8841, + "step": 396350 + }, + { + "epoch": 0.3905172481735142, + "grad_norm": 2.4216017723083496, + "learning_rate": 9.978422682222739e-06, + "loss": 2.952, + "step": 396400 + }, + { + "epoch": 0.39056650615133626, + "grad_norm": 2.2659919261932373, + "learning_rate": 9.978407562718781e-06, + "loss": 2.9838, + "step": 396450 + }, + { + "epoch": 0.3906157641291584, + "grad_norm": 2.1451048851013184, + "learning_rate": 9.978392437930927e-06, + "loss": 2.9622, + "step": 396500 + }, + { + "epoch": 0.39066502210698045, + "grad_norm": 2.209489345550537, + "learning_rate": 9.978377307859191e-06, + "loss": 2.9325, + "step": 396550 + }, + { + "epoch": 0.3907142800848025, + "grad_norm": 2.251373052597046, + "learning_rate": 9.97836217250359e-06, + "loss": 2.9034, + "step": 396600 + }, + { + "epoch": 0.39076353806262465, + "grad_norm": 2.5070114135742188, + "learning_rate": 9.978347031864139e-06, + "loss": 2.8591, + "step": 396650 + }, + { + "epoch": 0.3908127960404467, + "grad_norm": 2.1659536361694336, + "learning_rate": 9.978331885940856e-06, + "loss": 2.8722, + "step": 396700 + }, + { + "epoch": 0.3908620540182688, + "grad_norm": 2.306225299835205, + "learning_rate": 9.978316734733754e-06, + "loss": 2.978, + "step": 396750 + }, + { + "epoch": 0.39091131199609086, + "grad_norm": 2.2403042316436768, + "learning_rate": 9.978301578242852e-06, + "loss": 2.962, + "step": 396800 + }, + { + "epoch": 0.390960569973913, + "grad_norm": 2.1655938625335693, + "learning_rate": 9.978286416468164e-06, + "loss": 2.879, + "step": 396850 + }, + { + "epoch": 0.39100982795173506, + "grad_norm": 2.3574774265289307, + "learning_rate": 9.978271249409708e-06, + "loss": 2.9399, + "step": 396900 + }, + { + "epoch": 0.3910590859295571, + "grad_norm": 2.4231810569763184, + "learning_rate": 9.978256077067498e-06, + "loss": 2.9583, + "step": 396950 + }, + { + "epoch": 0.39110834390737925, + "grad_norm": 2.3082828521728516, + "learning_rate": 9.978240899441552e-06, + "loss": 2.9233, + "step": 397000 + }, + { + "epoch": 0.3911576018852013, + "grad_norm": 2.370938539505005, + "learning_rate": 9.978225716531886e-06, + "loss": 2.9713, + "step": 397050 + }, + { + "epoch": 0.3912068598630234, + "grad_norm": 2.240556001663208, + "learning_rate": 9.978210528338516e-06, + "loss": 2.8558, + "step": 397100 + }, + { + "epoch": 0.3912561178408455, + "grad_norm": 2.2416858673095703, + "learning_rate": 9.978195334861455e-06, + "loss": 2.8649, + "step": 397150 + }, + { + "epoch": 0.3913053758186676, + "grad_norm": 2.497138500213623, + "learning_rate": 9.978180136100724e-06, + "loss": 2.9395, + "step": 397200 + }, + { + "epoch": 0.39135463379648966, + "grad_norm": 2.19053053855896, + "learning_rate": 9.978164932056336e-06, + "loss": 2.9558, + "step": 397250 + }, + { + "epoch": 0.3914038917743118, + "grad_norm": 2.421168565750122, + "learning_rate": 9.978149722728309e-06, + "loss": 2.9061, + "step": 397300 + }, + { + "epoch": 0.39145314975213386, + "grad_norm": 2.1900625228881836, + "learning_rate": 9.978134508116656e-06, + "loss": 2.9542, + "step": 397350 + }, + { + "epoch": 0.3915024077299559, + "grad_norm": 2.2385172843933105, + "learning_rate": 9.978119288221397e-06, + "loss": 2.9121, + "step": 397400 + }, + { + "epoch": 0.39155166570777805, + "grad_norm": 2.3285586833953857, + "learning_rate": 9.978104063042544e-06, + "loss": 2.9024, + "step": 397450 + }, + { + "epoch": 0.3916009236856001, + "grad_norm": 2.2399396896362305, + "learning_rate": 9.978088832580118e-06, + "loss": 2.9237, + "step": 397500 + }, + { + "epoch": 0.3916501816634222, + "grad_norm": 2.3268065452575684, + "learning_rate": 9.978073596834134e-06, + "loss": 2.9792, + "step": 397550 + }, + { + "epoch": 0.3916994396412443, + "grad_norm": 2.268904447555542, + "learning_rate": 9.978058355804604e-06, + "loss": 2.9624, + "step": 397600 + }, + { + "epoch": 0.3917486976190664, + "grad_norm": 2.361743688583374, + "learning_rate": 9.978043109491549e-06, + "loss": 3.0123, + "step": 397650 + }, + { + "epoch": 0.39179795559688846, + "grad_norm": 2.2938709259033203, + "learning_rate": 9.978027857894982e-06, + "loss": 2.9391, + "step": 397700 + }, + { + "epoch": 0.3918472135747106, + "grad_norm": 2.1634230613708496, + "learning_rate": 9.97801260101492e-06, + "loss": 2.9888, + "step": 397750 + }, + { + "epoch": 0.39189647155253265, + "grad_norm": 2.225513458251953, + "learning_rate": 9.977997338851382e-06, + "loss": 2.8892, + "step": 397800 + }, + { + "epoch": 0.3919457295303547, + "grad_norm": 2.2949130535125732, + "learning_rate": 9.97798207140438e-06, + "loss": 2.9009, + "step": 397850 + }, + { + "epoch": 0.39199498750817685, + "grad_norm": 2.3509914875030518, + "learning_rate": 9.977966798673933e-06, + "loss": 2.9352, + "step": 397900 + }, + { + "epoch": 0.3920442454859989, + "grad_norm": 2.3153235912323, + "learning_rate": 9.977951520660056e-06, + "loss": 2.9001, + "step": 397950 + }, + { + "epoch": 0.392093503463821, + "grad_norm": 2.3168156147003174, + "learning_rate": 9.977936237362765e-06, + "loss": 2.9225, + "step": 398000 + }, + { + "epoch": 0.39214276144164306, + "grad_norm": 2.1682627201080322, + "learning_rate": 9.977920948782078e-06, + "loss": 2.8744, + "step": 398050 + }, + { + "epoch": 0.3921920194194652, + "grad_norm": 2.2787418365478516, + "learning_rate": 9.977905654918008e-06, + "loss": 2.861, + "step": 398100 + }, + { + "epoch": 0.39224127739728726, + "grad_norm": 2.220623016357422, + "learning_rate": 9.977890355770576e-06, + "loss": 2.9313, + "step": 398150 + }, + { + "epoch": 0.39229053537510933, + "grad_norm": 2.3804590702056885, + "learning_rate": 9.977875051339793e-06, + "loss": 2.9003, + "step": 398200 + }, + { + "epoch": 0.39233979335293145, + "grad_norm": 2.463669538497925, + "learning_rate": 9.977859741625678e-06, + "loss": 2.9559, + "step": 398250 + }, + { + "epoch": 0.3923890513307535, + "grad_norm": 2.3708572387695312, + "learning_rate": 9.977844426628247e-06, + "loss": 2.9613, + "step": 398300 + }, + { + "epoch": 0.3924383093085756, + "grad_norm": 2.1908140182495117, + "learning_rate": 9.977829106347516e-06, + "loss": 2.9203, + "step": 398350 + }, + { + "epoch": 0.3924875672863977, + "grad_norm": 2.513256072998047, + "learning_rate": 9.977813780783502e-06, + "loss": 2.9374, + "step": 398400 + }, + { + "epoch": 0.3925368252642198, + "grad_norm": 2.361107110977173, + "learning_rate": 9.97779844993622e-06, + "loss": 2.9556, + "step": 398450 + }, + { + "epoch": 0.39258608324204186, + "grad_norm": 2.299482822418213, + "learning_rate": 9.977783113805688e-06, + "loss": 2.9135, + "step": 398500 + }, + { + "epoch": 0.392635341219864, + "grad_norm": 2.1812620162963867, + "learning_rate": 9.97776777239192e-06, + "loss": 2.9223, + "step": 398550 + }, + { + "epoch": 0.39268459919768606, + "grad_norm": 2.2581756114959717, + "learning_rate": 9.977752425694936e-06, + "loss": 2.9202, + "step": 398600 + }, + { + "epoch": 0.3927338571755081, + "grad_norm": 2.318518877029419, + "learning_rate": 9.977737073714747e-06, + "loss": 3.0141, + "step": 398650 + }, + { + "epoch": 0.39278311515333025, + "grad_norm": 2.2614905834198, + "learning_rate": 9.977721716451373e-06, + "loss": 2.9381, + "step": 398700 + }, + { + "epoch": 0.3928323731311523, + "grad_norm": 2.1833560466766357, + "learning_rate": 9.97770635390483e-06, + "loss": 2.9181, + "step": 398750 + }, + { + "epoch": 0.3928816311089744, + "grad_norm": 2.477851152420044, + "learning_rate": 9.977690986075134e-06, + "loss": 2.9414, + "step": 398800 + }, + { + "epoch": 0.3929308890867965, + "grad_norm": 2.2162816524505615, + "learning_rate": 9.9776756129623e-06, + "loss": 2.9049, + "step": 398850 + }, + { + "epoch": 0.3929801470646186, + "grad_norm": 2.256434679031372, + "learning_rate": 9.977660234566345e-06, + "loss": 2.9332, + "step": 398900 + }, + { + "epoch": 0.39302940504244066, + "grad_norm": 2.2473368644714355, + "learning_rate": 9.977644850887285e-06, + "loss": 2.9607, + "step": 398950 + }, + { + "epoch": 0.3930786630202628, + "grad_norm": Infinity, + "learning_rate": 9.97762946192514e-06, + "loss": 2.9444, + "step": 399000 + }, + { + "epoch": 0.39312792099808486, + "grad_norm": 2.2653214931488037, + "learning_rate": 9.977614067679922e-06, + "loss": 2.8776, + "step": 399050 + }, + { + "epoch": 0.3931771789759069, + "grad_norm": 2.21346116065979, + "learning_rate": 9.977598668151648e-06, + "loss": 2.9114, + "step": 399100 + }, + { + "epoch": 0.39322643695372905, + "grad_norm": 2.409407377243042, + "learning_rate": 9.977583263340335e-06, + "loss": 3.0093, + "step": 399150 + }, + { + "epoch": 0.3932756949315511, + "grad_norm": 2.3705365657806396, + "learning_rate": 9.977567853245999e-06, + "loss": 2.9951, + "step": 399200 + }, + { + "epoch": 0.3933249529093732, + "grad_norm": 2.5002593994140625, + "learning_rate": 9.977552437868657e-06, + "loss": 2.98, + "step": 399250 + }, + { + "epoch": 0.39337421088719526, + "grad_norm": 2.295125722885132, + "learning_rate": 9.977537017208327e-06, + "loss": 2.8861, + "step": 399300 + }, + { + "epoch": 0.3934234688650174, + "grad_norm": 2.2658581733703613, + "learning_rate": 9.977521591265021e-06, + "loss": 2.9057, + "step": 399350 + }, + { + "epoch": 0.39347272684283946, + "grad_norm": 2.1383934020996094, + "learning_rate": 9.977506160038758e-06, + "loss": 2.8123, + "step": 399400 + }, + { + "epoch": 0.39352198482066153, + "grad_norm": 2.384601593017578, + "learning_rate": 9.977490723529555e-06, + "loss": 2.8951, + "step": 399450 + }, + { + "epoch": 0.39357124279848366, + "grad_norm": 2.2938709259033203, + "learning_rate": 9.977475281737428e-06, + "loss": 2.961, + "step": 399500 + }, + { + "epoch": 0.3936205007763057, + "grad_norm": 2.4790265560150146, + "learning_rate": 9.977459834662392e-06, + "loss": 2.9497, + "step": 399550 + }, + { + "epoch": 0.3936697587541278, + "grad_norm": 2.2960147857666016, + "learning_rate": 9.977444382304466e-06, + "loss": 2.914, + "step": 399600 + }, + { + "epoch": 0.3937190167319499, + "grad_norm": 2.3548178672790527, + "learning_rate": 9.977428924663663e-06, + "loss": 2.8769, + "step": 399650 + }, + { + "epoch": 0.393768274709772, + "grad_norm": 2.4988021850585938, + "learning_rate": 9.977413461740002e-06, + "loss": 2.9032, + "step": 399700 + }, + { + "epoch": 0.39381753268759406, + "grad_norm": 2.2795305252075195, + "learning_rate": 9.977397993533498e-06, + "loss": 2.9452, + "step": 399750 + }, + { + "epoch": 0.3938667906654162, + "grad_norm": 2.4802985191345215, + "learning_rate": 9.977382520044168e-06, + "loss": 2.8146, + "step": 399800 + }, + { + "epoch": 0.39391604864323826, + "grad_norm": 2.148482322692871, + "learning_rate": 9.977367041272028e-06, + "loss": 2.9883, + "step": 399850 + }, + { + "epoch": 0.39396530662106033, + "grad_norm": 2.322467565536499, + "learning_rate": 9.977351557217095e-06, + "loss": 2.9484, + "step": 399900 + }, + { + "epoch": 0.39401456459888246, + "grad_norm": 2.3714652061462402, + "learning_rate": 9.977336067879388e-06, + "loss": 2.9774, + "step": 399950 + }, + { + "epoch": 0.3940638225767045, + "grad_norm": 2.3027822971343994, + "learning_rate": 9.977320573258917e-06, + "loss": 2.9511, + "step": 400000 + }, + { + "epoch": 0.3941130805545266, + "grad_norm": 2.3692619800567627, + "learning_rate": 9.977305073355703e-06, + "loss": 2.9347, + "step": 400050 + }, + { + "epoch": 0.3941623385323487, + "grad_norm": 2.2894206047058105, + "learning_rate": 9.977289568169762e-06, + "loss": 2.9267, + "step": 400100 + }, + { + "epoch": 0.3942115965101708, + "grad_norm": 2.2817955017089844, + "learning_rate": 9.977274057701112e-06, + "loss": 2.9367, + "step": 400150 + }, + { + "epoch": 0.39426085448799286, + "grad_norm": 2.12579607963562, + "learning_rate": 9.977258541949766e-06, + "loss": 2.8881, + "step": 400200 + }, + { + "epoch": 0.394310112465815, + "grad_norm": 2.5247082710266113, + "learning_rate": 9.977243020915741e-06, + "loss": 2.9906, + "step": 400250 + }, + { + "epoch": 0.39435937044363706, + "grad_norm": 2.270658016204834, + "learning_rate": 9.977227494599056e-06, + "loss": 2.9566, + "step": 400300 + }, + { + "epoch": 0.39440862842145913, + "grad_norm": 2.174466848373413, + "learning_rate": 9.977211962999725e-06, + "loss": 3.0, + "step": 400350 + }, + { + "epoch": 0.39445788639928125, + "grad_norm": 2.253131151199341, + "learning_rate": 9.977196426117765e-06, + "loss": 2.9104, + "step": 400400 + }, + { + "epoch": 0.3945071443771033, + "grad_norm": 2.2439017295837402, + "learning_rate": 9.977180883953194e-06, + "loss": 2.9818, + "step": 400450 + }, + { + "epoch": 0.3945564023549254, + "grad_norm": 2.3574743270874023, + "learning_rate": 9.977165336506028e-06, + "loss": 2.8921, + "step": 400500 + }, + { + "epoch": 0.39460566033274747, + "grad_norm": 2.3558509349823, + "learning_rate": 9.977149783776282e-06, + "loss": 2.8732, + "step": 400550 + }, + { + "epoch": 0.3946549183105696, + "grad_norm": 2.0467140674591064, + "learning_rate": 9.977134225763973e-06, + "loss": 2.9146, + "step": 400600 + }, + { + "epoch": 0.39470417628839166, + "grad_norm": 2.331331729888916, + "learning_rate": 9.977118662469119e-06, + "loss": 2.9554, + "step": 400650 + }, + { + "epoch": 0.39475343426621373, + "grad_norm": 2.303225040435791, + "learning_rate": 9.977103093891733e-06, + "loss": 2.8676, + "step": 400700 + }, + { + "epoch": 0.39480269224403586, + "grad_norm": 2.2612719535827637, + "learning_rate": 9.977087520031837e-06, + "loss": 3.0032, + "step": 400750 + }, + { + "epoch": 0.39485195022185793, + "grad_norm": 2.3666911125183105, + "learning_rate": 9.977071940889443e-06, + "loss": 2.9534, + "step": 400800 + } + ], + "logging_steps": 50, + "max_steps": 5075320, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 10150, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.7226592062149755e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}