{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 21440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011660447761194029, "grad_norm": 2.31470614163852, "learning_rate": 2.3320895522388058e-07, "loss": 0.859, "step": 5 }, { "epoch": 0.0023320895522388058, "grad_norm": 2.1806524067205517, "learning_rate": 4.6641791044776116e-07, "loss": 0.8409, "step": 10 }, { "epoch": 0.003498134328358209, "grad_norm": 2.0336016383542637, "learning_rate": 6.996268656716418e-07, "loss": 0.8403, "step": 15 }, { "epoch": 0.0046641791044776115, "grad_norm": 1.8107765601305243, "learning_rate": 9.328358208955223e-07, "loss": 0.8402, "step": 20 }, { "epoch": 0.005830223880597015, "grad_norm": 1.5214305765409384, "learning_rate": 1.1660447761194032e-06, "loss": 0.8399, "step": 25 }, { "epoch": 0.006996268656716418, "grad_norm": 1.3729288900773917, "learning_rate": 1.3992537313432837e-06, "loss": 0.8529, "step": 30 }, { "epoch": 0.00816231343283582, "grad_norm": 1.110318324330149, "learning_rate": 1.6324626865671642e-06, "loss": 0.8044, "step": 35 }, { "epoch": 0.009328358208955223, "grad_norm": 0.91480035139308, "learning_rate": 1.8656716417910446e-06, "loss": 0.7846, "step": 40 }, { "epoch": 0.010494402985074628, "grad_norm": 0.8043185199854158, "learning_rate": 2.0988805970149257e-06, "loss": 0.771, "step": 45 }, { "epoch": 0.01166044776119403, "grad_norm": 0.7383123349724773, "learning_rate": 2.3320895522388064e-06, "loss": 0.7736, "step": 50 }, { "epoch": 0.012826492537313433, "grad_norm": 0.7490548326880867, "learning_rate": 2.5652985074626867e-06, "loss": 0.736, "step": 55 }, { "epoch": 0.013992537313432836, "grad_norm": 0.6375751886591362, "learning_rate": 2.7985074626865674e-06, "loss": 0.7309, "step": 60 }, { "epoch": 0.01515858208955224, "grad_norm": 0.6475768724712477, "learning_rate": 3.031716417910448e-06, "loss": 0.711, "step": 65 }, { "epoch": 0.01632462686567164, "grad_norm": 0.6106272866800404, "learning_rate": 3.2649253731343283e-06, "loss": 0.7357, "step": 70 }, { "epoch": 0.017490671641791043, "grad_norm": 0.6589391454469965, "learning_rate": 3.498134328358209e-06, "loss": 0.7276, "step": 75 }, { "epoch": 0.018656716417910446, "grad_norm": 0.6409686727344286, "learning_rate": 3.7313432835820893e-06, "loss": 0.6865, "step": 80 }, { "epoch": 0.019822761194029852, "grad_norm": 0.6336098527814419, "learning_rate": 3.96455223880597e-06, "loss": 0.6994, "step": 85 }, { "epoch": 0.020988805970149255, "grad_norm": 0.7346724850292417, "learning_rate": 4.1977611940298515e-06, "loss": 0.7019, "step": 90 }, { "epoch": 0.022154850746268658, "grad_norm": 0.6583598026893135, "learning_rate": 4.430970149253732e-06, "loss": 0.6827, "step": 95 }, { "epoch": 0.02332089552238806, "grad_norm": 0.5990590447236036, "learning_rate": 4.664179104477613e-06, "loss": 0.6674, "step": 100 }, { "epoch": 0.024486940298507464, "grad_norm": 0.5795709703569923, "learning_rate": 4.897388059701493e-06, "loss": 0.6599, "step": 105 }, { "epoch": 0.025652985074626867, "grad_norm": 0.5892028434107501, "learning_rate": 5.130597014925373e-06, "loss": 0.6777, "step": 110 }, { "epoch": 0.02681902985074627, "grad_norm": 0.6253265633727113, "learning_rate": 5.3638059701492545e-06, "loss": 0.6602, "step": 115 }, { "epoch": 0.027985074626865673, "grad_norm": 0.5779782549880188, "learning_rate": 5.597014925373135e-06, "loss": 0.6746, "step": 120 }, { "epoch": 0.029151119402985076, "grad_norm": 0.5651696911532211, "learning_rate": 5.830223880597015e-06, "loss": 0.6584, "step": 125 }, { "epoch": 0.03031716417910448, "grad_norm": 0.5548638684876713, "learning_rate": 6.063432835820896e-06, "loss": 0.6538, "step": 130 }, { "epoch": 0.03148320895522388, "grad_norm": 0.5527878439636197, "learning_rate": 6.2966417910447755e-06, "loss": 0.6355, "step": 135 }, { "epoch": 0.03264925373134328, "grad_norm": 0.5588523850308228, "learning_rate": 6.529850746268657e-06, "loss": 0.6243, "step": 140 }, { "epoch": 0.033815298507462684, "grad_norm": 0.5680280651347817, "learning_rate": 6.763059701492537e-06, "loss": 0.6406, "step": 145 }, { "epoch": 0.034981343283582086, "grad_norm": 0.5680483228113563, "learning_rate": 6.996268656716418e-06, "loss": 0.6318, "step": 150 }, { "epoch": 0.03614738805970149, "grad_norm": 0.562964090892077, "learning_rate": 7.229477611940298e-06, "loss": 0.6278, "step": 155 }, { "epoch": 0.03731343283582089, "grad_norm": 0.635553701478694, "learning_rate": 7.4626865671641785e-06, "loss": 0.658, "step": 160 }, { "epoch": 0.038479477611940295, "grad_norm": 0.6557036509575375, "learning_rate": 7.69589552238806e-06, "loss": 0.6405, "step": 165 }, { "epoch": 0.039645522388059705, "grad_norm": 0.5867982800908484, "learning_rate": 7.92910447761194e-06, "loss": 0.633, "step": 170 }, { "epoch": 0.04081156716417911, "grad_norm": 0.6141705031654435, "learning_rate": 8.162313432835822e-06, "loss": 0.6229, "step": 175 }, { "epoch": 0.04197761194029851, "grad_norm": 0.6248136285598961, "learning_rate": 8.395522388059703e-06, "loss": 0.6502, "step": 180 }, { "epoch": 0.043143656716417914, "grad_norm": 0.6019477617952123, "learning_rate": 8.628731343283582e-06, "loss": 0.6216, "step": 185 }, { "epoch": 0.044309701492537316, "grad_norm": 0.6240291503277375, "learning_rate": 8.861940298507463e-06, "loss": 0.6232, "step": 190 }, { "epoch": 0.04547574626865672, "grad_norm": 0.5871751457850535, "learning_rate": 9.095149253731345e-06, "loss": 0.6121, "step": 195 }, { "epoch": 0.04664179104477612, "grad_norm": 0.5469566198329419, "learning_rate": 9.328358208955226e-06, "loss": 0.6208, "step": 200 }, { "epoch": 0.047807835820895525, "grad_norm": 0.620559082451502, "learning_rate": 9.561567164179105e-06, "loss": 0.6371, "step": 205 }, { "epoch": 0.04897388059701493, "grad_norm": 0.6006349876167401, "learning_rate": 9.794776119402986e-06, "loss": 0.6478, "step": 210 }, { "epoch": 0.05013992537313433, "grad_norm": 0.5744573514050928, "learning_rate": 1.0027985074626867e-05, "loss": 0.6437, "step": 215 }, { "epoch": 0.051305970149253734, "grad_norm": 0.6132138150827955, "learning_rate": 1.0261194029850747e-05, "loss": 0.6296, "step": 220 }, { "epoch": 0.05247201492537314, "grad_norm": 0.6550090348688805, "learning_rate": 1.0494402985074628e-05, "loss": 0.6359, "step": 225 }, { "epoch": 0.05363805970149254, "grad_norm": 0.6070565138983731, "learning_rate": 1.0727611940298509e-05, "loss": 0.636, "step": 230 }, { "epoch": 0.05480410447761194, "grad_norm": 0.612999711828607, "learning_rate": 1.0960820895522388e-05, "loss": 0.6391, "step": 235 }, { "epoch": 0.055970149253731345, "grad_norm": 0.636063155653856, "learning_rate": 1.119402985074627e-05, "loss": 0.6206, "step": 240 }, { "epoch": 0.05713619402985075, "grad_norm": 0.5896975464405161, "learning_rate": 1.142723880597015e-05, "loss": 0.6052, "step": 245 }, { "epoch": 0.05830223880597015, "grad_norm": 0.5590212693105078, "learning_rate": 1.166044776119403e-05, "loss": 0.62, "step": 250 }, { "epoch": 0.059468283582089554, "grad_norm": 0.6286006057858667, "learning_rate": 1.1893656716417911e-05, "loss": 0.6121, "step": 255 }, { "epoch": 0.06063432835820896, "grad_norm": 0.6319032629223692, "learning_rate": 1.2126865671641792e-05, "loss": 0.6111, "step": 260 }, { "epoch": 0.06180037313432836, "grad_norm": 0.6227274913468611, "learning_rate": 1.2360074626865673e-05, "loss": 0.667, "step": 265 }, { "epoch": 0.06296641791044776, "grad_norm": 0.6291796320514405, "learning_rate": 1.2593283582089551e-05, "loss": 0.645, "step": 270 }, { "epoch": 0.06413246268656717, "grad_norm": 0.5883497046558631, "learning_rate": 1.2826492537313434e-05, "loss": 0.6205, "step": 275 }, { "epoch": 0.06529850746268656, "grad_norm": 0.6117993081700775, "learning_rate": 1.3059701492537313e-05, "loss": 0.633, "step": 280 }, { "epoch": 0.06646455223880597, "grad_norm": 0.6115079820522081, "learning_rate": 1.3292910447761194e-05, "loss": 0.6186, "step": 285 }, { "epoch": 0.06763059701492537, "grad_norm": 0.6264704135890352, "learning_rate": 1.3526119402985074e-05, "loss": 0.6043, "step": 290 }, { "epoch": 0.06879664179104478, "grad_norm": 0.6143832560745185, "learning_rate": 1.3759328358208957e-05, "loss": 0.6327, "step": 295 }, { "epoch": 0.06996268656716417, "grad_norm": 0.6263813560874937, "learning_rate": 1.3992537313432836e-05, "loss": 0.6209, "step": 300 }, { "epoch": 0.07112873134328358, "grad_norm": 0.5709188077991506, "learning_rate": 1.4225746268656717e-05, "loss": 0.6292, "step": 305 }, { "epoch": 0.07229477611940298, "grad_norm": 0.6034458566222612, "learning_rate": 1.4458955223880596e-05, "loss": 0.5954, "step": 310 }, { "epoch": 0.07346082089552239, "grad_norm": 0.6105447525677008, "learning_rate": 1.4692164179104478e-05, "loss": 0.6142, "step": 315 }, { "epoch": 0.07462686567164178, "grad_norm": 0.6082492860147549, "learning_rate": 1.4925373134328357e-05, "loss": 0.6181, "step": 320 }, { "epoch": 0.0757929104477612, "grad_norm": 0.6127880023341611, "learning_rate": 1.515858208955224e-05, "loss": 0.6181, "step": 325 }, { "epoch": 0.07695895522388059, "grad_norm": 0.5851972780654783, "learning_rate": 1.539179104477612e-05, "loss": 0.6136, "step": 330 }, { "epoch": 0.078125, "grad_norm": 0.6325445573033428, "learning_rate": 1.5625e-05, "loss": 0.5949, "step": 335 }, { "epoch": 0.07929104477611941, "grad_norm": 0.6544545895796555, "learning_rate": 1.585820895522388e-05, "loss": 0.5996, "step": 340 }, { "epoch": 0.0804570895522388, "grad_norm": 0.6136147810235407, "learning_rate": 1.6091417910447763e-05, "loss": 0.5878, "step": 345 }, { "epoch": 0.08162313432835822, "grad_norm": 0.6292253029881693, "learning_rate": 1.6324626865671644e-05, "loss": 0.5868, "step": 350 }, { "epoch": 0.08278917910447761, "grad_norm": 0.6639712455230693, "learning_rate": 1.6557835820895525e-05, "loss": 0.6326, "step": 355 }, { "epoch": 0.08395522388059702, "grad_norm": 0.6149576522483596, "learning_rate": 1.6791044776119406e-05, "loss": 0.5785, "step": 360 }, { "epoch": 0.08512126865671642, "grad_norm": 0.6157016571844574, "learning_rate": 1.7024253731343284e-05, "loss": 0.5942, "step": 365 }, { "epoch": 0.08628731343283583, "grad_norm": 0.5731436313839582, "learning_rate": 1.7257462686567165e-05, "loss": 0.6061, "step": 370 }, { "epoch": 0.08745335820895522, "grad_norm": 0.572969266685798, "learning_rate": 1.7490671641791046e-05, "loss": 0.5917, "step": 375 }, { "epoch": 0.08861940298507463, "grad_norm": 0.5474810292542341, "learning_rate": 1.7723880597014927e-05, "loss": 0.6018, "step": 380 }, { "epoch": 0.08978544776119403, "grad_norm": 0.580804803981787, "learning_rate": 1.7957089552238808e-05, "loss": 0.6128, "step": 385 }, { "epoch": 0.09095149253731344, "grad_norm": 0.6481861841112424, "learning_rate": 1.819029850746269e-05, "loss": 0.6279, "step": 390 }, { "epoch": 0.09211753731343283, "grad_norm": 0.5838423347872619, "learning_rate": 1.8423507462686567e-05, "loss": 0.6042, "step": 395 }, { "epoch": 0.09328358208955224, "grad_norm": 0.6455198727922007, "learning_rate": 1.865671641791045e-05, "loss": 0.6024, "step": 400 }, { "epoch": 0.09444962686567164, "grad_norm": 0.6523931295134143, "learning_rate": 1.888992537313433e-05, "loss": 0.6048, "step": 405 }, { "epoch": 0.09561567164179105, "grad_norm": 0.6173638885707777, "learning_rate": 1.912313432835821e-05, "loss": 0.6359, "step": 410 }, { "epoch": 0.09678171641791045, "grad_norm": 0.6620853772560328, "learning_rate": 1.935634328358209e-05, "loss": 0.5996, "step": 415 }, { "epoch": 0.09794776119402986, "grad_norm": 0.6565267779653624, "learning_rate": 1.9589552238805972e-05, "loss": 0.63, "step": 420 }, { "epoch": 0.09911380597014925, "grad_norm": 0.7395452985055094, "learning_rate": 1.982276119402985e-05, "loss": 0.5999, "step": 425 }, { "epoch": 0.10027985074626866, "grad_norm": 0.7276853929918868, "learning_rate": 2.0055970149253735e-05, "loss": 0.6167, "step": 430 }, { "epoch": 0.10144589552238806, "grad_norm": 0.5711327807157935, "learning_rate": 2.0289179104477612e-05, "loss": 0.6055, "step": 435 }, { "epoch": 0.10261194029850747, "grad_norm": 0.6225503982377574, "learning_rate": 2.0522388059701493e-05, "loss": 0.5971, "step": 440 }, { "epoch": 0.10377798507462686, "grad_norm": 0.639440056865781, "learning_rate": 2.0755597014925375e-05, "loss": 0.6072, "step": 445 }, { "epoch": 0.10494402985074627, "grad_norm": 0.5900464509968311, "learning_rate": 2.0988805970149256e-05, "loss": 0.5899, "step": 450 }, { "epoch": 0.10611007462686567, "grad_norm": 0.6018506954073033, "learning_rate": 2.1222014925373133e-05, "loss": 0.5785, "step": 455 }, { "epoch": 0.10727611940298508, "grad_norm": 0.6321374013938041, "learning_rate": 2.1455223880597018e-05, "loss": 0.6034, "step": 460 }, { "epoch": 0.10844216417910447, "grad_norm": 0.5882403627611281, "learning_rate": 2.1688432835820896e-05, "loss": 0.5884, "step": 465 }, { "epoch": 0.10960820895522388, "grad_norm": 0.6225348600881592, "learning_rate": 2.1921641791044777e-05, "loss": 0.5711, "step": 470 }, { "epoch": 0.11077425373134328, "grad_norm": 0.6262344050303364, "learning_rate": 2.2154850746268658e-05, "loss": 0.6288, "step": 475 }, { "epoch": 0.11194029850746269, "grad_norm": 0.6555452742873066, "learning_rate": 2.238805970149254e-05, "loss": 0.6226, "step": 480 }, { "epoch": 0.11310634328358209, "grad_norm": 0.6775582702092503, "learning_rate": 2.262126865671642e-05, "loss": 0.6283, "step": 485 }, { "epoch": 0.1142723880597015, "grad_norm": 0.8254727401525609, "learning_rate": 2.28544776119403e-05, "loss": 0.5886, "step": 490 }, { "epoch": 0.11543843283582089, "grad_norm": 0.6610453358422199, "learning_rate": 2.308768656716418e-05, "loss": 0.5993, "step": 495 }, { "epoch": 0.1166044776119403, "grad_norm": 0.6934901928302911, "learning_rate": 2.332089552238806e-05, "loss": 0.5882, "step": 500 }, { "epoch": 0.1177705223880597, "grad_norm": 0.6135522526969803, "learning_rate": 2.355410447761194e-05, "loss": 0.5861, "step": 505 }, { "epoch": 0.11893656716417911, "grad_norm": 0.5960309586201494, "learning_rate": 2.3787313432835822e-05, "loss": 0.5918, "step": 510 }, { "epoch": 0.1201026119402985, "grad_norm": 0.6740541197005134, "learning_rate": 2.4020522388059703e-05, "loss": 0.6014, "step": 515 }, { "epoch": 0.12126865671641791, "grad_norm": 0.712642754013648, "learning_rate": 2.4253731343283584e-05, "loss": 0.5931, "step": 520 }, { "epoch": 0.12243470149253731, "grad_norm": 0.6138777829198983, "learning_rate": 2.4486940298507462e-05, "loss": 0.5993, "step": 525 }, { "epoch": 0.12360074626865672, "grad_norm": 0.6860446691695918, "learning_rate": 2.4720149253731347e-05, "loss": 0.5894, "step": 530 }, { "epoch": 0.12476679104477612, "grad_norm": 0.5600239225937539, "learning_rate": 2.4953358208955224e-05, "loss": 0.5653, "step": 535 }, { "epoch": 0.1259328358208955, "grad_norm": 0.7015962241233562, "learning_rate": 2.5186567164179102e-05, "loss": 0.6196, "step": 540 }, { "epoch": 0.12709888059701493, "grad_norm": 0.6556531567053278, "learning_rate": 2.5419776119402987e-05, "loss": 0.5905, "step": 545 }, { "epoch": 0.12826492537313433, "grad_norm": 0.5759020094439726, "learning_rate": 2.5652985074626868e-05, "loss": 0.5882, "step": 550 }, { "epoch": 0.12943097014925373, "grad_norm": 0.6590948750158928, "learning_rate": 2.5886194029850745e-05, "loss": 0.6075, "step": 555 }, { "epoch": 0.13059701492537312, "grad_norm": 0.5946418488501953, "learning_rate": 2.6119402985074626e-05, "loss": 0.5944, "step": 560 }, { "epoch": 0.13176305970149255, "grad_norm": 0.552275921080956, "learning_rate": 2.635261194029851e-05, "loss": 0.603, "step": 565 }, { "epoch": 0.13292910447761194, "grad_norm": 0.6102324631143958, "learning_rate": 2.658582089552239e-05, "loss": 0.5911, "step": 570 }, { "epoch": 0.13409514925373134, "grad_norm": 0.6615381703401475, "learning_rate": 2.681902985074627e-05, "loss": 0.6299, "step": 575 }, { "epoch": 0.13526119402985073, "grad_norm": 0.571717872663118, "learning_rate": 2.7052238805970147e-05, "loss": 0.572, "step": 580 }, { "epoch": 0.13642723880597016, "grad_norm": 0.6475707634151389, "learning_rate": 2.7285447761194032e-05, "loss": 0.5779, "step": 585 }, { "epoch": 0.13759328358208955, "grad_norm": 0.6368002029643398, "learning_rate": 2.7518656716417913e-05, "loss": 0.5809, "step": 590 }, { "epoch": 0.13875932835820895, "grad_norm": 0.6827885325554015, "learning_rate": 2.775186567164179e-05, "loss": 0.583, "step": 595 }, { "epoch": 0.13992537313432835, "grad_norm": 0.6642314706691799, "learning_rate": 2.7985074626865672e-05, "loss": 0.5899, "step": 600 }, { "epoch": 0.14109141791044777, "grad_norm": 0.6762868283771502, "learning_rate": 2.8218283582089556e-05, "loss": 0.574, "step": 605 }, { "epoch": 0.14225746268656717, "grad_norm": 0.6481495750991825, "learning_rate": 2.8451492537313434e-05, "loss": 0.5772, "step": 610 }, { "epoch": 0.14342350746268656, "grad_norm": 0.5892324009182466, "learning_rate": 2.8684701492537315e-05, "loss": 0.6066, "step": 615 }, { "epoch": 0.14458955223880596, "grad_norm": 0.6254959051349941, "learning_rate": 2.8917910447761193e-05, "loss": 0.5924, "step": 620 }, { "epoch": 0.14575559701492538, "grad_norm": 0.5847897324041825, "learning_rate": 2.9151119402985077e-05, "loss": 0.5887, "step": 625 }, { "epoch": 0.14692164179104478, "grad_norm": 0.6649402988506068, "learning_rate": 2.9384328358208955e-05, "loss": 0.616, "step": 630 }, { "epoch": 0.14808768656716417, "grad_norm": 0.6320253638275484, "learning_rate": 2.9617537313432836e-05, "loss": 0.5718, "step": 635 }, { "epoch": 0.14925373134328357, "grad_norm": 0.635075778375142, "learning_rate": 2.9850746268656714e-05, "loss": 0.6073, "step": 640 }, { "epoch": 0.150419776119403, "grad_norm": 0.6976457881972329, "learning_rate": 3.00839552238806e-05, "loss": 0.5889, "step": 645 }, { "epoch": 0.1515858208955224, "grad_norm": 0.5937396394518106, "learning_rate": 3.031716417910448e-05, "loss": 0.5907, "step": 650 }, { "epoch": 0.15275186567164178, "grad_norm": 0.5801081729770634, "learning_rate": 3.055037313432836e-05, "loss": 0.5913, "step": 655 }, { "epoch": 0.15391791044776118, "grad_norm": 0.6667536375513913, "learning_rate": 3.078358208955224e-05, "loss": 0.5969, "step": 660 }, { "epoch": 0.1550839552238806, "grad_norm": 0.565340013321232, "learning_rate": 3.101679104477612e-05, "loss": 0.5858, "step": 665 }, { "epoch": 0.15625, "grad_norm": 0.5992712228605234, "learning_rate": 3.125e-05, "loss": 0.5978, "step": 670 }, { "epoch": 0.1574160447761194, "grad_norm": 0.6016621578315273, "learning_rate": 3.148320895522388e-05, "loss": 0.6107, "step": 675 }, { "epoch": 0.15858208955223882, "grad_norm": 0.6624206201218021, "learning_rate": 3.171641791044776e-05, "loss": 0.6197, "step": 680 }, { "epoch": 0.15974813432835822, "grad_norm": 0.5656129717345737, "learning_rate": 3.1949626865671644e-05, "loss": 0.6051, "step": 685 }, { "epoch": 0.1609141791044776, "grad_norm": 0.5716458801031394, "learning_rate": 3.2182835820895525e-05, "loss": 0.5917, "step": 690 }, { "epoch": 0.162080223880597, "grad_norm": 0.56275562236787, "learning_rate": 3.2416044776119406e-05, "loss": 0.606, "step": 695 }, { "epoch": 0.16324626865671643, "grad_norm": 0.6196256045734249, "learning_rate": 3.264925373134329e-05, "loss": 0.5887, "step": 700 }, { "epoch": 0.16441231343283583, "grad_norm": 0.6125006602113579, "learning_rate": 3.288246268656717e-05, "loss": 0.6054, "step": 705 }, { "epoch": 0.16557835820895522, "grad_norm": 0.5797375121091652, "learning_rate": 3.311567164179105e-05, "loss": 0.5748, "step": 710 }, { "epoch": 0.16674440298507462, "grad_norm": 0.555974564114352, "learning_rate": 3.3348880597014924e-05, "loss": 0.618, "step": 715 }, { "epoch": 0.16791044776119404, "grad_norm": 0.6230701097857418, "learning_rate": 3.358208955223881e-05, "loss": 0.5775, "step": 720 }, { "epoch": 0.16907649253731344, "grad_norm": 0.5883157056075351, "learning_rate": 3.3815298507462686e-05, "loss": 0.5935, "step": 725 }, { "epoch": 0.17024253731343283, "grad_norm": 0.5313103469638913, "learning_rate": 3.404850746268657e-05, "loss": 0.5903, "step": 730 }, { "epoch": 0.17140858208955223, "grad_norm": 0.5269472732980944, "learning_rate": 3.428171641791045e-05, "loss": 0.5589, "step": 735 }, { "epoch": 0.17257462686567165, "grad_norm": 0.5883967944258575, "learning_rate": 3.451492537313433e-05, "loss": 0.6068, "step": 740 }, { "epoch": 0.17374067164179105, "grad_norm": 0.6430325388079234, "learning_rate": 3.474813432835821e-05, "loss": 0.569, "step": 745 }, { "epoch": 0.17490671641791045, "grad_norm": 0.6426885251690175, "learning_rate": 3.498134328358209e-05, "loss": 0.5806, "step": 750 }, { "epoch": 0.17607276119402984, "grad_norm": 0.6733446219892759, "learning_rate": 3.521455223880597e-05, "loss": 0.5958, "step": 755 }, { "epoch": 0.17723880597014927, "grad_norm": 0.5943382748973256, "learning_rate": 3.5447761194029854e-05, "loss": 0.5766, "step": 760 }, { "epoch": 0.17840485074626866, "grad_norm": 0.6034703115342142, "learning_rate": 3.5680970149253735e-05, "loss": 0.5677, "step": 765 }, { "epoch": 0.17957089552238806, "grad_norm": 0.543048749101742, "learning_rate": 3.5914179104477616e-05, "loss": 0.582, "step": 770 }, { "epoch": 0.18073694029850745, "grad_norm": 0.6062053068378871, "learning_rate": 3.614738805970149e-05, "loss": 0.5948, "step": 775 }, { "epoch": 0.18190298507462688, "grad_norm": 0.5318696868283348, "learning_rate": 3.638059701492538e-05, "loss": 0.5803, "step": 780 }, { "epoch": 0.18306902985074627, "grad_norm": 0.614282680422898, "learning_rate": 3.661380597014926e-05, "loss": 0.5725, "step": 785 }, { "epoch": 0.18423507462686567, "grad_norm": 0.5822339500378908, "learning_rate": 3.6847014925373134e-05, "loss": 0.566, "step": 790 }, { "epoch": 0.18540111940298507, "grad_norm": 0.578500299025908, "learning_rate": 3.7080223880597015e-05, "loss": 0.6007, "step": 795 }, { "epoch": 0.1865671641791045, "grad_norm": 0.6855748875441977, "learning_rate": 3.73134328358209e-05, "loss": 0.5875, "step": 800 }, { "epoch": 0.18773320895522388, "grad_norm": 0.5831814969944836, "learning_rate": 3.754664179104478e-05, "loss": 0.5998, "step": 805 }, { "epoch": 0.18889925373134328, "grad_norm": 0.5260055027606848, "learning_rate": 3.777985074626866e-05, "loss": 0.5854, "step": 810 }, { "epoch": 0.19006529850746268, "grad_norm": 0.5964193533820314, "learning_rate": 3.801305970149254e-05, "loss": 0.6024, "step": 815 }, { "epoch": 0.1912313432835821, "grad_norm": 0.5107453949700954, "learning_rate": 3.824626865671642e-05, "loss": 0.5935, "step": 820 }, { "epoch": 0.1923973880597015, "grad_norm": 0.5648317168782171, "learning_rate": 3.84794776119403e-05, "loss": 0.5498, "step": 825 }, { "epoch": 0.1935634328358209, "grad_norm": 0.5562713574675907, "learning_rate": 3.871268656716418e-05, "loss": 0.6104, "step": 830 }, { "epoch": 0.1947294776119403, "grad_norm": 0.6192932504991097, "learning_rate": 3.894589552238806e-05, "loss": 0.601, "step": 835 }, { "epoch": 0.1958955223880597, "grad_norm": 0.6121054128335968, "learning_rate": 3.9179104477611945e-05, "loss": 0.5878, "step": 840 }, { "epoch": 0.1970615671641791, "grad_norm": 0.5906087442373493, "learning_rate": 3.9412313432835826e-05, "loss": 0.5657, "step": 845 }, { "epoch": 0.1982276119402985, "grad_norm": 0.5816800031851801, "learning_rate": 3.96455223880597e-05, "loss": 0.5676, "step": 850 }, { "epoch": 0.1993936567164179, "grad_norm": 0.5977952262779495, "learning_rate": 3.987873134328358e-05, "loss": 0.5999, "step": 855 }, { "epoch": 0.20055970149253732, "grad_norm": 0.5432525463268602, "learning_rate": 4.011194029850747e-05, "loss": 0.6065, "step": 860 }, { "epoch": 0.20172574626865672, "grad_norm": 0.5662596636748775, "learning_rate": 4.0345149253731344e-05, "loss": 0.6021, "step": 865 }, { "epoch": 0.20289179104477612, "grad_norm": 0.5550451000970232, "learning_rate": 4.0578358208955225e-05, "loss": 0.5767, "step": 870 }, { "epoch": 0.2040578358208955, "grad_norm": 0.6584490116997408, "learning_rate": 4.0811567164179106e-05, "loss": 0.5811, "step": 875 }, { "epoch": 0.20522388059701493, "grad_norm": 0.6196068395971027, "learning_rate": 4.104477611940299e-05, "loss": 0.5678, "step": 880 }, { "epoch": 0.20638992537313433, "grad_norm": 0.569578990966687, "learning_rate": 4.127798507462687e-05, "loss": 0.576, "step": 885 }, { "epoch": 0.20755597014925373, "grad_norm": 0.540443760471811, "learning_rate": 4.151119402985075e-05, "loss": 0.5549, "step": 890 }, { "epoch": 0.20872201492537312, "grad_norm": 0.5079297415494084, "learning_rate": 4.174440298507462e-05, "loss": 0.5682, "step": 895 }, { "epoch": 0.20988805970149255, "grad_norm": 0.6069256063141537, "learning_rate": 4.197761194029851e-05, "loss": 0.602, "step": 900 }, { "epoch": 0.21105410447761194, "grad_norm": 0.5255861953251896, "learning_rate": 4.221082089552239e-05, "loss": 0.5739, "step": 905 }, { "epoch": 0.21222014925373134, "grad_norm": 0.6771457707390894, "learning_rate": 4.244402985074627e-05, "loss": 0.5969, "step": 910 }, { "epoch": 0.21338619402985073, "grad_norm": 0.5924999294439693, "learning_rate": 4.267723880597015e-05, "loss": 0.5716, "step": 915 }, { "epoch": 0.21455223880597016, "grad_norm": 0.5628296247978087, "learning_rate": 4.2910447761194036e-05, "loss": 0.6034, "step": 920 }, { "epoch": 0.21571828358208955, "grad_norm": 0.6335505374297353, "learning_rate": 4.314365671641791e-05, "loss": 0.594, "step": 925 }, { "epoch": 0.21688432835820895, "grad_norm": 0.5933156164806249, "learning_rate": 4.337686567164179e-05, "loss": 0.6059, "step": 930 }, { "epoch": 0.21805037313432835, "grad_norm": 0.5577452592820481, "learning_rate": 4.361007462686567e-05, "loss": 0.5821, "step": 935 }, { "epoch": 0.21921641791044777, "grad_norm": 0.5447180264806184, "learning_rate": 4.384328358208955e-05, "loss": 0.5892, "step": 940 }, { "epoch": 0.22038246268656717, "grad_norm": 0.532791679164916, "learning_rate": 4.4076492537313434e-05, "loss": 0.5828, "step": 945 }, { "epoch": 0.22154850746268656, "grad_norm": 0.5583820131957794, "learning_rate": 4.4309701492537316e-05, "loss": 0.601, "step": 950 }, { "epoch": 0.22271455223880596, "grad_norm": 0.5003777558691288, "learning_rate": 4.45429104477612e-05, "loss": 0.5959, "step": 955 }, { "epoch": 0.22388059701492538, "grad_norm": 0.5358343725297245, "learning_rate": 4.477611940298508e-05, "loss": 0.5906, "step": 960 }, { "epoch": 0.22504664179104478, "grad_norm": 0.5479409267391404, "learning_rate": 4.500932835820896e-05, "loss": 0.5963, "step": 965 }, { "epoch": 0.22621268656716417, "grad_norm": 0.6183224784323952, "learning_rate": 4.524253731343284e-05, "loss": 0.6274, "step": 970 }, { "epoch": 0.22737873134328357, "grad_norm": 0.5433881231736056, "learning_rate": 4.5475746268656714e-05, "loss": 0.527, "step": 975 }, { "epoch": 0.228544776119403, "grad_norm": 0.5506055406053556, "learning_rate": 4.57089552238806e-05, "loss": 0.5657, "step": 980 }, { "epoch": 0.2297108208955224, "grad_norm": 0.5536131078091451, "learning_rate": 4.5942164179104477e-05, "loss": 0.59, "step": 985 }, { "epoch": 0.23087686567164178, "grad_norm": 0.5299963995134134, "learning_rate": 4.617537313432836e-05, "loss": 0.5938, "step": 990 }, { "epoch": 0.23204291044776118, "grad_norm": 0.5530020286887515, "learning_rate": 4.640858208955224e-05, "loss": 0.5513, "step": 995 }, { "epoch": 0.2332089552238806, "grad_norm": 0.5652090655865671, "learning_rate": 4.664179104477612e-05, "loss": 0.5808, "step": 1000 }, { "epoch": 0.234375, "grad_norm": 0.5442231136582252, "learning_rate": 4.6875e-05, "loss": 0.5729, "step": 1005 }, { "epoch": 0.2355410447761194, "grad_norm": 0.5201566882142586, "learning_rate": 4.710820895522388e-05, "loss": 0.5683, "step": 1010 }, { "epoch": 0.23670708955223882, "grad_norm": 0.564490392879513, "learning_rate": 4.734141791044776e-05, "loss": 0.5725, "step": 1015 }, { "epoch": 0.23787313432835822, "grad_norm": 0.5585859112380516, "learning_rate": 4.7574626865671644e-05, "loss": 0.5866, "step": 1020 }, { "epoch": 0.2390391791044776, "grad_norm": 0.5332960766216192, "learning_rate": 4.7807835820895525e-05, "loss": 0.58, "step": 1025 }, { "epoch": 0.240205223880597, "grad_norm": 0.5684421565070324, "learning_rate": 4.8041044776119407e-05, "loss": 0.5862, "step": 1030 }, { "epoch": 0.24137126865671643, "grad_norm": 0.519204840668517, "learning_rate": 4.827425373134329e-05, "loss": 0.5804, "step": 1035 }, { "epoch": 0.24253731343283583, "grad_norm": 0.5549119910611328, "learning_rate": 4.850746268656717e-05, "loss": 0.5909, "step": 1040 }, { "epoch": 0.24370335820895522, "grad_norm": 0.5344473129466101, "learning_rate": 4.874067164179105e-05, "loss": 0.5844, "step": 1045 }, { "epoch": 0.24486940298507462, "grad_norm": 0.46984697348471055, "learning_rate": 4.8973880597014924e-05, "loss": 0.5558, "step": 1050 }, { "epoch": 0.24603544776119404, "grad_norm": 0.5049122569293845, "learning_rate": 4.920708955223881e-05, "loss": 0.5767, "step": 1055 }, { "epoch": 0.24720149253731344, "grad_norm": 0.5165157903140213, "learning_rate": 4.944029850746269e-05, "loss": 0.5775, "step": 1060 }, { "epoch": 0.24836753731343283, "grad_norm": 0.5432283053409598, "learning_rate": 4.967350746268657e-05, "loss": 0.5873, "step": 1065 }, { "epoch": 0.24953358208955223, "grad_norm": 0.5957847227963389, "learning_rate": 4.990671641791045e-05, "loss": 0.5697, "step": 1070 }, { "epoch": 0.25069962686567165, "grad_norm": 0.5251045167384748, "learning_rate": 4.999999759121523e-05, "loss": 0.5791, "step": 1075 }, { "epoch": 0.251865671641791, "grad_norm": 0.5305719836101288, "learning_rate": 4.9999982870865717e-05, "loss": 0.5533, "step": 1080 }, { "epoch": 0.25303171641791045, "grad_norm": 0.489282165443993, "learning_rate": 4.9999954768389194e-05, "loss": 0.5584, "step": 1085 }, { "epoch": 0.25419776119402987, "grad_norm": 0.4792650955406527, "learning_rate": 4.999991328380238e-05, "loss": 0.5997, "step": 1090 }, { "epoch": 0.25536380597014924, "grad_norm": 0.5067911392365152, "learning_rate": 4.999985841712994e-05, "loss": 0.5724, "step": 1095 }, { "epoch": 0.25652985074626866, "grad_norm": 0.5906860460933071, "learning_rate": 4.999979016840452e-05, "loss": 0.5778, "step": 1100 }, { "epoch": 0.2576958955223881, "grad_norm": 0.5296167398051762, "learning_rate": 4.9999708537666696e-05, "loss": 0.5459, "step": 1105 }, { "epoch": 0.25886194029850745, "grad_norm": 0.5638313474773577, "learning_rate": 4.999961352496503e-05, "loss": 0.5883, "step": 1110 }, { "epoch": 0.2600279850746269, "grad_norm": 0.5396231839865687, "learning_rate": 4.999950513035602e-05, "loss": 0.5948, "step": 1115 }, { "epoch": 0.26119402985074625, "grad_norm": 0.5387745529714087, "learning_rate": 4.9999383353904156e-05, "loss": 0.5907, "step": 1120 }, { "epoch": 0.26236007462686567, "grad_norm": 0.4823196011549847, "learning_rate": 4.999924819568185e-05, "loss": 0.5747, "step": 1125 }, { "epoch": 0.2635261194029851, "grad_norm": 0.5093099848396279, "learning_rate": 4.999909965576949e-05, "loss": 0.5915, "step": 1130 }, { "epoch": 0.26469216417910446, "grad_norm": 0.5302438248697375, "learning_rate": 4.9998937734255424e-05, "loss": 0.5987, "step": 1135 }, { "epoch": 0.2658582089552239, "grad_norm": 0.5670135020812648, "learning_rate": 4.9998762431235955e-05, "loss": 0.578, "step": 1140 }, { "epoch": 0.2670242537313433, "grad_norm": 0.5283848808200766, "learning_rate": 4.9998573746815355e-05, "loss": 0.5697, "step": 1145 }, { "epoch": 0.2681902985074627, "grad_norm": 0.48975246214458645, "learning_rate": 4.999837168110584e-05, "loss": 0.5644, "step": 1150 }, { "epoch": 0.2693563432835821, "grad_norm": 0.4776034322287359, "learning_rate": 4.9998156234227586e-05, "loss": 0.5581, "step": 1155 }, { "epoch": 0.27052238805970147, "grad_norm": 0.5252762499974786, "learning_rate": 4.999792740630874e-05, "loss": 0.5861, "step": 1160 }, { "epoch": 0.2716884328358209, "grad_norm": 0.448611905979275, "learning_rate": 4.9997685197485396e-05, "loss": 0.5753, "step": 1165 }, { "epoch": 0.2728544776119403, "grad_norm": 0.49406828117713886, "learning_rate": 4.999742960790161e-05, "loss": 0.5687, "step": 1170 }, { "epoch": 0.2740205223880597, "grad_norm": 0.49948080039138953, "learning_rate": 4.9997160637709395e-05, "loss": 0.5639, "step": 1175 }, { "epoch": 0.2751865671641791, "grad_norm": 0.46726100492164313, "learning_rate": 4.999687828706874e-05, "loss": 0.5454, "step": 1180 }, { "epoch": 0.27635261194029853, "grad_norm": 0.472396240079208, "learning_rate": 4.999658255614756e-05, "loss": 0.5729, "step": 1185 }, { "epoch": 0.2775186567164179, "grad_norm": 0.5824879991684005, "learning_rate": 4.9996273445121744e-05, "loss": 0.5766, "step": 1190 }, { "epoch": 0.2786847014925373, "grad_norm": 0.5120369900726413, "learning_rate": 4.9995950954175145e-05, "loss": 0.5564, "step": 1195 }, { "epoch": 0.2798507462686567, "grad_norm": 0.6821504634222254, "learning_rate": 4.999561508349957e-05, "loss": 0.6108, "step": 1200 }, { "epoch": 0.2810167910447761, "grad_norm": 0.45526066922122244, "learning_rate": 4.9995265833294774e-05, "loss": 0.5493, "step": 1205 }, { "epoch": 0.28218283582089554, "grad_norm": 0.5120804639574477, "learning_rate": 4.9994903203768486e-05, "loss": 0.5515, "step": 1210 }, { "epoch": 0.2833488805970149, "grad_norm": 0.4541809372974789, "learning_rate": 4.999452719513638e-05, "loss": 0.5698, "step": 1215 }, { "epoch": 0.28451492537313433, "grad_norm": 0.49158525259783803, "learning_rate": 4.99941378076221e-05, "loss": 0.5799, "step": 1220 }, { "epoch": 0.28568097014925375, "grad_norm": 0.5289894603163678, "learning_rate": 4.9993735041457226e-05, "loss": 0.5995, "step": 1225 }, { "epoch": 0.2868470149253731, "grad_norm": 0.4641712754839069, "learning_rate": 4.999331889688131e-05, "loss": 0.5662, "step": 1230 }, { "epoch": 0.28801305970149255, "grad_norm": 0.4868098256257308, "learning_rate": 4.999288937414186e-05, "loss": 0.5688, "step": 1235 }, { "epoch": 0.2891791044776119, "grad_norm": 0.4924844059462168, "learning_rate": 4.999244647349435e-05, "loss": 0.5506, "step": 1240 }, { "epoch": 0.29034514925373134, "grad_norm": 0.4983358257275362, "learning_rate": 4.999199019520219e-05, "loss": 0.5883, "step": 1245 }, { "epoch": 0.29151119402985076, "grad_norm": 0.48195597056031647, "learning_rate": 4.999152053953675e-05, "loss": 0.5601, "step": 1250 }, { "epoch": 0.29267723880597013, "grad_norm": 0.5242089410028173, "learning_rate": 4.9991037506777384e-05, "loss": 0.586, "step": 1255 }, { "epoch": 0.29384328358208955, "grad_norm": 0.48781671854762115, "learning_rate": 4.999054109721136e-05, "loss": 0.588, "step": 1260 }, { "epoch": 0.295009328358209, "grad_norm": 0.5635402788279851, "learning_rate": 4.9990031311133944e-05, "loss": 0.5743, "step": 1265 }, { "epoch": 0.29617537313432835, "grad_norm": 0.4928633568357669, "learning_rate": 4.9989508148848315e-05, "loss": 0.6074, "step": 1270 }, { "epoch": 0.29734141791044777, "grad_norm": 0.49767968993096073, "learning_rate": 4.9988971610665645e-05, "loss": 0.5875, "step": 1275 }, { "epoch": 0.29850746268656714, "grad_norm": 0.46249236791400944, "learning_rate": 4.998842169690504e-05, "loss": 0.5737, "step": 1280 }, { "epoch": 0.29967350746268656, "grad_norm": 0.4495065875541309, "learning_rate": 4.9987858407893576e-05, "loss": 0.5775, "step": 1285 }, { "epoch": 0.300839552238806, "grad_norm": 0.48948231779530305, "learning_rate": 4.998728174396626e-05, "loss": 0.5536, "step": 1290 }, { "epoch": 0.30200559701492535, "grad_norm": 0.543688758605199, "learning_rate": 4.998669170546609e-05, "loss": 0.574, "step": 1295 }, { "epoch": 0.3031716417910448, "grad_norm": 0.45285718788241364, "learning_rate": 4.998608829274398e-05, "loss": 0.5724, "step": 1300 }, { "epoch": 0.3043376865671642, "grad_norm": 0.5269509970734879, "learning_rate": 4.998547150615882e-05, "loss": 0.577, "step": 1305 }, { "epoch": 0.30550373134328357, "grad_norm": 0.4651413513800348, "learning_rate": 4.998484134607746e-05, "loss": 0.5762, "step": 1310 }, { "epoch": 0.306669776119403, "grad_norm": 0.5329583323973838, "learning_rate": 4.998419781287469e-05, "loss": 0.5686, "step": 1315 }, { "epoch": 0.30783582089552236, "grad_norm": 0.4906742183579196, "learning_rate": 4.998354090693326e-05, "loss": 0.5846, "step": 1320 }, { "epoch": 0.3090018656716418, "grad_norm": 0.4935895797614555, "learning_rate": 4.9982870628643876e-05, "loss": 0.5611, "step": 1325 }, { "epoch": 0.3101679104477612, "grad_norm": 0.4636994251106214, "learning_rate": 4.9982186978405175e-05, "loss": 0.5575, "step": 1330 }, { "epoch": 0.3113339552238806, "grad_norm": 0.4891057406321723, "learning_rate": 4.998148995662379e-05, "loss": 0.5684, "step": 1335 }, { "epoch": 0.3125, "grad_norm": 0.47100059226298446, "learning_rate": 4.9980779563714274e-05, "loss": 0.5953, "step": 1340 }, { "epoch": 0.3136660447761194, "grad_norm": 0.5743422188018308, "learning_rate": 4.998005580009914e-05, "loss": 0.5369, "step": 1345 }, { "epoch": 0.3148320895522388, "grad_norm": 0.49522739813094924, "learning_rate": 4.9979318666208855e-05, "loss": 0.5655, "step": 1350 }, { "epoch": 0.3159981343283582, "grad_norm": 0.4412980180642329, "learning_rate": 4.997856816248184e-05, "loss": 0.5468, "step": 1355 }, { "epoch": 0.31716417910447764, "grad_norm": 0.4472553803069325, "learning_rate": 4.997780428936446e-05, "loss": 0.575, "step": 1360 }, { "epoch": 0.318330223880597, "grad_norm": 0.49682919492884364, "learning_rate": 4.9977027047311046e-05, "loss": 0.5861, "step": 1365 }, { "epoch": 0.31949626865671643, "grad_norm": 0.4695043714505331, "learning_rate": 4.9976236436783865e-05, "loss": 0.5753, "step": 1370 }, { "epoch": 0.3206623134328358, "grad_norm": 0.4433496667188403, "learning_rate": 4.9975432458253136e-05, "loss": 0.5601, "step": 1375 }, { "epoch": 0.3218283582089552, "grad_norm": 0.4621878380740119, "learning_rate": 4.997461511219705e-05, "loss": 0.5604, "step": 1380 }, { "epoch": 0.32299440298507465, "grad_norm": 0.4406248363104525, "learning_rate": 4.997378439910173e-05, "loss": 0.57, "step": 1385 }, { "epoch": 0.324160447761194, "grad_norm": 0.48970285941888225, "learning_rate": 4.997294031946124e-05, "loss": 0.5926, "step": 1390 }, { "epoch": 0.32532649253731344, "grad_norm": 0.4813570655780272, "learning_rate": 4.9972082873777626e-05, "loss": 0.5606, "step": 1395 }, { "epoch": 0.32649253731343286, "grad_norm": 0.4937323426247838, "learning_rate": 4.9971212062560844e-05, "loss": 0.5564, "step": 1400 }, { "epoch": 0.32765858208955223, "grad_norm": 0.4738148221539011, "learning_rate": 4.9970327886328824e-05, "loss": 0.5634, "step": 1405 }, { "epoch": 0.32882462686567165, "grad_norm": 0.4944510657021552, "learning_rate": 4.9969430345607445e-05, "loss": 0.5651, "step": 1410 }, { "epoch": 0.329990671641791, "grad_norm": 0.4603449829292863, "learning_rate": 4.9968519440930536e-05, "loss": 0.5655, "step": 1415 }, { "epoch": 0.33115671641791045, "grad_norm": 0.43710997519260814, "learning_rate": 4.996759517283986e-05, "loss": 0.555, "step": 1420 }, { "epoch": 0.33232276119402987, "grad_norm": 0.5016530800822935, "learning_rate": 4.996665754188513e-05, "loss": 0.5965, "step": 1425 }, { "epoch": 0.33348880597014924, "grad_norm": 0.45950447005990486, "learning_rate": 4.996570654862402e-05, "loss": 0.5744, "step": 1430 }, { "epoch": 0.33465485074626866, "grad_norm": 0.479158893542782, "learning_rate": 4.996474219362215e-05, "loss": 0.5513, "step": 1435 }, { "epoch": 0.3358208955223881, "grad_norm": 0.47063062571255804, "learning_rate": 4.996376447745307e-05, "loss": 0.5935, "step": 1440 }, { "epoch": 0.33698694029850745, "grad_norm": 0.46312430894878515, "learning_rate": 4.9962773400698295e-05, "loss": 0.5565, "step": 1445 }, { "epoch": 0.3381529850746269, "grad_norm": 0.48463397201420777, "learning_rate": 4.996176896394728e-05, "loss": 0.5787, "step": 1450 }, { "epoch": 0.33931902985074625, "grad_norm": 0.43419167021782, "learning_rate": 4.9960751167797414e-05, "loss": 0.5711, "step": 1455 }, { "epoch": 0.34048507462686567, "grad_norm": 0.4653074420498655, "learning_rate": 4.995972001285406e-05, "loss": 0.5868, "step": 1460 }, { "epoch": 0.3416511194029851, "grad_norm": 0.45197311826393544, "learning_rate": 4.99586754997305e-05, "loss": 0.5659, "step": 1465 }, { "epoch": 0.34281716417910446, "grad_norm": 0.4493851701904994, "learning_rate": 4.995761762904797e-05, "loss": 0.5677, "step": 1470 }, { "epoch": 0.3439832089552239, "grad_norm": 0.48396020059964967, "learning_rate": 4.9956546401435654e-05, "loss": 0.5941, "step": 1475 }, { "epoch": 0.3451492537313433, "grad_norm": 0.45853252669502864, "learning_rate": 4.995546181753069e-05, "loss": 0.5726, "step": 1480 }, { "epoch": 0.3463152985074627, "grad_norm": 0.45960292628732224, "learning_rate": 4.995436387797811e-05, "loss": 0.566, "step": 1485 }, { "epoch": 0.3474813432835821, "grad_norm": 0.45722447478413303, "learning_rate": 4.9953252583430965e-05, "loss": 0.5482, "step": 1490 }, { "epoch": 0.34864738805970147, "grad_norm": 0.49733374956686327, "learning_rate": 4.995212793455019e-05, "loss": 0.5787, "step": 1495 }, { "epoch": 0.3498134328358209, "grad_norm": 0.4475154420431213, "learning_rate": 4.9950989932004684e-05, "loss": 0.5575, "step": 1500 }, { "epoch": 0.3509794776119403, "grad_norm": 0.5039137900484434, "learning_rate": 4.9949838576471296e-05, "loss": 0.5628, "step": 1505 }, { "epoch": 0.3521455223880597, "grad_norm": 0.4607828784861782, "learning_rate": 4.9948673868634806e-05, "loss": 0.565, "step": 1510 }, { "epoch": 0.3533115671641791, "grad_norm": 0.45572689063215227, "learning_rate": 4.994749580918793e-05, "loss": 0.5534, "step": 1515 }, { "epoch": 0.35447761194029853, "grad_norm": 0.47030198648203614, "learning_rate": 4.9946304398831336e-05, "loss": 0.5679, "step": 1520 }, { "epoch": 0.3556436567164179, "grad_norm": 0.4596430481486753, "learning_rate": 4.9945099638273635e-05, "loss": 0.5642, "step": 1525 }, { "epoch": 0.3568097014925373, "grad_norm": 0.45333240594045915, "learning_rate": 4.9943881528231365e-05, "loss": 0.5635, "step": 1530 }, { "epoch": 0.3579757462686567, "grad_norm": 0.5450126141011754, "learning_rate": 4.9942650069429016e-05, "loss": 0.5619, "step": 1535 }, { "epoch": 0.3591417910447761, "grad_norm": 0.4638764230629832, "learning_rate": 4.994140526259901e-05, "loss": 0.5564, "step": 1540 }, { "epoch": 0.36030783582089554, "grad_norm": 0.461856082768502, "learning_rate": 4.994014710848171e-05, "loss": 0.5579, "step": 1545 }, { "epoch": 0.3614738805970149, "grad_norm": 0.46665112067850467, "learning_rate": 4.993887560782541e-05, "loss": 0.5638, "step": 1550 }, { "epoch": 0.36263992537313433, "grad_norm": 0.6118091321397457, "learning_rate": 4.993759076138637e-05, "loss": 0.5976, "step": 1555 }, { "epoch": 0.36380597014925375, "grad_norm": 0.5380381051727333, "learning_rate": 4.993629256992876e-05, "loss": 0.5377, "step": 1560 }, { "epoch": 0.3649720149253731, "grad_norm": 0.46844246829034303, "learning_rate": 4.993498103422469e-05, "loss": 0.5647, "step": 1565 }, { "epoch": 0.36613805970149255, "grad_norm": 0.4639832657096169, "learning_rate": 4.99336561550542e-05, "loss": 0.548, "step": 1570 }, { "epoch": 0.3673041044776119, "grad_norm": 0.43083325820687435, "learning_rate": 4.993231793320529e-05, "loss": 0.5766, "step": 1575 }, { "epoch": 0.36847014925373134, "grad_norm": 0.4376649250540398, "learning_rate": 4.993096636947389e-05, "loss": 0.5876, "step": 1580 }, { "epoch": 0.36963619402985076, "grad_norm": 0.46991883724171807, "learning_rate": 4.992960146466384e-05, "loss": 0.5543, "step": 1585 }, { "epoch": 0.37080223880597013, "grad_norm": 0.4442143989894822, "learning_rate": 4.992822321958695e-05, "loss": 0.5414, "step": 1590 }, { "epoch": 0.37196828358208955, "grad_norm": 0.4352259309288988, "learning_rate": 4.9926831635062955e-05, "loss": 0.5681, "step": 1595 }, { "epoch": 0.373134328358209, "grad_norm": 0.4481233562706409, "learning_rate": 4.992542671191948e-05, "loss": 0.5777, "step": 1600 }, { "epoch": 0.37430037313432835, "grad_norm": 0.43213203607139844, "learning_rate": 4.992400845099215e-05, "loss": 0.5631, "step": 1605 }, { "epoch": 0.37546641791044777, "grad_norm": 0.46838479756333634, "learning_rate": 4.992257685312448e-05, "loss": 0.5615, "step": 1610 }, { "epoch": 0.37663246268656714, "grad_norm": 0.4790157694826662, "learning_rate": 4.992113191916794e-05, "loss": 0.5722, "step": 1615 }, { "epoch": 0.37779850746268656, "grad_norm": 0.5074206106763016, "learning_rate": 4.991967364998191e-05, "loss": 0.5569, "step": 1620 }, { "epoch": 0.378964552238806, "grad_norm": 0.4939666577027776, "learning_rate": 4.9918202046433714e-05, "loss": 0.5443, "step": 1625 }, { "epoch": 0.38013059701492535, "grad_norm": 0.4603493031471122, "learning_rate": 4.991671710939861e-05, "loss": 0.5605, "step": 1630 }, { "epoch": 0.3812966417910448, "grad_norm": 0.41270849678671384, "learning_rate": 4.991521883975978e-05, "loss": 0.5456, "step": 1635 }, { "epoch": 0.3824626865671642, "grad_norm": 0.4164437714939307, "learning_rate": 4.991370723840834e-05, "loss": 0.5416, "step": 1640 }, { "epoch": 0.38362873134328357, "grad_norm": 0.44774976729021265, "learning_rate": 4.991218230624332e-05, "loss": 0.543, "step": 1645 }, { "epoch": 0.384794776119403, "grad_norm": 0.4256827310125916, "learning_rate": 4.9910644044171714e-05, "loss": 0.5646, "step": 1650 }, { "epoch": 0.38596082089552236, "grad_norm": 0.5176286353898649, "learning_rate": 4.9909092453108394e-05, "loss": 0.5705, "step": 1655 }, { "epoch": 0.3871268656716418, "grad_norm": 0.4399705219539844, "learning_rate": 4.9907527533976214e-05, "loss": 0.56, "step": 1660 }, { "epoch": 0.3882929104477612, "grad_norm": 0.518195076648471, "learning_rate": 4.990594928770591e-05, "loss": 0.5725, "step": 1665 }, { "epoch": 0.3894589552238806, "grad_norm": 0.43061197075942564, "learning_rate": 4.9904357715236164e-05, "loss": 0.5644, "step": 1670 }, { "epoch": 0.390625, "grad_norm": 0.4619368440512794, "learning_rate": 4.9902752817513586e-05, "loss": 0.5664, "step": 1675 }, { "epoch": 0.3917910447761194, "grad_norm": 0.4638437137264606, "learning_rate": 4.990113459549271e-05, "loss": 0.5562, "step": 1680 }, { "epoch": 0.3929570895522388, "grad_norm": 0.42811331971642497, "learning_rate": 4.989950305013599e-05, "loss": 0.5712, "step": 1685 }, { "epoch": 0.3941231343283582, "grad_norm": 0.43336616831376673, "learning_rate": 4.98978581824138e-05, "loss": 0.5701, "step": 1690 }, { "epoch": 0.39528917910447764, "grad_norm": 0.5021149588786751, "learning_rate": 4.989619999330446e-05, "loss": 0.5485, "step": 1695 }, { "epoch": 0.396455223880597, "grad_norm": 0.5041381985514305, "learning_rate": 4.9894528483794175e-05, "loss": 0.5793, "step": 1700 }, { "epoch": 0.39762126865671643, "grad_norm": 0.500144974307226, "learning_rate": 4.989284365487712e-05, "loss": 0.5757, "step": 1705 }, { "epoch": 0.3987873134328358, "grad_norm": 0.4306528925895123, "learning_rate": 4.9891145507555346e-05, "loss": 0.5597, "step": 1710 }, { "epoch": 0.3999533582089552, "grad_norm": 0.44274857897206693, "learning_rate": 4.988943404283886e-05, "loss": 0.5675, "step": 1715 }, { "epoch": 0.40111940298507465, "grad_norm": 0.42368506541882556, "learning_rate": 4.9887709261745566e-05, "loss": 0.5544, "step": 1720 }, { "epoch": 0.402285447761194, "grad_norm": 0.41733710751037534, "learning_rate": 4.9885971165301296e-05, "loss": 0.5637, "step": 1725 }, { "epoch": 0.40345149253731344, "grad_norm": 0.46279885315214764, "learning_rate": 4.988421975453982e-05, "loss": 0.5812, "step": 1730 }, { "epoch": 0.40461753731343286, "grad_norm": 0.474553185605738, "learning_rate": 4.988245503050279e-05, "loss": 0.5759, "step": 1735 }, { "epoch": 0.40578358208955223, "grad_norm": 0.4577653422711988, "learning_rate": 4.9880676994239805e-05, "loss": 0.5513, "step": 1740 }, { "epoch": 0.40694962686567165, "grad_norm": 0.4187272091284333, "learning_rate": 4.987888564680837e-05, "loss": 0.5652, "step": 1745 }, { "epoch": 0.408115671641791, "grad_norm": 0.5184535633687853, "learning_rate": 4.9877080989273925e-05, "loss": 0.5877, "step": 1750 }, { "epoch": 0.40928171641791045, "grad_norm": 0.44602635138498936, "learning_rate": 4.9875263022709786e-05, "loss": 0.5398, "step": 1755 }, { "epoch": 0.41044776119402987, "grad_norm": 0.559920348015096, "learning_rate": 4.987343174819723e-05, "loss": 0.549, "step": 1760 }, { "epoch": 0.41161380597014924, "grad_norm": 0.40212601810851556, "learning_rate": 4.9871587166825405e-05, "loss": 0.5261, "step": 1765 }, { "epoch": 0.41277985074626866, "grad_norm": 0.46302360712943924, "learning_rate": 4.9869729279691425e-05, "loss": 0.5612, "step": 1770 }, { "epoch": 0.4139458955223881, "grad_norm": 0.48495455648218605, "learning_rate": 4.986785808790028e-05, "loss": 0.5824, "step": 1775 }, { "epoch": 0.41511194029850745, "grad_norm": 0.4303272638769056, "learning_rate": 4.9865973592564876e-05, "loss": 0.5646, "step": 1780 }, { "epoch": 0.4162779850746269, "grad_norm": 0.4124997140863719, "learning_rate": 4.986407579480604e-05, "loss": 0.5603, "step": 1785 }, { "epoch": 0.41744402985074625, "grad_norm": 0.4311398379117858, "learning_rate": 4.9862164695752524e-05, "loss": 0.558, "step": 1790 }, { "epoch": 0.41861007462686567, "grad_norm": 0.4241377020007968, "learning_rate": 4.986024029654095e-05, "loss": 0.5536, "step": 1795 }, { "epoch": 0.4197761194029851, "grad_norm": 0.4248495962681604, "learning_rate": 4.98583025983159e-05, "loss": 0.5693, "step": 1800 }, { "epoch": 0.42094216417910446, "grad_norm": 0.4431106772288949, "learning_rate": 4.9856351602229846e-05, "loss": 0.5431, "step": 1805 }, { "epoch": 0.4221082089552239, "grad_norm": 0.4791949644486913, "learning_rate": 4.985438730944314e-05, "loss": 0.5585, "step": 1810 }, { "epoch": 0.4232742537313433, "grad_norm": 0.4377501102334117, "learning_rate": 4.985240972112409e-05, "loss": 0.5805, "step": 1815 }, { "epoch": 0.4244402985074627, "grad_norm": 0.4507178181055429, "learning_rate": 4.985041883844888e-05, "loss": 0.5373, "step": 1820 }, { "epoch": 0.4256063432835821, "grad_norm": 0.4705472621000598, "learning_rate": 4.984841466260161e-05, "loss": 0.5602, "step": 1825 }, { "epoch": 0.42677238805970147, "grad_norm": 0.4140988235587778, "learning_rate": 4.9846397194774294e-05, "loss": 0.5389, "step": 1830 }, { "epoch": 0.4279384328358209, "grad_norm": 0.40544501813240136, "learning_rate": 4.9844366436166837e-05, "loss": 0.5418, "step": 1835 }, { "epoch": 0.4291044776119403, "grad_norm": 0.42668985298136625, "learning_rate": 4.984232238798707e-05, "loss": 0.5638, "step": 1840 }, { "epoch": 0.4302705223880597, "grad_norm": 0.4618355005213451, "learning_rate": 4.9840265051450694e-05, "loss": 0.5533, "step": 1845 }, { "epoch": 0.4314365671641791, "grad_norm": 0.405012555937919, "learning_rate": 4.983819442778134e-05, "loss": 0.5423, "step": 1850 }, { "epoch": 0.43260261194029853, "grad_norm": 0.44687169495578805, "learning_rate": 4.983611051821055e-05, "loss": 0.5439, "step": 1855 }, { "epoch": 0.4337686567164179, "grad_norm": 0.40695918232481937, "learning_rate": 4.983401332397775e-05, "loss": 0.5232, "step": 1860 }, { "epoch": 0.4349347014925373, "grad_norm": 0.42811566851360106, "learning_rate": 4.983190284633025e-05, "loss": 0.5604, "step": 1865 }, { "epoch": 0.4361007462686567, "grad_norm": 0.42869973179034077, "learning_rate": 4.9829779086523295e-05, "loss": 0.5711, "step": 1870 }, { "epoch": 0.4372667910447761, "grad_norm": 0.5494959455877744, "learning_rate": 4.9827642045820016e-05, "loss": 0.5419, "step": 1875 }, { "epoch": 0.43843283582089554, "grad_norm": 0.42568192378292363, "learning_rate": 4.982549172549145e-05, "loss": 0.5825, "step": 1880 }, { "epoch": 0.4395988805970149, "grad_norm": 0.46959577972035027, "learning_rate": 4.982332812681651e-05, "loss": 0.5814, "step": 1885 }, { "epoch": 0.44076492537313433, "grad_norm": 0.4708004823112155, "learning_rate": 4.9821151251082035e-05, "loss": 0.5735, "step": 1890 }, { "epoch": 0.44193097014925375, "grad_norm": 0.4279741612325164, "learning_rate": 4.981896109958274e-05, "loss": 0.5437, "step": 1895 }, { "epoch": 0.4430970149253731, "grad_norm": 0.4095434730326501, "learning_rate": 4.981675767362125e-05, "loss": 0.5593, "step": 1900 }, { "epoch": 0.44426305970149255, "grad_norm": 0.42685723472991594, "learning_rate": 4.981454097450806e-05, "loss": 0.5661, "step": 1905 }, { "epoch": 0.4454291044776119, "grad_norm": 0.44948867506310025, "learning_rate": 4.98123110035616e-05, "loss": 0.5457, "step": 1910 }, { "epoch": 0.44659514925373134, "grad_norm": 0.43929816483518985, "learning_rate": 4.981006776210816e-05, "loss": 0.5571, "step": 1915 }, { "epoch": 0.44776119402985076, "grad_norm": 0.43876619295403296, "learning_rate": 4.980781125148194e-05, "loss": 0.5525, "step": 1920 }, { "epoch": 0.44892723880597013, "grad_norm": 0.45083808200498976, "learning_rate": 4.9805541473025016e-05, "loss": 0.5722, "step": 1925 }, { "epoch": 0.45009328358208955, "grad_norm": 0.5087947632583268, "learning_rate": 4.980325842808737e-05, "loss": 0.5602, "step": 1930 }, { "epoch": 0.451259328358209, "grad_norm": 0.4298992123911525, "learning_rate": 4.980096211802688e-05, "loss": 0.5527, "step": 1935 }, { "epoch": 0.45242537313432835, "grad_norm": 0.42488150264102736, "learning_rate": 4.979865254420929e-05, "loss": 0.5231, "step": 1940 }, { "epoch": 0.45359141791044777, "grad_norm": 0.4139456996916272, "learning_rate": 4.979632970800824e-05, "loss": 0.5634, "step": 1945 }, { "epoch": 0.45475746268656714, "grad_norm": 0.4697915547711206, "learning_rate": 4.9793993610805276e-05, "loss": 0.5489, "step": 1950 }, { "epoch": 0.45592350746268656, "grad_norm": 0.4583626661754686, "learning_rate": 4.979164425398983e-05, "loss": 0.5864, "step": 1955 }, { "epoch": 0.457089552238806, "grad_norm": 0.446217179949066, "learning_rate": 4.9789281638959184e-05, "loss": 0.5608, "step": 1960 }, { "epoch": 0.45825559701492535, "grad_norm": 0.45063170677328174, "learning_rate": 4.978690576711855e-05, "loss": 0.5463, "step": 1965 }, { "epoch": 0.4594216417910448, "grad_norm": 0.4335952872820952, "learning_rate": 4.978451663988099e-05, "loss": 0.5537, "step": 1970 }, { "epoch": 0.4605876865671642, "grad_norm": 0.4198703921554321, "learning_rate": 4.978211425866748e-05, "loss": 0.5202, "step": 1975 }, { "epoch": 0.46175373134328357, "grad_norm": 0.44012417799479364, "learning_rate": 4.977969862490685e-05, "loss": 0.576, "step": 1980 }, { "epoch": 0.462919776119403, "grad_norm": 0.4106293282394111, "learning_rate": 4.9777269740035844e-05, "loss": 0.5417, "step": 1985 }, { "epoch": 0.46408582089552236, "grad_norm": 0.42664083312692597, "learning_rate": 4.977482760549905e-05, "loss": 0.5507, "step": 1990 }, { "epoch": 0.4652518656716418, "grad_norm": 0.37376513992329213, "learning_rate": 4.977237222274897e-05, "loss": 0.5352, "step": 1995 }, { "epoch": 0.4664179104477612, "grad_norm": 0.4130064098570384, "learning_rate": 4.976990359324597e-05, "loss": 0.5497, "step": 2000 }, { "epoch": 0.4675839552238806, "grad_norm": 0.43241982641910154, "learning_rate": 4.9767421718458304e-05, "loss": 0.5532, "step": 2005 }, { "epoch": 0.46875, "grad_norm": 0.42166614491803217, "learning_rate": 4.9764926599862065e-05, "loss": 0.574, "step": 2010 }, { "epoch": 0.4699160447761194, "grad_norm": 0.469266061518071, "learning_rate": 4.9762418238941285e-05, "loss": 0.5414, "step": 2015 }, { "epoch": 0.4710820895522388, "grad_norm": 0.441909513049608, "learning_rate": 4.9759896637187826e-05, "loss": 0.5596, "step": 2020 }, { "epoch": 0.4722481343283582, "grad_norm": 0.4679809012304934, "learning_rate": 4.9757361796101445e-05, "loss": 0.5538, "step": 2025 }, { "epoch": 0.47341417910447764, "grad_norm": 0.4136933429405254, "learning_rate": 4.9754813717189765e-05, "loss": 0.5648, "step": 2030 }, { "epoch": 0.474580223880597, "grad_norm": 0.3977775912844767, "learning_rate": 4.975225240196829e-05, "loss": 0.5524, "step": 2035 }, { "epoch": 0.47574626865671643, "grad_norm": 0.40316601277755537, "learning_rate": 4.974967785196039e-05, "loss": 0.5347, "step": 2040 }, { "epoch": 0.4769123134328358, "grad_norm": 0.417585225911704, "learning_rate": 4.974709006869731e-05, "loss": 0.5568, "step": 2045 }, { "epoch": 0.4780783582089552, "grad_norm": 0.3891660537025456, "learning_rate": 4.974448905371816e-05, "loss": 0.5504, "step": 2050 }, { "epoch": 0.47924440298507465, "grad_norm": 0.44591964309369564, "learning_rate": 4.974187480856993e-05, "loss": 0.5529, "step": 2055 }, { "epoch": 0.480410447761194, "grad_norm": 0.39423545578129715, "learning_rate": 4.973924733480747e-05, "loss": 0.5407, "step": 2060 }, { "epoch": 0.48157649253731344, "grad_norm": 0.368291270463148, "learning_rate": 4.973660663399349e-05, "loss": 0.5369, "step": 2065 }, { "epoch": 0.48274253731343286, "grad_norm": 0.4301251852936052, "learning_rate": 4.9733952707698606e-05, "loss": 0.5387, "step": 2070 }, { "epoch": 0.48390858208955223, "grad_norm": 0.4207409252637043, "learning_rate": 4.9731285557501245e-05, "loss": 0.5431, "step": 2075 }, { "epoch": 0.48507462686567165, "grad_norm": 0.4336520003007907, "learning_rate": 4.9728605184987724e-05, "loss": 0.5489, "step": 2080 }, { "epoch": 0.486240671641791, "grad_norm": 0.40021246117533077, "learning_rate": 4.972591159175225e-05, "loss": 0.5396, "step": 2085 }, { "epoch": 0.48740671641791045, "grad_norm": 0.40834453386389635, "learning_rate": 4.972320477939685e-05, "loss": 0.531, "step": 2090 }, { "epoch": 0.48857276119402987, "grad_norm": 0.431163778817016, "learning_rate": 4.9720484749531434e-05, "loss": 0.5372, "step": 2095 }, { "epoch": 0.48973880597014924, "grad_norm": 0.41321888129096157, "learning_rate": 4.971775150377378e-05, "loss": 0.5712, "step": 2100 }, { "epoch": 0.49090485074626866, "grad_norm": 0.4000479374442276, "learning_rate": 4.971500504374951e-05, "loss": 0.5161, "step": 2105 }, { "epoch": 0.4920708955223881, "grad_norm": 0.4125452194985789, "learning_rate": 4.971224537109211e-05, "loss": 0.5427, "step": 2110 }, { "epoch": 0.49323694029850745, "grad_norm": 0.4335278042799295, "learning_rate": 4.970947248744294e-05, "loss": 0.5255, "step": 2115 }, { "epoch": 0.4944029850746269, "grad_norm": 0.45345614437500326, "learning_rate": 4.970668639445119e-05, "loss": 0.5548, "step": 2120 }, { "epoch": 0.49556902985074625, "grad_norm": 0.4104769008914523, "learning_rate": 4.9703887093773935e-05, "loss": 0.5674, "step": 2125 }, { "epoch": 0.49673507462686567, "grad_norm": 0.42388957127023674, "learning_rate": 4.970107458707608e-05, "loss": 0.5321, "step": 2130 }, { "epoch": 0.4979011194029851, "grad_norm": 0.41865598540915067, "learning_rate": 4.969824887603042e-05, "loss": 0.5641, "step": 2135 }, { "epoch": 0.49906716417910446, "grad_norm": 0.40493387297681477, "learning_rate": 4.969540996231754e-05, "loss": 0.562, "step": 2140 }, { "epoch": 0.5002332089552238, "grad_norm": 0.433698724664108, "learning_rate": 4.9692557847625946e-05, "loss": 0.5411, "step": 2145 }, { "epoch": 0.5013992537313433, "grad_norm": 0.462819257352637, "learning_rate": 4.968969253365196e-05, "loss": 0.5661, "step": 2150 }, { "epoch": 0.5025652985074627, "grad_norm": 0.414743883976037, "learning_rate": 4.968681402209976e-05, "loss": 0.554, "step": 2155 }, { "epoch": 0.503731343283582, "grad_norm": 0.4180694749018957, "learning_rate": 4.9683922314681374e-05, "loss": 0.5502, "step": 2160 }, { "epoch": 0.5048973880597015, "grad_norm": 0.43182947210847594, "learning_rate": 4.968101741311668e-05, "loss": 0.5465, "step": 2165 }, { "epoch": 0.5060634328358209, "grad_norm": 0.4134036726718013, "learning_rate": 4.96780993191334e-05, "loss": 0.5533, "step": 2170 }, { "epoch": 0.5072294776119403, "grad_norm": 0.39978988822243555, "learning_rate": 4.96751680344671e-05, "loss": 0.5354, "step": 2175 }, { "epoch": 0.5083955223880597, "grad_norm": 0.4325283182624236, "learning_rate": 4.9672223560861204e-05, "loss": 0.5883, "step": 2180 }, { "epoch": 0.5095615671641791, "grad_norm": 0.43897133429666435, "learning_rate": 4.966926590006697e-05, "loss": 0.5403, "step": 2185 }, { "epoch": 0.5107276119402985, "grad_norm": 0.4411985754252747, "learning_rate": 4.9666295053843495e-05, "loss": 0.5451, "step": 2190 }, { "epoch": 0.511893656716418, "grad_norm": 0.42414306291691434, "learning_rate": 4.9663311023957744e-05, "loss": 0.5588, "step": 2195 }, { "epoch": 0.5130597014925373, "grad_norm": 0.45615950583419457, "learning_rate": 4.966031381218447e-05, "loss": 0.5688, "step": 2200 }, { "epoch": 0.5142257462686567, "grad_norm": 0.4025743805608124, "learning_rate": 4.965730342030633e-05, "loss": 0.5581, "step": 2205 }, { "epoch": 0.5153917910447762, "grad_norm": 0.449274095779492, "learning_rate": 4.9654279850113775e-05, "loss": 0.5497, "step": 2210 }, { "epoch": 0.5165578358208955, "grad_norm": 0.4099326830474774, "learning_rate": 4.965124310340511e-05, "loss": 0.5507, "step": 2215 }, { "epoch": 0.5177238805970149, "grad_norm": 0.42211103230606756, "learning_rate": 4.964819318198648e-05, "loss": 0.5403, "step": 2220 }, { "epoch": 0.5188899253731343, "grad_norm": 0.3977512342414685, "learning_rate": 4.9645130087671866e-05, "loss": 0.5292, "step": 2225 }, { "epoch": 0.5200559701492538, "grad_norm": 0.4104065309174799, "learning_rate": 4.9642053822283066e-05, "loss": 0.5459, "step": 2230 }, { "epoch": 0.5212220149253731, "grad_norm": 0.4442365588240732, "learning_rate": 4.963896438764973e-05, "loss": 0.5705, "step": 2235 }, { "epoch": 0.5223880597014925, "grad_norm": 0.380223914558644, "learning_rate": 4.9635861785609333e-05, "loss": 0.5587, "step": 2240 }, { "epoch": 0.523554104477612, "grad_norm": 0.39221509175325275, "learning_rate": 4.9632746018007184e-05, "loss": 0.54, "step": 2245 }, { "epoch": 0.5247201492537313, "grad_norm": 0.4218750790208063, "learning_rate": 4.9629617086696434e-05, "loss": 0.5431, "step": 2250 }, { "epoch": 0.5258861940298507, "grad_norm": 0.42184482397484385, "learning_rate": 4.962647499353803e-05, "loss": 0.5647, "step": 2255 }, { "epoch": 0.5270522388059702, "grad_norm": 0.39624179577870444, "learning_rate": 4.962331974040079e-05, "loss": 0.5229, "step": 2260 }, { "epoch": 0.5282182835820896, "grad_norm": 0.3884637349231805, "learning_rate": 4.962015132916133e-05, "loss": 0.5403, "step": 2265 }, { "epoch": 0.5293843283582089, "grad_norm": 0.38503446280263115, "learning_rate": 4.961696976170409e-05, "loss": 0.5268, "step": 2270 }, { "epoch": 0.5305503731343284, "grad_norm": 0.4014110774875387, "learning_rate": 4.9613775039921355e-05, "loss": 0.5279, "step": 2275 }, { "epoch": 0.5317164179104478, "grad_norm": 0.36379396965145155, "learning_rate": 4.961056716571322e-05, "loss": 0.5349, "step": 2280 }, { "epoch": 0.5328824626865671, "grad_norm": 0.4325186706011893, "learning_rate": 4.96073461409876e-05, "loss": 0.5406, "step": 2285 }, { "epoch": 0.5340485074626866, "grad_norm": 0.42859599239320695, "learning_rate": 4.960411196766025e-05, "loss": 0.5715, "step": 2290 }, { "epoch": 0.535214552238806, "grad_norm": 0.43631387946986677, "learning_rate": 4.960086464765472e-05, "loss": 0.5356, "step": 2295 }, { "epoch": 0.5363805970149254, "grad_norm": 0.3908360471093934, "learning_rate": 4.95976041829024e-05, "loss": 0.5413, "step": 2300 }, { "epoch": 0.5375466417910447, "grad_norm": 0.412186673321893, "learning_rate": 4.959433057534248e-05, "loss": 0.5594, "step": 2305 }, { "epoch": 0.5387126865671642, "grad_norm": 0.38206980527085477, "learning_rate": 4.9591043826921984e-05, "loss": 0.5525, "step": 2310 }, { "epoch": 0.5398787313432836, "grad_norm": 0.42150140836879146, "learning_rate": 4.958774393959574e-05, "loss": 0.5312, "step": 2315 }, { "epoch": 0.5410447761194029, "grad_norm": 0.38256885412752933, "learning_rate": 4.95844309153264e-05, "loss": 0.5662, "step": 2320 }, { "epoch": 0.5422108208955224, "grad_norm": 0.4221705350921838, "learning_rate": 4.958110475608442e-05, "loss": 0.5328, "step": 2325 }, { "epoch": 0.5433768656716418, "grad_norm": 0.41339404123608425, "learning_rate": 4.9577765463848065e-05, "loss": 0.5214, "step": 2330 }, { "epoch": 0.5445429104477612, "grad_norm": 0.4489741191179022, "learning_rate": 4.957441304060343e-05, "loss": 0.5502, "step": 2335 }, { "epoch": 0.5457089552238806, "grad_norm": 0.45846439883111634, "learning_rate": 4.957104748834441e-05, "loss": 0.5552, "step": 2340 }, { "epoch": 0.546875, "grad_norm": 0.46384208003104427, "learning_rate": 4.956766880907269e-05, "loss": 0.5333, "step": 2345 }, { "epoch": 0.5480410447761194, "grad_norm": 0.38904250662100903, "learning_rate": 4.9564277004797784e-05, "loss": 0.5402, "step": 2350 }, { "epoch": 0.5492070895522388, "grad_norm": 0.41833862097627633, "learning_rate": 4.956087207753702e-05, "loss": 0.5452, "step": 2355 }, { "epoch": 0.5503731343283582, "grad_norm": 0.40467595517570154, "learning_rate": 4.95574540293155e-05, "loss": 0.5321, "step": 2360 }, { "epoch": 0.5515391791044776, "grad_norm": 0.4036471233217711, "learning_rate": 4.955402286216617e-05, "loss": 0.5471, "step": 2365 }, { "epoch": 0.5527052238805971, "grad_norm": 0.4446716550676256, "learning_rate": 4.9550578578129734e-05, "loss": 0.564, "step": 2370 }, { "epoch": 0.5538712686567164, "grad_norm": 0.4307597876599641, "learning_rate": 4.954712117925473e-05, "loss": 0.536, "step": 2375 }, { "epoch": 0.5550373134328358, "grad_norm": 0.42377340401702124, "learning_rate": 4.954365066759748e-05, "loss": 0.5451, "step": 2380 }, { "epoch": 0.5562033582089553, "grad_norm": 0.40759561883047934, "learning_rate": 4.954016704522213e-05, "loss": 0.5427, "step": 2385 }, { "epoch": 0.5573694029850746, "grad_norm": 0.38288930360428963, "learning_rate": 4.95366703142006e-05, "loss": 0.5438, "step": 2390 }, { "epoch": 0.558535447761194, "grad_norm": 0.47471663867410535, "learning_rate": 4.9533160476612584e-05, "loss": 0.5344, "step": 2395 }, { "epoch": 0.5597014925373134, "grad_norm": 0.4128590228252018, "learning_rate": 4.952963753454563e-05, "loss": 0.5356, "step": 2400 }, { "epoch": 0.5608675373134329, "grad_norm": 0.4126062052427728, "learning_rate": 4.9526101490095035e-05, "loss": 0.5332, "step": 2405 }, { "epoch": 0.5620335820895522, "grad_norm": 0.3850389069038923, "learning_rate": 4.95225523453639e-05, "loss": 0.5359, "step": 2410 }, { "epoch": 0.5631996268656716, "grad_norm": 0.4051951555671248, "learning_rate": 4.9518990102463133e-05, "loss": 0.5306, "step": 2415 }, { "epoch": 0.5643656716417911, "grad_norm": 0.4093554562992661, "learning_rate": 4.951541476351141e-05, "loss": 0.5227, "step": 2420 }, { "epoch": 0.5655317164179104, "grad_norm": 0.4260462395399217, "learning_rate": 4.9511826330635205e-05, "loss": 0.5402, "step": 2425 }, { "epoch": 0.5666977611940298, "grad_norm": 0.3955030974367377, "learning_rate": 4.9508224805968784e-05, "loss": 0.5397, "step": 2430 }, { "epoch": 0.5678638059701493, "grad_norm": 0.4258726059708142, "learning_rate": 4.9504610191654195e-05, "loss": 0.5324, "step": 2435 }, { "epoch": 0.5690298507462687, "grad_norm": 0.41515662260898495, "learning_rate": 4.950098248984127e-05, "loss": 0.5691, "step": 2440 }, { "epoch": 0.570195895522388, "grad_norm": 0.42425090749716615, "learning_rate": 4.949734170268763e-05, "loss": 0.5533, "step": 2445 }, { "epoch": 0.5713619402985075, "grad_norm": 0.4252546180107593, "learning_rate": 4.949368783235867e-05, "loss": 0.5656, "step": 2450 }, { "epoch": 0.5725279850746269, "grad_norm": 0.3825097097032984, "learning_rate": 4.949002088102758e-05, "loss": 0.5532, "step": 2455 }, { "epoch": 0.5736940298507462, "grad_norm": 0.39714340156998973, "learning_rate": 4.9486340850875316e-05, "loss": 0.536, "step": 2460 }, { "epoch": 0.5748600746268657, "grad_norm": 0.4280746733530717, "learning_rate": 4.948264774409062e-05, "loss": 0.5616, "step": 2465 }, { "epoch": 0.5760261194029851, "grad_norm": 0.40412029049673165, "learning_rate": 4.947894156287001e-05, "loss": 0.5221, "step": 2470 }, { "epoch": 0.5771921641791045, "grad_norm": 0.43128378795650346, "learning_rate": 4.947522230941779e-05, "loss": 0.5358, "step": 2475 }, { "epoch": 0.5783582089552238, "grad_norm": 0.3985041609839594, "learning_rate": 4.947148998594601e-05, "loss": 0.5254, "step": 2480 }, { "epoch": 0.5795242537313433, "grad_norm": 0.3772993372811755, "learning_rate": 4.946774459467454e-05, "loss": 0.5261, "step": 2485 }, { "epoch": 0.5806902985074627, "grad_norm": 0.4130113004940263, "learning_rate": 4.946398613783096e-05, "loss": 0.5473, "step": 2490 }, { "epoch": 0.581856343283582, "grad_norm": 0.4357760759909216, "learning_rate": 4.946021461765069e-05, "loss": 0.5768, "step": 2495 }, { "epoch": 0.5830223880597015, "grad_norm": 0.3935256613674847, "learning_rate": 4.945643003637686e-05, "loss": 0.5115, "step": 2500 }, { "epoch": 0.5841884328358209, "grad_norm": 0.4039689209345031, "learning_rate": 4.945263239626039e-05, "loss": 0.5665, "step": 2505 }, { "epoch": 0.5853544776119403, "grad_norm": 0.39673543418986956, "learning_rate": 4.944882169956001e-05, "loss": 0.5298, "step": 2510 }, { "epoch": 0.5865205223880597, "grad_norm": 0.393928069739327, "learning_rate": 4.944499794854215e-05, "loss": 0.5308, "step": 2515 }, { "epoch": 0.5876865671641791, "grad_norm": 0.43568570484896163, "learning_rate": 4.9441161145481016e-05, "loss": 0.5399, "step": 2520 }, { "epoch": 0.5888526119402985, "grad_norm": 0.459960614460092, "learning_rate": 4.943731129265862e-05, "loss": 0.5547, "step": 2525 }, { "epoch": 0.590018656716418, "grad_norm": 0.41496044754685024, "learning_rate": 4.9433448392364694e-05, "loss": 0.5536, "step": 2530 }, { "epoch": 0.5911847014925373, "grad_norm": 0.4025528875737623, "learning_rate": 4.942957244689673e-05, "loss": 0.5266, "step": 2535 }, { "epoch": 0.5923507462686567, "grad_norm": 0.3808899346150937, "learning_rate": 4.942568345856002e-05, "loss": 0.5442, "step": 2540 }, { "epoch": 0.5935167910447762, "grad_norm": 0.6016204533773503, "learning_rate": 4.9421781429667555e-05, "loss": 0.5517, "step": 2545 }, { "epoch": 0.5946828358208955, "grad_norm": 0.3901280993232808, "learning_rate": 4.941786636254014e-05, "loss": 0.5069, "step": 2550 }, { "epoch": 0.5958488805970149, "grad_norm": 0.4296414713053638, "learning_rate": 4.9413938259506286e-05, "loss": 0.5407, "step": 2555 }, { "epoch": 0.5970149253731343, "grad_norm": 0.4542224493780557, "learning_rate": 4.940999712290229e-05, "loss": 0.5215, "step": 2560 }, { "epoch": 0.5981809701492538, "grad_norm": 0.3699306372038709, "learning_rate": 4.940604295507218e-05, "loss": 0.5334, "step": 2565 }, { "epoch": 0.5993470149253731, "grad_norm": 0.39978710700203823, "learning_rate": 4.940207575836775e-05, "loss": 0.546, "step": 2570 }, { "epoch": 0.6005130597014925, "grad_norm": 0.38181128292940997, "learning_rate": 4.9398095535148535e-05, "loss": 0.5446, "step": 2575 }, { "epoch": 0.601679104477612, "grad_norm": 0.5004932568037355, "learning_rate": 4.9394102287781816e-05, "loss": 0.5456, "step": 2580 }, { "epoch": 0.6028451492537313, "grad_norm": 0.3768598673122833, "learning_rate": 4.939009601864263e-05, "loss": 0.538, "step": 2585 }, { "epoch": 0.6040111940298507, "grad_norm": 0.39413056994014956, "learning_rate": 4.938607673011375e-05, "loss": 0.5137, "step": 2590 }, { "epoch": 0.6051772388059702, "grad_norm": 0.410121290723826, "learning_rate": 4.938204442458569e-05, "loss": 0.5384, "step": 2595 }, { "epoch": 0.6063432835820896, "grad_norm": 0.371103195286021, "learning_rate": 4.9377999104456704e-05, "loss": 0.5225, "step": 2600 }, { "epoch": 0.6075093283582089, "grad_norm": 0.428376461747673, "learning_rate": 4.937394077213281e-05, "loss": 0.5392, "step": 2605 }, { "epoch": 0.6086753731343284, "grad_norm": 0.40068274167948215, "learning_rate": 4.9369869430027756e-05, "loss": 0.5147, "step": 2610 }, { "epoch": 0.6098414179104478, "grad_norm": 0.46353704587455485, "learning_rate": 4.9365785080562984e-05, "loss": 0.5504, "step": 2615 }, { "epoch": 0.6110074626865671, "grad_norm": 0.41837179049201895, "learning_rate": 4.9361687726167746e-05, "loss": 0.5504, "step": 2620 }, { "epoch": 0.6121735074626866, "grad_norm": 0.46141765429478143, "learning_rate": 4.935757736927896e-05, "loss": 0.5411, "step": 2625 }, { "epoch": 0.613339552238806, "grad_norm": 0.4184873410957794, "learning_rate": 4.9353454012341346e-05, "loss": 0.5357, "step": 2630 }, { "epoch": 0.6145055970149254, "grad_norm": 0.420514986213068, "learning_rate": 4.934931765780727e-05, "loss": 0.5401, "step": 2635 }, { "epoch": 0.6156716417910447, "grad_norm": 0.43917359395612987, "learning_rate": 4.934516830813693e-05, "loss": 0.5524, "step": 2640 }, { "epoch": 0.6168376865671642, "grad_norm": 0.35672615682155906, "learning_rate": 4.9341005965798155e-05, "loss": 0.5401, "step": 2645 }, { "epoch": 0.6180037313432836, "grad_norm": 0.3786713219886741, "learning_rate": 4.9336830633266565e-05, "loss": 0.5363, "step": 2650 }, { "epoch": 0.6191697761194029, "grad_norm": 0.4064460236883073, "learning_rate": 4.9332642313025495e-05, "loss": 0.5464, "step": 2655 }, { "epoch": 0.6203358208955224, "grad_norm": 0.39567223901783555, "learning_rate": 4.932844100756599e-05, "loss": 0.5263, "step": 2660 }, { "epoch": 0.6215018656716418, "grad_norm": 0.40123811323141495, "learning_rate": 4.9324226719386826e-05, "loss": 0.5356, "step": 2665 }, { "epoch": 0.6226679104477612, "grad_norm": 0.3915925580507895, "learning_rate": 4.931999945099449e-05, "loss": 0.5473, "step": 2670 }, { "epoch": 0.6238339552238806, "grad_norm": 0.38500253649294836, "learning_rate": 4.931575920490322e-05, "loss": 0.5256, "step": 2675 }, { "epoch": 0.625, "grad_norm": 0.3604303898684078, "learning_rate": 4.931150598363494e-05, "loss": 0.5288, "step": 2680 }, { "epoch": 0.6261660447761194, "grad_norm": 0.41682983974343424, "learning_rate": 4.93072397897193e-05, "loss": 0.5577, "step": 2685 }, { "epoch": 0.6273320895522388, "grad_norm": 0.383675309143092, "learning_rate": 4.9302960625693666e-05, "loss": 0.5102, "step": 2690 }, { "epoch": 0.6284981343283582, "grad_norm": 0.3994211112597252, "learning_rate": 4.929866849410313e-05, "loss": 0.5522, "step": 2695 }, { "epoch": 0.6296641791044776, "grad_norm": 0.39450428451057956, "learning_rate": 4.929436339750049e-05, "loss": 0.5529, "step": 2700 }, { "epoch": 0.6308302238805971, "grad_norm": 0.4217539177584992, "learning_rate": 4.9290045338446245e-05, "loss": 0.5395, "step": 2705 }, { "epoch": 0.6319962686567164, "grad_norm": 0.39723731617923325, "learning_rate": 4.9285714319508607e-05, "loss": 0.5216, "step": 2710 }, { "epoch": 0.6331623134328358, "grad_norm": 0.38234661399826736, "learning_rate": 4.9281370343263514e-05, "loss": 0.5255, "step": 2715 }, { "epoch": 0.6343283582089553, "grad_norm": 0.4738795364697418, "learning_rate": 4.927701341229457e-05, "loss": 0.5538, "step": 2720 }, { "epoch": 0.6354944029850746, "grad_norm": 0.4140546339821553, "learning_rate": 4.927264352919315e-05, "loss": 0.5401, "step": 2725 }, { "epoch": 0.636660447761194, "grad_norm": 0.37663373107692194, "learning_rate": 4.9268260696558264e-05, "loss": 0.536, "step": 2730 }, { "epoch": 0.6378264925373134, "grad_norm": 0.3963608702532208, "learning_rate": 4.926386491699665e-05, "loss": 0.5087, "step": 2735 }, { "epoch": 0.6389925373134329, "grad_norm": 0.38669349050721274, "learning_rate": 4.925945619312277e-05, "loss": 0.5526, "step": 2740 }, { "epoch": 0.6401585820895522, "grad_norm": 0.3836229304354064, "learning_rate": 4.925503452755875e-05, "loss": 0.5296, "step": 2745 }, { "epoch": 0.6413246268656716, "grad_norm": 0.3667584538602013, "learning_rate": 4.925059992293443e-05, "loss": 0.5473, "step": 2750 }, { "epoch": 0.6424906716417911, "grad_norm": 0.3896534523747938, "learning_rate": 4.924615238188734e-05, "loss": 0.5065, "step": 2755 }, { "epoch": 0.6436567164179104, "grad_norm": 0.391843351397095, "learning_rate": 4.924169190706271e-05, "loss": 0.5279, "step": 2760 }, { "epoch": 0.6448227611940298, "grad_norm": 0.40227777754572197, "learning_rate": 4.9237218501113466e-05, "loss": 0.5119, "step": 2765 }, { "epoch": 0.6459888059701493, "grad_norm": 0.4109208776707434, "learning_rate": 4.92327321667002e-05, "loss": 0.5287, "step": 2770 }, { "epoch": 0.6471548507462687, "grad_norm": 0.40814412746294954, "learning_rate": 4.922823290649122e-05, "loss": 0.5428, "step": 2775 }, { "epoch": 0.648320895522388, "grad_norm": 0.38176058409566654, "learning_rate": 4.922372072316253e-05, "loss": 0.5451, "step": 2780 }, { "epoch": 0.6494869402985075, "grad_norm": 0.40985499403940806, "learning_rate": 4.921919561939779e-05, "loss": 0.5766, "step": 2785 }, { "epoch": 0.6506529850746269, "grad_norm": 0.3987752571923002, "learning_rate": 4.9214657597888354e-05, "loss": 0.5466, "step": 2790 }, { "epoch": 0.6518190298507462, "grad_norm": 0.42378717634202273, "learning_rate": 4.921010666133326e-05, "loss": 0.5351, "step": 2795 }, { "epoch": 0.6529850746268657, "grad_norm": 0.43538203338320064, "learning_rate": 4.920554281243925e-05, "loss": 0.5397, "step": 2800 }, { "epoch": 0.6541511194029851, "grad_norm": 0.4422239695813655, "learning_rate": 4.920096605392071e-05, "loss": 0.5279, "step": 2805 }, { "epoch": 0.6553171641791045, "grad_norm": 0.3863682728766982, "learning_rate": 4.919637638849972e-05, "loss": 0.5223, "step": 2810 }, { "epoch": 0.6564832089552238, "grad_norm": 0.3928925654796181, "learning_rate": 4.9191773818906044e-05, "loss": 0.5528, "step": 2815 }, { "epoch": 0.6576492537313433, "grad_norm": 0.38761234569742775, "learning_rate": 4.918715834787711e-05, "loss": 0.5246, "step": 2820 }, { "epoch": 0.6588152985074627, "grad_norm": 0.3866078885204687, "learning_rate": 4.918252997815802e-05, "loss": 0.539, "step": 2825 }, { "epoch": 0.659981343283582, "grad_norm": 0.42287402253037515, "learning_rate": 4.917788871250157e-05, "loss": 0.5194, "step": 2830 }, { "epoch": 0.6611473880597015, "grad_norm": 0.38153243447938595, "learning_rate": 4.9173234553668194e-05, "loss": 0.4957, "step": 2835 }, { "epoch": 0.6623134328358209, "grad_norm": 0.3988851192480335, "learning_rate": 4.9168567504425994e-05, "loss": 0.5428, "step": 2840 }, { "epoch": 0.6634794776119403, "grad_norm": 0.38608291875633494, "learning_rate": 4.916388756755077e-05, "loss": 0.5207, "step": 2845 }, { "epoch": 0.6646455223880597, "grad_norm": 0.3857104904637888, "learning_rate": 4.915919474582596e-05, "loss": 0.5365, "step": 2850 }, { "epoch": 0.6658115671641791, "grad_norm": 0.37253461634433516, "learning_rate": 4.915448904204268e-05, "loss": 0.5273, "step": 2855 }, { "epoch": 0.6669776119402985, "grad_norm": 0.3947450807549253, "learning_rate": 4.914977045899969e-05, "loss": 0.5238, "step": 2860 }, { "epoch": 0.668143656716418, "grad_norm": 0.4063505083191156, "learning_rate": 4.914503899950344e-05, "loss": 0.5359, "step": 2865 }, { "epoch": 0.6693097014925373, "grad_norm": 0.3857521939474964, "learning_rate": 4.914029466636801e-05, "loss": 0.5255, "step": 2870 }, { "epoch": 0.6704757462686567, "grad_norm": 0.3912979781077615, "learning_rate": 4.9135537462415146e-05, "loss": 0.513, "step": 2875 }, { "epoch": 0.6716417910447762, "grad_norm": 0.3745982543502313, "learning_rate": 4.913076739047425e-05, "loss": 0.5424, "step": 2880 }, { "epoch": 0.6728078358208955, "grad_norm": 0.3968073349699562, "learning_rate": 4.91259844533824e-05, "loss": 0.5394, "step": 2885 }, { "epoch": 0.6739738805970149, "grad_norm": 0.39206443475901537, "learning_rate": 4.9121188653984266e-05, "loss": 0.5319, "step": 2890 }, { "epoch": 0.6751399253731343, "grad_norm": 0.39111573373033887, "learning_rate": 4.911637999513224e-05, "loss": 0.537, "step": 2895 }, { "epoch": 0.6763059701492538, "grad_norm": 0.3889569274195957, "learning_rate": 4.9111558479686296e-05, "loss": 0.5307, "step": 2900 }, { "epoch": 0.6774720149253731, "grad_norm": 2.951639883433305, "learning_rate": 4.910672411051412e-05, "loss": 0.5514, "step": 2905 }, { "epoch": 0.6786380597014925, "grad_norm": 0.6846608488178547, "learning_rate": 4.910187689049099e-05, "loss": 0.5314, "step": 2910 }, { "epoch": 0.679804104477612, "grad_norm": 0.4093428473728174, "learning_rate": 4.909701682249985e-05, "loss": 0.5348, "step": 2915 }, { "epoch": 0.6809701492537313, "grad_norm": 0.38578253112754274, "learning_rate": 4.909214390943127e-05, "loss": 0.5223, "step": 2920 }, { "epoch": 0.6821361940298507, "grad_norm": 0.3898653529650086, "learning_rate": 4.908725815418349e-05, "loss": 0.5358, "step": 2925 }, { "epoch": 0.6833022388059702, "grad_norm": 0.3911821894109652, "learning_rate": 4.908235955966236e-05, "loss": 0.5313, "step": 2930 }, { "epoch": 0.6844682835820896, "grad_norm": 0.39835380288671635, "learning_rate": 4.907744812878138e-05, "loss": 0.5287, "step": 2935 }, { "epoch": 0.6856343283582089, "grad_norm": 0.4294557267238251, "learning_rate": 4.907252386446169e-05, "loss": 0.5306, "step": 2940 }, { "epoch": 0.6868003731343284, "grad_norm": 0.38270702565340614, "learning_rate": 4.906758676963204e-05, "loss": 0.5236, "step": 2945 }, { "epoch": 0.6879664179104478, "grad_norm": 0.36823714228457355, "learning_rate": 4.906263684722883e-05, "loss": 0.5326, "step": 2950 }, { "epoch": 0.6891324626865671, "grad_norm": 0.39194862605492836, "learning_rate": 4.905767410019607e-05, "loss": 0.5315, "step": 2955 }, { "epoch": 0.6902985074626866, "grad_norm": 0.43321836299604577, "learning_rate": 4.905269853148543e-05, "loss": 0.5349, "step": 2960 }, { "epoch": 0.691464552238806, "grad_norm": 0.37809749169598833, "learning_rate": 4.904771014405618e-05, "loss": 0.5163, "step": 2965 }, { "epoch": 0.6926305970149254, "grad_norm": 0.38605371180030074, "learning_rate": 4.9042708940875225e-05, "loss": 0.5278, "step": 2970 }, { "epoch": 0.6937966417910447, "grad_norm": 0.38021129180239116, "learning_rate": 4.903769492491709e-05, "loss": 0.5306, "step": 2975 }, { "epoch": 0.6949626865671642, "grad_norm": 0.42687512055029797, "learning_rate": 4.903266809916392e-05, "loss": 0.5307, "step": 2980 }, { "epoch": 0.6961287313432836, "grad_norm": 0.42495050285927866, "learning_rate": 4.902762846660546e-05, "loss": 0.5327, "step": 2985 }, { "epoch": 0.6972947761194029, "grad_norm": 0.4115848460391631, "learning_rate": 4.902257603023912e-05, "loss": 0.5659, "step": 2990 }, { "epoch": 0.6984608208955224, "grad_norm": 0.4092517042755935, "learning_rate": 4.901751079306987e-05, "loss": 0.5362, "step": 2995 }, { "epoch": 0.6996268656716418, "grad_norm": 0.4108255952331313, "learning_rate": 4.901243275811034e-05, "loss": 0.5357, "step": 3000 }, { "epoch": 0.7007929104477612, "grad_norm": 0.4044470832031311, "learning_rate": 4.900734192838073e-05, "loss": 0.541, "step": 3005 }, { "epoch": 0.7019589552238806, "grad_norm": 0.3735587539523039, "learning_rate": 4.9002238306908884e-05, "loss": 0.5273, "step": 3010 }, { "epoch": 0.703125, "grad_norm": 0.3767725560311478, "learning_rate": 4.899712189673022e-05, "loss": 0.5547, "step": 3015 }, { "epoch": 0.7042910447761194, "grad_norm": 0.3892345398916338, "learning_rate": 4.899199270088782e-05, "loss": 0.5485, "step": 3020 }, { "epoch": 0.7054570895522388, "grad_norm": 0.3860586851895598, "learning_rate": 4.898685072243231e-05, "loss": 0.537, "step": 3025 }, { "epoch": 0.7066231343283582, "grad_norm": 0.38368334621088224, "learning_rate": 4.8981695964421934e-05, "loss": 0.5197, "step": 3030 }, { "epoch": 0.7077891791044776, "grad_norm": 0.3722183706729155, "learning_rate": 4.897652842992256e-05, "loss": 0.5286, "step": 3035 }, { "epoch": 0.7089552238805971, "grad_norm": 0.40111216056211474, "learning_rate": 4.897134812200763e-05, "loss": 0.5276, "step": 3040 }, { "epoch": 0.7101212686567164, "grad_norm": 0.3677350592197864, "learning_rate": 4.896615504375819e-05, "loss": 0.5176, "step": 3045 }, { "epoch": 0.7112873134328358, "grad_norm": 0.43186027733762905, "learning_rate": 4.8960949198262896e-05, "loss": 0.5504, "step": 3050 }, { "epoch": 0.7124533582089553, "grad_norm": 0.3754092397371525, "learning_rate": 4.895573058861798e-05, "loss": 0.5298, "step": 3055 }, { "epoch": 0.7136194029850746, "grad_norm": 0.37437284767163637, "learning_rate": 4.895049921792727e-05, "loss": 0.5494, "step": 3060 }, { "epoch": 0.714785447761194, "grad_norm": 0.36959904830475193, "learning_rate": 4.894525508930218e-05, "loss": 0.5259, "step": 3065 }, { "epoch": 0.7159514925373134, "grad_norm": 0.43939464674563955, "learning_rate": 4.893999820586172e-05, "loss": 0.5716, "step": 3070 }, { "epoch": 0.7171175373134329, "grad_norm": 0.40922209897051337, "learning_rate": 4.893472857073249e-05, "loss": 0.5361, "step": 3075 }, { "epoch": 0.7182835820895522, "grad_norm": 0.40570250040927736, "learning_rate": 4.892944618704865e-05, "loss": 0.526, "step": 3080 }, { "epoch": 0.7194496268656716, "grad_norm": 0.3758106893416268, "learning_rate": 4.892415105795197e-05, "loss": 0.5365, "step": 3085 }, { "epoch": 0.7206156716417911, "grad_norm": 0.3762267615069055, "learning_rate": 4.89188431865918e-05, "loss": 0.5489, "step": 3090 }, { "epoch": 0.7217817164179104, "grad_norm": 0.47300186208433503, "learning_rate": 4.891352257612505e-05, "loss": 0.5592, "step": 3095 }, { "epoch": 0.7229477611940298, "grad_norm": 0.42577961562317984, "learning_rate": 4.89081892297162e-05, "loss": 0.5392, "step": 3100 }, { "epoch": 0.7241138059701493, "grad_norm": 0.3636682370248496, "learning_rate": 4.8902843150537345e-05, "loss": 0.5126, "step": 3105 }, { "epoch": 0.7252798507462687, "grad_norm": 0.37366394466126196, "learning_rate": 4.8897484341768104e-05, "loss": 0.5455, "step": 3110 }, { "epoch": 0.726445895522388, "grad_norm": 0.374747080930876, "learning_rate": 4.88921128065957e-05, "loss": 0.5413, "step": 3115 }, { "epoch": 0.7276119402985075, "grad_norm": 0.41837222928371043, "learning_rate": 4.8886728548214933e-05, "loss": 0.5101, "step": 3120 }, { "epoch": 0.7287779850746269, "grad_norm": 0.38118401717070516, "learning_rate": 4.8881331569828134e-05, "loss": 0.5252, "step": 3125 }, { "epoch": 0.7299440298507462, "grad_norm": 0.4354876996923206, "learning_rate": 4.887592187464522e-05, "loss": 0.5367, "step": 3130 }, { "epoch": 0.7311100746268657, "grad_norm": 0.3823240177123982, "learning_rate": 4.8870499465883676e-05, "loss": 0.5354, "step": 3135 }, { "epoch": 0.7322761194029851, "grad_norm": 0.38921016614206255, "learning_rate": 4.886506434676854e-05, "loss": 0.5246, "step": 3140 }, { "epoch": 0.7334421641791045, "grad_norm": 0.37598254128147834, "learning_rate": 4.885961652053242e-05, "loss": 0.5228, "step": 3145 }, { "epoch": 0.7346082089552238, "grad_norm": 0.37488353157168236, "learning_rate": 4.885415599041545e-05, "loss": 0.5421, "step": 3150 }, { "epoch": 0.7357742537313433, "grad_norm": 0.38206799211213144, "learning_rate": 4.884868275966538e-05, "loss": 0.5567, "step": 3155 }, { "epoch": 0.7369402985074627, "grad_norm": 0.3697930725962389, "learning_rate": 4.884319683153746e-05, "loss": 0.5313, "step": 3160 }, { "epoch": 0.738106343283582, "grad_norm": 0.35928636492820076, "learning_rate": 4.88376982092945e-05, "loss": 0.519, "step": 3165 }, { "epoch": 0.7392723880597015, "grad_norm": 0.39872417925703796, "learning_rate": 4.883218689620688e-05, "loss": 0.5382, "step": 3170 }, { "epoch": 0.7404384328358209, "grad_norm": 0.3603561466972036, "learning_rate": 4.882666289555251e-05, "loss": 0.5057, "step": 3175 }, { "epoch": 0.7416044776119403, "grad_norm": 0.3817389594188072, "learning_rate": 4.882112621061687e-05, "loss": 0.5379, "step": 3180 }, { "epoch": 0.7427705223880597, "grad_norm": 0.3856273538293339, "learning_rate": 4.881557684469295e-05, "loss": 0.516, "step": 3185 }, { "epoch": 0.7439365671641791, "grad_norm": 0.3921503674609452, "learning_rate": 4.881001480108131e-05, "loss": 0.5499, "step": 3190 }, { "epoch": 0.7451026119402985, "grad_norm": 0.3624242863944163, "learning_rate": 4.880444008309004e-05, "loss": 0.5241, "step": 3195 }, { "epoch": 0.746268656716418, "grad_norm": 0.36529864721183014, "learning_rate": 4.8798852694034775e-05, "loss": 0.5375, "step": 3200 }, { "epoch": 0.7474347014925373, "grad_norm": 0.3683021692125095, "learning_rate": 4.8793252637238656e-05, "loss": 0.5361, "step": 3205 }, { "epoch": 0.7486007462686567, "grad_norm": 0.41150876515923124, "learning_rate": 4.878763991603241e-05, "loss": 0.5565, "step": 3210 }, { "epoch": 0.7497667910447762, "grad_norm": 0.40867839984227117, "learning_rate": 4.878201453375425e-05, "loss": 0.5523, "step": 3215 }, { "epoch": 0.7509328358208955, "grad_norm": 0.4095340603407869, "learning_rate": 4.877637649374994e-05, "loss": 0.5351, "step": 3220 }, { "epoch": 0.7520988805970149, "grad_norm": 0.36660890111556915, "learning_rate": 4.877072579937278e-05, "loss": 0.5238, "step": 3225 }, { "epoch": 0.7532649253731343, "grad_norm": 0.3693950147252224, "learning_rate": 4.876506245398358e-05, "loss": 0.5265, "step": 3230 }, { "epoch": 0.7544309701492538, "grad_norm": 0.39183392565814196, "learning_rate": 4.8759386460950676e-05, "loss": 0.5343, "step": 3235 }, { "epoch": 0.7555970149253731, "grad_norm": 0.542803265870136, "learning_rate": 4.875369782364994e-05, "loss": 0.5145, "step": 3240 }, { "epoch": 0.7567630597014925, "grad_norm": 0.3796918877465813, "learning_rate": 4.8747996545464746e-05, "loss": 0.5387, "step": 3245 }, { "epoch": 0.757929104477612, "grad_norm": 0.36993099878408103, "learning_rate": 4.8742282629786005e-05, "loss": 0.5257, "step": 3250 }, { "epoch": 0.7590951492537313, "grad_norm": 0.35937870943600997, "learning_rate": 4.8736556080012125e-05, "loss": 0.5138, "step": 3255 }, { "epoch": 0.7602611940298507, "grad_norm": 0.3685247204565086, "learning_rate": 4.8730816899549046e-05, "loss": 0.5396, "step": 3260 }, { "epoch": 0.7614272388059702, "grad_norm": 0.34953024881079675, "learning_rate": 4.872506509181021e-05, "loss": 0.5058, "step": 3265 }, { "epoch": 0.7625932835820896, "grad_norm": 0.38937210431785063, "learning_rate": 4.871930066021658e-05, "loss": 0.5121, "step": 3270 }, { "epoch": 0.7637593283582089, "grad_norm": 0.3713733929526791, "learning_rate": 4.8713523608196595e-05, "loss": 0.5077, "step": 3275 }, { "epoch": 0.7649253731343284, "grad_norm": 0.34539804422553044, "learning_rate": 4.8707733939186254e-05, "loss": 0.5244, "step": 3280 }, { "epoch": 0.7660914179104478, "grad_norm": 0.3953032545102582, "learning_rate": 4.8701931656629e-05, "loss": 0.5126, "step": 3285 }, { "epoch": 0.7672574626865671, "grad_norm": 0.4274727255915733, "learning_rate": 4.869611676397584e-05, "loss": 0.5425, "step": 3290 }, { "epoch": 0.7684235074626866, "grad_norm": 0.3992773367160212, "learning_rate": 4.8690289264685226e-05, "loss": 0.5522, "step": 3295 }, { "epoch": 0.769589552238806, "grad_norm": 0.37929867188570593, "learning_rate": 4.868444916222313e-05, "loss": 0.527, "step": 3300 }, { "epoch": 0.7707555970149254, "grad_norm": 0.38375418179681725, "learning_rate": 4.8678596460063046e-05, "loss": 0.523, "step": 3305 }, { "epoch": 0.7719216417910447, "grad_norm": 0.35887407029642154, "learning_rate": 4.867273116168591e-05, "loss": 0.538, "step": 3310 }, { "epoch": 0.7730876865671642, "grad_norm": 0.33680304878390593, "learning_rate": 4.866685327058018e-05, "loss": 0.5438, "step": 3315 }, { "epoch": 0.7742537313432836, "grad_norm": 0.40367292142093464, "learning_rate": 4.8660962790241824e-05, "loss": 0.5474, "step": 3320 }, { "epoch": 0.7754197761194029, "grad_norm": 0.3535253816287866, "learning_rate": 4.865505972417424e-05, "loss": 0.5398, "step": 3325 }, { "epoch": 0.7765858208955224, "grad_norm": 0.367223155545177, "learning_rate": 4.864914407588837e-05, "loss": 0.5104, "step": 3330 }, { "epoch": 0.7777518656716418, "grad_norm": 0.35517393232062466, "learning_rate": 4.864321584890261e-05, "loss": 0.5204, "step": 3335 }, { "epoch": 0.7789179104477612, "grad_norm": 0.35960337444973683, "learning_rate": 4.863727504674282e-05, "loss": 0.5402, "step": 3340 }, { "epoch": 0.7800839552238806, "grad_norm": 0.35899268759291825, "learning_rate": 4.86313216729424e-05, "loss": 0.5414, "step": 3345 }, { "epoch": 0.78125, "grad_norm": 0.3854500530158688, "learning_rate": 4.8625355731042174e-05, "loss": 0.5311, "step": 3350 }, { "epoch": 0.7824160447761194, "grad_norm": 0.37821710706001427, "learning_rate": 4.8619377224590435e-05, "loss": 0.5235, "step": 3355 }, { "epoch": 0.7835820895522388, "grad_norm": 0.3665037290417063, "learning_rate": 4.861338615714299e-05, "loss": 0.5135, "step": 3360 }, { "epoch": 0.7847481343283582, "grad_norm": 0.3713560738949342, "learning_rate": 4.8607382532263085e-05, "loss": 0.5047, "step": 3365 }, { "epoch": 0.7859141791044776, "grad_norm": 0.3635466877964919, "learning_rate": 4.860136635352145e-05, "loss": 0.5379, "step": 3370 }, { "epoch": 0.7870802238805971, "grad_norm": 0.40042517177476933, "learning_rate": 4.8595337624496284e-05, "loss": 0.5421, "step": 3375 }, { "epoch": 0.7882462686567164, "grad_norm": 0.38015847460292385, "learning_rate": 4.8589296348773244e-05, "loss": 0.5252, "step": 3380 }, { "epoch": 0.7894123134328358, "grad_norm": 0.3554037084450133, "learning_rate": 4.858324252994543e-05, "loss": 0.5466, "step": 3385 }, { "epoch": 0.7905783582089553, "grad_norm": 0.36613079355211486, "learning_rate": 4.857717617161345e-05, "loss": 0.5002, "step": 3390 }, { "epoch": 0.7917444029850746, "grad_norm": 0.37651594332970095, "learning_rate": 4.857109727738532e-05, "loss": 0.5387, "step": 3395 }, { "epoch": 0.792910447761194, "grad_norm": 0.4389797234162256, "learning_rate": 4.856500585087654e-05, "loss": 0.531, "step": 3400 }, { "epoch": 0.7940764925373134, "grad_norm": 0.3738498054416888, "learning_rate": 4.855890189571005e-05, "loss": 0.505, "step": 3405 }, { "epoch": 0.7952425373134329, "grad_norm": 0.3674465646905238, "learning_rate": 4.855278541551626e-05, "loss": 0.5459, "step": 3410 }, { "epoch": 0.7964085820895522, "grad_norm": 0.39058597736030176, "learning_rate": 4.8546656413933014e-05, "loss": 0.5207, "step": 3415 }, { "epoch": 0.7975746268656716, "grad_norm": 0.38345389631005905, "learning_rate": 4.85405148946056e-05, "loss": 0.4985, "step": 3420 }, { "epoch": 0.7987406716417911, "grad_norm": 0.3612601910233199, "learning_rate": 4.853436086118677e-05, "loss": 0.5405, "step": 3425 }, { "epoch": 0.7999067164179104, "grad_norm": 0.43426171790928614, "learning_rate": 4.8528194317336703e-05, "loss": 0.5468, "step": 3430 }, { "epoch": 0.8010727611940298, "grad_norm": 0.364692653323174, "learning_rate": 4.852201526672302e-05, "loss": 0.5414, "step": 3435 }, { "epoch": 0.8022388059701493, "grad_norm": 0.35787657334869305, "learning_rate": 4.851582371302078e-05, "loss": 0.5147, "step": 3440 }, { "epoch": 0.8034048507462687, "grad_norm": 0.4025786719654805, "learning_rate": 4.8509619659912486e-05, "loss": 0.5584, "step": 3445 }, { "epoch": 0.804570895522388, "grad_norm": 0.3653941639823354, "learning_rate": 4.8503403111088075e-05, "loss": 0.5346, "step": 3450 }, { "epoch": 0.8057369402985075, "grad_norm": 0.3807238713664878, "learning_rate": 4.849717407024491e-05, "loss": 0.535, "step": 3455 }, { "epoch": 0.8069029850746269, "grad_norm": 0.38322383255794096, "learning_rate": 4.849093254108778e-05, "loss": 0.5185, "step": 3460 }, { "epoch": 0.8080690298507462, "grad_norm": 0.3574305340915936, "learning_rate": 4.8484678527328906e-05, "loss": 0.505, "step": 3465 }, { "epoch": 0.8092350746268657, "grad_norm": 0.36188591159275424, "learning_rate": 4.8478412032687956e-05, "loss": 0.5416, "step": 3470 }, { "epoch": 0.8104011194029851, "grad_norm": 0.36872477781138113, "learning_rate": 4.847213306089197e-05, "loss": 0.5254, "step": 3475 }, { "epoch": 0.8115671641791045, "grad_norm": 0.37999000283471834, "learning_rate": 4.8465841615675464e-05, "loss": 0.5047, "step": 3480 }, { "epoch": 0.8127332089552238, "grad_norm": 0.35855482155978396, "learning_rate": 4.845953770078032e-05, "loss": 0.5329, "step": 3485 }, { "epoch": 0.8138992537313433, "grad_norm": 0.37240742569138474, "learning_rate": 4.84532213199559e-05, "loss": 0.5269, "step": 3490 }, { "epoch": 0.8150652985074627, "grad_norm": 0.3537955013147227, "learning_rate": 4.844689247695893e-05, "loss": 0.5139, "step": 3495 }, { "epoch": 0.816231343283582, "grad_norm": 0.5994885720859646, "learning_rate": 4.844055117555355e-05, "loss": 0.5179, "step": 3500 }, { "epoch": 0.8173973880597015, "grad_norm": 0.3571813302219918, "learning_rate": 4.8434197419511346e-05, "loss": 0.5156, "step": 3505 }, { "epoch": 0.8185634328358209, "grad_norm": 0.3558376327730202, "learning_rate": 4.8427831212611276e-05, "loss": 0.5085, "step": 3510 }, { "epoch": 0.8197294776119403, "grad_norm": 0.3828186825763461, "learning_rate": 4.8421452558639715e-05, "loss": 0.5229, "step": 3515 }, { "epoch": 0.8208955223880597, "grad_norm": 0.35916409943555344, "learning_rate": 4.8415061461390444e-05, "loss": 0.537, "step": 3520 }, { "epoch": 0.8220615671641791, "grad_norm": 0.3779972441000561, "learning_rate": 4.840865792466464e-05, "loss": 0.5249, "step": 3525 }, { "epoch": 0.8232276119402985, "grad_norm": 0.38367392212571394, "learning_rate": 4.840224195227088e-05, "loss": 0.5385, "step": 3530 }, { "epoch": 0.824393656716418, "grad_norm": 0.4144581958197853, "learning_rate": 4.839581354802516e-05, "loss": 0.5313, "step": 3535 }, { "epoch": 0.8255597014925373, "grad_norm": 0.39554430330589657, "learning_rate": 4.8389372715750814e-05, "loss": 0.5347, "step": 3540 }, { "epoch": 0.8267257462686567, "grad_norm": 0.4231326348522534, "learning_rate": 4.838291945927862e-05, "loss": 0.5338, "step": 3545 }, { "epoch": 0.8278917910447762, "grad_norm": 0.33844234195933864, "learning_rate": 4.8376453782446724e-05, "loss": 0.5347, "step": 3550 }, { "epoch": 0.8290578358208955, "grad_norm": 0.40521372891231033, "learning_rate": 4.836997568910067e-05, "loss": 0.5362, "step": 3555 }, { "epoch": 0.8302238805970149, "grad_norm": 0.41223456724651164, "learning_rate": 4.836348518309337e-05, "loss": 0.5272, "step": 3560 }, { "epoch": 0.8313899253731343, "grad_norm": 0.38998789028488645, "learning_rate": 4.835698226828513e-05, "loss": 0.5334, "step": 3565 }, { "epoch": 0.8325559701492538, "grad_norm": 0.36292234494655784, "learning_rate": 4.835046694854364e-05, "loss": 0.5287, "step": 3570 }, { "epoch": 0.8337220149253731, "grad_norm": 0.37198758549059624, "learning_rate": 4.834393922774397e-05, "loss": 0.525, "step": 3575 }, { "epoch": 0.8348880597014925, "grad_norm": 0.3588469223074909, "learning_rate": 4.833739910976853e-05, "loss": 0.5204, "step": 3580 }, { "epoch": 0.836054104477612, "grad_norm": 0.3613777440664703, "learning_rate": 4.833084659850715e-05, "loss": 0.5194, "step": 3585 }, { "epoch": 0.8372201492537313, "grad_norm": 0.3669099169462927, "learning_rate": 4.8324281697857024e-05, "loss": 0.5301, "step": 3590 }, { "epoch": 0.8383861940298507, "grad_norm": 0.37356480434573064, "learning_rate": 4.8317704411722676e-05, "loss": 0.5372, "step": 3595 }, { "epoch": 0.8395522388059702, "grad_norm": 0.3830462031746556, "learning_rate": 4.831111474401604e-05, "loss": 0.5333, "step": 3600 }, { "epoch": 0.8407182835820896, "grad_norm": 0.34145441939187476, "learning_rate": 4.830451269865639e-05, "loss": 0.5087, "step": 3605 }, { "epoch": 0.8418843283582089, "grad_norm": 0.3549704149825061, "learning_rate": 4.8297898279570385e-05, "loss": 0.5031, "step": 3610 }, { "epoch": 0.8430503731343284, "grad_norm": 0.3908631136507107, "learning_rate": 4.829127149069201e-05, "loss": 0.5187, "step": 3615 }, { "epoch": 0.8442164179104478, "grad_norm": 0.36814413083033554, "learning_rate": 4.828463233596264e-05, "loss": 0.5418, "step": 3620 }, { "epoch": 0.8453824626865671, "grad_norm": 0.39205408593529883, "learning_rate": 4.827798081933097e-05, "loss": 0.5283, "step": 3625 }, { "epoch": 0.8465485074626866, "grad_norm": 0.3728855870818213, "learning_rate": 4.827131694475309e-05, "loss": 0.5195, "step": 3630 }, { "epoch": 0.847714552238806, "grad_norm": 0.4423186199754924, "learning_rate": 4.826464071619239e-05, "loss": 0.5288, "step": 3635 }, { "epoch": 0.8488805970149254, "grad_norm": 0.34581435883322015, "learning_rate": 4.825795213761967e-05, "loss": 0.5259, "step": 3640 }, { "epoch": 0.8500466417910447, "grad_norm": 0.36748786969462993, "learning_rate": 4.825125121301301e-05, "loss": 0.5219, "step": 3645 }, { "epoch": 0.8512126865671642, "grad_norm": 0.3455389751097042, "learning_rate": 4.824453794635788e-05, "loss": 0.5374, "step": 3650 }, { "epoch": 0.8523787313432836, "grad_norm": 0.3697450381090462, "learning_rate": 4.823781234164706e-05, "loss": 0.5176, "step": 3655 }, { "epoch": 0.8535447761194029, "grad_norm": 0.37672444877375977, "learning_rate": 4.8231074402880686e-05, "loss": 0.5047, "step": 3660 }, { "epoch": 0.8547108208955224, "grad_norm": 0.35794911351156683, "learning_rate": 4.822432413406624e-05, "loss": 0.5145, "step": 3665 }, { "epoch": 0.8558768656716418, "grad_norm": 0.34990335094508795, "learning_rate": 4.82175615392185e-05, "loss": 0.5349, "step": 3670 }, { "epoch": 0.8570429104477612, "grad_norm": 0.38359837586324985, "learning_rate": 4.821078662235962e-05, "loss": 0.5568, "step": 3675 }, { "epoch": 0.8582089552238806, "grad_norm": 0.3352674153689726, "learning_rate": 4.8203999387519036e-05, "loss": 0.5576, "step": 3680 }, { "epoch": 0.859375, "grad_norm": 0.36520638496981045, "learning_rate": 4.8197199838733567e-05, "loss": 0.5563, "step": 3685 }, { "epoch": 0.8605410447761194, "grad_norm": 0.3574284015097453, "learning_rate": 4.81903879800473e-05, "loss": 0.5404, "step": 3690 }, { "epoch": 0.8617070895522388, "grad_norm": 0.359570530567946, "learning_rate": 4.818356381551167e-05, "loss": 0.5166, "step": 3695 }, { "epoch": 0.8628731343283582, "grad_norm": 0.3710298218316339, "learning_rate": 4.817672734918543e-05, "loss": 0.5443, "step": 3700 }, { "epoch": 0.8640391791044776, "grad_norm": 0.3374403237529766, "learning_rate": 4.816987858513465e-05, "loss": 0.5205, "step": 3705 }, { "epoch": 0.8652052238805971, "grad_norm": 0.348318746440703, "learning_rate": 4.816301752743271e-05, "loss": 0.5352, "step": 3710 }, { "epoch": 0.8663712686567164, "grad_norm": 0.38484207390002323, "learning_rate": 4.8156144180160315e-05, "loss": 0.5366, "step": 3715 }, { "epoch": 0.8675373134328358, "grad_norm": 0.37445796526863606, "learning_rate": 4.8149258547405466e-05, "loss": 0.5285, "step": 3720 }, { "epoch": 0.8687033582089553, "grad_norm": 0.3712118304785722, "learning_rate": 4.814236063326345e-05, "loss": 0.5278, "step": 3725 }, { "epoch": 0.8698694029850746, "grad_norm": 0.4048809909580681, "learning_rate": 4.8135450441836905e-05, "loss": 0.5245, "step": 3730 }, { "epoch": 0.871035447761194, "grad_norm": 0.35286186346457543, "learning_rate": 4.812852797723574e-05, "loss": 0.5091, "step": 3735 }, { "epoch": 0.8722014925373134, "grad_norm": 0.3885939730822929, "learning_rate": 4.8121593243577176e-05, "loss": 0.5213, "step": 3740 }, { "epoch": 0.8733675373134329, "grad_norm": 0.3509136590403021, "learning_rate": 4.8114646244985734e-05, "loss": 0.4985, "step": 3745 }, { "epoch": 0.8745335820895522, "grad_norm": 0.379327379093406, "learning_rate": 4.8107686985593194e-05, "loss": 0.5369, "step": 3750 }, { "epoch": 0.8756996268656716, "grad_norm": 0.36629198233004384, "learning_rate": 4.810071546953868e-05, "loss": 0.5157, "step": 3755 }, { "epoch": 0.8768656716417911, "grad_norm": 0.361590813988436, "learning_rate": 4.809373170096859e-05, "loss": 0.4995, "step": 3760 }, { "epoch": 0.8780317164179104, "grad_norm": 0.38058987259985083, "learning_rate": 4.808673568403657e-05, "loss": 0.5403, "step": 3765 }, { "epoch": 0.8791977611940298, "grad_norm": 0.3687321217294468, "learning_rate": 4.8079727422903615e-05, "loss": 0.5056, "step": 3770 }, { "epoch": 0.8803638059701493, "grad_norm": 0.36448187208791616, "learning_rate": 4.807270692173795e-05, "loss": 0.5467, "step": 3775 }, { "epoch": 0.8815298507462687, "grad_norm": 0.3518354824504398, "learning_rate": 4.806567418471511e-05, "loss": 0.4974, "step": 3780 }, { "epoch": 0.882695895522388, "grad_norm": 0.3517464627873222, "learning_rate": 4.8058629216017884e-05, "loss": 0.5245, "step": 3785 }, { "epoch": 0.8838619402985075, "grad_norm": 0.3501120459290764, "learning_rate": 4.805157201983637e-05, "loss": 0.5203, "step": 3790 }, { "epoch": 0.8850279850746269, "grad_norm": 0.3752725179333214, "learning_rate": 4.804450260036791e-05, "loss": 0.5052, "step": 3795 }, { "epoch": 0.8861940298507462, "grad_norm": 0.33128678885131735, "learning_rate": 4.803742096181711e-05, "loss": 0.5042, "step": 3800 }, { "epoch": 0.8873600746268657, "grad_norm": 0.3893343866092791, "learning_rate": 4.803032710839587e-05, "loss": 0.5054, "step": 3805 }, { "epoch": 0.8885261194029851, "grad_norm": 0.38272290845756407, "learning_rate": 4.802322104432334e-05, "loss": 0.5198, "step": 3810 }, { "epoch": 0.8896921641791045, "grad_norm": 0.3518699066231957, "learning_rate": 4.801610277382593e-05, "loss": 0.5327, "step": 3815 }, { "epoch": 0.8908582089552238, "grad_norm": 0.35723026748106745, "learning_rate": 4.800897230113732e-05, "loss": 0.5294, "step": 3820 }, { "epoch": 0.8920242537313433, "grad_norm": 0.3648234750324044, "learning_rate": 4.8001829630498445e-05, "loss": 0.5333, "step": 3825 }, { "epoch": 0.8931902985074627, "grad_norm": 0.35539484373934965, "learning_rate": 4.799467476615748e-05, "loss": 0.5176, "step": 3830 }, { "epoch": 0.894356343283582, "grad_norm": 0.3506025617608877, "learning_rate": 4.798750771236988e-05, "loss": 0.5406, "step": 3835 }, { "epoch": 0.8955223880597015, "grad_norm": 0.3604809232860036, "learning_rate": 4.7980328473398314e-05, "loss": 0.5093, "step": 3840 }, { "epoch": 0.8966884328358209, "grad_norm": 0.37745557495788395, "learning_rate": 4.797313705351273e-05, "loss": 0.533, "step": 3845 }, { "epoch": 0.8978544776119403, "grad_norm": 0.37778332560546296, "learning_rate": 4.7965933456990306e-05, "loss": 0.5249, "step": 3850 }, { "epoch": 0.8990205223880597, "grad_norm": 0.3597428246838284, "learning_rate": 4.795871768811547e-05, "loss": 0.5498, "step": 3855 }, { "epoch": 0.9001865671641791, "grad_norm": 0.3503018368391822, "learning_rate": 4.795148975117988e-05, "loss": 0.5155, "step": 3860 }, { "epoch": 0.9013526119402985, "grad_norm": 0.36434390383698934, "learning_rate": 4.794424965048243e-05, "loss": 0.5427, "step": 3865 }, { "epoch": 0.902518656716418, "grad_norm": 0.3364509220521192, "learning_rate": 4.7936997390329266e-05, "loss": 0.5033, "step": 3870 }, { "epoch": 0.9036847014925373, "grad_norm": 0.3743439294123426, "learning_rate": 4.7929732975033744e-05, "loss": 0.5225, "step": 3875 }, { "epoch": 0.9048507462686567, "grad_norm": 0.377358056832661, "learning_rate": 4.7922456408916465e-05, "loss": 0.5522, "step": 3880 }, { "epoch": 0.9060167910447762, "grad_norm": 0.356898913926242, "learning_rate": 4.791516769630526e-05, "loss": 0.5231, "step": 3885 }, { "epoch": 0.9071828358208955, "grad_norm": 0.3567691938451185, "learning_rate": 4.790786684153516e-05, "loss": 0.5088, "step": 3890 }, { "epoch": 0.9083488805970149, "grad_norm": 0.34726261524405555, "learning_rate": 4.790055384894844e-05, "loss": 0.5243, "step": 3895 }, { "epoch": 0.9095149253731343, "grad_norm": 0.3341187282693523, "learning_rate": 4.7893228722894584e-05, "loss": 0.5042, "step": 3900 }, { "epoch": 0.9106809701492538, "grad_norm": 0.3734105353776965, "learning_rate": 4.78858914677303e-05, "loss": 0.5276, "step": 3905 }, { "epoch": 0.9118470149253731, "grad_norm": 0.3646933409594863, "learning_rate": 4.787854208781951e-05, "loss": 0.5176, "step": 3910 }, { "epoch": 0.9130130597014925, "grad_norm": 0.35598729767991766, "learning_rate": 4.787118058753334e-05, "loss": 0.5281, "step": 3915 }, { "epoch": 0.914179104477612, "grad_norm": 0.3482901756658386, "learning_rate": 4.786380697125012e-05, "loss": 0.5029, "step": 3920 }, { "epoch": 0.9153451492537313, "grad_norm": 0.34946469107236366, "learning_rate": 4.7856421243355414e-05, "loss": 0.5089, "step": 3925 }, { "epoch": 0.9165111940298507, "grad_norm": 0.3374850022842466, "learning_rate": 4.784902340824195e-05, "loss": 0.5424, "step": 3930 }, { "epoch": 0.9176772388059702, "grad_norm": 0.3451628345775439, "learning_rate": 4.784161347030968e-05, "loss": 0.5067, "step": 3935 }, { "epoch": 0.9188432835820896, "grad_norm": 0.3396991426296273, "learning_rate": 4.7834191433965756e-05, "loss": 0.5368, "step": 3940 }, { "epoch": 0.9200093283582089, "grad_norm": 0.35650498827828714, "learning_rate": 4.782675730362452e-05, "loss": 0.5236, "step": 3945 }, { "epoch": 0.9211753731343284, "grad_norm": 0.3579016732583666, "learning_rate": 4.781931108370751e-05, "loss": 0.5091, "step": 3950 }, { "epoch": 0.9223414179104478, "grad_norm": 0.38378319243031106, "learning_rate": 4.781185277864344e-05, "loss": 0.5163, "step": 3955 }, { "epoch": 0.9235074626865671, "grad_norm": 0.34607506200104415, "learning_rate": 4.780438239286824e-05, "loss": 0.4975, "step": 3960 }, { "epoch": 0.9246735074626866, "grad_norm": 0.3586809835931925, "learning_rate": 4.7796899930825004e-05, "loss": 0.5122, "step": 3965 }, { "epoch": 0.925839552238806, "grad_norm": 0.38780022598471536, "learning_rate": 4.7789405396964004e-05, "loss": 0.525, "step": 3970 }, { "epoch": 0.9270055970149254, "grad_norm": 0.3432810643625677, "learning_rate": 4.7781898795742716e-05, "loss": 0.5337, "step": 3975 }, { "epoch": 0.9281716417910447, "grad_norm": 0.3648287666200129, "learning_rate": 4.777438013162576e-05, "loss": 0.5084, "step": 3980 }, { "epoch": 0.9293376865671642, "grad_norm": 0.3570869085594697, "learning_rate": 4.7766849409084976e-05, "loss": 0.5153, "step": 3985 }, { "epoch": 0.9305037313432836, "grad_norm": 0.3407670830006965, "learning_rate": 4.775930663259932e-05, "loss": 0.5007, "step": 3990 }, { "epoch": 0.9316697761194029, "grad_norm": 0.3878252654284208, "learning_rate": 4.7751751806654966e-05, "loss": 0.5511, "step": 3995 }, { "epoch": 0.9328358208955224, "grad_norm": 0.3734064688870557, "learning_rate": 4.774418493574523e-05, "loss": 0.5177, "step": 4000 }, { "epoch": 0.9340018656716418, "grad_norm": 0.38658474263973996, "learning_rate": 4.773660602437059e-05, "loss": 0.5173, "step": 4005 }, { "epoch": 0.9351679104477612, "grad_norm": 0.37559941555662035, "learning_rate": 4.77290150770387e-05, "loss": 0.5184, "step": 4010 }, { "epoch": 0.9363339552238806, "grad_norm": 0.3649865471875475, "learning_rate": 4.772141209826435e-05, "loss": 0.52, "step": 4015 }, { "epoch": 0.9375, "grad_norm": 0.3666204894036579, "learning_rate": 4.771379709256953e-05, "loss": 0.5016, "step": 4020 }, { "epoch": 0.9386660447761194, "grad_norm": 0.3855570618196418, "learning_rate": 4.770617006448332e-05, "loss": 0.5139, "step": 4025 }, { "epoch": 0.9398320895522388, "grad_norm": 0.3688556011606313, "learning_rate": 4.769853101854201e-05, "loss": 0.5314, "step": 4030 }, { "epoch": 0.9409981343283582, "grad_norm": 0.35422971055747376, "learning_rate": 4.7690879959288994e-05, "loss": 0.502, "step": 4035 }, { "epoch": 0.9421641791044776, "grad_norm": 0.41163266033343787, "learning_rate": 4.768321689127483e-05, "loss": 0.5191, "step": 4040 }, { "epoch": 0.9433302238805971, "grad_norm": 0.35927951351420284, "learning_rate": 4.767554181905723e-05, "loss": 0.5037, "step": 4045 }, { "epoch": 0.9444962686567164, "grad_norm": 0.4080315217338119, "learning_rate": 4.766785474720102e-05, "loss": 0.5289, "step": 4050 }, { "epoch": 0.9456623134328358, "grad_norm": 0.3643245613714343, "learning_rate": 4.766015568027818e-05, "loss": 0.5248, "step": 4055 }, { "epoch": 0.9468283582089553, "grad_norm": 0.36984525668316576, "learning_rate": 4.765244462286782e-05, "loss": 0.5178, "step": 4060 }, { "epoch": 0.9479944029850746, "grad_norm": 0.357614208945119, "learning_rate": 4.7644721579556184e-05, "loss": 0.4851, "step": 4065 }, { "epoch": 0.949160447761194, "grad_norm": 0.3625643886650782, "learning_rate": 4.763698655493664e-05, "loss": 0.5176, "step": 4070 }, { "epoch": 0.9503264925373134, "grad_norm": 0.4340708025546624, "learning_rate": 4.762923955360968e-05, "loss": 0.5231, "step": 4075 }, { "epoch": 0.9514925373134329, "grad_norm": 0.3867495253416601, "learning_rate": 4.7621480580182925e-05, "loss": 0.5242, "step": 4080 }, { "epoch": 0.9526585820895522, "grad_norm": 0.33670691155146837, "learning_rate": 4.761370963927112e-05, "loss": 0.5014, "step": 4085 }, { "epoch": 0.9538246268656716, "grad_norm": 0.3910906714932446, "learning_rate": 4.760592673549611e-05, "loss": 0.5507, "step": 4090 }, { "epoch": 0.9549906716417911, "grad_norm": 0.3427975875880022, "learning_rate": 4.759813187348688e-05, "loss": 0.5198, "step": 4095 }, { "epoch": 0.9561567164179104, "grad_norm": 0.3448331826575159, "learning_rate": 4.759032505787952e-05, "loss": 0.5037, "step": 4100 }, { "epoch": 0.9573227611940298, "grad_norm": 0.4176221837743691, "learning_rate": 4.758250629331721e-05, "loss": 0.5309, "step": 4105 }, { "epoch": 0.9584888059701493, "grad_norm": 0.3474680474425099, "learning_rate": 4.7574675584450256e-05, "loss": 0.5299, "step": 4110 }, { "epoch": 0.9596548507462687, "grad_norm": 0.3380216012321242, "learning_rate": 4.756683293593607e-05, "loss": 0.4861, "step": 4115 }, { "epoch": 0.960820895522388, "grad_norm": 0.3689763159872039, "learning_rate": 4.755897835243916e-05, "loss": 0.534, "step": 4120 }, { "epoch": 0.9619869402985075, "grad_norm": 0.36353135415568916, "learning_rate": 4.755111183863111e-05, "loss": 0.5172, "step": 4125 }, { "epoch": 0.9631529850746269, "grad_norm": 0.4102867037516881, "learning_rate": 4.754323339919064e-05, "loss": 0.5279, "step": 4130 }, { "epoch": 0.9643190298507462, "grad_norm": 0.3629563614507666, "learning_rate": 4.753534303880353e-05, "loss": 0.5263, "step": 4135 }, { "epoch": 0.9654850746268657, "grad_norm": 0.38282803357753165, "learning_rate": 4.752744076216268e-05, "loss": 0.5327, "step": 4140 }, { "epoch": 0.9666511194029851, "grad_norm": 0.3759862448767285, "learning_rate": 4.751952657396807e-05, "loss": 0.4998, "step": 4145 }, { "epoch": 0.9678171641791045, "grad_norm": 0.4225602759399125, "learning_rate": 4.751160047892672e-05, "loss": 0.5368, "step": 4150 }, { "epoch": 0.9689832089552238, "grad_norm": 0.3667189317271687, "learning_rate": 4.75036624817528e-05, "loss": 0.5123, "step": 4155 }, { "epoch": 0.9701492537313433, "grad_norm": 0.3759800202584298, "learning_rate": 4.74957125871675e-05, "loss": 0.5284, "step": 4160 }, { "epoch": 0.9713152985074627, "grad_norm": 0.3516404484578329, "learning_rate": 4.748775079989913e-05, "loss": 0.513, "step": 4165 }, { "epoch": 0.972481343283582, "grad_norm": 0.3667735402807629, "learning_rate": 4.747977712468305e-05, "loss": 0.5346, "step": 4170 }, { "epoch": 0.9736473880597015, "grad_norm": 0.34657495681092837, "learning_rate": 4.747179156626171e-05, "loss": 0.5259, "step": 4175 }, { "epoch": 0.9748134328358209, "grad_norm": 0.3686808027950596, "learning_rate": 4.746379412938459e-05, "loss": 0.5289, "step": 4180 }, { "epoch": 0.9759794776119403, "grad_norm": 0.3710221094823407, "learning_rate": 4.745578481880827e-05, "loss": 0.5325, "step": 4185 }, { "epoch": 0.9771455223880597, "grad_norm": 0.3718992110394827, "learning_rate": 4.7447763639296384e-05, "loss": 0.5192, "step": 4190 }, { "epoch": 0.9783115671641791, "grad_norm": 0.38811688254320986, "learning_rate": 4.743973059561962e-05, "loss": 0.5166, "step": 4195 }, { "epoch": 0.9794776119402985, "grad_norm": 0.37797901051638616, "learning_rate": 4.743168569255572e-05, "loss": 0.5152, "step": 4200 }, { "epoch": 0.980643656716418, "grad_norm": 0.354444010887117, "learning_rate": 4.742362893488949e-05, "loss": 0.5089, "step": 4205 }, { "epoch": 0.9818097014925373, "grad_norm": 0.38839405719200126, "learning_rate": 4.741556032741278e-05, "loss": 0.5149, "step": 4210 }, { "epoch": 0.9829757462686567, "grad_norm": 0.3527632951015326, "learning_rate": 4.7407479874924474e-05, "loss": 0.4977, "step": 4215 }, { "epoch": 0.9841417910447762, "grad_norm": 0.36559685649539897, "learning_rate": 4.739938758223055e-05, "loss": 0.532, "step": 4220 }, { "epoch": 0.9853078358208955, "grad_norm": 0.40877482441563245, "learning_rate": 4.739128345414395e-05, "loss": 0.5419, "step": 4225 }, { "epoch": 0.9864738805970149, "grad_norm": 0.39497249782350524, "learning_rate": 4.738316749548473e-05, "loss": 0.5458, "step": 4230 }, { "epoch": 0.9876399253731343, "grad_norm": 0.3544546491958244, "learning_rate": 4.737503971107994e-05, "loss": 0.5106, "step": 4235 }, { "epoch": 0.9888059701492538, "grad_norm": 0.3569979280697408, "learning_rate": 4.736690010576368e-05, "loss": 0.5265, "step": 4240 }, { "epoch": 0.9899720149253731, "grad_norm": 0.36246491758499877, "learning_rate": 4.735874868437705e-05, "loss": 0.527, "step": 4245 }, { "epoch": 0.9911380597014925, "grad_norm": 0.3605631002640691, "learning_rate": 4.735058545176824e-05, "loss": 0.5045, "step": 4250 }, { "epoch": 0.992304104477612, "grad_norm": 0.3501311525847066, "learning_rate": 4.73424104127924e-05, "loss": 0.5352, "step": 4255 }, { "epoch": 0.9934701492537313, "grad_norm": 0.3531197991557556, "learning_rate": 4.733422357231176e-05, "loss": 0.5201, "step": 4260 }, { "epoch": 0.9946361940298507, "grad_norm": 0.3673995578841256, "learning_rate": 4.7326024935195504e-05, "loss": 0.524, "step": 4265 }, { "epoch": 0.9958022388059702, "grad_norm": 0.3650499869368064, "learning_rate": 4.731781450631988e-05, "loss": 0.5138, "step": 4270 }, { "epoch": 0.9969682835820896, "grad_norm": 0.3576361014814151, "learning_rate": 4.7309592290568144e-05, "loss": 0.5222, "step": 4275 }, { "epoch": 0.9981343283582089, "grad_norm": 0.45808390293359386, "learning_rate": 4.730135829283055e-05, "loss": 0.5235, "step": 4280 }, { "epoch": 0.9993003731343284, "grad_norm": 0.34157809235910325, "learning_rate": 4.7293112518004357e-05, "loss": 0.5038, "step": 4285 }, { "epoch": 1.0004664179104477, "grad_norm": 0.3703872875286972, "learning_rate": 4.728485497099385e-05, "loss": 0.5048, "step": 4290 }, { "epoch": 1.0016324626865671, "grad_norm": 0.39107106437997685, "learning_rate": 4.7276585656710295e-05, "loss": 0.463, "step": 4295 }, { "epoch": 1.0027985074626866, "grad_norm": 0.3740341373445779, "learning_rate": 4.726830458007194e-05, "loss": 0.4489, "step": 4300 }, { "epoch": 1.0039645522388059, "grad_norm": 0.3615235344255957, "learning_rate": 4.72600117460041e-05, "loss": 0.4654, "step": 4305 }, { "epoch": 1.0051305970149254, "grad_norm": 0.37819710152198327, "learning_rate": 4.725170715943898e-05, "loss": 0.4807, "step": 4310 }, { "epoch": 1.0062966417910448, "grad_norm": 0.3650150836539973, "learning_rate": 4.724339082531588e-05, "loss": 0.4477, "step": 4315 }, { "epoch": 1.007462686567164, "grad_norm": 0.3564277718182905, "learning_rate": 4.723506274858101e-05, "loss": 0.4681, "step": 4320 }, { "epoch": 1.0086287313432836, "grad_norm": 0.38019322833322156, "learning_rate": 4.722672293418759e-05, "loss": 0.4848, "step": 4325 }, { "epoch": 1.009794776119403, "grad_norm": 0.3831442682341317, "learning_rate": 4.721837138709582e-05, "loss": 0.505, "step": 4330 }, { "epoch": 1.0109608208955223, "grad_norm": 0.3757318543553241, "learning_rate": 4.7210008112272895e-05, "loss": 0.4911, "step": 4335 }, { "epoch": 1.0121268656716418, "grad_norm": 0.4222622673290297, "learning_rate": 4.720163311469296e-05, "loss": 0.4661, "step": 4340 }, { "epoch": 1.0132929104477613, "grad_norm": 0.3419450515547992, "learning_rate": 4.7193246399337146e-05, "loss": 0.4922, "step": 4345 }, { "epoch": 1.0144589552238805, "grad_norm": 0.41543713936485166, "learning_rate": 4.718484797119355e-05, "loss": 0.4954, "step": 4350 }, { "epoch": 1.015625, "grad_norm": 0.33108739537354487, "learning_rate": 4.717643783525722e-05, "loss": 0.4568, "step": 4355 }, { "epoch": 1.0167910447761195, "grad_norm": 0.372470897375687, "learning_rate": 4.7168015996530204e-05, "loss": 0.4705, "step": 4360 }, { "epoch": 1.0179570895522387, "grad_norm": 0.34207144199133666, "learning_rate": 4.715958246002148e-05, "loss": 0.4614, "step": 4365 }, { "epoch": 1.0191231343283582, "grad_norm": 0.3749619013088552, "learning_rate": 4.715113723074699e-05, "loss": 0.4599, "step": 4370 }, { "epoch": 1.0202891791044777, "grad_norm": 0.37088493349069046, "learning_rate": 4.714268031372964e-05, "loss": 0.4732, "step": 4375 }, { "epoch": 1.021455223880597, "grad_norm": 0.3395518739628772, "learning_rate": 4.7134211713999264e-05, "loss": 0.4466, "step": 4380 }, { "epoch": 1.0226212686567164, "grad_norm": 0.38790472672739523, "learning_rate": 4.712573143659268e-05, "loss": 0.4697, "step": 4385 }, { "epoch": 1.023787313432836, "grad_norm": 0.34352109900158556, "learning_rate": 4.711723948655362e-05, "loss": 0.4672, "step": 4390 }, { "epoch": 1.0249533582089552, "grad_norm": 0.36000936651907434, "learning_rate": 4.710873586893276e-05, "loss": 0.4472, "step": 4395 }, { "epoch": 1.0261194029850746, "grad_norm": 0.3269748917403646, "learning_rate": 4.7100220588787755e-05, "loss": 0.469, "step": 4400 }, { "epoch": 1.0272854477611941, "grad_norm": 0.3437252814245387, "learning_rate": 4.7091693651183144e-05, "loss": 0.483, "step": 4405 }, { "epoch": 1.0284514925373134, "grad_norm": 0.36295468075037257, "learning_rate": 4.7083155061190426e-05, "loss": 0.4938, "step": 4410 }, { "epoch": 1.0296175373134329, "grad_norm": 0.34170727382592825, "learning_rate": 4.707460482388804e-05, "loss": 0.4682, "step": 4415 }, { "epoch": 1.0307835820895523, "grad_norm": 0.40347198297014825, "learning_rate": 4.706604294436132e-05, "loss": 0.4588, "step": 4420 }, { "epoch": 1.0319496268656716, "grad_norm": 0.3798756515973087, "learning_rate": 4.705746942770255e-05, "loss": 0.4701, "step": 4425 }, { "epoch": 1.033115671641791, "grad_norm": 0.37895617706651286, "learning_rate": 4.704888427901094e-05, "loss": 0.444, "step": 4430 }, { "epoch": 1.0342817164179103, "grad_norm": 0.3426791493033279, "learning_rate": 4.70402875033926e-05, "loss": 0.4815, "step": 4435 }, { "epoch": 1.0354477611940298, "grad_norm": 0.37433436951614435, "learning_rate": 4.703167910596055e-05, "loss": 0.4916, "step": 4440 }, { "epoch": 1.0366138059701493, "grad_norm": 0.3648609106638476, "learning_rate": 4.702305909183475e-05, "loss": 0.4646, "step": 4445 }, { "epoch": 1.0377798507462686, "grad_norm": 0.3292380253195559, "learning_rate": 4.701442746614206e-05, "loss": 0.4936, "step": 4450 }, { "epoch": 1.038945895522388, "grad_norm": 0.34478649180279286, "learning_rate": 4.700578423401622e-05, "loss": 0.4571, "step": 4455 }, { "epoch": 1.0401119402985075, "grad_norm": 0.3646134103678016, "learning_rate": 4.699712940059791e-05, "loss": 0.4653, "step": 4460 }, { "epoch": 1.0412779850746268, "grad_norm": 0.35114381717321036, "learning_rate": 4.6988462971034676e-05, "loss": 0.4844, "step": 4465 }, { "epoch": 1.0424440298507462, "grad_norm": 0.3298159067089294, "learning_rate": 4.697978495048099e-05, "loss": 0.4696, "step": 4470 }, { "epoch": 1.0436100746268657, "grad_norm": 0.4339146415373856, "learning_rate": 4.697109534409821e-05, "loss": 0.4758, "step": 4475 }, { "epoch": 1.044776119402985, "grad_norm": 0.37461865408584955, "learning_rate": 4.696239415705458e-05, "loss": 0.479, "step": 4480 }, { "epoch": 1.0459421641791045, "grad_norm": 0.3365744173673012, "learning_rate": 4.695368139452521e-05, "loss": 0.4683, "step": 4485 }, { "epoch": 1.047108208955224, "grad_norm": 0.3397519146528062, "learning_rate": 4.694495706169214e-05, "loss": 0.477, "step": 4490 }, { "epoch": 1.0482742537313432, "grad_norm": 0.39498235820057703, "learning_rate": 4.693622116374427e-05, "loss": 0.496, "step": 4495 }, { "epoch": 1.0494402985074627, "grad_norm": 0.3441985067767204, "learning_rate": 4.692747370587737e-05, "loss": 0.4739, "step": 4500 }, { "epoch": 1.0506063432835822, "grad_norm": 0.3610968244663037, "learning_rate": 4.691871469329408e-05, "loss": 0.472, "step": 4505 }, { "epoch": 1.0517723880597014, "grad_norm": 0.3646189009707427, "learning_rate": 4.690994413120394e-05, "loss": 0.4605, "step": 4510 }, { "epoch": 1.052938432835821, "grad_norm": 0.3555160621444571, "learning_rate": 4.690116202482335e-05, "loss": 0.4592, "step": 4515 }, { "epoch": 1.0541044776119404, "grad_norm": 0.37864888282318304, "learning_rate": 4.689236837937556e-05, "loss": 0.4555, "step": 4520 }, { "epoch": 1.0552705223880596, "grad_norm": 0.351622874821909, "learning_rate": 4.688356320009069e-05, "loss": 0.495, "step": 4525 }, { "epoch": 1.056436567164179, "grad_norm": 0.37630623033926713, "learning_rate": 4.687474649220573e-05, "loss": 0.478, "step": 4530 }, { "epoch": 1.0576026119402986, "grad_norm": 0.36080528635217013, "learning_rate": 4.6865918260964506e-05, "loss": 0.4664, "step": 4535 }, { "epoch": 1.0587686567164178, "grad_norm": 0.390910593040182, "learning_rate": 4.685707851161773e-05, "loss": 0.4655, "step": 4540 }, { "epoch": 1.0599347014925373, "grad_norm": 0.34939052192195547, "learning_rate": 4.6848227249422936e-05, "loss": 0.4827, "step": 4545 }, { "epoch": 1.0611007462686568, "grad_norm": 0.3776705402142592, "learning_rate": 4.683936447964452e-05, "loss": 0.4784, "step": 4550 }, { "epoch": 1.062266791044776, "grad_norm": 0.41247129015358364, "learning_rate": 4.683049020755372e-05, "loss": 0.4824, "step": 4555 }, { "epoch": 1.0634328358208955, "grad_norm": 0.37161703283515807, "learning_rate": 4.6821604438428594e-05, "loss": 0.4812, "step": 4560 }, { "epoch": 1.064598880597015, "grad_norm": 0.3855721993358096, "learning_rate": 4.681270717755409e-05, "loss": 0.4871, "step": 4565 }, { "epoch": 1.0657649253731343, "grad_norm": 0.3745003412323164, "learning_rate": 4.680379843022192e-05, "loss": 0.4653, "step": 4570 }, { "epoch": 1.0669309701492538, "grad_norm": 0.3442522560370515, "learning_rate": 4.679487820173069e-05, "loss": 0.4529, "step": 4575 }, { "epoch": 1.0680970149253732, "grad_norm": 0.36435916677044505, "learning_rate": 4.678594649738581e-05, "loss": 0.4471, "step": 4580 }, { "epoch": 1.0692630597014925, "grad_norm": 0.36181593368564924, "learning_rate": 4.67770033224995e-05, "loss": 0.4651, "step": 4585 }, { "epoch": 1.070429104477612, "grad_norm": 0.3595589483080218, "learning_rate": 4.676804868239083e-05, "loss": 0.4366, "step": 4590 }, { "epoch": 1.0715951492537314, "grad_norm": 0.3620547821336687, "learning_rate": 4.675908258238567e-05, "loss": 0.4646, "step": 4595 }, { "epoch": 1.0727611940298507, "grad_norm": 0.3665862033739012, "learning_rate": 4.6750105027816716e-05, "loss": 0.4741, "step": 4600 }, { "epoch": 1.0739272388059702, "grad_norm": 0.3522918193078635, "learning_rate": 4.6741116024023476e-05, "loss": 0.4656, "step": 4605 }, { "epoch": 1.0750932835820897, "grad_norm": 0.34441917709894415, "learning_rate": 4.673211557635225e-05, "loss": 0.4654, "step": 4610 }, { "epoch": 1.076259328358209, "grad_norm": 0.36340291750873155, "learning_rate": 4.672310369015619e-05, "loss": 0.4849, "step": 4615 }, { "epoch": 1.0774253731343284, "grad_norm": 0.3469345999215564, "learning_rate": 4.671408037079519e-05, "loss": 0.4666, "step": 4620 }, { "epoch": 1.0785914179104477, "grad_norm": 0.3538303630694054, "learning_rate": 4.670504562363598e-05, "loss": 0.4561, "step": 4625 }, { "epoch": 1.0797574626865671, "grad_norm": 0.36634890355747535, "learning_rate": 4.669599945405208e-05, "loss": 0.4921, "step": 4630 }, { "epoch": 1.0809235074626866, "grad_norm": 0.3430965755301908, "learning_rate": 4.668694186742383e-05, "loss": 0.4618, "step": 4635 }, { "epoch": 1.0820895522388059, "grad_norm": 0.36964625204254353, "learning_rate": 4.6677872869138304e-05, "loss": 0.4685, "step": 4640 }, { "epoch": 1.0832555970149254, "grad_norm": 0.40764938594719646, "learning_rate": 4.666879246458941e-05, "loss": 0.4684, "step": 4645 }, { "epoch": 1.0844216417910448, "grad_norm": 0.36940611020903746, "learning_rate": 4.6659700659177814e-05, "loss": 0.4672, "step": 4650 }, { "epoch": 1.085587686567164, "grad_norm": 0.3534666311242132, "learning_rate": 4.665059745831098e-05, "loss": 0.4786, "step": 4655 }, { "epoch": 1.0867537313432836, "grad_norm": 0.3708615966075552, "learning_rate": 4.6641482867403156e-05, "loss": 0.4615, "step": 4660 }, { "epoch": 1.087919776119403, "grad_norm": 0.35557897859045173, "learning_rate": 4.6632356891875336e-05, "loss": 0.4715, "step": 4665 }, { "epoch": 1.0890858208955223, "grad_norm": 0.3607744066748857, "learning_rate": 4.662321953715529e-05, "loss": 0.4591, "step": 4670 }, { "epoch": 1.0902518656716418, "grad_norm": 0.36007222893395574, "learning_rate": 4.661407080867759e-05, "loss": 0.4574, "step": 4675 }, { "epoch": 1.0914179104477613, "grad_norm": 0.38089461165127153, "learning_rate": 4.660491071188353e-05, "loss": 0.4846, "step": 4680 }, { "epoch": 1.0925839552238805, "grad_norm": 0.3905329009773269, "learning_rate": 4.6595739252221196e-05, "loss": 0.5093, "step": 4685 }, { "epoch": 1.09375, "grad_norm": 0.3602585268091229, "learning_rate": 4.658655643514541e-05, "loss": 0.4792, "step": 4690 }, { "epoch": 1.0949160447761195, "grad_norm": 0.4522315050176882, "learning_rate": 4.657736226611778e-05, "loss": 0.4864, "step": 4695 }, { "epoch": 1.0960820895522387, "grad_norm": 0.3405801635037323, "learning_rate": 4.656815675060662e-05, "loss": 0.4689, "step": 4700 }, { "epoch": 1.0972481343283582, "grad_norm": 0.4041410578879345, "learning_rate": 4.655893989408702e-05, "loss": 0.4792, "step": 4705 }, { "epoch": 1.0984141791044777, "grad_norm": 0.3387401672489702, "learning_rate": 4.654971170204083e-05, "loss": 0.4605, "step": 4710 }, { "epoch": 1.099580223880597, "grad_norm": 0.3553977899178839, "learning_rate": 4.6540472179956625e-05, "loss": 0.471, "step": 4715 }, { "epoch": 1.1007462686567164, "grad_norm": 0.35250369064647263, "learning_rate": 4.6531221333329694e-05, "loss": 0.4807, "step": 4720 }, { "epoch": 1.101912313432836, "grad_norm": 0.34666820032492063, "learning_rate": 4.652195916766211e-05, "loss": 0.4577, "step": 4725 }, { "epoch": 1.1030783582089552, "grad_norm": 0.3687788621939309, "learning_rate": 4.6512685688462645e-05, "loss": 0.4929, "step": 4730 }, { "epoch": 1.1042444029850746, "grad_norm": 0.3604406893897734, "learning_rate": 4.65034009012468e-05, "loss": 0.4694, "step": 4735 }, { "epoch": 1.1054104477611941, "grad_norm": 0.39492988501975834, "learning_rate": 4.649410481153683e-05, "loss": 0.4847, "step": 4740 }, { "epoch": 1.1065764925373134, "grad_norm": 0.3556350936151747, "learning_rate": 4.6484797424861675e-05, "loss": 0.4709, "step": 4745 }, { "epoch": 1.1077425373134329, "grad_norm": 0.36685815822526896, "learning_rate": 4.6475478746757025e-05, "loss": 0.4596, "step": 4750 }, { "epoch": 1.1089085820895523, "grad_norm": 0.38179365682843375, "learning_rate": 4.646614878276526e-05, "loss": 0.4903, "step": 4755 }, { "epoch": 1.1100746268656716, "grad_norm": 0.3477278156168366, "learning_rate": 4.64568075384355e-05, "loss": 0.467, "step": 4760 }, { "epoch": 1.111240671641791, "grad_norm": 0.3494902566057672, "learning_rate": 4.644745501932355e-05, "loss": 0.4527, "step": 4765 }, { "epoch": 1.1124067164179103, "grad_norm": 0.35263314338083995, "learning_rate": 4.643809123099192e-05, "loss": 0.4932, "step": 4770 }, { "epoch": 1.1135727611940298, "grad_norm": 0.3874795877039355, "learning_rate": 4.6428716179009844e-05, "loss": 0.4708, "step": 4775 }, { "epoch": 1.1147388059701493, "grad_norm": 0.36333961831243017, "learning_rate": 4.641932986895325e-05, "loss": 0.4585, "step": 4780 }, { "epoch": 1.1159048507462686, "grad_norm": 0.351190710620662, "learning_rate": 4.6409932306404735e-05, "loss": 0.4804, "step": 4785 }, { "epoch": 1.117070895522388, "grad_norm": 0.31329819772120054, "learning_rate": 4.640052349695363e-05, "loss": 0.4637, "step": 4790 }, { "epoch": 1.1182369402985075, "grad_norm": 0.3792351860722495, "learning_rate": 4.6391103446195915e-05, "loss": 0.4584, "step": 4795 }, { "epoch": 1.1194029850746268, "grad_norm": 0.3506865703413139, "learning_rate": 4.6381672159734287e-05, "loss": 0.4562, "step": 4800 }, { "epoch": 1.1205690298507462, "grad_norm": 0.4326740549088606, "learning_rate": 4.637222964317811e-05, "loss": 0.4867, "step": 4805 }, { "epoch": 1.1217350746268657, "grad_norm": 0.35337883468208786, "learning_rate": 4.636277590214344e-05, "loss": 0.4631, "step": 4810 }, { "epoch": 1.122901119402985, "grad_norm": 0.3437797335400648, "learning_rate": 4.6353310942252986e-05, "loss": 0.4711, "step": 4815 }, { "epoch": 1.1240671641791045, "grad_norm": 0.3349220385183838, "learning_rate": 4.634383476913615e-05, "loss": 0.4794, "step": 4820 }, { "epoch": 1.125233208955224, "grad_norm": 0.34586059659937457, "learning_rate": 4.6334347388429e-05, "loss": 0.4527, "step": 4825 }, { "epoch": 1.1263992537313432, "grad_norm": 0.33260407177537676, "learning_rate": 4.632484880577425e-05, "loss": 0.462, "step": 4830 }, { "epoch": 1.1275652985074627, "grad_norm": 0.34136322338964215, "learning_rate": 4.6315339026821305e-05, "loss": 0.4707, "step": 4835 }, { "epoch": 1.1287313432835822, "grad_norm": 0.3489751043295923, "learning_rate": 4.6305818057226226e-05, "loss": 0.4689, "step": 4840 }, { "epoch": 1.1298973880597014, "grad_norm": 0.3693000467333574, "learning_rate": 4.62962859026517e-05, "loss": 0.4762, "step": 4845 }, { "epoch": 1.131063432835821, "grad_norm": 0.4298711061127338, "learning_rate": 4.62867425687671e-05, "loss": 0.4969, "step": 4850 }, { "epoch": 1.1322294776119404, "grad_norm": 0.3502124625404359, "learning_rate": 4.6277188061248436e-05, "loss": 0.4893, "step": 4855 }, { "epoch": 1.1333955223880596, "grad_norm": 0.3682601185314399, "learning_rate": 4.626762238577836e-05, "loss": 0.4726, "step": 4860 }, { "epoch": 1.134561567164179, "grad_norm": 0.35690149579209757, "learning_rate": 4.6258045548046166e-05, "loss": 0.4707, "step": 4865 }, { "epoch": 1.1357276119402986, "grad_norm": 0.35016570452171414, "learning_rate": 4.624845755374779e-05, "loss": 0.4834, "step": 4870 }, { "epoch": 1.1368936567164178, "grad_norm": 0.3563033283287152, "learning_rate": 4.6238858408585804e-05, "loss": 0.4571, "step": 4875 }, { "epoch": 1.1380597014925373, "grad_norm": 0.3445818160791845, "learning_rate": 4.622924811826942e-05, "loss": 0.4722, "step": 4880 }, { "epoch": 1.1392257462686568, "grad_norm": 0.34145338015025534, "learning_rate": 4.6219626688514456e-05, "loss": 0.4741, "step": 4885 }, { "epoch": 1.140391791044776, "grad_norm": 0.36587625257751644, "learning_rate": 4.620999412504338e-05, "loss": 0.4618, "step": 4890 }, { "epoch": 1.1415578358208955, "grad_norm": 0.34599353777788416, "learning_rate": 4.620035043358526e-05, "loss": 0.4747, "step": 4895 }, { "epoch": 1.142723880597015, "grad_norm": 0.359034246944853, "learning_rate": 4.619069561987581e-05, "loss": 0.4765, "step": 4900 }, { "epoch": 1.1438899253731343, "grad_norm": 0.3316031562213355, "learning_rate": 4.618102968965733e-05, "loss": 0.4629, "step": 4905 }, { "epoch": 1.1450559701492538, "grad_norm": 0.3456782856940626, "learning_rate": 4.6171352648678755e-05, "loss": 0.4544, "step": 4910 }, { "epoch": 1.1462220149253732, "grad_norm": 0.3597782283482135, "learning_rate": 4.6161664502695606e-05, "loss": 0.476, "step": 4915 }, { "epoch": 1.1473880597014925, "grad_norm": 0.33986609170732546, "learning_rate": 4.615196525747003e-05, "loss": 0.4526, "step": 4920 }, { "epoch": 1.148554104477612, "grad_norm": 0.34494264289228443, "learning_rate": 4.6142254918770764e-05, "loss": 0.4851, "step": 4925 }, { "epoch": 1.1497201492537314, "grad_norm": 0.35939390691080125, "learning_rate": 4.613253349237314e-05, "loss": 0.4722, "step": 4930 }, { "epoch": 1.1508861940298507, "grad_norm": 0.34825796229089734, "learning_rate": 4.612280098405909e-05, "loss": 0.4799, "step": 4935 }, { "epoch": 1.1520522388059702, "grad_norm": 0.34466249371372343, "learning_rate": 4.611305739961715e-05, "loss": 0.4564, "step": 4940 }, { "epoch": 1.1532182835820897, "grad_norm": 0.37636422115279045, "learning_rate": 4.610330274484242e-05, "loss": 0.4878, "step": 4945 }, { "epoch": 1.154384328358209, "grad_norm": 0.3444610445631163, "learning_rate": 4.609353702553659e-05, "loss": 0.4591, "step": 4950 }, { "epoch": 1.1555503731343284, "grad_norm": 0.44224030162597666, "learning_rate": 4.6083760247507945e-05, "loss": 0.4888, "step": 4955 }, { "epoch": 1.1567164179104479, "grad_norm": 0.36083422304592466, "learning_rate": 4.607397241657133e-05, "loss": 0.4743, "step": 4960 }, { "epoch": 1.1578824626865671, "grad_norm": 0.41777229964352414, "learning_rate": 4.606417353854818e-05, "loss": 0.4663, "step": 4965 }, { "epoch": 1.1590485074626866, "grad_norm": 0.327211979110645, "learning_rate": 4.605436361926648e-05, "loss": 0.4544, "step": 4970 }, { "epoch": 1.1602145522388059, "grad_norm": 0.3709306815528986, "learning_rate": 4.6044542664560804e-05, "loss": 0.4793, "step": 4975 }, { "epoch": 1.1613805970149254, "grad_norm": 0.359192689249793, "learning_rate": 4.6034710680272274e-05, "loss": 0.4307, "step": 4980 }, { "epoch": 1.1625466417910448, "grad_norm": 0.4667362119738335, "learning_rate": 4.602486767224858e-05, "loss": 0.4815, "step": 4985 }, { "epoch": 1.163712686567164, "grad_norm": 0.39369758782073544, "learning_rate": 4.601501364634397e-05, "loss": 0.468, "step": 4990 }, { "epoch": 1.1648787313432836, "grad_norm": 0.35195180185798663, "learning_rate": 4.600514860841923e-05, "loss": 0.4974, "step": 4995 }, { "epoch": 1.166044776119403, "grad_norm": 0.38108608598276555, "learning_rate": 4.599527256434171e-05, "loss": 0.4731, "step": 5000 }, { "epoch": 1.1672108208955223, "grad_norm": 0.3389828422725711, "learning_rate": 4.598538551998531e-05, "loss": 0.4672, "step": 5005 }, { "epoch": 1.1683768656716418, "grad_norm": 0.3456064473874332, "learning_rate": 4.597548748123046e-05, "loss": 0.4763, "step": 5010 }, { "epoch": 1.1695429104477613, "grad_norm": 0.3897515261074159, "learning_rate": 4.596557845396412e-05, "loss": 0.477, "step": 5015 }, { "epoch": 1.1707089552238805, "grad_norm": 0.35285702888187354, "learning_rate": 4.595565844407982e-05, "loss": 0.47, "step": 5020 }, { "epoch": 1.171875, "grad_norm": 0.3436729048905015, "learning_rate": 4.59457274574776e-05, "loss": 0.476, "step": 5025 }, { "epoch": 1.1730410447761195, "grad_norm": 0.3395289915550358, "learning_rate": 4.5935785500064014e-05, "loss": 0.4514, "step": 5030 }, { "epoch": 1.1742070895522387, "grad_norm": 0.3459533878168976, "learning_rate": 4.5925832577752175e-05, "loss": 0.4803, "step": 5035 }, { "epoch": 1.1753731343283582, "grad_norm": 0.3817694396126775, "learning_rate": 4.5915868696461685e-05, "loss": 0.4901, "step": 5040 }, { "epoch": 1.1765391791044777, "grad_norm": 0.3719161382099099, "learning_rate": 4.590589386211869e-05, "loss": 0.4723, "step": 5045 }, { "epoch": 1.177705223880597, "grad_norm": 0.3589909527778616, "learning_rate": 4.589590808065583e-05, "loss": 0.4634, "step": 5050 }, { "epoch": 1.1788712686567164, "grad_norm": 0.3446017057438538, "learning_rate": 4.588591135801227e-05, "loss": 0.4736, "step": 5055 }, { "epoch": 1.180037313432836, "grad_norm": 0.36066282460265336, "learning_rate": 4.587590370013367e-05, "loss": 0.4813, "step": 5060 }, { "epoch": 1.1812033582089552, "grad_norm": 0.3348515022796862, "learning_rate": 4.5865885112972216e-05, "loss": 0.4668, "step": 5065 }, { "epoch": 1.1823694029850746, "grad_norm": 0.3806257961184635, "learning_rate": 4.585585560248657e-05, "loss": 0.464, "step": 5070 }, { "epoch": 1.1835354477611941, "grad_norm": 0.4055021260870813, "learning_rate": 4.58458151746419e-05, "loss": 0.4771, "step": 5075 }, { "epoch": 1.1847014925373134, "grad_norm": 0.36024792486991203, "learning_rate": 4.5835763835409864e-05, "loss": 0.4847, "step": 5080 }, { "epoch": 1.1858675373134329, "grad_norm": 0.36432444987862567, "learning_rate": 4.5825701590768625e-05, "loss": 0.4674, "step": 5085 }, { "epoch": 1.1870335820895521, "grad_norm": 0.35539556899013736, "learning_rate": 4.58156284467028e-05, "loss": 0.4644, "step": 5090 }, { "epoch": 1.1881996268656716, "grad_norm": 0.36012386102501925, "learning_rate": 4.5805544409203535e-05, "loss": 0.4761, "step": 5095 }, { "epoch": 1.189365671641791, "grad_norm": 0.37398174590872096, "learning_rate": 4.579544948426841e-05, "loss": 0.4799, "step": 5100 }, { "epoch": 1.1905317164179103, "grad_norm": 0.3994282420738512, "learning_rate": 4.57853436779015e-05, "loss": 0.4879, "step": 5105 }, { "epoch": 1.1916977611940298, "grad_norm": 0.39624462528288745, "learning_rate": 4.577522699611336e-05, "loss": 0.4687, "step": 5110 }, { "epoch": 1.1928638059701493, "grad_norm": 0.36822580927497833, "learning_rate": 4.576509944492101e-05, "loss": 0.4743, "step": 5115 }, { "epoch": 1.1940298507462686, "grad_norm": 0.3418725043685805, "learning_rate": 4.57549610303479e-05, "loss": 0.4653, "step": 5120 }, { "epoch": 1.195195895522388, "grad_norm": 0.371728706042081, "learning_rate": 4.5744811758424e-05, "loss": 0.4931, "step": 5125 }, { "epoch": 1.1963619402985075, "grad_norm": 0.3639777699347714, "learning_rate": 4.573465163518569e-05, "loss": 0.4556, "step": 5130 }, { "epoch": 1.1975279850746268, "grad_norm": 0.3655513971592392, "learning_rate": 4.572448066667584e-05, "loss": 0.4759, "step": 5135 }, { "epoch": 1.1986940298507462, "grad_norm": 0.3804150438485859, "learning_rate": 4.571429885894373e-05, "loss": 0.4786, "step": 5140 }, { "epoch": 1.1998600746268657, "grad_norm": 0.37662124845828593, "learning_rate": 4.5704106218045124e-05, "loss": 0.49, "step": 5145 }, { "epoch": 1.201026119402985, "grad_norm": 0.3538352986060505, "learning_rate": 4.569390275004221e-05, "loss": 0.4786, "step": 5150 }, { "epoch": 1.2021921641791045, "grad_norm": 0.3401736071338025, "learning_rate": 4.568368846100363e-05, "loss": 0.4807, "step": 5155 }, { "epoch": 1.203358208955224, "grad_norm": 0.3871943834832515, "learning_rate": 4.567346335700442e-05, "loss": 0.4951, "step": 5160 }, { "epoch": 1.2045242537313432, "grad_norm": 0.34238863528262575, "learning_rate": 4.5663227444126114e-05, "loss": 0.4796, "step": 5165 }, { "epoch": 1.2056902985074627, "grad_norm": 0.3120375829312124, "learning_rate": 4.565298072845662e-05, "loss": 0.4435, "step": 5170 }, { "epoch": 1.2068563432835822, "grad_norm": 0.34929115168215075, "learning_rate": 4.564272321609031e-05, "loss": 0.4593, "step": 5175 }, { "epoch": 1.2080223880597014, "grad_norm": 0.3642388243430463, "learning_rate": 4.563245491312793e-05, "loss": 0.4965, "step": 5180 }, { "epoch": 1.209188432835821, "grad_norm": 0.34560827023747864, "learning_rate": 4.5622175825676695e-05, "loss": 0.4455, "step": 5185 }, { "epoch": 1.2103544776119404, "grad_norm": 0.3544268002221307, "learning_rate": 4.5611885959850216e-05, "loss": 0.4656, "step": 5190 }, { "epoch": 1.2115205223880596, "grad_norm": 0.35984517231494456, "learning_rate": 4.560158532176849e-05, "loss": 0.4618, "step": 5195 }, { "epoch": 1.212686567164179, "grad_norm": 0.33659386973381583, "learning_rate": 4.559127391755796e-05, "loss": 0.452, "step": 5200 }, { "epoch": 1.2138526119402986, "grad_norm": 0.3386594459424572, "learning_rate": 4.558095175335145e-05, "loss": 0.4861, "step": 5205 }, { "epoch": 1.2150186567164178, "grad_norm": 0.38530834371843253, "learning_rate": 4.557061883528818e-05, "loss": 0.4703, "step": 5210 }, { "epoch": 1.2161847014925373, "grad_norm": 0.34018779300657914, "learning_rate": 4.5560275169513786e-05, "loss": 0.4752, "step": 5215 }, { "epoch": 1.2173507462686568, "grad_norm": 0.3665096540503898, "learning_rate": 4.554992076218026e-05, "loss": 0.4591, "step": 5220 }, { "epoch": 1.218516791044776, "grad_norm": 0.3369144313377284, "learning_rate": 4.553955561944603e-05, "loss": 0.4564, "step": 5225 }, { "epoch": 1.2196828358208955, "grad_norm": 0.3468759578660706, "learning_rate": 4.552917974747588e-05, "loss": 0.4855, "step": 5230 }, { "epoch": 1.220848880597015, "grad_norm": 0.35481173701535745, "learning_rate": 4.551879315244098e-05, "loss": 0.4617, "step": 5235 }, { "epoch": 1.2220149253731343, "grad_norm": 0.3450986925909998, "learning_rate": 4.5508395840518884e-05, "loss": 0.4499, "step": 5240 }, { "epoch": 1.2231809701492538, "grad_norm": 0.3576714143656946, "learning_rate": 4.549798781789349e-05, "loss": 0.4615, "step": 5245 }, { "epoch": 1.2243470149253732, "grad_norm": 0.34560502842767293, "learning_rate": 4.548756909075511e-05, "loss": 0.4898, "step": 5250 }, { "epoch": 1.2255130597014925, "grad_norm": 0.34514981942949313, "learning_rate": 4.5477139665300414e-05, "loss": 0.458, "step": 5255 }, { "epoch": 1.226679104477612, "grad_norm": 0.37756186234396855, "learning_rate": 4.5466699547732405e-05, "loss": 0.467, "step": 5260 }, { "epoch": 1.2278451492537314, "grad_norm": 0.34040811641059854, "learning_rate": 4.545624874426047e-05, "loss": 0.4585, "step": 5265 }, { "epoch": 1.2290111940298507, "grad_norm": 0.3559940480316581, "learning_rate": 4.544578726110035e-05, "loss": 0.4618, "step": 5270 }, { "epoch": 1.2301772388059702, "grad_norm": 0.351376294469949, "learning_rate": 4.5435315104474124e-05, "loss": 0.4867, "step": 5275 }, { "epoch": 1.2313432835820897, "grad_norm": 0.3670231506185556, "learning_rate": 4.5424832280610245e-05, "loss": 0.4791, "step": 5280 }, { "epoch": 1.232509328358209, "grad_norm": 0.34028405863163114, "learning_rate": 4.541433879574348e-05, "loss": 0.4617, "step": 5285 }, { "epoch": 1.2336753731343284, "grad_norm": 0.3393481641804084, "learning_rate": 4.540383465611496e-05, "loss": 0.457, "step": 5290 }, { "epoch": 1.2348414179104479, "grad_norm": 0.3786249770896504, "learning_rate": 4.539331986797215e-05, "loss": 0.4803, "step": 5295 }, { "epoch": 1.2360074626865671, "grad_norm": 0.374430924707444, "learning_rate": 4.5382794437568824e-05, "loss": 0.4729, "step": 5300 }, { "epoch": 1.2371735074626866, "grad_norm": 0.3422094187059447, "learning_rate": 4.537225837116512e-05, "loss": 0.47, "step": 5305 }, { "epoch": 1.2383395522388059, "grad_norm": 0.3530091405504405, "learning_rate": 4.5361711675027484e-05, "loss": 0.4682, "step": 5310 }, { "epoch": 1.2395055970149254, "grad_norm": 0.3654578778498564, "learning_rate": 4.535115435542868e-05, "loss": 0.4688, "step": 5315 }, { "epoch": 1.2406716417910448, "grad_norm": 0.3422839665065767, "learning_rate": 4.53405864186478e-05, "loss": 0.492, "step": 5320 }, { "epoch": 1.241837686567164, "grad_norm": 0.34419522904570493, "learning_rate": 4.5330007870970255e-05, "loss": 0.4661, "step": 5325 }, { "epoch": 1.2430037313432836, "grad_norm": 0.3820446962350419, "learning_rate": 4.531941871868775e-05, "loss": 0.4764, "step": 5330 }, { "epoch": 1.244169776119403, "grad_norm": 0.34931863469762414, "learning_rate": 4.530881896809831e-05, "loss": 0.4662, "step": 5335 }, { "epoch": 1.2453358208955223, "grad_norm": 0.40255326996579954, "learning_rate": 4.5298208625506253e-05, "loss": 0.4724, "step": 5340 }, { "epoch": 1.2465018656716418, "grad_norm": 0.3471022066775054, "learning_rate": 4.5287587697222215e-05, "loss": 0.4677, "step": 5345 }, { "epoch": 1.2476679104477613, "grad_norm": 0.3338908746986557, "learning_rate": 4.527695618956312e-05, "loss": 0.4906, "step": 5350 }, { "epoch": 1.2488339552238805, "grad_norm": 0.3560939004514755, "learning_rate": 4.5266314108852166e-05, "loss": 0.4806, "step": 5355 }, { "epoch": 1.25, "grad_norm": 0.34642499571322316, "learning_rate": 4.5255661461418854e-05, "loss": 0.4921, "step": 5360 }, { "epoch": 1.2511660447761195, "grad_norm": 0.35942052354277376, "learning_rate": 4.5244998253598994e-05, "loss": 0.4673, "step": 5365 }, { "epoch": 1.2523320895522387, "grad_norm": 0.33618468091116055, "learning_rate": 4.5234324491734624e-05, "loss": 0.4699, "step": 5370 }, { "epoch": 1.2534981343283582, "grad_norm": 0.3621804767634846, "learning_rate": 4.5223640182174115e-05, "loss": 0.4937, "step": 5375 }, { "epoch": 1.2546641791044777, "grad_norm": 0.39067172926492755, "learning_rate": 4.521294533127206e-05, "loss": 0.4747, "step": 5380 }, { "epoch": 1.255830223880597, "grad_norm": 0.3519068497378385, "learning_rate": 4.520223994538937e-05, "loss": 0.4577, "step": 5385 }, { "epoch": 1.2569962686567164, "grad_norm": 0.3346943899355124, "learning_rate": 4.519152403089317e-05, "loss": 0.4673, "step": 5390 }, { "epoch": 1.2581623134328357, "grad_norm": 0.33450040481508014, "learning_rate": 4.51807975941569e-05, "loss": 0.4827, "step": 5395 }, { "epoch": 1.2593283582089552, "grad_norm": 0.3395969296470776, "learning_rate": 4.517006064156023e-05, "loss": 0.456, "step": 5400 }, { "epoch": 1.2604944029850746, "grad_norm": 0.3606714318871862, "learning_rate": 4.515931317948907e-05, "loss": 0.4648, "step": 5405 }, { "epoch": 1.261660447761194, "grad_norm": 0.3598142629013287, "learning_rate": 4.5148555214335616e-05, "loss": 0.4586, "step": 5410 }, { "epoch": 1.2628264925373134, "grad_norm": 0.3417923393849799, "learning_rate": 4.5137786752498285e-05, "loss": 0.4562, "step": 5415 }, { "epoch": 1.2639925373134329, "grad_norm": 0.332696637003428, "learning_rate": 4.512700780038174e-05, "loss": 0.4686, "step": 5420 }, { "epoch": 1.2651585820895521, "grad_norm": 0.3495175993937834, "learning_rate": 4.5116218364396904e-05, "loss": 0.453, "step": 5425 }, { "epoch": 1.2663246268656716, "grad_norm": 0.35815858890808655, "learning_rate": 4.510541845096091e-05, "loss": 0.4444, "step": 5430 }, { "epoch": 1.267490671641791, "grad_norm": 0.32114652584389525, "learning_rate": 4.509460806649714e-05, "loss": 0.4622, "step": 5435 }, { "epoch": 1.2686567164179103, "grad_norm": 0.34840881577020344, "learning_rate": 4.5083787217435175e-05, "loss": 0.4686, "step": 5440 }, { "epoch": 1.2698227611940298, "grad_norm": 0.3529734030459022, "learning_rate": 4.507295591021087e-05, "loss": 0.4811, "step": 5445 }, { "epoch": 1.2709888059701493, "grad_norm": 0.3406895748034019, "learning_rate": 4.506211415126624e-05, "loss": 0.4707, "step": 5450 }, { "epoch": 1.2721548507462686, "grad_norm": 0.34948257097100904, "learning_rate": 4.505126194704958e-05, "loss": 0.4657, "step": 5455 }, { "epoch": 1.273320895522388, "grad_norm": 0.3481689704013532, "learning_rate": 4.504039930401535e-05, "loss": 0.4982, "step": 5460 }, { "epoch": 1.2744869402985075, "grad_norm": 0.36571058022165265, "learning_rate": 4.5029526228624226e-05, "loss": 0.4894, "step": 5465 }, { "epoch": 1.2756529850746268, "grad_norm": 0.3504718839400544, "learning_rate": 4.501864272734311e-05, "loss": 0.4909, "step": 5470 }, { "epoch": 1.2768190298507462, "grad_norm": 0.3712428064742856, "learning_rate": 4.500774880664508e-05, "loss": 0.4614, "step": 5475 }, { "epoch": 1.2779850746268657, "grad_norm": 0.3367222023286355, "learning_rate": 4.4996844473009425e-05, "loss": 0.4843, "step": 5480 }, { "epoch": 1.279151119402985, "grad_norm": 0.3467565130703101, "learning_rate": 4.498592973292162e-05, "loss": 0.4748, "step": 5485 }, { "epoch": 1.2803171641791045, "grad_norm": 0.352271810512614, "learning_rate": 4.497500459287335e-05, "loss": 0.4758, "step": 5490 }, { "epoch": 1.281483208955224, "grad_norm": 0.3649509574716188, "learning_rate": 4.496406905936246e-05, "loss": 0.4618, "step": 5495 }, { "epoch": 1.2826492537313432, "grad_norm": 0.34154397167538003, "learning_rate": 4.4953123138892984e-05, "loss": 0.4399, "step": 5500 }, { "epoch": 1.2838152985074627, "grad_norm": 0.35770749183416967, "learning_rate": 4.4942166837975134e-05, "loss": 0.4734, "step": 5505 }, { "epoch": 1.2849813432835822, "grad_norm": 0.33193004769625795, "learning_rate": 4.4931200163125306e-05, "loss": 0.4637, "step": 5510 }, { "epoch": 1.2861473880597014, "grad_norm": 0.32636524823546637, "learning_rate": 4.492022312086605e-05, "loss": 0.4628, "step": 5515 }, { "epoch": 1.287313432835821, "grad_norm": 0.37972064478182993, "learning_rate": 4.4909235717726086e-05, "loss": 0.4809, "step": 5520 }, { "epoch": 1.2884794776119404, "grad_norm": 0.3705003936352805, "learning_rate": 4.4898237960240315e-05, "loss": 0.4862, "step": 5525 }, { "epoch": 1.2896455223880596, "grad_norm": 0.35557349238462793, "learning_rate": 4.488722985494978e-05, "loss": 0.4713, "step": 5530 }, { "epoch": 1.290811567164179, "grad_norm": 0.3520554264053171, "learning_rate": 4.487621140840165e-05, "loss": 0.4665, "step": 5535 }, { "epoch": 1.2919776119402986, "grad_norm": 0.3493763195955227, "learning_rate": 4.486518262714931e-05, "loss": 0.4775, "step": 5540 }, { "epoch": 1.2931436567164178, "grad_norm": 0.3434809439568833, "learning_rate": 4.485414351775224e-05, "loss": 0.4677, "step": 5545 }, { "epoch": 1.2943097014925373, "grad_norm": 0.3448956584886712, "learning_rate": 4.484309408677609e-05, "loss": 0.4733, "step": 5550 }, { "epoch": 1.2954757462686568, "grad_norm": 0.3511373110368711, "learning_rate": 4.483203434079263e-05, "loss": 0.4817, "step": 5555 }, { "epoch": 1.296641791044776, "grad_norm": 0.34682648049040404, "learning_rate": 4.4820964286379764e-05, "loss": 0.4845, "step": 5560 }, { "epoch": 1.2978078358208955, "grad_norm": 0.33948525207732083, "learning_rate": 4.480988393012155e-05, "loss": 0.4629, "step": 5565 }, { "epoch": 1.298973880597015, "grad_norm": 0.357963666780957, "learning_rate": 4.479879327860816e-05, "loss": 0.4887, "step": 5570 }, { "epoch": 1.3001399253731343, "grad_norm": 0.3540736603121829, "learning_rate": 4.478769233843587e-05, "loss": 0.4706, "step": 5575 }, { "epoch": 1.3013059701492538, "grad_norm": 0.34082396033556095, "learning_rate": 4.477658111620711e-05, "loss": 0.4645, "step": 5580 }, { "epoch": 1.3024720149253732, "grad_norm": 0.35335694823393465, "learning_rate": 4.4765459618530405e-05, "loss": 0.4775, "step": 5585 }, { "epoch": 1.3036380597014925, "grad_norm": 0.35226879726747845, "learning_rate": 4.47543278520204e-05, "loss": 0.4592, "step": 5590 }, { "epoch": 1.304804104477612, "grad_norm": 0.38845177796693403, "learning_rate": 4.474318582329783e-05, "loss": 0.4393, "step": 5595 }, { "epoch": 1.3059701492537314, "grad_norm": 0.3656431063157445, "learning_rate": 4.4732033538989556e-05, "loss": 0.4551, "step": 5600 }, { "epoch": 1.3071361940298507, "grad_norm": 0.3575967154476404, "learning_rate": 4.4720871005728526e-05, "loss": 0.4803, "step": 5605 }, { "epoch": 1.3083022388059702, "grad_norm": 0.41037970036578786, "learning_rate": 4.47096982301538e-05, "loss": 0.4615, "step": 5610 }, { "epoch": 1.3094682835820897, "grad_norm": 0.35118234109058316, "learning_rate": 4.469851521891049e-05, "loss": 0.4716, "step": 5615 }, { "epoch": 1.310634328358209, "grad_norm": 0.35030180181912374, "learning_rate": 4.468732197864984e-05, "loss": 0.4825, "step": 5620 }, { "epoch": 1.3118003731343284, "grad_norm": 0.34959973159151725, "learning_rate": 4.467611851602916e-05, "loss": 0.4823, "step": 5625 }, { "epoch": 1.3129664179104479, "grad_norm": 0.34289899511328137, "learning_rate": 4.4664904837711835e-05, "loss": 0.4701, "step": 5630 }, { "epoch": 1.3141324626865671, "grad_norm": 0.3369597712695564, "learning_rate": 4.465368095036733e-05, "loss": 0.4467, "step": 5635 }, { "epoch": 1.3152985074626866, "grad_norm": 0.35788483394262643, "learning_rate": 4.4642446860671185e-05, "loss": 0.4806, "step": 5640 }, { "epoch": 1.316464552238806, "grad_norm": 0.37717250387952733, "learning_rate": 4.463120257530501e-05, "loss": 0.4709, "step": 5645 }, { "epoch": 1.3176305970149254, "grad_norm": 0.35296128412664984, "learning_rate": 4.461994810095647e-05, "loss": 0.456, "step": 5650 }, { "epoch": 1.3187966417910448, "grad_norm": 0.33699362841592273, "learning_rate": 4.46086834443193e-05, "loss": 0.4627, "step": 5655 }, { "epoch": 1.3199626865671643, "grad_norm": 0.336295217139395, "learning_rate": 4.4597408612093265e-05, "loss": 0.4826, "step": 5660 }, { "epoch": 1.3211287313432836, "grad_norm": 0.3387351126786447, "learning_rate": 4.458612361098423e-05, "loss": 0.448, "step": 5665 }, { "epoch": 1.322294776119403, "grad_norm": 0.3371270481559477, "learning_rate": 4.457482844770408e-05, "loss": 0.4835, "step": 5670 }, { "epoch": 1.3234608208955223, "grad_norm": 0.38399747806048057, "learning_rate": 4.456352312897072e-05, "loss": 0.4961, "step": 5675 }, { "epoch": 1.3246268656716418, "grad_norm": 0.34975638032156975, "learning_rate": 4.455220766150814e-05, "loss": 0.4802, "step": 5680 }, { "epoch": 1.3257929104477613, "grad_norm": 0.36240364241292927, "learning_rate": 4.454088205204634e-05, "loss": 0.4773, "step": 5685 }, { "epoch": 1.3269589552238805, "grad_norm": 0.3367159592821131, "learning_rate": 4.452954630732136e-05, "loss": 0.4494, "step": 5690 }, { "epoch": 1.328125, "grad_norm": 0.3900161527854174, "learning_rate": 4.451820043407527e-05, "loss": 0.4731, "step": 5695 }, { "epoch": 1.3292910447761195, "grad_norm": 0.3391857878049143, "learning_rate": 4.450684443905615e-05, "loss": 0.4843, "step": 5700 }, { "epoch": 1.3304570895522387, "grad_norm": 0.36324288923377723, "learning_rate": 4.4495478329018125e-05, "loss": 0.4807, "step": 5705 }, { "epoch": 1.3316231343283582, "grad_norm": 0.34646667743589166, "learning_rate": 4.44841021107213e-05, "loss": 0.4771, "step": 5710 }, { "epoch": 1.3327891791044777, "grad_norm": 0.35641048169105694, "learning_rate": 4.447271579093185e-05, "loss": 0.4623, "step": 5715 }, { "epoch": 1.333955223880597, "grad_norm": 0.3563101542477204, "learning_rate": 4.4461319376421875e-05, "loss": 0.4591, "step": 5720 }, { "epoch": 1.3351212686567164, "grad_norm": 0.4141322825166876, "learning_rate": 4.444991287396955e-05, "loss": 0.461, "step": 5725 }, { "epoch": 1.3362873134328357, "grad_norm": 0.3248529230379204, "learning_rate": 4.443849629035903e-05, "loss": 0.4478, "step": 5730 }, { "epoch": 1.3374533582089552, "grad_norm": 0.3886089588340654, "learning_rate": 4.4427069632380455e-05, "loss": 0.4628, "step": 5735 }, { "epoch": 1.3386194029850746, "grad_norm": 0.33379826652693817, "learning_rate": 4.441563290682996e-05, "loss": 0.469, "step": 5740 }, { "epoch": 1.339785447761194, "grad_norm": 0.3429521150843274, "learning_rate": 4.4404186120509674e-05, "loss": 0.474, "step": 5745 }, { "epoch": 1.3409514925373134, "grad_norm": 0.35876226703538694, "learning_rate": 4.43927292802277e-05, "loss": 0.4703, "step": 5750 }, { "epoch": 1.3421175373134329, "grad_norm": 0.3569189498874493, "learning_rate": 4.438126239279814e-05, "loss": 0.4701, "step": 5755 }, { "epoch": 1.3432835820895521, "grad_norm": 0.3189911195518358, "learning_rate": 4.436978546504105e-05, "loss": 0.4584, "step": 5760 }, { "epoch": 1.3444496268656716, "grad_norm": 0.36161508269905673, "learning_rate": 4.435829850378247e-05, "loss": 0.4892, "step": 5765 }, { "epoch": 1.345615671641791, "grad_norm": 0.3367798013900717, "learning_rate": 4.43468015158544e-05, "loss": 0.4642, "step": 5770 }, { "epoch": 1.3467817164179103, "grad_norm": 0.39248959182020965, "learning_rate": 4.433529450809481e-05, "loss": 0.4714, "step": 5775 }, { "epoch": 1.3479477611940298, "grad_norm": 0.3952304086704552, "learning_rate": 4.432377748734763e-05, "loss": 0.4659, "step": 5780 }, { "epoch": 1.3491138059701493, "grad_norm": 0.36098942675552664, "learning_rate": 4.431225046046274e-05, "loss": 0.4585, "step": 5785 }, { "epoch": 1.3502798507462686, "grad_norm": 0.34396963031792716, "learning_rate": 4.430071343429597e-05, "loss": 0.4575, "step": 5790 }, { "epoch": 1.351445895522388, "grad_norm": 0.3516631835202446, "learning_rate": 4.4289166415709096e-05, "loss": 0.467, "step": 5795 }, { "epoch": 1.3526119402985075, "grad_norm": 0.34830728074438166, "learning_rate": 4.427760941156986e-05, "loss": 0.443, "step": 5800 }, { "epoch": 1.3537779850746268, "grad_norm": 0.3401554835447212, "learning_rate": 4.426604242875191e-05, "loss": 0.4736, "step": 5805 }, { "epoch": 1.3549440298507462, "grad_norm": 0.31692934632881437, "learning_rate": 4.4254465474134856e-05, "loss": 0.4664, "step": 5810 }, { "epoch": 1.3561100746268657, "grad_norm": 0.3344355543563697, "learning_rate": 4.42428785546042e-05, "loss": 0.4418, "step": 5815 }, { "epoch": 1.357276119402985, "grad_norm": 0.3429853648105938, "learning_rate": 4.423128167705144e-05, "loss": 0.4934, "step": 5820 }, { "epoch": 1.3584421641791045, "grad_norm": 0.3372551467137175, "learning_rate": 4.4219674848373924e-05, "loss": 0.4557, "step": 5825 }, { "epoch": 1.359608208955224, "grad_norm": 0.33816161202855277, "learning_rate": 4.4208058075474945e-05, "loss": 0.4538, "step": 5830 }, { "epoch": 1.3607742537313432, "grad_norm": 0.35285268917873747, "learning_rate": 4.419643136526373e-05, "loss": 0.4811, "step": 5835 }, { "epoch": 1.3619402985074627, "grad_norm": 0.3488461077093203, "learning_rate": 4.418479472465539e-05, "loss": 0.4766, "step": 5840 }, { "epoch": 1.3631063432835822, "grad_norm": 0.35482334452327685, "learning_rate": 4.417314816057096e-05, "loss": 0.4745, "step": 5845 }, { "epoch": 1.3642723880597014, "grad_norm": 0.34765449286342404, "learning_rate": 4.416149167993737e-05, "loss": 0.4756, "step": 5850 }, { "epoch": 1.365438432835821, "grad_norm": 0.33080434875289166, "learning_rate": 4.4149825289687454e-05, "loss": 0.4808, "step": 5855 }, { "epoch": 1.3666044776119404, "grad_norm": 0.34179222147845023, "learning_rate": 4.413814899675991e-05, "loss": 0.4751, "step": 5860 }, { "epoch": 1.3677705223880596, "grad_norm": 0.3439575534468635, "learning_rate": 4.4126462808099364e-05, "loss": 0.4775, "step": 5865 }, { "epoch": 1.368936567164179, "grad_norm": 0.3184975726093386, "learning_rate": 4.411476673065631e-05, "loss": 0.4662, "step": 5870 }, { "epoch": 1.3701026119402986, "grad_norm": 0.333398032842768, "learning_rate": 4.410306077138713e-05, "loss": 0.452, "step": 5875 }, { "epoch": 1.3712686567164178, "grad_norm": 0.32159660884405294, "learning_rate": 4.409134493725409e-05, "loss": 0.4489, "step": 5880 }, { "epoch": 1.3724347014925373, "grad_norm": 0.35401452948354284, "learning_rate": 4.407961923522529e-05, "loss": 0.5064, "step": 5885 }, { "epoch": 1.3736007462686568, "grad_norm": 0.3436323801869561, "learning_rate": 4.406788367227475e-05, "loss": 0.4644, "step": 5890 }, { "epoch": 1.374766791044776, "grad_norm": 0.34815397841072016, "learning_rate": 4.4056138255382335e-05, "loss": 0.4898, "step": 5895 }, { "epoch": 1.3759328358208955, "grad_norm": 0.40504575116335834, "learning_rate": 4.404438299153376e-05, "loss": 0.5113, "step": 5900 }, { "epoch": 1.377098880597015, "grad_norm": 0.3687852567765307, "learning_rate": 4.4032617887720604e-05, "loss": 0.4967, "step": 5905 }, { "epoch": 1.3782649253731343, "grad_norm": 0.3738569955095955, "learning_rate": 4.4020842950940294e-05, "loss": 0.4832, "step": 5910 }, { "epoch": 1.3794309701492538, "grad_norm": 0.365148671617145, "learning_rate": 4.400905818819613e-05, "loss": 0.4428, "step": 5915 }, { "epoch": 1.3805970149253732, "grad_norm": 0.33811311087636564, "learning_rate": 4.3997263606497225e-05, "loss": 0.4714, "step": 5920 }, { "epoch": 1.3817630597014925, "grad_norm": 0.33101762048774236, "learning_rate": 4.3985459212858535e-05, "loss": 0.4644, "step": 5925 }, { "epoch": 1.382929104477612, "grad_norm": 0.4045628708096018, "learning_rate": 4.397364501430088e-05, "loss": 0.4933, "step": 5930 }, { "epoch": 1.3840951492537314, "grad_norm": 0.34091667587368785, "learning_rate": 4.396182101785089e-05, "loss": 0.4815, "step": 5935 }, { "epoch": 1.3852611940298507, "grad_norm": 0.330650194374689, "learning_rate": 4.3949987230541e-05, "loss": 0.4662, "step": 5940 }, { "epoch": 1.3864272388059702, "grad_norm": 0.3410642768782291, "learning_rate": 4.3938143659409515e-05, "loss": 0.4733, "step": 5945 }, { "epoch": 1.3875932835820897, "grad_norm": 0.3253332678449179, "learning_rate": 4.392629031150054e-05, "loss": 0.4757, "step": 5950 }, { "epoch": 1.388759328358209, "grad_norm": 0.32630858099074683, "learning_rate": 4.391442719386398e-05, "loss": 0.4551, "step": 5955 }, { "epoch": 1.3899253731343284, "grad_norm": 0.3713212930833286, "learning_rate": 4.390255431355557e-05, "loss": 0.452, "step": 5960 }, { "epoch": 1.3910914179104479, "grad_norm": 0.3625155387958956, "learning_rate": 4.389067167763683e-05, "loss": 0.4541, "step": 5965 }, { "epoch": 1.3922574626865671, "grad_norm": 0.3543990369217148, "learning_rate": 4.387877929317512e-05, "loss": 0.4975, "step": 5970 }, { "epoch": 1.3934235074626866, "grad_norm": 0.3374150857174104, "learning_rate": 4.3866877167243554e-05, "loss": 0.4589, "step": 5975 }, { "epoch": 1.394589552238806, "grad_norm": 0.3724150202757855, "learning_rate": 4.3854965306921064e-05, "loss": 0.4895, "step": 5980 }, { "epoch": 1.3957555970149254, "grad_norm": 0.334775208986404, "learning_rate": 4.384304371929238e-05, "loss": 0.4702, "step": 5985 }, { "epoch": 1.3969216417910448, "grad_norm": 0.34216010538815633, "learning_rate": 4.383111241144798e-05, "loss": 0.4443, "step": 5990 }, { "epoch": 1.3980876865671643, "grad_norm": 0.3412124651166191, "learning_rate": 4.3819171390484184e-05, "loss": 0.4623, "step": 5995 }, { "epoch": 1.3992537313432836, "grad_norm": 0.3425367727967644, "learning_rate": 4.380722066350303e-05, "loss": 0.474, "step": 6000 }, { "epoch": 1.400419776119403, "grad_norm": 0.33250347319099394, "learning_rate": 4.3795260237612353e-05, "loss": 0.4695, "step": 6005 }, { "epoch": 1.4015858208955223, "grad_norm": 0.35582044943792523, "learning_rate": 4.378329011992575e-05, "loss": 0.4767, "step": 6010 }, { "epoch": 1.4027518656716418, "grad_norm": 0.329136701757757, "learning_rate": 4.37713103175626e-05, "loss": 0.4575, "step": 6015 }, { "epoch": 1.4039179104477613, "grad_norm": 0.3375566200650066, "learning_rate": 4.375932083764803e-05, "loss": 0.4553, "step": 6020 }, { "epoch": 1.4050839552238805, "grad_norm": 0.36949881723621175, "learning_rate": 4.3747321687312916e-05, "loss": 0.4606, "step": 6025 }, { "epoch": 1.40625, "grad_norm": 0.3393860235345915, "learning_rate": 4.37353128736939e-05, "loss": 0.4854, "step": 6030 }, { "epoch": 1.4074160447761195, "grad_norm": 0.39783506079987735, "learning_rate": 4.3723294403933355e-05, "loss": 0.4686, "step": 6035 }, { "epoch": 1.4085820895522387, "grad_norm": 0.44253275431438494, "learning_rate": 4.3711266285179415e-05, "loss": 0.4696, "step": 6040 }, { "epoch": 1.4097481343283582, "grad_norm": 0.34565348987597855, "learning_rate": 4.369922852458594e-05, "loss": 0.4792, "step": 6045 }, { "epoch": 1.4109141791044777, "grad_norm": 0.3266358830905705, "learning_rate": 4.3687181129312534e-05, "loss": 0.4748, "step": 6050 }, { "epoch": 1.412080223880597, "grad_norm": 0.3346189911122666, "learning_rate": 4.3675124106524514e-05, "loss": 0.4863, "step": 6055 }, { "epoch": 1.4132462686567164, "grad_norm": 0.36015183728416794, "learning_rate": 4.366305746339293e-05, "loss": 0.4657, "step": 6060 }, { "epoch": 1.4144123134328357, "grad_norm": 0.3445327226090057, "learning_rate": 4.365098120709458e-05, "loss": 0.4402, "step": 6065 }, { "epoch": 1.4155783582089552, "grad_norm": 0.3578417001077714, "learning_rate": 4.363889534481195e-05, "loss": 0.4738, "step": 6070 }, { "epoch": 1.4167444029850746, "grad_norm": 0.35258504782831435, "learning_rate": 4.3626799883733236e-05, "loss": 0.4587, "step": 6075 }, { "epoch": 1.417910447761194, "grad_norm": 0.34996079677911557, "learning_rate": 4.361469483105236e-05, "loss": 0.4649, "step": 6080 }, { "epoch": 1.4190764925373134, "grad_norm": 0.33313727721456643, "learning_rate": 4.360258019396895e-05, "loss": 0.4514, "step": 6085 }, { "epoch": 1.4202425373134329, "grad_norm": 0.3737444393468166, "learning_rate": 4.3590455979688335e-05, "loss": 0.4844, "step": 6090 }, { "epoch": 1.4214085820895521, "grad_norm": 0.36666239009502494, "learning_rate": 4.357832219542151e-05, "loss": 0.5074, "step": 6095 }, { "epoch": 1.4225746268656716, "grad_norm": 0.34348607992487973, "learning_rate": 4.3566178848385194e-05, "loss": 0.4833, "step": 6100 }, { "epoch": 1.423740671641791, "grad_norm": 0.32522904396512936, "learning_rate": 4.35540259458018e-05, "loss": 0.4553, "step": 6105 }, { "epoch": 1.4249067164179103, "grad_norm": 0.3285642105741488, "learning_rate": 4.3541863494899385e-05, "loss": 0.4633, "step": 6110 }, { "epoch": 1.4260727611940298, "grad_norm": 0.33496319615756404, "learning_rate": 4.352969150291172e-05, "loss": 0.4567, "step": 6115 }, { "epoch": 1.4272388059701493, "grad_norm": 0.3280628349806029, "learning_rate": 4.351750997707824e-05, "loss": 0.4452, "step": 6120 }, { "epoch": 1.4284048507462686, "grad_norm": 0.342932705677778, "learning_rate": 4.3505318924644036e-05, "loss": 0.451, "step": 6125 }, { "epoch": 1.429570895522388, "grad_norm": 0.314679608343574, "learning_rate": 4.34931183528599e-05, "loss": 0.4588, "step": 6130 }, { "epoch": 1.4307369402985075, "grad_norm": 0.3552548606888951, "learning_rate": 4.348090826898225e-05, "loss": 0.4664, "step": 6135 }, { "epoch": 1.4319029850746268, "grad_norm": 0.3504993104642995, "learning_rate": 4.346868868027318e-05, "loss": 0.461, "step": 6140 }, { "epoch": 1.4330690298507462, "grad_norm": 0.3402055721432462, "learning_rate": 4.345645959400043e-05, "loss": 0.4601, "step": 6145 }, { "epoch": 1.4342350746268657, "grad_norm": 0.3299759511142303, "learning_rate": 4.344422101743739e-05, "loss": 0.4428, "step": 6150 }, { "epoch": 1.435401119402985, "grad_norm": 0.33594338149547776, "learning_rate": 4.3431972957863106e-05, "loss": 0.4626, "step": 6155 }, { "epoch": 1.4365671641791045, "grad_norm": 0.3746544537156815, "learning_rate": 4.341971542256225e-05, "loss": 0.475, "step": 6160 }, { "epoch": 1.437733208955224, "grad_norm": 0.34845527069662197, "learning_rate": 4.340744841882512e-05, "loss": 0.4677, "step": 6165 }, { "epoch": 1.4388992537313432, "grad_norm": 0.34970961524757327, "learning_rate": 4.339517195394768e-05, "loss": 0.4588, "step": 6170 }, { "epoch": 1.4400652985074627, "grad_norm": 0.3615162800299732, "learning_rate": 4.3382886035231484e-05, "loss": 0.4809, "step": 6175 }, { "epoch": 1.4412313432835822, "grad_norm": 0.31849620012464863, "learning_rate": 4.3370590669983736e-05, "loss": 0.4592, "step": 6180 }, { "epoch": 1.4423973880597014, "grad_norm": 0.3357398139997132, "learning_rate": 4.335828586551725e-05, "loss": 0.4716, "step": 6185 }, { "epoch": 1.443563432835821, "grad_norm": 0.3112798682013351, "learning_rate": 4.334597162915045e-05, "loss": 0.459, "step": 6190 }, { "epoch": 1.4447294776119404, "grad_norm": 0.3546794675494792, "learning_rate": 4.333364796820735e-05, "loss": 0.4611, "step": 6195 }, { "epoch": 1.4458955223880596, "grad_norm": 0.328212485716124, "learning_rate": 4.332131489001762e-05, "loss": 0.4829, "step": 6200 }, { "epoch": 1.447061567164179, "grad_norm": 0.3254891995018081, "learning_rate": 4.3308972401916495e-05, "loss": 0.4409, "step": 6205 }, { "epoch": 1.4482276119402986, "grad_norm": 0.3265612892352721, "learning_rate": 4.3296620511244804e-05, "loss": 0.4724, "step": 6210 }, { "epoch": 1.4493936567164178, "grad_norm": 0.3273810014969138, "learning_rate": 4.3284259225348985e-05, "loss": 0.4663, "step": 6215 }, { "epoch": 1.4505597014925373, "grad_norm": 0.3449418735589012, "learning_rate": 4.327188855158106e-05, "loss": 0.4692, "step": 6220 }, { "epoch": 1.4517257462686568, "grad_norm": 0.3540063144251308, "learning_rate": 4.325950849729862e-05, "loss": 0.4767, "step": 6225 }, { "epoch": 1.452891791044776, "grad_norm": 1.0643761025103295, "learning_rate": 4.3247119069864856e-05, "loss": 0.4571, "step": 6230 }, { "epoch": 1.4540578358208955, "grad_norm": 0.33419173990646234, "learning_rate": 4.323472027664852e-05, "loss": 0.447, "step": 6235 }, { "epoch": 1.455223880597015, "grad_norm": 0.34788092893745387, "learning_rate": 4.322231212502394e-05, "loss": 0.4597, "step": 6240 }, { "epoch": 1.4563899253731343, "grad_norm": 0.3134974285379023, "learning_rate": 4.320989462237101e-05, "loss": 0.4545, "step": 6245 }, { "epoch": 1.4575559701492538, "grad_norm": 0.35842579625615295, "learning_rate": 4.3197467776075185e-05, "loss": 0.4833, "step": 6250 }, { "epoch": 1.4587220149253732, "grad_norm": 0.33672571732190615, "learning_rate": 4.318503159352748e-05, "loss": 0.47, "step": 6255 }, { "epoch": 1.4598880597014925, "grad_norm": 0.3333232640983891, "learning_rate": 4.317258608212444e-05, "loss": 0.4645, "step": 6260 }, { "epoch": 1.461054104477612, "grad_norm": 0.3677124920971659, "learning_rate": 4.31601312492682e-05, "loss": 0.4428, "step": 6265 }, { "epoch": 1.4622201492537314, "grad_norm": 0.3579258774501656, "learning_rate": 4.3147667102366415e-05, "loss": 0.4858, "step": 6270 }, { "epoch": 1.4633861940298507, "grad_norm": 0.3429301134861556, "learning_rate": 4.313519364883227e-05, "loss": 0.4787, "step": 6275 }, { "epoch": 1.4645522388059702, "grad_norm": 0.3479253377443876, "learning_rate": 4.3122710896084504e-05, "loss": 0.467, "step": 6280 }, { "epoch": 1.4657182835820897, "grad_norm": 0.3369620809903681, "learning_rate": 4.3110218851547384e-05, "loss": 0.4636, "step": 6285 }, { "epoch": 1.466884328358209, "grad_norm": 0.33730464524968007, "learning_rate": 4.309771752265069e-05, "loss": 0.4573, "step": 6290 }, { "epoch": 1.4680503731343284, "grad_norm": 0.3218846587482649, "learning_rate": 4.308520691682974e-05, "loss": 0.4473, "step": 6295 }, { "epoch": 1.4692164179104479, "grad_norm": 0.32838855496131214, "learning_rate": 4.307268704152535e-05, "loss": 0.4604, "step": 6300 }, { "epoch": 1.4703824626865671, "grad_norm": 0.34867492982284204, "learning_rate": 4.3060157904183873e-05, "loss": 0.4791, "step": 6305 }, { "epoch": 1.4715485074626866, "grad_norm": 0.3330642008297452, "learning_rate": 4.3047619512257164e-05, "loss": 0.4431, "step": 6310 }, { "epoch": 1.472714552238806, "grad_norm": 0.33600408093566253, "learning_rate": 4.3035071873202563e-05, "loss": 0.4525, "step": 6315 }, { "epoch": 1.4738805970149254, "grad_norm": 0.3406265788599132, "learning_rate": 4.302251499448294e-05, "loss": 0.4767, "step": 6320 }, { "epoch": 1.4750466417910448, "grad_norm": 0.3429515470944637, "learning_rate": 4.3009948883566645e-05, "loss": 0.4894, "step": 6325 }, { "epoch": 1.4762126865671643, "grad_norm": 0.32648908815987504, "learning_rate": 4.29973735479275e-05, "loss": 0.4767, "step": 6330 }, { "epoch": 1.4773787313432836, "grad_norm": 0.33673546137273785, "learning_rate": 4.298478899504485e-05, "loss": 0.4604, "step": 6335 }, { "epoch": 1.478544776119403, "grad_norm": 0.33823780875850035, "learning_rate": 4.297219523240349e-05, "loss": 0.4531, "step": 6340 }, { "epoch": 1.4797108208955223, "grad_norm": 0.35119200081415775, "learning_rate": 4.2959592267493715e-05, "loss": 0.4636, "step": 6345 }, { "epoch": 1.4808768656716418, "grad_norm": 0.4322198520912854, "learning_rate": 4.2946980107811295e-05, "loss": 0.4787, "step": 6350 }, { "epoch": 1.4820429104477613, "grad_norm": 0.36083460260316264, "learning_rate": 4.2934358760857454e-05, "loss": 0.4794, "step": 6355 }, { "epoch": 1.4832089552238805, "grad_norm": 0.31173947674943203, "learning_rate": 4.292172823413887e-05, "loss": 0.4538, "step": 6360 }, { "epoch": 1.484375, "grad_norm": 0.33214255574354906, "learning_rate": 4.2909088535167714e-05, "loss": 0.46, "step": 6365 }, { "epoch": 1.4855410447761195, "grad_norm": 0.32569959491022565, "learning_rate": 4.289643967146158e-05, "loss": 0.4777, "step": 6370 }, { "epoch": 1.4867070895522387, "grad_norm": 0.3134761071205022, "learning_rate": 4.288378165054354e-05, "loss": 0.4557, "step": 6375 }, { "epoch": 1.4878731343283582, "grad_norm": 0.3418201443289579, "learning_rate": 4.28711144799421e-05, "loss": 0.4669, "step": 6380 }, { "epoch": 1.4890391791044777, "grad_norm": 0.3612542299768223, "learning_rate": 4.2858438167191185e-05, "loss": 0.4641, "step": 6385 }, { "epoch": 1.490205223880597, "grad_norm": 0.34867879366259485, "learning_rate": 4.2845752719830206e-05, "loss": 0.456, "step": 6390 }, { "epoch": 1.4913712686567164, "grad_norm": 0.3731883625822092, "learning_rate": 4.283305814540397e-05, "loss": 0.4713, "step": 6395 }, { "epoch": 1.4925373134328357, "grad_norm": 0.34935271530956713, "learning_rate": 4.282035445146272e-05, "loss": 0.4826, "step": 6400 }, { "epoch": 1.4937033582089552, "grad_norm": 0.32927278997045184, "learning_rate": 4.2807641645562134e-05, "loss": 0.472, "step": 6405 }, { "epoch": 1.4948694029850746, "grad_norm": 0.3400672819928905, "learning_rate": 4.2794919735263295e-05, "loss": 0.4582, "step": 6410 }, { "epoch": 1.496035447761194, "grad_norm": 0.3259509490685773, "learning_rate": 4.278218872813271e-05, "loss": 0.4793, "step": 6415 }, { "epoch": 1.4972014925373134, "grad_norm": 0.36147664082553604, "learning_rate": 4.276944863174229e-05, "loss": 0.4531, "step": 6420 }, { "epoch": 1.4983675373134329, "grad_norm": 0.37015743529852446, "learning_rate": 4.275669945366936e-05, "loss": 0.4712, "step": 6425 }, { "epoch": 1.4995335820895521, "grad_norm": 0.35148150922706417, "learning_rate": 4.2743941201496644e-05, "loss": 0.4593, "step": 6430 }, { "epoch": 1.5006996268656716, "grad_norm": 0.33182234228008994, "learning_rate": 4.2731173882812264e-05, "loss": 0.467, "step": 6435 }, { "epoch": 1.501865671641791, "grad_norm": 0.3407677529605748, "learning_rate": 4.271839750520972e-05, "loss": 0.4674, "step": 6440 }, { "epoch": 1.5030317164179103, "grad_norm": 0.3345446593153718, "learning_rate": 4.2705612076287907e-05, "loss": 0.4507, "step": 6445 }, { "epoch": 1.5041977611940298, "grad_norm": 0.3332154562763926, "learning_rate": 4.2692817603651134e-05, "loss": 0.4431, "step": 6450 }, { "epoch": 1.5053638059701493, "grad_norm": 0.36323268860291735, "learning_rate": 4.2680014094909035e-05, "loss": 0.4566, "step": 6455 }, { "epoch": 1.5065298507462686, "grad_norm": 0.3259349704090724, "learning_rate": 4.2667201557676673e-05, "loss": 0.4629, "step": 6460 }, { "epoch": 1.507695895522388, "grad_norm": 0.33444066240013476, "learning_rate": 4.2654379999574425e-05, "loss": 0.4557, "step": 6465 }, { "epoch": 1.5088619402985075, "grad_norm": 0.344096009710541, "learning_rate": 4.2641549428228087e-05, "loss": 0.4636, "step": 6470 }, { "epoch": 1.5100279850746268, "grad_norm": 0.33982012028637054, "learning_rate": 4.2628709851268775e-05, "loss": 0.4588, "step": 6475 }, { "epoch": 1.5111940298507462, "grad_norm": 0.32270394544416997, "learning_rate": 4.261586127633297e-05, "loss": 0.4554, "step": 6480 }, { "epoch": 1.5123600746268657, "grad_norm": 0.3252570889517615, "learning_rate": 4.2603003711062536e-05, "loss": 0.4561, "step": 6485 }, { "epoch": 1.513526119402985, "grad_norm": 0.3394768153452522, "learning_rate": 4.259013716310465e-05, "loss": 0.4636, "step": 6490 }, { "epoch": 1.5146921641791045, "grad_norm": 0.3365118701807337, "learning_rate": 4.2577261640111834e-05, "loss": 0.4519, "step": 6495 }, { "epoch": 1.515858208955224, "grad_norm": 0.32262503177912955, "learning_rate": 4.256437714974196e-05, "loss": 0.48, "step": 6500 }, { "epoch": 1.5170242537313432, "grad_norm": 0.316392457569673, "learning_rate": 4.255148369965822e-05, "loss": 0.469, "step": 6505 }, { "epoch": 1.5181902985074627, "grad_norm": 0.3490934274261248, "learning_rate": 4.253858129752916e-05, "loss": 0.465, "step": 6510 }, { "epoch": 1.5193563432835822, "grad_norm": 0.33463059646920107, "learning_rate": 4.252566995102864e-05, "loss": 0.4741, "step": 6515 }, { "epoch": 1.5205223880597014, "grad_norm": 0.3256308103198369, "learning_rate": 4.251274966783579e-05, "loss": 0.4583, "step": 6520 }, { "epoch": 1.521688432835821, "grad_norm": 0.32298219383214943, "learning_rate": 4.2499820455635154e-05, "loss": 0.4481, "step": 6525 }, { "epoch": 1.5228544776119404, "grad_norm": 0.3676236502372447, "learning_rate": 4.24868823221165e-05, "loss": 0.4703, "step": 6530 }, { "epoch": 1.5240205223880596, "grad_norm": 0.3312495080130234, "learning_rate": 4.2473935274974944e-05, "loss": 0.4657, "step": 6535 }, { "epoch": 1.525186567164179, "grad_norm": 0.30885099791103654, "learning_rate": 4.246097932191088e-05, "loss": 0.4579, "step": 6540 }, { "epoch": 1.5263526119402986, "grad_norm": 0.3341932819145299, "learning_rate": 4.2448014470630034e-05, "loss": 0.4609, "step": 6545 }, { "epoch": 1.5275186567164178, "grad_norm": 0.34122470078700223, "learning_rate": 4.2435040728843376e-05, "loss": 0.4676, "step": 6550 }, { "epoch": 1.5286847014925373, "grad_norm": 0.32811744269493354, "learning_rate": 4.2422058104267215e-05, "loss": 0.4645, "step": 6555 }, { "epoch": 1.5298507462686568, "grad_norm": 0.3547434028641149, "learning_rate": 4.2409066604623096e-05, "loss": 0.4717, "step": 6560 }, { "epoch": 1.531016791044776, "grad_norm": 0.31996972002621243, "learning_rate": 4.239606623763789e-05, "loss": 0.4623, "step": 6565 }, { "epoch": 1.5321828358208955, "grad_norm": 0.3355552932095701, "learning_rate": 4.23830570110437e-05, "loss": 0.4621, "step": 6570 }, { "epoch": 1.533348880597015, "grad_norm": 0.32800163172000013, "learning_rate": 4.237003893257791e-05, "loss": 0.4818, "step": 6575 }, { "epoch": 1.5345149253731343, "grad_norm": 0.3307796850338419, "learning_rate": 4.2357012009983185e-05, "loss": 0.486, "step": 6580 }, { "epoch": 1.5356809701492538, "grad_norm": 0.338310889951412, "learning_rate": 4.234397625100745e-05, "loss": 0.4521, "step": 6585 }, { "epoch": 1.5368470149253732, "grad_norm": 0.3647838461591258, "learning_rate": 4.2330931663403844e-05, "loss": 0.4706, "step": 6590 }, { "epoch": 1.5380130597014925, "grad_norm": 0.3300642849353027, "learning_rate": 4.231787825493081e-05, "loss": 0.4492, "step": 6595 }, { "epoch": 1.539179104477612, "grad_norm": 0.33557264157968786, "learning_rate": 4.230481603335201e-05, "loss": 0.4687, "step": 6600 }, { "epoch": 1.5403451492537314, "grad_norm": 0.3114008445874425, "learning_rate": 4.229174500643634e-05, "loss": 0.4578, "step": 6605 }, { "epoch": 1.5415111940298507, "grad_norm": 0.32152049048790204, "learning_rate": 4.227866518195797e-05, "loss": 0.4658, "step": 6610 }, { "epoch": 1.5426772388059702, "grad_norm": 0.3527059542529975, "learning_rate": 4.226557656769626e-05, "loss": 0.4844, "step": 6615 }, { "epoch": 1.5438432835820897, "grad_norm": 0.330570383716205, "learning_rate": 4.225247917143582e-05, "loss": 0.4832, "step": 6620 }, { "epoch": 1.545009328358209, "grad_norm": 0.34996640622759717, "learning_rate": 4.223937300096648e-05, "loss": 0.4753, "step": 6625 }, { "epoch": 1.5461753731343284, "grad_norm": 0.3425186961041473, "learning_rate": 4.22262580640833e-05, "loss": 0.4784, "step": 6630 }, { "epoch": 1.5473414179104479, "grad_norm": 0.3197472159330529, "learning_rate": 4.221313436858651e-05, "loss": 0.462, "step": 6635 }, { "epoch": 1.5485074626865671, "grad_norm": 0.36173561576993507, "learning_rate": 4.220000192228161e-05, "loss": 0.4864, "step": 6640 }, { "epoch": 1.5496735074626866, "grad_norm": 0.3457277757550715, "learning_rate": 4.218686073297926e-05, "loss": 0.4924, "step": 6645 }, { "epoch": 1.550839552238806, "grad_norm": 0.31799753718370405, "learning_rate": 4.217371080849535e-05, "loss": 0.4364, "step": 6650 }, { "epoch": 1.5520055970149254, "grad_norm": 0.3196897952854417, "learning_rate": 4.216055215665093e-05, "loss": 0.4506, "step": 6655 }, { "epoch": 1.5531716417910446, "grad_norm": 0.3257421590495531, "learning_rate": 4.2147384785272284e-05, "loss": 0.4553, "step": 6660 }, { "epoch": 1.5543376865671643, "grad_norm": 0.3432550994802492, "learning_rate": 4.213420870219084e-05, "loss": 0.4817, "step": 6665 }, { "epoch": 1.5555037313432836, "grad_norm": 0.34725168242420085, "learning_rate": 4.212102391524324e-05, "loss": 0.4648, "step": 6670 }, { "epoch": 1.5566697761194028, "grad_norm": 0.33122172319023835, "learning_rate": 4.210783043227129e-05, "loss": 0.457, "step": 6675 }, { "epoch": 1.5578358208955225, "grad_norm": 0.32042190315550045, "learning_rate": 4.209462826112195e-05, "loss": 0.4775, "step": 6680 }, { "epoch": 1.5590018656716418, "grad_norm": 0.34115796189591796, "learning_rate": 4.2081417409647386e-05, "loss": 0.4725, "step": 6685 }, { "epoch": 1.560167910447761, "grad_norm": 0.3349752626639639, "learning_rate": 4.2068197885704904e-05, "loss": 0.4722, "step": 6690 }, { "epoch": 1.5613339552238807, "grad_norm": 0.3450973154659737, "learning_rate": 4.205496969715696e-05, "loss": 0.482, "step": 6695 }, { "epoch": 1.5625, "grad_norm": 0.3346036864573799, "learning_rate": 4.204173285187117e-05, "loss": 0.4685, "step": 6700 }, { "epoch": 1.5636660447761193, "grad_norm": 0.34800557133729426, "learning_rate": 4.202848735772031e-05, "loss": 0.4801, "step": 6705 }, { "epoch": 1.564832089552239, "grad_norm": 0.3367084430898118, "learning_rate": 4.201523322258231e-05, "loss": 0.4274, "step": 6710 }, { "epoch": 1.5659981343283582, "grad_norm": 0.35012078531123364, "learning_rate": 4.2001970454340185e-05, "loss": 0.4779, "step": 6715 }, { "epoch": 1.5671641791044775, "grad_norm": 0.3464241011269615, "learning_rate": 4.1988699060882144e-05, "loss": 0.4474, "step": 6720 }, { "epoch": 1.5683302238805972, "grad_norm": 0.32315881126500995, "learning_rate": 4.197541905010149e-05, "loss": 0.4557, "step": 6725 }, { "epoch": 1.5694962686567164, "grad_norm": 0.33350654547882, "learning_rate": 4.196213042989668e-05, "loss": 0.4813, "step": 6730 }, { "epoch": 1.5706623134328357, "grad_norm": 0.31996360129943924, "learning_rate": 4.194883320817127e-05, "loss": 0.4796, "step": 6735 }, { "epoch": 1.5718283582089554, "grad_norm": 0.30918268023470596, "learning_rate": 4.193552739283393e-05, "loss": 0.47, "step": 6740 }, { "epoch": 1.5729944029850746, "grad_norm": 0.34011552174858767, "learning_rate": 4.192221299179845e-05, "loss": 0.4677, "step": 6745 }, { "epoch": 1.574160447761194, "grad_norm": 0.3129514549766107, "learning_rate": 4.190889001298373e-05, "loss": 0.4564, "step": 6750 }, { "epoch": 1.5753264925373134, "grad_norm": 0.3319813884486381, "learning_rate": 4.189555846431377e-05, "loss": 0.4546, "step": 6755 }, { "epoch": 1.5764925373134329, "grad_norm": 0.34061377560704664, "learning_rate": 4.188221835371766e-05, "loss": 0.4728, "step": 6760 }, { "epoch": 1.5776585820895521, "grad_norm": 0.33008510935751006, "learning_rate": 4.1868869689129584e-05, "loss": 0.4557, "step": 6765 }, { "epoch": 1.5788246268656716, "grad_norm": 0.34404215231714474, "learning_rate": 4.1855512478488816e-05, "loss": 0.4569, "step": 6770 }, { "epoch": 1.579990671641791, "grad_norm": 0.3328531164247705, "learning_rate": 4.184214672973971e-05, "loss": 0.4749, "step": 6775 }, { "epoch": 1.5811567164179103, "grad_norm": 0.3242877263909023, "learning_rate": 4.182877245083172e-05, "loss": 0.469, "step": 6780 }, { "epoch": 1.5823227611940298, "grad_norm": 0.3250921673756281, "learning_rate": 4.181538964971933e-05, "loss": 0.442, "step": 6785 }, { "epoch": 1.5834888059701493, "grad_norm": 0.33070321772700895, "learning_rate": 4.180199833436213e-05, "loss": 0.4635, "step": 6790 }, { "epoch": 1.5846548507462686, "grad_norm": 0.34575544940838887, "learning_rate": 4.178859851272475e-05, "loss": 0.491, "step": 6795 }, { "epoch": 1.585820895522388, "grad_norm": 0.35928774509106254, "learning_rate": 4.1775190192776905e-05, "loss": 0.4829, "step": 6800 }, { "epoch": 1.5869869402985075, "grad_norm": 0.32912110954208285, "learning_rate": 4.176177338249334e-05, "loss": 0.4569, "step": 6805 }, { "epoch": 1.5881529850746268, "grad_norm": 0.320790652546973, "learning_rate": 4.1748348089853864e-05, "loss": 0.4738, "step": 6810 }, { "epoch": 1.5893190298507462, "grad_norm": 0.33958660148187925, "learning_rate": 4.173491432284332e-05, "loss": 0.4888, "step": 6815 }, { "epoch": 1.5904850746268657, "grad_norm": 0.33037229113813193, "learning_rate": 4.172147208945159e-05, "loss": 0.4727, "step": 6820 }, { "epoch": 1.591651119402985, "grad_norm": 0.33796195513843025, "learning_rate": 4.170802139767362e-05, "loss": 0.4578, "step": 6825 }, { "epoch": 1.5928171641791045, "grad_norm": 0.3357550301495569, "learning_rate": 4.1694562255509354e-05, "loss": 0.4609, "step": 6830 }, { "epoch": 1.593983208955224, "grad_norm": 0.32146300171119124, "learning_rate": 4.168109467096378e-05, "loss": 0.4589, "step": 6835 }, { "epoch": 1.5951492537313432, "grad_norm": 0.33435126899728823, "learning_rate": 4.1667618652046894e-05, "loss": 0.4557, "step": 6840 }, { "epoch": 1.5963152985074627, "grad_norm": 0.3373892914826476, "learning_rate": 4.165413420677372e-05, "loss": 0.4592, "step": 6845 }, { "epoch": 1.5974813432835822, "grad_norm": 0.33192379105590525, "learning_rate": 4.164064134316428e-05, "loss": 0.5028, "step": 6850 }, { "epoch": 1.5986473880597014, "grad_norm": 0.3614568709330173, "learning_rate": 4.162714006924362e-05, "loss": 0.4897, "step": 6855 }, { "epoch": 1.599813432835821, "grad_norm": 0.3339458864487026, "learning_rate": 4.161363039304177e-05, "loss": 0.4662, "step": 6860 }, { "epoch": 1.6009794776119404, "grad_norm": 0.3420259560333856, "learning_rate": 4.160011232259378e-05, "loss": 0.4597, "step": 6865 }, { "epoch": 1.6021455223880596, "grad_norm": 0.3563996763973991, "learning_rate": 4.158658586593969e-05, "loss": 0.4618, "step": 6870 }, { "epoch": 1.603311567164179, "grad_norm": 0.3086604099147653, "learning_rate": 4.1573051031124486e-05, "loss": 0.4401, "step": 6875 }, { "epoch": 1.6044776119402986, "grad_norm": 0.3699639063420792, "learning_rate": 4.155950782619819e-05, "loss": 0.4697, "step": 6880 }, { "epoch": 1.6056436567164178, "grad_norm": 0.37077466980798357, "learning_rate": 4.1545956259215776e-05, "loss": 0.4575, "step": 6885 }, { "epoch": 1.6068097014925373, "grad_norm": 0.37397215048587934, "learning_rate": 4.153239633823721e-05, "loss": 0.4818, "step": 6890 }, { "epoch": 1.6079757462686568, "grad_norm": 0.3819042229679817, "learning_rate": 4.151882807132739e-05, "loss": 0.4834, "step": 6895 }, { "epoch": 1.609141791044776, "grad_norm": 0.3420138491704207, "learning_rate": 4.1505251466556206e-05, "loss": 0.4735, "step": 6900 }, { "epoch": 1.6103078358208955, "grad_norm": 0.34624964872120995, "learning_rate": 4.149166653199852e-05, "loss": 0.4724, "step": 6905 }, { "epoch": 1.611473880597015, "grad_norm": 0.32999263610838986, "learning_rate": 4.1478073275734105e-05, "loss": 0.4617, "step": 6910 }, { "epoch": 1.6126399253731343, "grad_norm": 0.3219821202086392, "learning_rate": 4.146447170584772e-05, "loss": 0.4409, "step": 6915 }, { "epoch": 1.6138059701492538, "grad_norm": 0.31823919385054067, "learning_rate": 4.145086183042907e-05, "loss": 0.4772, "step": 6920 }, { "epoch": 1.6149720149253732, "grad_norm": 0.3117042372128923, "learning_rate": 4.143724365757275e-05, "loss": 0.4473, "step": 6925 }, { "epoch": 1.6161380597014925, "grad_norm": 0.3250330576185892, "learning_rate": 4.142361719537838e-05, "loss": 0.4579, "step": 6930 }, { "epoch": 1.617304104477612, "grad_norm": 0.35348966460135667, "learning_rate": 4.140998245195042e-05, "loss": 0.4452, "step": 6935 }, { "epoch": 1.6184701492537314, "grad_norm": 0.35601914592858, "learning_rate": 4.13963394353983e-05, "loss": 0.4819, "step": 6940 }, { "epoch": 1.6196361940298507, "grad_norm": 0.34207907216373123, "learning_rate": 4.138268815383636e-05, "loss": 0.4659, "step": 6945 }, { "epoch": 1.6208022388059702, "grad_norm": 0.35803264576282556, "learning_rate": 4.136902861538387e-05, "loss": 0.4813, "step": 6950 }, { "epoch": 1.6219682835820897, "grad_norm": 0.3540499176018185, "learning_rate": 4.135536082816499e-05, "loss": 0.4772, "step": 6955 }, { "epoch": 1.623134328358209, "grad_norm": 0.33817971497647364, "learning_rate": 4.13416848003088e-05, "loss": 0.4526, "step": 6960 }, { "epoch": 1.6243003731343284, "grad_norm": 0.34797848678426574, "learning_rate": 4.132800053994927e-05, "loss": 0.4803, "step": 6965 }, { "epoch": 1.6254664179104479, "grad_norm": 0.33500331225773355, "learning_rate": 4.1314308055225295e-05, "loss": 0.4799, "step": 6970 }, { "epoch": 1.6266324626865671, "grad_norm": 0.34589069785663457, "learning_rate": 4.1300607354280605e-05, "loss": 0.4606, "step": 6975 }, { "epoch": 1.6277985074626866, "grad_norm": 0.3083760808773547, "learning_rate": 4.128689844526388e-05, "loss": 0.441, "step": 6980 }, { "epoch": 1.628964552238806, "grad_norm": 0.32415702677889685, "learning_rate": 4.1273181336328646e-05, "loss": 0.4725, "step": 6985 }, { "epoch": 1.6301305970149254, "grad_norm": 0.34014109508651214, "learning_rate": 4.125945603563331e-05, "loss": 0.478, "step": 6990 }, { "epoch": 1.6312966417910446, "grad_norm": 0.32538033857434384, "learning_rate": 4.124572255134115e-05, "loss": 0.4525, "step": 6995 }, { "epoch": 1.6324626865671643, "grad_norm": 0.3208811617337528, "learning_rate": 4.123198089162033e-05, "loss": 0.4505, "step": 7000 }, { "epoch": 1.6336287313432836, "grad_norm": 0.3325850879806714, "learning_rate": 4.121823106464384e-05, "loss": 0.4718, "step": 7005 }, { "epoch": 1.6347947761194028, "grad_norm": 0.3614875622567189, "learning_rate": 4.1204473078589575e-05, "loss": 0.4744, "step": 7010 }, { "epoch": 1.6359608208955225, "grad_norm": 0.3191733658518597, "learning_rate": 4.119070694164024e-05, "loss": 0.4725, "step": 7015 }, { "epoch": 1.6371268656716418, "grad_norm": 0.33466736823296434, "learning_rate": 4.117693266198342e-05, "loss": 0.4542, "step": 7020 }, { "epoch": 1.638292910447761, "grad_norm": 0.3336159641539354, "learning_rate": 4.116315024781152e-05, "loss": 0.4699, "step": 7025 }, { "epoch": 1.6394589552238807, "grad_norm": 0.36793982561955546, "learning_rate": 4.114935970732178e-05, "loss": 0.4823, "step": 7030 }, { "epoch": 1.640625, "grad_norm": 0.33120379255270743, "learning_rate": 4.113556104871631e-05, "loss": 0.4923, "step": 7035 }, { "epoch": 1.6417910447761193, "grad_norm": 0.30729104833002724, "learning_rate": 4.112175428020199e-05, "loss": 0.4435, "step": 7040 }, { "epoch": 1.642957089552239, "grad_norm": 0.34616218198056103, "learning_rate": 4.110793940999059e-05, "loss": 0.4516, "step": 7045 }, { "epoch": 1.6441231343283582, "grad_norm": 0.3354519199641666, "learning_rate": 4.1094116446298645e-05, "loss": 0.4671, "step": 7050 }, { "epoch": 1.6452891791044775, "grad_norm": 0.32236283925320086, "learning_rate": 4.108028539734753e-05, "loss": 0.4837, "step": 7055 }, { "epoch": 1.6464552238805972, "grad_norm": 0.3116041047755905, "learning_rate": 4.1066446271363426e-05, "loss": 0.4591, "step": 7060 }, { "epoch": 1.6476212686567164, "grad_norm": 0.3597117887276012, "learning_rate": 4.1052599076577306e-05, "loss": 0.482, "step": 7065 }, { "epoch": 1.6487873134328357, "grad_norm": 0.3371049984339211, "learning_rate": 4.103874382122496e-05, "loss": 0.452, "step": 7070 }, { "epoch": 1.6499533582089554, "grad_norm": 0.3342395269568629, "learning_rate": 4.1024880513546955e-05, "loss": 0.4554, "step": 7075 }, { "epoch": 1.6511194029850746, "grad_norm": 0.3334503038716055, "learning_rate": 4.1011009161788655e-05, "loss": 0.4659, "step": 7080 }, { "epoch": 1.652285447761194, "grad_norm": 0.32691988784391596, "learning_rate": 4.099712977420021e-05, "loss": 0.448, "step": 7085 }, { "epoch": 1.6534514925373134, "grad_norm": 0.3189213068437785, "learning_rate": 4.098324235903655e-05, "loss": 0.442, "step": 7090 }, { "epoch": 1.6546175373134329, "grad_norm": 0.34562830396228567, "learning_rate": 4.0969346924557374e-05, "loss": 0.4874, "step": 7095 }, { "epoch": 1.6557835820895521, "grad_norm": 0.3433525287693214, "learning_rate": 4.095544347902715e-05, "loss": 0.4757, "step": 7100 }, { "epoch": 1.6569496268656716, "grad_norm": 0.36953751999856715, "learning_rate": 4.094153203071512e-05, "loss": 0.51, "step": 7105 }, { "epoch": 1.658115671641791, "grad_norm": 0.3274988234772176, "learning_rate": 4.092761258789529e-05, "loss": 0.4571, "step": 7110 }, { "epoch": 1.6592817164179103, "grad_norm": 0.34823208824612917, "learning_rate": 4.091368515884638e-05, "loss": 0.4831, "step": 7115 }, { "epoch": 1.6604477611940298, "grad_norm": 0.32536250915613824, "learning_rate": 4.089974975185192e-05, "loss": 0.4614, "step": 7120 }, { "epoch": 1.6616138059701493, "grad_norm": 0.324795678268426, "learning_rate": 4.088580637520015e-05, "loss": 0.4551, "step": 7125 }, { "epoch": 1.6627798507462686, "grad_norm": 0.3162172035843185, "learning_rate": 4.087185503718404e-05, "loss": 0.4443, "step": 7130 }, { "epoch": 1.663945895522388, "grad_norm": 0.33952859510564576, "learning_rate": 4.0857895746101335e-05, "loss": 0.4717, "step": 7135 }, { "epoch": 1.6651119402985075, "grad_norm": 0.3075133719483971, "learning_rate": 4.084392851025447e-05, "loss": 0.4339, "step": 7140 }, { "epoch": 1.6662779850746268, "grad_norm": 0.33515643023992886, "learning_rate": 4.082995333795063e-05, "loss": 0.4632, "step": 7145 }, { "epoch": 1.6674440298507462, "grad_norm": 0.3288379676558023, "learning_rate": 4.081597023750169e-05, "loss": 0.4654, "step": 7150 }, { "epoch": 1.6686100746268657, "grad_norm": 0.30998910910125077, "learning_rate": 4.0801979217224285e-05, "loss": 0.454, "step": 7155 }, { "epoch": 1.669776119402985, "grad_norm": 0.34437967384422724, "learning_rate": 4.078798028543974e-05, "loss": 0.4729, "step": 7160 }, { "epoch": 1.6709421641791045, "grad_norm": 0.3829981647145245, "learning_rate": 4.0773973450474055e-05, "loss": 0.4737, "step": 7165 }, { "epoch": 1.672108208955224, "grad_norm": 0.3530178173945503, "learning_rate": 4.0759958720658e-05, "loss": 0.4533, "step": 7170 }, { "epoch": 1.6732742537313432, "grad_norm": 0.3179227326670143, "learning_rate": 4.074593610432695e-05, "loss": 0.4508, "step": 7175 }, { "epoch": 1.6744402985074627, "grad_norm": 0.3146630519651914, "learning_rate": 4.073190560982106e-05, "loss": 0.4597, "step": 7180 }, { "epoch": 1.6756063432835822, "grad_norm": 0.34642209234797366, "learning_rate": 4.071786724548511e-05, "loss": 0.4649, "step": 7185 }, { "epoch": 1.6767723880597014, "grad_norm": 0.3706405830369706, "learning_rate": 4.07038210196686e-05, "loss": 0.4606, "step": 7190 }, { "epoch": 1.677938432835821, "grad_norm": 0.3056476971166796, "learning_rate": 4.068976694072565e-05, "loss": 0.4578, "step": 7195 }, { "epoch": 1.6791044776119404, "grad_norm": 0.3079763421623756, "learning_rate": 4.067570501701513e-05, "loss": 0.4642, "step": 7200 }, { "epoch": 1.6802705223880596, "grad_norm": 0.3356571842787205, "learning_rate": 4.0661635256900505e-05, "loss": 0.4731, "step": 7205 }, { "epoch": 1.681436567164179, "grad_norm": 0.31406788758525345, "learning_rate": 4.064755766874993e-05, "loss": 0.4426, "step": 7210 }, { "epoch": 1.6826026119402986, "grad_norm": 0.3192951288730359, "learning_rate": 4.0633472260936224e-05, "loss": 0.4609, "step": 7215 }, { "epoch": 1.6837686567164178, "grad_norm": 0.30323038618671533, "learning_rate": 4.061937904183685e-05, "loss": 0.4495, "step": 7220 }, { "epoch": 1.6849347014925373, "grad_norm": 0.3603630192365742, "learning_rate": 4.060527801983391e-05, "loss": 0.4815, "step": 7225 }, { "epoch": 1.6861007462686568, "grad_norm": 0.3552458403395982, "learning_rate": 4.0591169203314145e-05, "loss": 0.478, "step": 7230 }, { "epoch": 1.687266791044776, "grad_norm": 0.3437733174212601, "learning_rate": 4.057705260066894e-05, "loss": 0.4813, "step": 7235 }, { "epoch": 1.6884328358208955, "grad_norm": 0.37064293513083574, "learning_rate": 4.056292822029432e-05, "loss": 0.4797, "step": 7240 }, { "epoch": 1.689598880597015, "grad_norm": 0.31622660412831927, "learning_rate": 4.05487960705909e-05, "loss": 0.4514, "step": 7245 }, { "epoch": 1.6907649253731343, "grad_norm": 0.3237729621504635, "learning_rate": 4.053465615996397e-05, "loss": 0.4735, "step": 7250 }, { "epoch": 1.6919309701492538, "grad_norm": 0.3487012541914169, "learning_rate": 4.0520508496823395e-05, "loss": 0.4722, "step": 7255 }, { "epoch": 1.6930970149253732, "grad_norm": 0.33532018875590897, "learning_rate": 4.050635308958366e-05, "loss": 0.4768, "step": 7260 }, { "epoch": 1.6942630597014925, "grad_norm": 0.3303916866276644, "learning_rate": 4.0492189946663864e-05, "loss": 0.4687, "step": 7265 }, { "epoch": 1.695429104477612, "grad_norm": 0.32806496684644687, "learning_rate": 4.047801907648769e-05, "loss": 0.4472, "step": 7270 }, { "epoch": 1.6965951492537314, "grad_norm": 0.339298332376045, "learning_rate": 4.046384048748344e-05, "loss": 0.4591, "step": 7275 }, { "epoch": 1.6977611940298507, "grad_norm": 0.32851237253811666, "learning_rate": 4.0449654188083985e-05, "loss": 0.4603, "step": 7280 }, { "epoch": 1.6989272388059702, "grad_norm": 0.34429588338857486, "learning_rate": 4.04354601867268e-05, "loss": 0.4545, "step": 7285 }, { "epoch": 1.7000932835820897, "grad_norm": 0.3522229862796038, "learning_rate": 4.042125849185394e-05, "loss": 0.4896, "step": 7290 }, { "epoch": 1.701259328358209, "grad_norm": 0.3304803396604614, "learning_rate": 4.040704911191201e-05, "loss": 0.4704, "step": 7295 }, { "epoch": 1.7024253731343284, "grad_norm": 0.31746663711386225, "learning_rate": 4.0392832055352205e-05, "loss": 0.4668, "step": 7300 }, { "epoch": 1.7035914179104479, "grad_norm": 0.312913030389991, "learning_rate": 4.0378607330630304e-05, "loss": 0.4743, "step": 7305 }, { "epoch": 1.7047574626865671, "grad_norm": 0.3225342566376156, "learning_rate": 4.036437494620661e-05, "loss": 0.4494, "step": 7310 }, { "epoch": 1.7059235074626866, "grad_norm": 0.3047737962305274, "learning_rate": 4.0350134910546e-05, "loss": 0.474, "step": 7315 }, { "epoch": 1.707089552238806, "grad_norm": 0.3272477406317157, "learning_rate": 4.033588723211793e-05, "loss": 0.4638, "step": 7320 }, { "epoch": 1.7082555970149254, "grad_norm": 0.31344877189254877, "learning_rate": 4.032163191939633e-05, "loss": 0.4781, "step": 7325 }, { "epoch": 1.7094216417910446, "grad_norm": 0.3355640428877015, "learning_rate": 4.030736898085974e-05, "loss": 0.4558, "step": 7330 }, { "epoch": 1.7105876865671643, "grad_norm": 0.3229925108692079, "learning_rate": 4.02930984249912e-05, "loss": 0.4648, "step": 7335 }, { "epoch": 1.7117537313432836, "grad_norm": 0.3339328917463594, "learning_rate": 4.02788202602783e-05, "loss": 0.4693, "step": 7340 }, { "epoch": 1.7129197761194028, "grad_norm": 0.32061837468394516, "learning_rate": 4.026453449521313e-05, "loss": 0.4707, "step": 7345 }, { "epoch": 1.7140858208955225, "grad_norm": 0.3334566989161919, "learning_rate": 4.025024113829233e-05, "loss": 0.4691, "step": 7350 }, { "epoch": 1.7152518656716418, "grad_norm": 0.3117933495011161, "learning_rate": 4.023594019801702e-05, "loss": 0.462, "step": 7355 }, { "epoch": 1.716417910447761, "grad_norm": 0.31444912546892323, "learning_rate": 4.022163168289287e-05, "loss": 0.4534, "step": 7360 }, { "epoch": 1.7175839552238807, "grad_norm": 0.33449115278274755, "learning_rate": 4.020731560143002e-05, "loss": 0.4544, "step": 7365 }, { "epoch": 1.71875, "grad_norm": 0.32373835873579954, "learning_rate": 4.019299196214315e-05, "loss": 0.4833, "step": 7370 }, { "epoch": 1.7199160447761193, "grad_norm": 0.3376817168174418, "learning_rate": 4.017866077355139e-05, "loss": 0.4646, "step": 7375 }, { "epoch": 1.721082089552239, "grad_norm": 0.323632069545875, "learning_rate": 4.016432204417839e-05, "loss": 0.4735, "step": 7380 }, { "epoch": 1.7222481343283582, "grad_norm": 0.3273976532971608, "learning_rate": 4.014997578255227e-05, "loss": 0.4599, "step": 7385 }, { "epoch": 1.7234141791044775, "grad_norm": 0.31173763021502077, "learning_rate": 4.0135621997205654e-05, "loss": 0.4332, "step": 7390 }, { "epoch": 1.7245802238805972, "grad_norm": 0.3418947089076045, "learning_rate": 4.01212606966756e-05, "loss": 0.4583, "step": 7395 }, { "epoch": 1.7257462686567164, "grad_norm": 0.33546667435232946, "learning_rate": 4.010689188950367e-05, "loss": 0.4933, "step": 7400 }, { "epoch": 1.7269123134328357, "grad_norm": 0.3431240770804782, "learning_rate": 4.009251558423588e-05, "loss": 0.4753, "step": 7405 }, { "epoch": 1.7280783582089554, "grad_norm": 0.3333671988156517, "learning_rate": 4.00781317894227e-05, "loss": 0.4683, "step": 7410 }, { "epoch": 1.7292444029850746, "grad_norm": 0.34563156857776767, "learning_rate": 4.006374051361907e-05, "loss": 0.4646, "step": 7415 }, { "epoch": 1.730410447761194, "grad_norm": 0.3190618544335192, "learning_rate": 4.004934176538436e-05, "loss": 0.4568, "step": 7420 }, { "epoch": 1.7315764925373134, "grad_norm": 0.32943852720887506, "learning_rate": 4.0034935553282396e-05, "loss": 0.4719, "step": 7425 }, { "epoch": 1.7327425373134329, "grad_norm": 0.3352232407139199, "learning_rate": 4.002052188588144e-05, "loss": 0.4705, "step": 7430 }, { "epoch": 1.7339085820895521, "grad_norm": 0.36728042330785804, "learning_rate": 4.000610077175419e-05, "loss": 0.4774, "step": 7435 }, { "epoch": 1.7350746268656716, "grad_norm": 0.3469652200713558, "learning_rate": 3.999167221947777e-05, "loss": 0.4574, "step": 7440 }, { "epoch": 1.736240671641791, "grad_norm": 0.3639547566058384, "learning_rate": 3.997723623763372e-05, "loss": 0.47, "step": 7445 }, { "epoch": 1.7374067164179103, "grad_norm": 0.3348528872265908, "learning_rate": 3.9962792834808034e-05, "loss": 0.4826, "step": 7450 }, { "epoch": 1.7385727611940298, "grad_norm": 0.33223482507466934, "learning_rate": 3.9948342019591066e-05, "loss": 0.4547, "step": 7455 }, { "epoch": 1.7397388059701493, "grad_norm": 0.3577356441829403, "learning_rate": 3.993388380057763e-05, "loss": 0.4756, "step": 7460 }, { "epoch": 1.7409048507462686, "grad_norm": 0.3314024852499628, "learning_rate": 3.9919418186366905e-05, "loss": 0.4574, "step": 7465 }, { "epoch": 1.742070895522388, "grad_norm": 0.3065927663958129, "learning_rate": 3.9904945185562484e-05, "loss": 0.4627, "step": 7470 }, { "epoch": 1.7432369402985075, "grad_norm": 0.29873041722286, "learning_rate": 3.989046480677236e-05, "loss": 0.4747, "step": 7475 }, { "epoch": 1.7444029850746268, "grad_norm": 0.3320058004590895, "learning_rate": 3.987597705860891e-05, "loss": 0.461, "step": 7480 }, { "epoch": 1.7455690298507462, "grad_norm": 0.3272721295880745, "learning_rate": 3.986148194968888e-05, "loss": 0.4703, "step": 7485 }, { "epoch": 1.7467350746268657, "grad_norm": 0.3363991185235613, "learning_rate": 3.9846979488633415e-05, "loss": 0.466, "step": 7490 }, { "epoch": 1.747901119402985, "grad_norm": 0.3401648724589398, "learning_rate": 3.9832469684068007e-05, "loss": 0.4508, "step": 7495 }, { "epoch": 1.7490671641791045, "grad_norm": 0.32673520813583895, "learning_rate": 3.9817952544622554e-05, "loss": 0.4343, "step": 7500 }, { "epoch": 1.750233208955224, "grad_norm": 0.33213225396645735, "learning_rate": 3.9803428078931276e-05, "loss": 0.5444, "step": 7505 }, { "epoch": 1.7513992537313432, "grad_norm": 0.3102967200111759, "learning_rate": 3.978889629563277e-05, "loss": 0.4492, "step": 7510 }, { "epoch": 1.7525652985074627, "grad_norm": 0.3292571258229434, "learning_rate": 3.977435720337e-05, "loss": 0.4734, "step": 7515 }, { "epoch": 1.7537313432835822, "grad_norm": 0.3192233513183503, "learning_rate": 3.9759810810790236e-05, "loss": 0.4772, "step": 7520 }, { "epoch": 1.7548973880597014, "grad_norm": 0.36079486925702725, "learning_rate": 3.9745257126545146e-05, "loss": 0.4743, "step": 7525 }, { "epoch": 1.756063432835821, "grad_norm": 0.3504091032321048, "learning_rate": 3.9730696159290656e-05, "loss": 0.4524, "step": 7530 }, { "epoch": 1.7572294776119404, "grad_norm": 0.32653255000606496, "learning_rate": 3.971612791768712e-05, "loss": 0.4497, "step": 7535 }, { "epoch": 1.7583955223880596, "grad_norm": 0.3222864892540727, "learning_rate": 3.970155241039914e-05, "loss": 0.4691, "step": 7540 }, { "epoch": 1.759561567164179, "grad_norm": 0.32822260638527023, "learning_rate": 3.968696964609568e-05, "loss": 0.4605, "step": 7545 }, { "epoch": 1.7607276119402986, "grad_norm": 0.33368821658399866, "learning_rate": 3.967237963345001e-05, "loss": 0.473, "step": 7550 }, { "epoch": 1.7618936567164178, "grad_norm": 0.3441315905330009, "learning_rate": 3.9657782381139696e-05, "loss": 0.4588, "step": 7555 }, { "epoch": 1.7630597014925373, "grad_norm": 0.3386771796818688, "learning_rate": 3.964317789784664e-05, "loss": 0.4783, "step": 7560 }, { "epoch": 1.7642257462686568, "grad_norm": 0.32364843769565194, "learning_rate": 3.962856619225703e-05, "loss": 0.4757, "step": 7565 }, { "epoch": 1.765391791044776, "grad_norm": 0.3142847596998878, "learning_rate": 3.961394727306133e-05, "loss": 0.4478, "step": 7570 }, { "epoch": 1.7665578358208955, "grad_norm": 0.3338652923785012, "learning_rate": 3.9599321148954325e-05, "loss": 0.4484, "step": 7575 }, { "epoch": 1.767723880597015, "grad_norm": 0.31338477996541986, "learning_rate": 3.958468782863508e-05, "loss": 0.4564, "step": 7580 }, { "epoch": 1.7688899253731343, "grad_norm": 0.31103805767415105, "learning_rate": 3.9570047320806916e-05, "loss": 0.4626, "step": 7585 }, { "epoch": 1.7700559701492538, "grad_norm": 0.33954791983778504, "learning_rate": 3.955539963417746e-05, "loss": 0.4544, "step": 7590 }, { "epoch": 1.7712220149253732, "grad_norm": 0.3377370902996918, "learning_rate": 3.954074477745859e-05, "loss": 0.4538, "step": 7595 }, { "epoch": 1.7723880597014925, "grad_norm": 0.33682355542096076, "learning_rate": 3.952608275936644e-05, "loss": 0.4816, "step": 7600 }, { "epoch": 1.773554104477612, "grad_norm": 0.3061278658569396, "learning_rate": 3.9511413588621435e-05, "loss": 0.4405, "step": 7605 }, { "epoch": 1.7747201492537314, "grad_norm": 0.31574220409579423, "learning_rate": 3.949673727394823e-05, "loss": 0.4548, "step": 7610 }, { "epoch": 1.7758861940298507, "grad_norm": 0.3777625297037683, "learning_rate": 3.9482053824075716e-05, "loss": 0.4639, "step": 7615 }, { "epoch": 1.7770522388059702, "grad_norm": 0.3029999766007014, "learning_rate": 3.946736324773707e-05, "loss": 0.4643, "step": 7620 }, { "epoch": 1.7782182835820897, "grad_norm": 0.3284031193787799, "learning_rate": 3.945266555366968e-05, "loss": 0.4519, "step": 7625 }, { "epoch": 1.779384328358209, "grad_norm": 0.33364532903833377, "learning_rate": 3.943796075061517e-05, "loss": 0.4647, "step": 7630 }, { "epoch": 1.7805503731343284, "grad_norm": 0.3077589950827104, "learning_rate": 3.942324884731938e-05, "loss": 0.4577, "step": 7635 }, { "epoch": 1.7817164179104479, "grad_norm": 0.3317835039136894, "learning_rate": 3.940852985253239e-05, "loss": 0.4252, "step": 7640 }, { "epoch": 1.7828824626865671, "grad_norm": 0.32259763132175034, "learning_rate": 3.9393803775008506e-05, "loss": 0.4722, "step": 7645 }, { "epoch": 1.7840485074626866, "grad_norm": 0.3417293168109983, "learning_rate": 3.937907062350622e-05, "loss": 0.4616, "step": 7650 }, { "epoch": 1.785214552238806, "grad_norm": 0.32573572772958925, "learning_rate": 3.9364330406788265e-05, "loss": 0.4643, "step": 7655 }, { "epoch": 1.7863805970149254, "grad_norm": 0.34137362535900706, "learning_rate": 3.9349583133621535e-05, "loss": 0.4605, "step": 7660 }, { "epoch": 1.7875466417910446, "grad_norm": 0.3111180946600651, "learning_rate": 3.933482881277715e-05, "loss": 0.4657, "step": 7665 }, { "epoch": 1.7887126865671643, "grad_norm": 0.3291822142229409, "learning_rate": 3.9320067453030415e-05, "loss": 0.4516, "step": 7670 }, { "epoch": 1.7898787313432836, "grad_norm": 0.34076675027870634, "learning_rate": 3.930529906316083e-05, "loss": 0.4669, "step": 7675 }, { "epoch": 1.7910447761194028, "grad_norm": 0.33220385887575243, "learning_rate": 3.9290523651952046e-05, "loss": 0.4585, "step": 7680 }, { "epoch": 1.7922108208955225, "grad_norm": 0.3597238957873434, "learning_rate": 3.927574122819193e-05, "loss": 0.451, "step": 7685 }, { "epoch": 1.7933768656716418, "grad_norm": 0.30887795215161673, "learning_rate": 3.926095180067249e-05, "loss": 0.453, "step": 7690 }, { "epoch": 1.794542910447761, "grad_norm": 0.3212052662395762, "learning_rate": 3.924615537818992e-05, "loss": 0.4713, "step": 7695 }, { "epoch": 1.7957089552238807, "grad_norm": 0.31984487566276537, "learning_rate": 3.923135196954456e-05, "loss": 0.4872, "step": 7700 }, { "epoch": 1.796875, "grad_norm": 0.32155777756878273, "learning_rate": 3.92165415835409e-05, "loss": 0.4576, "step": 7705 }, { "epoch": 1.7980410447761193, "grad_norm": 0.31698010709092744, "learning_rate": 3.92017242289876e-05, "loss": 0.4618, "step": 7710 }, { "epoch": 1.799207089552239, "grad_norm": 0.3350559563938395, "learning_rate": 3.918689991469746e-05, "loss": 0.451, "step": 7715 }, { "epoch": 1.8003731343283582, "grad_norm": 0.32581123150131286, "learning_rate": 3.9172068649487405e-05, "loss": 0.4473, "step": 7720 }, { "epoch": 1.8015391791044775, "grad_norm": 0.3499490459149918, "learning_rate": 3.91572304421785e-05, "loss": 0.464, "step": 7725 }, { "epoch": 1.8027052238805972, "grad_norm": 0.32822280432312334, "learning_rate": 3.914238530159595e-05, "loss": 0.475, "step": 7730 }, { "epoch": 1.8038712686567164, "grad_norm": 0.31935556956987293, "learning_rate": 3.9127533236569077e-05, "loss": 0.4712, "step": 7735 }, { "epoch": 1.8050373134328357, "grad_norm": 0.34873770528962084, "learning_rate": 3.9112674255931294e-05, "loss": 0.4578, "step": 7740 }, { "epoch": 1.8062033582089554, "grad_norm": 0.3455218187514769, "learning_rate": 3.909780836852019e-05, "loss": 0.4544, "step": 7745 }, { "epoch": 1.8073694029850746, "grad_norm": 0.3292054740020448, "learning_rate": 3.908293558317741e-05, "loss": 0.4512, "step": 7750 }, { "epoch": 1.808535447761194, "grad_norm": 0.3314842049495878, "learning_rate": 3.9068055908748706e-05, "loss": 0.4751, "step": 7755 }, { "epoch": 1.8097014925373134, "grad_norm": 0.30938651566350633, "learning_rate": 3.9053169354083946e-05, "loss": 0.4331, "step": 7760 }, { "epoch": 1.8108675373134329, "grad_norm": 0.32214239674628525, "learning_rate": 3.903827592803708e-05, "loss": 0.4728, "step": 7765 }, { "epoch": 1.8120335820895521, "grad_norm": 0.3043180490770527, "learning_rate": 3.9023375639466156e-05, "loss": 0.4248, "step": 7770 }, { "epoch": 1.8131996268656716, "grad_norm": 0.31143937820829876, "learning_rate": 3.900846849723328e-05, "loss": 0.4555, "step": 7775 }, { "epoch": 1.814365671641791, "grad_norm": 0.30503553696644364, "learning_rate": 3.8993554510204664e-05, "loss": 0.4604, "step": 7780 }, { "epoch": 1.8155317164179103, "grad_norm": 0.3334508226536671, "learning_rate": 3.897863368725056e-05, "loss": 0.4864, "step": 7785 }, { "epoch": 1.8166977611940298, "grad_norm": 0.34169562186157326, "learning_rate": 3.896370603724531e-05, "loss": 0.4521, "step": 7790 }, { "epoch": 1.8178638059701493, "grad_norm": 0.335094114071336, "learning_rate": 3.8948771569067305e-05, "loss": 0.4478, "step": 7795 }, { "epoch": 1.8190298507462686, "grad_norm": 0.31585481265436866, "learning_rate": 3.893383029159899e-05, "loss": 0.4549, "step": 7800 }, { "epoch": 1.820195895522388, "grad_norm": 0.32872682537120884, "learning_rate": 3.891888221372688e-05, "loss": 0.4638, "step": 7805 }, { "epoch": 1.8213619402985075, "grad_norm": 0.33282921132719095, "learning_rate": 3.89039273443415e-05, "loss": 0.4446, "step": 7810 }, { "epoch": 1.8225279850746268, "grad_norm": 0.31069348498645605, "learning_rate": 3.888896569233744e-05, "loss": 0.4558, "step": 7815 }, { "epoch": 1.8236940298507462, "grad_norm": 0.3217055889621255, "learning_rate": 3.887399726661332e-05, "loss": 0.4628, "step": 7820 }, { "epoch": 1.8248600746268657, "grad_norm": 0.33258933344430636, "learning_rate": 3.885902207607178e-05, "loss": 0.4593, "step": 7825 }, { "epoch": 1.826026119402985, "grad_norm": 0.335713408666473, "learning_rate": 3.88440401296195e-05, "loss": 0.4758, "step": 7830 }, { "epoch": 1.8271921641791045, "grad_norm": 0.346449949823291, "learning_rate": 3.8829051436167144e-05, "loss": 0.4683, "step": 7835 }, { "epoch": 1.828358208955224, "grad_norm": 0.33516753990549714, "learning_rate": 3.881405600462943e-05, "loss": 0.4564, "step": 7840 }, { "epoch": 1.8295242537313432, "grad_norm": 0.35138549675984454, "learning_rate": 3.879905384392508e-05, "loss": 0.4679, "step": 7845 }, { "epoch": 1.8306902985074627, "grad_norm": 0.3509271341125894, "learning_rate": 3.8784044962976776e-05, "loss": 0.4537, "step": 7850 }, { "epoch": 1.8318563432835822, "grad_norm": 0.3295852159389941, "learning_rate": 3.8769029370711234e-05, "loss": 0.4704, "step": 7855 }, { "epoch": 1.8330223880597014, "grad_norm": 0.3219920585928317, "learning_rate": 3.8754007076059155e-05, "loss": 0.4721, "step": 7860 }, { "epoch": 1.834188432835821, "grad_norm": 0.32036554569756065, "learning_rate": 3.873897808795522e-05, "loss": 0.463, "step": 7865 }, { "epoch": 1.8353544776119404, "grad_norm": 0.3334846957626944, "learning_rate": 3.8723942415338105e-05, "loss": 0.4729, "step": 7870 }, { "epoch": 1.8365205223880596, "grad_norm": 0.3254709598700866, "learning_rate": 3.870890006715044e-05, "loss": 0.4528, "step": 7875 }, { "epoch": 1.837686567164179, "grad_norm": 0.35499842914666335, "learning_rate": 3.869385105233884e-05, "loss": 0.4649, "step": 7880 }, { "epoch": 1.8388526119402986, "grad_norm": 0.34009809613132064, "learning_rate": 3.867879537985388e-05, "loss": 0.4697, "step": 7885 }, { "epoch": 1.8400186567164178, "grad_norm": 0.3505457636640597, "learning_rate": 3.8663733058650104e-05, "loss": 0.4872, "step": 7890 }, { "epoch": 1.8411847014925373, "grad_norm": 0.31285820850905105, "learning_rate": 3.8648664097686e-05, "loss": 0.439, "step": 7895 }, { "epoch": 1.8423507462686568, "grad_norm": 0.330351773304493, "learning_rate": 3.8633588505924e-05, "loss": 0.473, "step": 7900 }, { "epoch": 1.843516791044776, "grad_norm": 0.3278611547398983, "learning_rate": 3.861850629233051e-05, "loss": 0.4613, "step": 7905 }, { "epoch": 1.8446828358208955, "grad_norm": 0.32658142113658106, "learning_rate": 3.8603417465875816e-05, "loss": 0.4573, "step": 7910 }, { "epoch": 1.845848880597015, "grad_norm": 0.3485881700699174, "learning_rate": 3.858832203553421e-05, "loss": 0.4835, "step": 7915 }, { "epoch": 1.8470149253731343, "grad_norm": 0.3083825708830578, "learning_rate": 3.857322001028385e-05, "loss": 0.4433, "step": 7920 }, { "epoch": 1.8481809701492538, "grad_norm": 0.3503312124519673, "learning_rate": 3.855811139910686e-05, "loss": 0.4799, "step": 7925 }, { "epoch": 1.8493470149253732, "grad_norm": 0.3234996296650612, "learning_rate": 3.854299621098925e-05, "loss": 0.4585, "step": 7930 }, { "epoch": 1.8505130597014925, "grad_norm": 0.3052564357859432, "learning_rate": 3.8527874454920955e-05, "loss": 0.451, "step": 7935 }, { "epoch": 1.851679104477612, "grad_norm": 0.3742671960794288, "learning_rate": 3.851274613989582e-05, "loss": 0.4654, "step": 7940 }, { "epoch": 1.8528451492537314, "grad_norm": 0.3335244961793869, "learning_rate": 3.849761127491158e-05, "loss": 0.4474, "step": 7945 }, { "epoch": 1.8540111940298507, "grad_norm": 0.34729167900291275, "learning_rate": 3.848246986896989e-05, "loss": 0.4757, "step": 7950 }, { "epoch": 1.8551772388059702, "grad_norm": 0.32621784125350106, "learning_rate": 3.8467321931076255e-05, "loss": 0.4375, "step": 7955 }, { "epoch": 1.8563432835820897, "grad_norm": 0.34002031048416315, "learning_rate": 3.84521674702401e-05, "loss": 0.4875, "step": 7960 }, { "epoch": 1.857509328358209, "grad_norm": 0.33018372241070876, "learning_rate": 3.8437006495474716e-05, "loss": 0.451, "step": 7965 }, { "epoch": 1.8586753731343284, "grad_norm": 0.3248254976028379, "learning_rate": 3.8421839015797265e-05, "loss": 0.467, "step": 7970 }, { "epoch": 1.8598414179104479, "grad_norm": 0.31695297868444927, "learning_rate": 3.840666504022879e-05, "loss": 0.471, "step": 7975 }, { "epoch": 1.8610074626865671, "grad_norm": 0.32814182269306824, "learning_rate": 3.839148457779418e-05, "loss": 0.4777, "step": 7980 }, { "epoch": 1.8621735074626866, "grad_norm": 0.3188618300682253, "learning_rate": 3.837629763752219e-05, "loss": 0.4514, "step": 7985 }, { "epoch": 1.863339552238806, "grad_norm": 0.3418727121371228, "learning_rate": 3.8361104228445455e-05, "loss": 0.4954, "step": 7990 }, { "epoch": 1.8645055970149254, "grad_norm": 0.3348980968427248, "learning_rate": 3.834590435960041e-05, "loss": 0.4749, "step": 7995 }, { "epoch": 1.8656716417910446, "grad_norm": 0.3469916419162627, "learning_rate": 3.8330698040027345e-05, "loss": 0.4855, "step": 8000 }, { "epoch": 1.8668376865671643, "grad_norm": 0.3245130980042784, "learning_rate": 3.8315485278770423e-05, "loss": 0.4805, "step": 8005 }, { "epoch": 1.8680037313432836, "grad_norm": 0.32669433732276504, "learning_rate": 3.83002660848776e-05, "loss": 0.4685, "step": 8010 }, { "epoch": 1.8691697761194028, "grad_norm": 0.3371551045394974, "learning_rate": 3.828504046740065e-05, "loss": 0.4686, "step": 8015 }, { "epoch": 1.8703358208955225, "grad_norm": 0.3351214970857238, "learning_rate": 3.826980843539521e-05, "loss": 0.4803, "step": 8020 }, { "epoch": 1.8715018656716418, "grad_norm": 0.30946469782414815, "learning_rate": 3.82545699979207e-05, "loss": 0.439, "step": 8025 }, { "epoch": 1.872667910447761, "grad_norm": 0.33302124613759193, "learning_rate": 3.823932516404036e-05, "loss": 0.4554, "step": 8030 }, { "epoch": 1.8738339552238807, "grad_norm": 0.31750655079024775, "learning_rate": 3.822407394282123e-05, "loss": 0.4473, "step": 8035 }, { "epoch": 1.875, "grad_norm": 0.35069118730521315, "learning_rate": 3.8208816343334156e-05, "loss": 0.4746, "step": 8040 }, { "epoch": 1.8761660447761193, "grad_norm": 0.31257634947661017, "learning_rate": 3.819355237465377e-05, "loss": 0.4536, "step": 8045 }, { "epoch": 1.877332089552239, "grad_norm": 0.3203627093879648, "learning_rate": 3.81782820458585e-05, "loss": 0.4595, "step": 8050 }, { "epoch": 1.8784981343283582, "grad_norm": 0.30534528984636694, "learning_rate": 3.816300536603054e-05, "loss": 0.465, "step": 8055 }, { "epoch": 1.8796641791044775, "grad_norm": 0.3235119118193064, "learning_rate": 3.814772234425588e-05, "loss": 0.4632, "step": 8060 }, { "epoch": 1.8808302238805972, "grad_norm": 0.33739919698858545, "learning_rate": 3.813243298962428e-05, "loss": 0.4538, "step": 8065 }, { "epoch": 1.8819962686567164, "grad_norm": 0.3325386291992744, "learning_rate": 3.8117137311229255e-05, "loss": 0.4593, "step": 8070 }, { "epoch": 1.8831623134328357, "grad_norm": 0.3011728405166508, "learning_rate": 3.81018353181681e-05, "loss": 0.447, "step": 8075 }, { "epoch": 1.8843283582089554, "grad_norm": 0.3212051882269613, "learning_rate": 3.808652701954183e-05, "loss": 0.4721, "step": 8080 }, { "epoch": 1.8854944029850746, "grad_norm": 0.31184448824834854, "learning_rate": 3.807121242445526e-05, "loss": 0.4572, "step": 8085 }, { "epoch": 1.886660447761194, "grad_norm": 0.3264791983669669, "learning_rate": 3.805589154201691e-05, "loss": 0.4543, "step": 8090 }, { "epoch": 1.8878264925373134, "grad_norm": 0.3114795037694442, "learning_rate": 3.804056438133905e-05, "loss": 0.4514, "step": 8095 }, { "epoch": 1.8889925373134329, "grad_norm": 0.3189561537443261, "learning_rate": 3.80252309515377e-05, "loss": 0.453, "step": 8100 }, { "epoch": 1.8901585820895521, "grad_norm": 0.31776350942786574, "learning_rate": 3.800989126173259e-05, "loss": 0.4556, "step": 8105 }, { "epoch": 1.8913246268656716, "grad_norm": 0.32079439958398626, "learning_rate": 3.799454532104718e-05, "loss": 0.4542, "step": 8110 }, { "epoch": 1.892490671641791, "grad_norm": 0.33131265989782804, "learning_rate": 3.7979193138608646e-05, "loss": 0.4609, "step": 8115 }, { "epoch": 1.8936567164179103, "grad_norm": 0.3327910149121253, "learning_rate": 3.7963834723547866e-05, "loss": 0.4765, "step": 8120 }, { "epoch": 1.8948227611940298, "grad_norm": 0.3576971690086045, "learning_rate": 3.794847008499946e-05, "loss": 0.4605, "step": 8125 }, { "epoch": 1.8959888059701493, "grad_norm": 0.31756641923535656, "learning_rate": 3.793309923210171e-05, "loss": 0.458, "step": 8130 }, { "epoch": 1.8971548507462686, "grad_norm": 0.32523492172924273, "learning_rate": 3.791772217399661e-05, "loss": 0.4823, "step": 8135 }, { "epoch": 1.898320895522388, "grad_norm": 0.32383831258621426, "learning_rate": 3.7902338919829854e-05, "loss": 0.4596, "step": 8140 }, { "epoch": 1.8994869402985075, "grad_norm": 0.3391911198250996, "learning_rate": 3.788694947875079e-05, "loss": 0.4632, "step": 8145 }, { "epoch": 1.9006529850746268, "grad_norm": 0.30137706635382955, "learning_rate": 3.78715538599125e-05, "loss": 0.4454, "step": 8150 }, { "epoch": 1.9018190298507462, "grad_norm": 0.3459202673542257, "learning_rate": 3.7856152072471686e-05, "loss": 0.458, "step": 8155 }, { "epoch": 1.9029850746268657, "grad_norm": 0.33169973302248834, "learning_rate": 3.784074412558875e-05, "loss": 0.4385, "step": 8160 }, { "epoch": 1.904151119402985, "grad_norm": 0.34644032807933745, "learning_rate": 3.782533002842773e-05, "loss": 0.4877, "step": 8165 }, { "epoch": 1.9053171641791045, "grad_norm": 0.3368976029041088, "learning_rate": 3.7809909790156355e-05, "loss": 0.4753, "step": 8170 }, { "epoch": 1.906483208955224, "grad_norm": 0.3256484730335472, "learning_rate": 3.7794483419946e-05, "loss": 0.4467, "step": 8175 }, { "epoch": 1.9076492537313432, "grad_norm": 0.3215817140612585, "learning_rate": 3.777905092697166e-05, "loss": 0.4572, "step": 8180 }, { "epoch": 1.9088152985074627, "grad_norm": 0.3383505169421641, "learning_rate": 3.7763612320412e-05, "loss": 0.4629, "step": 8185 }, { "epoch": 1.9099813432835822, "grad_norm": 0.32069100213684426, "learning_rate": 3.77481676094493e-05, "loss": 0.4607, "step": 8190 }, { "epoch": 1.9111473880597014, "grad_norm": 0.32835821049312575, "learning_rate": 3.77327168032695e-05, "loss": 0.4694, "step": 8195 }, { "epoch": 1.912313432835821, "grad_norm": 0.3467709020619544, "learning_rate": 3.771725991106214e-05, "loss": 0.4691, "step": 8200 }, { "epoch": 1.9134794776119404, "grad_norm": 0.3306661408042111, "learning_rate": 3.770179694202038e-05, "loss": 0.4705, "step": 8205 }, { "epoch": 1.9146455223880596, "grad_norm": 0.314401468880452, "learning_rate": 3.7686327905341014e-05, "loss": 0.4563, "step": 8210 }, { "epoch": 1.915811567164179, "grad_norm": 0.31715617006599117, "learning_rate": 3.767085281022441e-05, "loss": 0.465, "step": 8215 }, { "epoch": 1.9169776119402986, "grad_norm": 0.3366229097507516, "learning_rate": 3.765537166587458e-05, "loss": 0.4617, "step": 8220 }, { "epoch": 1.9181436567164178, "grad_norm": 0.31505277674369897, "learning_rate": 3.763988448149912e-05, "loss": 0.4656, "step": 8225 }, { "epoch": 1.9193097014925373, "grad_norm": 0.3136303836644364, "learning_rate": 3.762439126630919e-05, "loss": 0.4658, "step": 8230 }, { "epoch": 1.9204757462686568, "grad_norm": 0.30999053723638365, "learning_rate": 3.7608892029519576e-05, "loss": 0.4512, "step": 8235 }, { "epoch": 1.921641791044776, "grad_norm": 0.328613573645389, "learning_rate": 3.7593386780348625e-05, "loss": 0.469, "step": 8240 }, { "epoch": 1.9228078358208955, "grad_norm": 0.36358357041271927, "learning_rate": 3.757787552801827e-05, "loss": 0.464, "step": 8245 }, { "epoch": 1.923973880597015, "grad_norm": 0.3465509791167903, "learning_rate": 3.756235828175401e-05, "loss": 0.4683, "step": 8250 }, { "epoch": 1.9251399253731343, "grad_norm": 0.3786901022074795, "learning_rate": 3.75468350507849e-05, "loss": 0.4672, "step": 8255 }, { "epoch": 1.9263059701492538, "grad_norm": 0.3385674775790514, "learning_rate": 3.753130584434357e-05, "loss": 0.4714, "step": 8260 }, { "epoch": 1.9274720149253732, "grad_norm": 0.3252345512535215, "learning_rate": 3.7515770671666175e-05, "loss": 0.4545, "step": 8265 }, { "epoch": 1.9286380597014925, "grad_norm": 0.33098644442489994, "learning_rate": 3.750022954199248e-05, "loss": 0.4873, "step": 8270 }, { "epoch": 1.929804104477612, "grad_norm": 0.33705559723146444, "learning_rate": 3.748468246456572e-05, "loss": 0.4582, "step": 8275 }, { "epoch": 1.9309701492537314, "grad_norm": 0.3312102446660308, "learning_rate": 3.7469129448632704e-05, "loss": 0.4584, "step": 8280 }, { "epoch": 1.9321361940298507, "grad_norm": 0.2985887156790144, "learning_rate": 3.7453570503443785e-05, "loss": 0.4569, "step": 8285 }, { "epoch": 1.9333022388059702, "grad_norm": 0.31981170350527494, "learning_rate": 3.743800563825283e-05, "loss": 0.4751, "step": 8290 }, { "epoch": 1.9344682835820897, "grad_norm": 0.31161099831445976, "learning_rate": 3.742243486231719e-05, "loss": 0.4641, "step": 8295 }, { "epoch": 1.935634328358209, "grad_norm": 0.3093342472847783, "learning_rate": 3.74068581848978e-05, "loss": 0.4694, "step": 8300 }, { "epoch": 1.9368003731343284, "grad_norm": 0.34209842644228594, "learning_rate": 3.7391275615259065e-05, "loss": 0.4671, "step": 8305 }, { "epoch": 1.9379664179104479, "grad_norm": 0.31631473113092634, "learning_rate": 3.737568716266888e-05, "loss": 0.4518, "step": 8310 }, { "epoch": 1.9391324626865671, "grad_norm": 0.31559245916026246, "learning_rate": 3.7360092836398686e-05, "loss": 0.458, "step": 8315 }, { "epoch": 1.9402985074626866, "grad_norm": 0.3018699419745661, "learning_rate": 3.734449264572336e-05, "loss": 0.4533, "step": 8320 }, { "epoch": 1.941464552238806, "grad_norm": 0.3242303966557626, "learning_rate": 3.7328886599921327e-05, "loss": 0.4561, "step": 8325 }, { "epoch": 1.9426305970149254, "grad_norm": 0.32202208206224575, "learning_rate": 3.7313274708274445e-05, "loss": 0.4694, "step": 8330 }, { "epoch": 1.9437966417910446, "grad_norm": 0.30990782532343536, "learning_rate": 3.729765698006808e-05, "loss": 0.4521, "step": 8335 }, { "epoch": 1.9449626865671643, "grad_norm": 0.3048640975793819, "learning_rate": 3.7282033424591043e-05, "loss": 0.4341, "step": 8340 }, { "epoch": 1.9461287313432836, "grad_norm": 0.3368657401620674, "learning_rate": 3.726640405113564e-05, "loss": 0.4547, "step": 8345 }, { "epoch": 1.9472947761194028, "grad_norm": 0.3099933955455299, "learning_rate": 3.725076886899763e-05, "loss": 0.4381, "step": 8350 }, { "epoch": 1.9484608208955225, "grad_norm": 0.3401686714270818, "learning_rate": 3.723512788747619e-05, "loss": 0.478, "step": 8355 }, { "epoch": 1.9496268656716418, "grad_norm": 0.33096268165623416, "learning_rate": 3.721948111587399e-05, "loss": 0.5073, "step": 8360 }, { "epoch": 1.950792910447761, "grad_norm": 0.3164863205631471, "learning_rate": 3.720382856349715e-05, "loss": 0.4411, "step": 8365 }, { "epoch": 1.9519589552238807, "grad_norm": 0.31624211657132456, "learning_rate": 3.718817023965519e-05, "loss": 0.4354, "step": 8370 }, { "epoch": 1.953125, "grad_norm": 0.33248567097779913, "learning_rate": 3.717250615366108e-05, "loss": 0.4831, "step": 8375 }, { "epoch": 1.9542910447761193, "grad_norm": 0.3046710717227166, "learning_rate": 3.715683631483121e-05, "loss": 0.4536, "step": 8380 }, { "epoch": 1.955457089552239, "grad_norm": 0.3083780305785333, "learning_rate": 3.714116073248542e-05, "loss": 0.4339, "step": 8385 }, { "epoch": 1.9566231343283582, "grad_norm": 0.33455165027824996, "learning_rate": 3.712547941594693e-05, "loss": 0.4683, "step": 8390 }, { "epoch": 1.9577891791044775, "grad_norm": 0.3426536588315946, "learning_rate": 3.71097923745424e-05, "loss": 0.4651, "step": 8395 }, { "epoch": 1.9589552238805972, "grad_norm": 0.3129487990268488, "learning_rate": 3.709409961760186e-05, "loss": 0.4696, "step": 8400 }, { "epoch": 1.9601212686567164, "grad_norm": 0.33330989701845404, "learning_rate": 3.707840115445877e-05, "loss": 0.4613, "step": 8405 }, { "epoch": 1.9612873134328357, "grad_norm": 0.31936463039977786, "learning_rate": 3.706269699444998e-05, "loss": 0.4529, "step": 8410 }, { "epoch": 1.9624533582089554, "grad_norm": 0.32365100398634133, "learning_rate": 3.704698714691572e-05, "loss": 0.4574, "step": 8415 }, { "epoch": 1.9636194029850746, "grad_norm": 0.3401748878620013, "learning_rate": 3.703127162119959e-05, "loss": 0.4662, "step": 8420 }, { "epoch": 1.964785447761194, "grad_norm": 0.296258052389048, "learning_rate": 3.701555042664861e-05, "loss": 0.4387, "step": 8425 }, { "epoch": 1.9659514925373134, "grad_norm": 0.3487274914463436, "learning_rate": 3.699982357261312e-05, "loss": 0.4607, "step": 8430 }, { "epoch": 1.9671175373134329, "grad_norm": 0.32753681353725883, "learning_rate": 3.6984091068446855e-05, "loss": 0.4655, "step": 8435 }, { "epoch": 1.9682835820895521, "grad_norm": 0.3257070361701552, "learning_rate": 3.69683529235069e-05, "loss": 0.4757, "step": 8440 }, { "epoch": 1.9694496268656716, "grad_norm": 0.350676554662175, "learning_rate": 3.695260914715372e-05, "loss": 0.4673, "step": 8445 }, { "epoch": 1.970615671641791, "grad_norm": 0.33132191973635894, "learning_rate": 3.693685974875109e-05, "loss": 0.4778, "step": 8450 }, { "epoch": 1.9717817164179103, "grad_norm": 0.3166516294323465, "learning_rate": 3.692110473766616e-05, "loss": 0.4779, "step": 8455 }, { "epoch": 1.9729477611940298, "grad_norm": 0.3262098693791917, "learning_rate": 3.69053441232694e-05, "loss": 0.4403, "step": 8460 }, { "epoch": 1.9741138059701493, "grad_norm": 0.35852094125486883, "learning_rate": 3.688957791493462e-05, "loss": 0.4638, "step": 8465 }, { "epoch": 1.9752798507462686, "grad_norm": 0.33695176178664865, "learning_rate": 3.6873806122038964e-05, "loss": 0.4524, "step": 8470 }, { "epoch": 1.976445895522388, "grad_norm": 0.33967072057618775, "learning_rate": 3.685802875396287e-05, "loss": 0.4879, "step": 8475 }, { "epoch": 1.9776119402985075, "grad_norm": 0.3230470335042884, "learning_rate": 3.684224582009014e-05, "loss": 0.4743, "step": 8480 }, { "epoch": 1.9787779850746268, "grad_norm": 0.32611382927003435, "learning_rate": 3.682645732980783e-05, "loss": 0.4547, "step": 8485 }, { "epoch": 1.9799440298507462, "grad_norm": 0.32027423075272554, "learning_rate": 3.6810663292506344e-05, "loss": 0.4576, "step": 8490 }, { "epoch": 1.9811100746268657, "grad_norm": 0.2998864123772078, "learning_rate": 3.6794863717579365e-05, "loss": 0.456, "step": 8495 }, { "epoch": 1.982276119402985, "grad_norm": 0.3296524390065207, "learning_rate": 3.677905861442387e-05, "loss": 0.4539, "step": 8500 }, { "epoch": 1.9834421641791045, "grad_norm": 0.320024303912779, "learning_rate": 3.676324799244014e-05, "loss": 0.4651, "step": 8505 }, { "epoch": 1.984608208955224, "grad_norm": 0.3233521363547366, "learning_rate": 3.6747431861031716e-05, "loss": 0.4612, "step": 8510 }, { "epoch": 1.9857742537313432, "grad_norm": 0.3382237291427512, "learning_rate": 3.673161022960544e-05, "loss": 0.4683, "step": 8515 }, { "epoch": 1.9869402985074627, "grad_norm": 0.32855571390165056, "learning_rate": 3.67157831075714e-05, "loss": 0.4528, "step": 8520 }, { "epoch": 1.9881063432835822, "grad_norm": 0.3331779819433144, "learning_rate": 3.6699950504342954e-05, "loss": 0.4485, "step": 8525 }, { "epoch": 1.9892723880597014, "grad_norm": 0.3056847338421327, "learning_rate": 3.6684112429336745e-05, "loss": 0.4758, "step": 8530 }, { "epoch": 1.990438432835821, "grad_norm": 0.32855409179827705, "learning_rate": 3.666826889197265e-05, "loss": 0.445, "step": 8535 }, { "epoch": 1.9916044776119404, "grad_norm": 0.3757389752274244, "learning_rate": 3.665241990167378e-05, "loss": 0.4429, "step": 8540 }, { "epoch": 1.9927705223880596, "grad_norm": 0.3212644203007692, "learning_rate": 3.663656546786653e-05, "loss": 0.4668, "step": 8545 }, { "epoch": 1.993936567164179, "grad_norm": 0.3055540187125843, "learning_rate": 3.6620705599980494e-05, "loss": 0.4392, "step": 8550 }, { "epoch": 1.9951026119402986, "grad_norm": 0.3111057920694736, "learning_rate": 3.660484030744852e-05, "loss": 0.4177, "step": 8555 }, { "epoch": 1.9962686567164178, "grad_norm": 0.3085390728261077, "learning_rate": 3.6588969599706665e-05, "loss": 0.4631, "step": 8560 }, { "epoch": 1.9974347014925373, "grad_norm": 0.3169719272451861, "learning_rate": 3.6573093486194226e-05, "loss": 0.46, "step": 8565 }, { "epoch": 1.9986007462686568, "grad_norm": 0.3489380750224144, "learning_rate": 3.655721197635371e-05, "loss": 0.4561, "step": 8570 }, { "epoch": 1.999766791044776, "grad_norm": 0.3167210678106308, "learning_rate": 3.654132507963083e-05, "loss": 0.4562, "step": 8575 }, { "epoch": 2.0009328358208953, "grad_norm": 0.3145100839078455, "learning_rate": 3.652543280547449e-05, "loss": 0.402, "step": 8580 }, { "epoch": 2.002098880597015, "grad_norm": 0.3739251067181522, "learning_rate": 3.650953516333682e-05, "loss": 0.398, "step": 8585 }, { "epoch": 2.0032649253731343, "grad_norm": 0.33030851305774417, "learning_rate": 3.6493632162673125e-05, "loss": 0.388, "step": 8590 }, { "epoch": 2.0044309701492535, "grad_norm": 0.32436345472003664, "learning_rate": 3.647772381294189e-05, "loss": 0.4005, "step": 8595 }, { "epoch": 2.0055970149253732, "grad_norm": 0.3344649766185474, "learning_rate": 3.6461810123604805e-05, "loss": 0.3856, "step": 8600 }, { "epoch": 2.0067630597014925, "grad_norm": 0.33728424503869586, "learning_rate": 3.6445891104126714e-05, "loss": 0.4365, "step": 8605 }, { "epoch": 2.0079291044776117, "grad_norm": 0.3404316093009153, "learning_rate": 3.6429966763975636e-05, "loss": 0.3964, "step": 8610 }, { "epoch": 2.0090951492537314, "grad_norm": 0.3378580339271899, "learning_rate": 3.641403711262277e-05, "loss": 0.3996, "step": 8615 }, { "epoch": 2.0102611940298507, "grad_norm": 0.34431386908644757, "learning_rate": 3.639810215954245e-05, "loss": 0.4013, "step": 8620 }, { "epoch": 2.01142723880597, "grad_norm": 0.3435701876332849, "learning_rate": 3.638216191421218e-05, "loss": 0.3887, "step": 8625 }, { "epoch": 2.0125932835820897, "grad_norm": 0.3378758988158323, "learning_rate": 3.6366216386112605e-05, "loss": 0.398, "step": 8630 }, { "epoch": 2.013759328358209, "grad_norm": 0.32610020212555607, "learning_rate": 3.635026558472752e-05, "loss": 0.4021, "step": 8635 }, { "epoch": 2.014925373134328, "grad_norm": 0.3307637385854097, "learning_rate": 3.633430951954383e-05, "loss": 0.385, "step": 8640 }, { "epoch": 2.016091417910448, "grad_norm": 0.32575664349785305, "learning_rate": 3.631834820005163e-05, "loss": 0.3881, "step": 8645 }, { "epoch": 2.017257462686567, "grad_norm": 0.31862392750078533, "learning_rate": 3.6302381635744056e-05, "loss": 0.3739, "step": 8650 }, { "epoch": 2.0184235074626864, "grad_norm": 0.3541599450516158, "learning_rate": 3.628640983611744e-05, "loss": 0.3957, "step": 8655 }, { "epoch": 2.019589552238806, "grad_norm": 0.4047590068267803, "learning_rate": 3.6270432810671176e-05, "loss": 0.4079, "step": 8660 }, { "epoch": 2.0207555970149254, "grad_norm": 0.32493835343321636, "learning_rate": 3.62544505689078e-05, "loss": 0.3984, "step": 8665 }, { "epoch": 2.0219216417910446, "grad_norm": 0.34672647439953735, "learning_rate": 3.623846312033294e-05, "loss": 0.3915, "step": 8670 }, { "epoch": 2.0230876865671643, "grad_norm": 0.3404090139625984, "learning_rate": 3.622247047445529e-05, "loss": 0.4031, "step": 8675 }, { "epoch": 2.0242537313432836, "grad_norm": 0.36206886274537775, "learning_rate": 3.6206472640786696e-05, "loss": 0.4331, "step": 8680 }, { "epoch": 2.025419776119403, "grad_norm": 0.3136468361598117, "learning_rate": 3.619046962884204e-05, "loss": 0.3877, "step": 8685 }, { "epoch": 2.0265858208955225, "grad_norm": 0.3299506757137787, "learning_rate": 3.617446144813929e-05, "loss": 0.3894, "step": 8690 }, { "epoch": 2.027751865671642, "grad_norm": 0.359656032726551, "learning_rate": 3.6158448108199515e-05, "loss": 0.3964, "step": 8695 }, { "epoch": 2.028917910447761, "grad_norm": 0.3372126161755083, "learning_rate": 3.614242961854683e-05, "loss": 0.3876, "step": 8700 }, { "epoch": 2.0300839552238807, "grad_norm": 0.37800743601174797, "learning_rate": 3.6126405988708424e-05, "loss": 0.3934, "step": 8705 }, { "epoch": 2.03125, "grad_norm": 0.3296460466656033, "learning_rate": 3.611037722821452e-05, "loss": 0.3919, "step": 8710 }, { "epoch": 2.0324160447761193, "grad_norm": 0.3577029577400965, "learning_rate": 3.609434334659842e-05, "loss": 0.387, "step": 8715 }, { "epoch": 2.033582089552239, "grad_norm": 0.33699339824825153, "learning_rate": 3.607830435339648e-05, "loss": 0.4078, "step": 8720 }, { "epoch": 2.034748134328358, "grad_norm": 0.37817467842658975, "learning_rate": 3.606226025814805e-05, "loss": 0.4156, "step": 8725 }, { "epoch": 2.0359141791044775, "grad_norm": 0.39827449754645217, "learning_rate": 3.604621107039555e-05, "loss": 0.3958, "step": 8730 }, { "epoch": 2.037080223880597, "grad_norm": 0.33064611675295624, "learning_rate": 3.6030156799684435e-05, "loss": 0.4066, "step": 8735 }, { "epoch": 2.0382462686567164, "grad_norm": 0.34358252515801474, "learning_rate": 3.601409745556315e-05, "loss": 0.4023, "step": 8740 }, { "epoch": 2.0394123134328357, "grad_norm": 0.3461697819524935, "learning_rate": 3.5998033047583194e-05, "loss": 0.3912, "step": 8745 }, { "epoch": 2.0405783582089554, "grad_norm": 0.334638657383683, "learning_rate": 3.598196358529906e-05, "loss": 0.3813, "step": 8750 }, { "epoch": 2.0417444029850746, "grad_norm": 0.3421329198507935, "learning_rate": 3.596588907826824e-05, "loss": 0.3862, "step": 8755 }, { "epoch": 2.042910447761194, "grad_norm": 0.34729731676881953, "learning_rate": 3.5949809536051235e-05, "loss": 0.4017, "step": 8760 }, { "epoch": 2.0440764925373136, "grad_norm": 0.32402886143278264, "learning_rate": 3.593372496821154e-05, "loss": 0.3999, "step": 8765 }, { "epoch": 2.045242537313433, "grad_norm": 0.3485694446078643, "learning_rate": 3.591763538431563e-05, "loss": 0.3875, "step": 8770 }, { "epoch": 2.046408582089552, "grad_norm": 0.3341445435700491, "learning_rate": 3.5901540793933e-05, "loss": 0.3863, "step": 8775 }, { "epoch": 2.047574626865672, "grad_norm": 0.34310854212308417, "learning_rate": 3.5885441206636065e-05, "loss": 0.3893, "step": 8780 }, { "epoch": 2.048740671641791, "grad_norm": 0.3595784233067295, "learning_rate": 3.586933663200026e-05, "loss": 0.4177, "step": 8785 }, { "epoch": 2.0499067164179103, "grad_norm": 0.3301877722080081, "learning_rate": 3.585322707960397e-05, "loss": 0.3946, "step": 8790 }, { "epoch": 2.05107276119403, "grad_norm": 0.32064347002971455, "learning_rate": 3.583711255902853e-05, "loss": 0.3778, "step": 8795 }, { "epoch": 2.0522388059701493, "grad_norm": 0.36183342848231537, "learning_rate": 3.5820993079858235e-05, "loss": 0.4026, "step": 8800 }, { "epoch": 2.0534048507462686, "grad_norm": 0.40243763729351273, "learning_rate": 3.580486865168034e-05, "loss": 0.4075, "step": 8805 }, { "epoch": 2.0545708955223883, "grad_norm": 0.3356995650904215, "learning_rate": 3.5788739284085044e-05, "loss": 0.4015, "step": 8810 }, { "epoch": 2.0557369402985075, "grad_norm": 0.35874221046646215, "learning_rate": 3.577260498666546e-05, "loss": 0.4047, "step": 8815 }, { "epoch": 2.0569029850746268, "grad_norm": 0.32224871797620713, "learning_rate": 3.575646576901767e-05, "loss": 0.393, "step": 8820 }, { "epoch": 2.0580690298507465, "grad_norm": 0.3144167366671361, "learning_rate": 3.5740321640740646e-05, "loss": 0.3954, "step": 8825 }, { "epoch": 2.0592350746268657, "grad_norm": 0.36127999507923353, "learning_rate": 3.57241726114363e-05, "loss": 0.4041, "step": 8830 }, { "epoch": 2.060401119402985, "grad_norm": 0.3582324707065836, "learning_rate": 3.570801869070945e-05, "loss": 0.4069, "step": 8835 }, { "epoch": 2.0615671641791047, "grad_norm": 0.3152933437571003, "learning_rate": 3.5691859888167846e-05, "loss": 0.3726, "step": 8840 }, { "epoch": 2.062733208955224, "grad_norm": 0.34062557594789267, "learning_rate": 3.5675696213422105e-05, "loss": 0.3802, "step": 8845 }, { "epoch": 2.063899253731343, "grad_norm": 0.34342544413829695, "learning_rate": 3.5659527676085774e-05, "loss": 0.3866, "step": 8850 }, { "epoch": 2.065065298507463, "grad_norm": 0.33213617451648014, "learning_rate": 3.564335428577526e-05, "loss": 0.4064, "step": 8855 }, { "epoch": 2.066231343283582, "grad_norm": 0.31122497710003183, "learning_rate": 3.56271760521099e-05, "loss": 0.38, "step": 8860 }, { "epoch": 2.0673973880597014, "grad_norm": 0.3247035766512988, "learning_rate": 3.561099298471187e-05, "loss": 0.393, "step": 8865 }, { "epoch": 2.0685634328358207, "grad_norm": 0.35980760216477076, "learning_rate": 3.559480509320625e-05, "loss": 0.4043, "step": 8870 }, { "epoch": 2.0697294776119404, "grad_norm": 0.345505798287271, "learning_rate": 3.557861238722097e-05, "loss": 0.3766, "step": 8875 }, { "epoch": 2.0708955223880596, "grad_norm": 0.33397575478315844, "learning_rate": 3.556241487638682e-05, "loss": 0.4005, "step": 8880 }, { "epoch": 2.0720615671641793, "grad_norm": 0.34144564488313844, "learning_rate": 3.554621257033749e-05, "loss": 0.4157, "step": 8885 }, { "epoch": 2.0732276119402986, "grad_norm": 0.3425041520164869, "learning_rate": 3.5530005478709446e-05, "loss": 0.4074, "step": 8890 }, { "epoch": 2.074393656716418, "grad_norm": 0.3324000766783016, "learning_rate": 3.551379361114209e-05, "loss": 0.3976, "step": 8895 }, { "epoch": 2.075559701492537, "grad_norm": 0.3336250221139361, "learning_rate": 3.549757697727759e-05, "loss": 0.3874, "step": 8900 }, { "epoch": 2.076725746268657, "grad_norm": 0.3262259512746147, "learning_rate": 3.548135558676098e-05, "loss": 0.3876, "step": 8905 }, { "epoch": 2.077891791044776, "grad_norm": 0.353796191706053, "learning_rate": 3.546512944924014e-05, "loss": 0.4093, "step": 8910 }, { "epoch": 2.0790578358208953, "grad_norm": 0.33076643233497205, "learning_rate": 3.544889857436573e-05, "loss": 0.3884, "step": 8915 }, { "epoch": 2.080223880597015, "grad_norm": 0.3392843554673036, "learning_rate": 3.5432662971791264e-05, "loss": 0.3999, "step": 8920 }, { "epoch": 2.0813899253731343, "grad_norm": 0.351654409620618, "learning_rate": 3.541642265117306e-05, "loss": 0.4009, "step": 8925 }, { "epoch": 2.0825559701492535, "grad_norm": 0.323086059014892, "learning_rate": 3.540017762217023e-05, "loss": 0.3788, "step": 8930 }, { "epoch": 2.0837220149253732, "grad_norm": 0.34201271685018747, "learning_rate": 3.5383927894444694e-05, "loss": 0.3927, "step": 8935 }, { "epoch": 2.0848880597014925, "grad_norm": 0.3164595992648572, "learning_rate": 3.5367673477661174e-05, "loss": 0.3979, "step": 8940 }, { "epoch": 2.0860541044776117, "grad_norm": 0.353067105386722, "learning_rate": 3.535141438148717e-05, "loss": 0.4043, "step": 8945 }, { "epoch": 2.0872201492537314, "grad_norm": 0.32638760846277376, "learning_rate": 3.533515061559297e-05, "loss": 0.3935, "step": 8950 }, { "epoch": 2.0883861940298507, "grad_norm": 0.33745419508434843, "learning_rate": 3.5318882189651635e-05, "loss": 0.3981, "step": 8955 }, { "epoch": 2.08955223880597, "grad_norm": 0.3363813168728781, "learning_rate": 3.5302609113339e-05, "loss": 0.3857, "step": 8960 }, { "epoch": 2.0907182835820897, "grad_norm": 0.3505872812787426, "learning_rate": 3.5286331396333675e-05, "loss": 0.3998, "step": 8965 }, { "epoch": 2.091884328358209, "grad_norm": 0.3560302700326745, "learning_rate": 3.5270049048317016e-05, "loss": 0.4038, "step": 8970 }, { "epoch": 2.093050373134328, "grad_norm": 0.3296137319524784, "learning_rate": 3.525376207897314e-05, "loss": 0.3855, "step": 8975 }, { "epoch": 2.094216417910448, "grad_norm": 0.3257709438873327, "learning_rate": 3.5237470497988905e-05, "loss": 0.3851, "step": 8980 }, { "epoch": 2.095382462686567, "grad_norm": 0.30911319531279924, "learning_rate": 3.5221174315053935e-05, "loss": 0.3838, "step": 8985 }, { "epoch": 2.0965485074626864, "grad_norm": 0.32163696411856363, "learning_rate": 3.520487353986056e-05, "loss": 0.3876, "step": 8990 }, { "epoch": 2.097714552238806, "grad_norm": 0.33223213332866475, "learning_rate": 3.518856818210387e-05, "loss": 0.3889, "step": 8995 }, { "epoch": 2.0988805970149254, "grad_norm": 0.3368633401865806, "learning_rate": 3.517225825148164e-05, "loss": 0.4188, "step": 9000 }, { "epoch": 2.1000466417910446, "grad_norm": 0.36498126923311375, "learning_rate": 3.515594375769442e-05, "loss": 0.4094, "step": 9005 }, { "epoch": 2.1012126865671643, "grad_norm": 0.3357021014397493, "learning_rate": 3.513962471044543e-05, "loss": 0.4131, "step": 9010 }, { "epoch": 2.1023787313432836, "grad_norm": 0.3413655532634832, "learning_rate": 3.512330111944062e-05, "loss": 0.4029, "step": 9015 }, { "epoch": 2.103544776119403, "grad_norm": 0.35141018831844995, "learning_rate": 3.510697299438864e-05, "loss": 0.3926, "step": 9020 }, { "epoch": 2.1047108208955225, "grad_norm": 0.3694729663325888, "learning_rate": 3.509064034500082e-05, "loss": 0.4102, "step": 9025 }, { "epoch": 2.105876865671642, "grad_norm": 0.33303225255011376, "learning_rate": 3.50743031809912e-05, "loss": 0.4045, "step": 9030 }, { "epoch": 2.107042910447761, "grad_norm": 0.3563933543982995, "learning_rate": 3.505796151207651e-05, "loss": 0.4373, "step": 9035 }, { "epoch": 2.1082089552238807, "grad_norm": 0.33690360102280603, "learning_rate": 3.504161534797612e-05, "loss": 0.3864, "step": 9040 }, { "epoch": 2.109375, "grad_norm": 0.3138880897274856, "learning_rate": 3.5025264698412126e-05, "loss": 0.4054, "step": 9045 }, { "epoch": 2.1105410447761193, "grad_norm": 0.359526925655807, "learning_rate": 3.500890957310926e-05, "loss": 0.4184, "step": 9050 }, { "epoch": 2.111707089552239, "grad_norm": 0.3357611988379197, "learning_rate": 3.4992549981794915e-05, "loss": 0.3932, "step": 9055 }, { "epoch": 2.112873134328358, "grad_norm": 0.3585510708956351, "learning_rate": 3.497618593419916e-05, "loss": 0.397, "step": 9060 }, { "epoch": 2.1140391791044775, "grad_norm": 0.35707551160540557, "learning_rate": 3.495981744005471e-05, "loss": 0.3925, "step": 9065 }, { "epoch": 2.115205223880597, "grad_norm": 0.3460622434339384, "learning_rate": 3.494344450909689e-05, "loss": 0.4276, "step": 9070 }, { "epoch": 2.1163712686567164, "grad_norm": 0.3354501658692491, "learning_rate": 3.492706715106372e-05, "loss": 0.3884, "step": 9075 }, { "epoch": 2.1175373134328357, "grad_norm": 0.33888650995437664, "learning_rate": 3.491068537569581e-05, "loss": 0.404, "step": 9080 }, { "epoch": 2.1187033582089554, "grad_norm": 0.3515085575344124, "learning_rate": 3.489429919273642e-05, "loss": 0.3982, "step": 9085 }, { "epoch": 2.1198694029850746, "grad_norm": 0.3394430153326523, "learning_rate": 3.4877908611931406e-05, "loss": 0.4146, "step": 9090 }, { "epoch": 2.121035447761194, "grad_norm": 0.32224241041683155, "learning_rate": 3.486151364302928e-05, "loss": 0.3914, "step": 9095 }, { "epoch": 2.1222014925373136, "grad_norm": 0.3755355862884108, "learning_rate": 3.484511429578113e-05, "loss": 0.3956, "step": 9100 }, { "epoch": 2.123367537313433, "grad_norm": 0.3395061826382033, "learning_rate": 3.482871057994065e-05, "loss": 0.3895, "step": 9105 }, { "epoch": 2.124533582089552, "grad_norm": 0.3187421326522484, "learning_rate": 3.481230250526416e-05, "loss": 0.395, "step": 9110 }, { "epoch": 2.125699626865672, "grad_norm": 0.32836308004681614, "learning_rate": 3.479589008151054e-05, "loss": 0.3867, "step": 9115 }, { "epoch": 2.126865671641791, "grad_norm": 0.36740210947329216, "learning_rate": 3.477947331844127e-05, "loss": 0.4013, "step": 9120 }, { "epoch": 2.1280317164179103, "grad_norm": 0.34743179920168893, "learning_rate": 3.476305222582042e-05, "loss": 0.3923, "step": 9125 }, { "epoch": 2.12919776119403, "grad_norm": 0.35436895456317047, "learning_rate": 3.4746626813414624e-05, "loss": 0.406, "step": 9130 }, { "epoch": 2.1303638059701493, "grad_norm": 0.33055851137695413, "learning_rate": 3.4730197090993084e-05, "loss": 0.3835, "step": 9135 }, { "epoch": 2.1315298507462686, "grad_norm": 0.32502673775303126, "learning_rate": 3.471376306832756e-05, "loss": 0.3786, "step": 9140 }, { "epoch": 2.1326958955223883, "grad_norm": 0.33239252261803837, "learning_rate": 3.4697324755192387e-05, "loss": 0.3877, "step": 9145 }, { "epoch": 2.1338619402985075, "grad_norm": 0.3717027978325473, "learning_rate": 3.468088216136445e-05, "loss": 0.413, "step": 9150 }, { "epoch": 2.1350279850746268, "grad_norm": 0.3536399748722503, "learning_rate": 3.466443529662317e-05, "loss": 0.4027, "step": 9155 }, { "epoch": 2.1361940298507465, "grad_norm": 0.33164638266796576, "learning_rate": 3.4647984170750506e-05, "loss": 0.3925, "step": 9160 }, { "epoch": 2.1373600746268657, "grad_norm": 0.351036727625949, "learning_rate": 3.463152879353097e-05, "loss": 0.4011, "step": 9165 }, { "epoch": 2.138526119402985, "grad_norm": 0.3344343878280186, "learning_rate": 3.4615069174751566e-05, "loss": 0.3936, "step": 9170 }, { "epoch": 2.1396921641791047, "grad_norm": 0.3284585539540017, "learning_rate": 3.459860532420186e-05, "loss": 0.3984, "step": 9175 }, { "epoch": 2.140858208955224, "grad_norm": 0.32979577914435176, "learning_rate": 3.4582137251673916e-05, "loss": 0.3819, "step": 9180 }, { "epoch": 2.142024253731343, "grad_norm": 0.346927762213058, "learning_rate": 3.456566496696232e-05, "loss": 0.406, "step": 9185 }, { "epoch": 2.143190298507463, "grad_norm": 0.3234394629275177, "learning_rate": 3.454918847986414e-05, "loss": 0.3878, "step": 9190 }, { "epoch": 2.144356343283582, "grad_norm": 0.3235265627151897, "learning_rate": 3.453270780017897e-05, "loss": 0.4144, "step": 9195 }, { "epoch": 2.1455223880597014, "grad_norm": 0.3508045257734673, "learning_rate": 3.451622293770889e-05, "loss": 0.4115, "step": 9200 }, { "epoch": 2.1466884328358207, "grad_norm": 0.3212937139834875, "learning_rate": 3.4499733902258446e-05, "loss": 0.3905, "step": 9205 }, { "epoch": 2.1478544776119404, "grad_norm": 0.34576430727001417, "learning_rate": 3.448324070363469e-05, "loss": 0.3993, "step": 9210 }, { "epoch": 2.1490205223880596, "grad_norm": 0.3617695153550067, "learning_rate": 3.446674335164716e-05, "loss": 0.4012, "step": 9215 }, { "epoch": 2.1501865671641793, "grad_norm": 0.33404261776562705, "learning_rate": 3.445024185610783e-05, "loss": 0.3844, "step": 9220 }, { "epoch": 2.1513526119402986, "grad_norm": 0.32591207490785173, "learning_rate": 3.443373622683116e-05, "loss": 0.399, "step": 9225 }, { "epoch": 2.152518656716418, "grad_norm": 0.344817492879971, "learning_rate": 3.441722647363408e-05, "loss": 0.4041, "step": 9230 }, { "epoch": 2.153684701492537, "grad_norm": 0.36815130827464715, "learning_rate": 3.440071260633594e-05, "loss": 0.4036, "step": 9235 }, { "epoch": 2.154850746268657, "grad_norm": 0.3220001451072475, "learning_rate": 3.438419463475857e-05, "loss": 0.3942, "step": 9240 }, { "epoch": 2.156016791044776, "grad_norm": 0.32337108417941435, "learning_rate": 3.436767256872621e-05, "loss": 0.4066, "step": 9245 }, { "epoch": 2.1571828358208953, "grad_norm": 0.33426588397013024, "learning_rate": 3.435114641806557e-05, "loss": 0.3942, "step": 9250 }, { "epoch": 2.158348880597015, "grad_norm": 0.3371874695263784, "learning_rate": 3.433461619260575e-05, "loss": 0.4081, "step": 9255 }, { "epoch": 2.1595149253731343, "grad_norm": 0.34791760129153243, "learning_rate": 3.43180819021783e-05, "loss": 0.4131, "step": 9260 }, { "epoch": 2.1606809701492535, "grad_norm": 0.342867541625907, "learning_rate": 3.4301543556617206e-05, "loss": 0.4001, "step": 9265 }, { "epoch": 2.1618470149253732, "grad_norm": 0.3652891668487797, "learning_rate": 3.428500116575881e-05, "loss": 0.4084, "step": 9270 }, { "epoch": 2.1630130597014925, "grad_norm": 0.33482197584024415, "learning_rate": 3.42684547394419e-05, "loss": 0.3982, "step": 9275 }, { "epoch": 2.1641791044776117, "grad_norm": 0.3103298482949906, "learning_rate": 3.425190428750767e-05, "loss": 0.3843, "step": 9280 }, { "epoch": 2.1653451492537314, "grad_norm": 0.32023976082837036, "learning_rate": 3.423534981979968e-05, "loss": 0.3807, "step": 9285 }, { "epoch": 2.1665111940298507, "grad_norm": 0.3375338948737711, "learning_rate": 3.4218791346163894e-05, "loss": 0.406, "step": 9290 }, { "epoch": 2.16767723880597, "grad_norm": 0.3391712756288744, "learning_rate": 3.420222887644866e-05, "loss": 0.4014, "step": 9295 }, { "epoch": 2.1688432835820897, "grad_norm": 0.3264185092818876, "learning_rate": 3.41856624205047e-05, "loss": 0.4011, "step": 9300 }, { "epoch": 2.170009328358209, "grad_norm": 0.3625198753398349, "learning_rate": 3.4169091988185106e-05, "loss": 0.4163, "step": 9305 }, { "epoch": 2.171175373134328, "grad_norm": 0.3466832342198577, "learning_rate": 3.415251758934534e-05, "loss": 0.4123, "step": 9310 }, { "epoch": 2.172341417910448, "grad_norm": 0.3274812401812581, "learning_rate": 3.413593923384321e-05, "loss": 0.3967, "step": 9315 }, { "epoch": 2.173507462686567, "grad_norm": 0.3290803952497196, "learning_rate": 3.4119356931538894e-05, "loss": 0.3828, "step": 9320 }, { "epoch": 2.1746735074626864, "grad_norm": 0.3727908774190808, "learning_rate": 3.410277069229491e-05, "loss": 0.4023, "step": 9325 }, { "epoch": 2.175839552238806, "grad_norm": 0.3168087450759355, "learning_rate": 3.408618052597611e-05, "loss": 0.3939, "step": 9330 }, { "epoch": 2.1770055970149254, "grad_norm": 0.3472772751440761, "learning_rate": 3.4069586442449684e-05, "loss": 0.41, "step": 9335 }, { "epoch": 2.1781716417910446, "grad_norm": 0.34129902684683067, "learning_rate": 3.405298845158518e-05, "loss": 0.3956, "step": 9340 }, { "epoch": 2.1793376865671643, "grad_norm": 0.34556466209948566, "learning_rate": 3.403638656325442e-05, "loss": 0.3915, "step": 9345 }, { "epoch": 2.1805037313432836, "grad_norm": 0.3350484982276217, "learning_rate": 3.4019780787331586e-05, "loss": 0.4078, "step": 9350 }, { "epoch": 2.181669776119403, "grad_norm": 0.35006707466476816, "learning_rate": 3.4003171133693154e-05, "loss": 0.4107, "step": 9355 }, { "epoch": 2.1828358208955225, "grad_norm": 0.34231880530649994, "learning_rate": 3.3986557612217904e-05, "loss": 0.3896, "step": 9360 }, { "epoch": 2.184001865671642, "grad_norm": 0.34337827685609007, "learning_rate": 3.396994023278693e-05, "loss": 0.4086, "step": 9365 }, { "epoch": 2.185167910447761, "grad_norm": 0.3434139017017427, "learning_rate": 3.3953319005283606e-05, "loss": 0.3945, "step": 9370 }, { "epoch": 2.1863339552238807, "grad_norm": 0.3263391562178993, "learning_rate": 3.393669393959361e-05, "loss": 0.407, "step": 9375 }, { "epoch": 2.1875, "grad_norm": 0.35678583131187686, "learning_rate": 3.392006504560487e-05, "loss": 0.4079, "step": 9380 }, { "epoch": 2.1886660447761193, "grad_norm": 0.35338585088865887, "learning_rate": 3.390343233320764e-05, "loss": 0.3887, "step": 9385 }, { "epoch": 2.189832089552239, "grad_norm": 0.34589020653771424, "learning_rate": 3.388679581229441e-05, "loss": 0.404, "step": 9390 }, { "epoch": 2.190998134328358, "grad_norm": 0.34053259727139085, "learning_rate": 3.3870155492759936e-05, "loss": 0.3934, "step": 9395 }, { "epoch": 2.1921641791044775, "grad_norm": 0.3437211990982055, "learning_rate": 3.3853511384501256e-05, "loss": 0.3875, "step": 9400 }, { "epoch": 2.193330223880597, "grad_norm": 0.3264115450415562, "learning_rate": 3.3836863497417645e-05, "loss": 0.3912, "step": 9405 }, { "epoch": 2.1944962686567164, "grad_norm": 0.33324491583888916, "learning_rate": 3.382021184141062e-05, "loss": 0.3827, "step": 9410 }, { "epoch": 2.1956623134328357, "grad_norm": 0.3512413108243904, "learning_rate": 3.3803556426383954e-05, "loss": 0.3853, "step": 9415 }, { "epoch": 2.1968283582089554, "grad_norm": 0.3429981627700293, "learning_rate": 3.378689726224364e-05, "loss": 0.4035, "step": 9420 }, { "epoch": 2.1979944029850746, "grad_norm": 0.33736698789697045, "learning_rate": 3.3770234358897926e-05, "loss": 0.3964, "step": 9425 }, { "epoch": 2.199160447761194, "grad_norm": 0.3567326263315446, "learning_rate": 3.3753567726257255e-05, "loss": 0.4037, "step": 9430 }, { "epoch": 2.2003264925373136, "grad_norm": 0.3468438640866795, "learning_rate": 3.373689737423431e-05, "loss": 0.3989, "step": 9435 }, { "epoch": 2.201492537313433, "grad_norm": 0.33211326307873107, "learning_rate": 3.372022331274397e-05, "loss": 0.401, "step": 9440 }, { "epoch": 2.202658582089552, "grad_norm": 0.3263460463032437, "learning_rate": 3.3703545551703326e-05, "loss": 0.3953, "step": 9445 }, { "epoch": 2.203824626865672, "grad_norm": 0.3094804604925941, "learning_rate": 3.368686410103167e-05, "loss": 0.3878, "step": 9450 }, { "epoch": 2.204990671641791, "grad_norm": 0.3312175611581137, "learning_rate": 3.367017897065051e-05, "loss": 0.3713, "step": 9455 }, { "epoch": 2.2061567164179103, "grad_norm": 0.3311897369406946, "learning_rate": 3.3653490170483485e-05, "loss": 0.3948, "step": 9460 }, { "epoch": 2.20732276119403, "grad_norm": 0.3395134715793302, "learning_rate": 3.363679771045648e-05, "loss": 0.3856, "step": 9465 }, { "epoch": 2.2084888059701493, "grad_norm": 0.31358378364835615, "learning_rate": 3.3620101600497526e-05, "loss": 0.3981, "step": 9470 }, { "epoch": 2.2096548507462686, "grad_norm": 0.32807405832671016, "learning_rate": 3.360340185053683e-05, "loss": 0.3768, "step": 9475 }, { "epoch": 2.2108208955223883, "grad_norm": 0.3477184959298971, "learning_rate": 3.358669847050676e-05, "loss": 0.4165, "step": 9480 }, { "epoch": 2.2119869402985075, "grad_norm": 0.32566328783386406, "learning_rate": 3.356999147034184e-05, "loss": 0.3902, "step": 9485 }, { "epoch": 2.2131529850746268, "grad_norm": 0.3331101687492885, "learning_rate": 3.355328085997876e-05, "loss": 0.3905, "step": 9490 }, { "epoch": 2.2143190298507465, "grad_norm": 0.3425856257792634, "learning_rate": 3.3536566649356356e-05, "loss": 0.3923, "step": 9495 }, { "epoch": 2.2154850746268657, "grad_norm": 0.33919056245823614, "learning_rate": 3.351984884841558e-05, "loss": 0.3984, "step": 9500 }, { "epoch": 2.216651119402985, "grad_norm": 0.3544595341566174, "learning_rate": 3.350312746709956e-05, "loss": 0.4016, "step": 9505 }, { "epoch": 2.2178171641791047, "grad_norm": 0.3292091593354357, "learning_rate": 3.348640251535352e-05, "loss": 0.4102, "step": 9510 }, { "epoch": 2.218983208955224, "grad_norm": 0.3485922388731389, "learning_rate": 3.346967400312482e-05, "loss": 0.4208, "step": 9515 }, { "epoch": 2.220149253731343, "grad_norm": 0.32920454376985353, "learning_rate": 3.3452941940362946e-05, "loss": 0.4029, "step": 9520 }, { "epoch": 2.221315298507463, "grad_norm": 0.33808076152472294, "learning_rate": 3.343620633701948e-05, "loss": 0.3867, "step": 9525 }, { "epoch": 2.222481343283582, "grad_norm": 0.3576606952663629, "learning_rate": 3.341946720304812e-05, "loss": 0.404, "step": 9530 }, { "epoch": 2.2236473880597014, "grad_norm": 0.31820374339825297, "learning_rate": 3.340272454840466e-05, "loss": 0.4155, "step": 9535 }, { "epoch": 2.2248134328358207, "grad_norm": 0.30741877370719306, "learning_rate": 3.3385978383046996e-05, "loss": 0.3751, "step": 9540 }, { "epoch": 2.2259794776119404, "grad_norm": 0.35617692539846185, "learning_rate": 3.336922871693509e-05, "loss": 0.4088, "step": 9545 }, { "epoch": 2.2271455223880596, "grad_norm": 0.35323849990030076, "learning_rate": 3.335247556003101e-05, "loss": 0.4076, "step": 9550 }, { "epoch": 2.2283115671641793, "grad_norm": 0.35950126223781953, "learning_rate": 3.33357189222989e-05, "loss": 0.3997, "step": 9555 }, { "epoch": 2.2294776119402986, "grad_norm": 0.3304510599891847, "learning_rate": 3.331895881370495e-05, "loss": 0.3898, "step": 9560 }, { "epoch": 2.230643656716418, "grad_norm": 0.34137056131587445, "learning_rate": 3.3302195244217435e-05, "loss": 0.4042, "step": 9565 }, { "epoch": 2.231809701492537, "grad_norm": 0.34319746372534804, "learning_rate": 3.32854282238067e-05, "loss": 0.4064, "step": 9570 }, { "epoch": 2.232975746268657, "grad_norm": 0.3383452593319318, "learning_rate": 3.326865776244509e-05, "loss": 0.4051, "step": 9575 }, { "epoch": 2.234141791044776, "grad_norm": 0.3131809638477496, "learning_rate": 3.3251883870107066e-05, "loss": 0.386, "step": 9580 }, { "epoch": 2.2353078358208953, "grad_norm": 0.33853554060608537, "learning_rate": 3.323510655676906e-05, "loss": 0.3966, "step": 9585 }, { "epoch": 2.236473880597015, "grad_norm": 0.36488474678062477, "learning_rate": 3.3218325832409616e-05, "loss": 0.4282, "step": 9590 }, { "epoch": 2.2376399253731343, "grad_norm": 0.3474887152647101, "learning_rate": 3.320154170700925e-05, "loss": 0.3923, "step": 9595 }, { "epoch": 2.2388059701492535, "grad_norm": 0.340132436894961, "learning_rate": 3.3184754190550506e-05, "loss": 0.3976, "step": 9600 }, { "epoch": 2.2399720149253732, "grad_norm": 0.35033706963579114, "learning_rate": 3.316796329301796e-05, "loss": 0.4187, "step": 9605 }, { "epoch": 2.2411380597014925, "grad_norm": 0.31748039738442313, "learning_rate": 3.31511690243982e-05, "loss": 0.4008, "step": 9610 }, { "epoch": 2.2423041044776117, "grad_norm": 0.32418023288216874, "learning_rate": 3.3134371394679806e-05, "loss": 0.3962, "step": 9615 }, { "epoch": 2.2434701492537314, "grad_norm": 0.3481031422132577, "learning_rate": 3.3117570413853373e-05, "loss": 0.4048, "step": 9620 }, { "epoch": 2.2446361940298507, "grad_norm": 0.3533859620834045, "learning_rate": 3.3100766091911464e-05, "loss": 0.4183, "step": 9625 }, { "epoch": 2.24580223880597, "grad_norm": 0.322320201186157, "learning_rate": 3.308395843884866e-05, "loss": 0.3839, "step": 9630 }, { "epoch": 2.2469682835820897, "grad_norm": 0.3439397217590057, "learning_rate": 3.30671474646615e-05, "loss": 0.3992, "step": 9635 }, { "epoch": 2.248134328358209, "grad_norm": 0.34683716299409695, "learning_rate": 3.305033317934852e-05, "loss": 0.4004, "step": 9640 }, { "epoch": 2.249300373134328, "grad_norm": 0.3572421224821308, "learning_rate": 3.30335155929102e-05, "loss": 0.4011, "step": 9645 }, { "epoch": 2.250466417910448, "grad_norm": 0.32862119253173727, "learning_rate": 3.301669471534899e-05, "loss": 0.3959, "step": 9650 }, { "epoch": 2.251632462686567, "grad_norm": 0.33086323543403445, "learning_rate": 3.299987055666932e-05, "loss": 0.4069, "step": 9655 }, { "epoch": 2.2527985074626864, "grad_norm": 0.3373749666544192, "learning_rate": 3.298304312687754e-05, "loss": 0.4003, "step": 9660 }, { "epoch": 2.253964552238806, "grad_norm": 0.3321302407645913, "learning_rate": 3.2966212435981975e-05, "loss": 0.4021, "step": 9665 }, { "epoch": 2.2551305970149254, "grad_norm": 0.3616873156900573, "learning_rate": 3.2949378493992854e-05, "loss": 0.4113, "step": 9670 }, { "epoch": 2.2562966417910446, "grad_norm": 0.3510310352513335, "learning_rate": 3.293254131092238e-05, "loss": 0.3978, "step": 9675 }, { "epoch": 2.2574626865671643, "grad_norm": 0.34763794392877756, "learning_rate": 3.2915700896784655e-05, "loss": 0.4025, "step": 9680 }, { "epoch": 2.2586287313432836, "grad_norm": 0.3547637725037518, "learning_rate": 3.28988572615957e-05, "loss": 0.409, "step": 9685 }, { "epoch": 2.259794776119403, "grad_norm": 0.3326050672795035, "learning_rate": 3.288201041537348e-05, "loss": 0.4035, "step": 9690 }, { "epoch": 2.2609608208955225, "grad_norm": 0.35370414029326885, "learning_rate": 3.286516036813785e-05, "loss": 0.4073, "step": 9695 }, { "epoch": 2.262126865671642, "grad_norm": 0.33248837327294756, "learning_rate": 3.284830712991057e-05, "loss": 0.4089, "step": 9700 }, { "epoch": 2.263292910447761, "grad_norm": 0.32410294453878474, "learning_rate": 3.28314507107153e-05, "loss": 0.3995, "step": 9705 }, { "epoch": 2.2644589552238807, "grad_norm": 0.33391170367556255, "learning_rate": 3.281459112057759e-05, "loss": 0.3857, "step": 9710 }, { "epoch": 2.265625, "grad_norm": 0.35798626988928706, "learning_rate": 3.2797728369524875e-05, "loss": 0.3893, "step": 9715 }, { "epoch": 2.2667910447761193, "grad_norm": 0.3526529318202589, "learning_rate": 3.2780862467586486e-05, "loss": 0.3987, "step": 9720 }, { "epoch": 2.267957089552239, "grad_norm": 0.3782225159728585, "learning_rate": 3.2763993424793604e-05, "loss": 0.4166, "step": 9725 }, { "epoch": 2.269123134328358, "grad_norm": 0.35139966768672615, "learning_rate": 3.2747121251179294e-05, "loss": 0.392, "step": 9730 }, { "epoch": 2.2702891791044775, "grad_norm": 0.3584291853868186, "learning_rate": 3.273024595677846e-05, "loss": 0.4138, "step": 9735 }, { "epoch": 2.271455223880597, "grad_norm": 0.34507563866710766, "learning_rate": 3.271336755162792e-05, "loss": 0.3864, "step": 9740 }, { "epoch": 2.2726212686567164, "grad_norm": 0.3710731561050386, "learning_rate": 3.269648604576625e-05, "loss": 0.4159, "step": 9745 }, { "epoch": 2.2737873134328357, "grad_norm": 0.354054117064921, "learning_rate": 3.267960144923397e-05, "loss": 0.4014, "step": 9750 }, { "epoch": 2.2749533582089554, "grad_norm": 0.3495485682998862, "learning_rate": 3.266271377207335e-05, "loss": 0.4154, "step": 9755 }, { "epoch": 2.2761194029850746, "grad_norm": 0.35625687553791074, "learning_rate": 3.264582302432856e-05, "loss": 0.3961, "step": 9760 }, { "epoch": 2.277285447761194, "grad_norm": 0.3515612044904509, "learning_rate": 3.262892921604556e-05, "loss": 0.388, "step": 9765 }, { "epoch": 2.2784514925373136, "grad_norm": 0.33953461012900515, "learning_rate": 3.261203235727214e-05, "loss": 0.3935, "step": 9770 }, { "epoch": 2.279617537313433, "grad_norm": 0.32714035351633375, "learning_rate": 3.259513245805791e-05, "loss": 0.3937, "step": 9775 }, { "epoch": 2.280783582089552, "grad_norm": 0.36485978305336336, "learning_rate": 3.2578229528454266e-05, "loss": 0.4222, "step": 9780 }, { "epoch": 2.281949626865672, "grad_norm": 0.3377764941517442, "learning_rate": 3.256132357851445e-05, "loss": 0.4027, "step": 9785 }, { "epoch": 2.283115671641791, "grad_norm": 0.34529439774227333, "learning_rate": 3.254441461829344e-05, "loss": 0.426, "step": 9790 }, { "epoch": 2.2842817164179103, "grad_norm": 0.32594908872759104, "learning_rate": 3.252750265784806e-05, "loss": 0.4017, "step": 9795 }, { "epoch": 2.28544776119403, "grad_norm": 0.3334206547051278, "learning_rate": 3.251058770723688e-05, "loss": 0.3859, "step": 9800 }, { "epoch": 2.2866138059701493, "grad_norm": 0.3277007088336736, "learning_rate": 3.249366977652028e-05, "loss": 0.3842, "step": 9805 }, { "epoch": 2.2877798507462686, "grad_norm": 0.34033250680525806, "learning_rate": 3.247674887576038e-05, "loss": 0.3945, "step": 9810 }, { "epoch": 2.2889458955223883, "grad_norm": 0.34437650469522774, "learning_rate": 3.24598250150211e-05, "loss": 0.3943, "step": 9815 }, { "epoch": 2.2901119402985075, "grad_norm": 0.3316019832531891, "learning_rate": 3.2442898204368086e-05, "loss": 0.4009, "step": 9820 }, { "epoch": 2.2912779850746268, "grad_norm": 0.3400697007911317, "learning_rate": 3.242596845386878e-05, "loss": 0.4052, "step": 9825 }, { "epoch": 2.2924440298507465, "grad_norm": 0.3372049827411618, "learning_rate": 3.240903577359232e-05, "loss": 0.4138, "step": 9830 }, { "epoch": 2.2936100746268657, "grad_norm": 0.3359693174749889, "learning_rate": 3.239210017360963e-05, "loss": 0.3929, "step": 9835 }, { "epoch": 2.294776119402985, "grad_norm": 0.3419201946144107, "learning_rate": 3.237516166399336e-05, "loss": 0.4041, "step": 9840 }, { "epoch": 2.2959421641791042, "grad_norm": 0.344251547989701, "learning_rate": 3.2358220254817874e-05, "loss": 0.393, "step": 9845 }, { "epoch": 2.297108208955224, "grad_norm": 0.3452504852266753, "learning_rate": 3.234127595615927e-05, "loss": 0.3845, "step": 9850 }, { "epoch": 2.298274253731343, "grad_norm": 0.346391228822272, "learning_rate": 3.232432877809538e-05, "loss": 0.4132, "step": 9855 }, { "epoch": 2.299440298507463, "grad_norm": 0.34399752821195706, "learning_rate": 3.230737873070574e-05, "loss": 0.4211, "step": 9860 }, { "epoch": 2.300606343283582, "grad_norm": 0.31554100422299874, "learning_rate": 3.229042582407157e-05, "loss": 0.4232, "step": 9865 }, { "epoch": 2.3017723880597014, "grad_norm": 0.3470044349125867, "learning_rate": 3.2273470068275816e-05, "loss": 0.3889, "step": 9870 }, { "epoch": 2.3029384328358207, "grad_norm": 0.3576250171073377, "learning_rate": 3.225651147340312e-05, "loss": 0.4068, "step": 9875 }, { "epoch": 2.3041044776119404, "grad_norm": 0.33846918522103525, "learning_rate": 3.223955004953979e-05, "loss": 0.4041, "step": 9880 }, { "epoch": 2.3052705223880596, "grad_norm": 0.33351911581021004, "learning_rate": 3.222258580677385e-05, "loss": 0.4118, "step": 9885 }, { "epoch": 2.3064365671641793, "grad_norm": 0.33271982300791403, "learning_rate": 3.220561875519495e-05, "loss": 0.3925, "step": 9890 }, { "epoch": 2.3076026119402986, "grad_norm": 0.3381145215479696, "learning_rate": 3.218864890489446e-05, "loss": 0.387, "step": 9895 }, { "epoch": 2.308768656716418, "grad_norm": 0.32310480672472286, "learning_rate": 3.2171676265965415e-05, "loss": 0.3948, "step": 9900 }, { "epoch": 2.309934701492537, "grad_norm": 0.3189292949217755, "learning_rate": 3.2154700848502454e-05, "loss": 0.402, "step": 9905 }, { "epoch": 2.311100746268657, "grad_norm": 0.31740468766170504, "learning_rate": 3.2137722662601934e-05, "loss": 0.3799, "step": 9910 }, { "epoch": 2.312266791044776, "grad_norm": 0.35015890204685557, "learning_rate": 3.212074171836181e-05, "loss": 0.4188, "step": 9915 }, { "epoch": 2.3134328358208958, "grad_norm": 0.32065110431934984, "learning_rate": 3.21037580258817e-05, "loss": 0.3726, "step": 9920 }, { "epoch": 2.314598880597015, "grad_norm": 0.3322442176622855, "learning_rate": 3.208677159526287e-05, "loss": 0.3967, "step": 9925 }, { "epoch": 2.3157649253731343, "grad_norm": 0.32783982981262094, "learning_rate": 3.206978243660817e-05, "loss": 0.3906, "step": 9930 }, { "epoch": 2.3169309701492535, "grad_norm": 0.32141862450794195, "learning_rate": 3.205279056002212e-05, "loss": 0.3816, "step": 9935 }, { "epoch": 2.3180970149253732, "grad_norm": 0.3331699519161977, "learning_rate": 3.203579597561082e-05, "loss": 0.3759, "step": 9940 }, { "epoch": 2.3192630597014925, "grad_norm": 0.37151609855873413, "learning_rate": 3.2018798693482015e-05, "loss": 0.3915, "step": 9945 }, { "epoch": 2.3204291044776117, "grad_norm": 0.3185923320139489, "learning_rate": 3.200179872374503e-05, "loss": 0.3916, "step": 9950 }, { "epoch": 2.3215951492537314, "grad_norm": 0.33891971937758, "learning_rate": 3.198479607651079e-05, "loss": 0.4028, "step": 9955 }, { "epoch": 2.3227611940298507, "grad_norm": 0.34374813820010514, "learning_rate": 3.1967790761891826e-05, "loss": 0.396, "step": 9960 }, { "epoch": 2.32392723880597, "grad_norm": 0.3279161974190552, "learning_rate": 3.1950782790002236e-05, "loss": 0.3876, "step": 9965 }, { "epoch": 2.3250932835820897, "grad_norm": 0.32730251279290634, "learning_rate": 3.1933772170957716e-05, "loss": 0.4025, "step": 9970 }, { "epoch": 2.326259328358209, "grad_norm": 0.3267150501264816, "learning_rate": 3.191675891487554e-05, "loss": 0.3877, "step": 9975 }, { "epoch": 2.327425373134328, "grad_norm": 0.34114222456197135, "learning_rate": 3.189974303187452e-05, "loss": 0.3899, "step": 9980 }, { "epoch": 2.328591417910448, "grad_norm": 0.32092975001225715, "learning_rate": 3.188272453207507e-05, "loss": 0.3781, "step": 9985 }, { "epoch": 2.329757462686567, "grad_norm": 0.3920402189328833, "learning_rate": 3.186570342559912e-05, "loss": 0.3985, "step": 9990 }, { "epoch": 2.3309235074626864, "grad_norm": 0.33039051930391516, "learning_rate": 3.184867972257019e-05, "loss": 0.3819, "step": 9995 }, { "epoch": 2.332089552238806, "grad_norm": 0.31857875199394975, "learning_rate": 3.1831653433113317e-05, "loss": 0.4033, "step": 10000 }, { "epoch": 2.3332555970149254, "grad_norm": 0.3476619668416482, "learning_rate": 3.1814624567355087e-05, "loss": 0.4156, "step": 10005 }, { "epoch": 2.3344216417910446, "grad_norm": 0.34466914526162007, "learning_rate": 3.179759313542362e-05, "loss": 0.4056, "step": 10010 }, { "epoch": 2.3355876865671643, "grad_norm": 0.319779769106911, "learning_rate": 3.1780559147448554e-05, "loss": 0.3944, "step": 10015 }, { "epoch": 2.3367537313432836, "grad_norm": 0.334340407521021, "learning_rate": 3.176352261356105e-05, "loss": 0.3991, "step": 10020 }, { "epoch": 2.337919776119403, "grad_norm": 0.3230954959705112, "learning_rate": 3.17464835438938e-05, "loss": 0.3871, "step": 10025 }, { "epoch": 2.3390858208955225, "grad_norm": 0.3230271726785597, "learning_rate": 3.172944194858096e-05, "loss": 0.375, "step": 10030 }, { "epoch": 2.340251865671642, "grad_norm": 0.3605855995169774, "learning_rate": 3.171239783775825e-05, "loss": 0.4095, "step": 10035 }, { "epoch": 2.341417910447761, "grad_norm": 0.3426358748076512, "learning_rate": 3.169535122156283e-05, "loss": 0.4063, "step": 10040 }, { "epoch": 2.3425839552238807, "grad_norm": 0.33561767090770944, "learning_rate": 3.167830211013338e-05, "loss": 0.4034, "step": 10045 }, { "epoch": 2.34375, "grad_norm": 0.3375095406977493, "learning_rate": 3.166125051361007e-05, "loss": 0.3778, "step": 10050 }, { "epoch": 2.3449160447761193, "grad_norm": 0.34540683433433367, "learning_rate": 3.164419644213451e-05, "loss": 0.3931, "step": 10055 }, { "epoch": 2.346082089552239, "grad_norm": 0.3511959265083776, "learning_rate": 3.162713990584983e-05, "loss": 0.405, "step": 10060 }, { "epoch": 2.347248134328358, "grad_norm": 0.331903263476702, "learning_rate": 3.1610080914900604e-05, "loss": 0.4143, "step": 10065 }, { "epoch": 2.3484141791044775, "grad_norm": 0.32982825480967737, "learning_rate": 3.159301947943285e-05, "loss": 0.3965, "step": 10070 }, { "epoch": 2.349580223880597, "grad_norm": 0.32711616689698986, "learning_rate": 3.157595560959407e-05, "loss": 0.3998, "step": 10075 }, { "epoch": 2.3507462686567164, "grad_norm": 0.35263199055232375, "learning_rate": 3.155888931553319e-05, "loss": 0.4079, "step": 10080 }, { "epoch": 2.3519123134328357, "grad_norm": 0.3569333029191892, "learning_rate": 3.154182060740058e-05, "loss": 0.3995, "step": 10085 }, { "epoch": 2.3530783582089554, "grad_norm": 0.33012474548979087, "learning_rate": 3.152474949534808e-05, "loss": 0.3969, "step": 10090 }, { "epoch": 2.3542444029850746, "grad_norm": 0.342346282324755, "learning_rate": 3.1507675989528915e-05, "loss": 0.3905, "step": 10095 }, { "epoch": 2.355410447761194, "grad_norm": 0.31500124987977707, "learning_rate": 3.1490600100097746e-05, "loss": 0.384, "step": 10100 }, { "epoch": 2.3565764925373136, "grad_norm": 0.33836453332412225, "learning_rate": 3.147352183721067e-05, "loss": 0.3856, "step": 10105 }, { "epoch": 2.357742537313433, "grad_norm": 0.35235644603826194, "learning_rate": 3.145644121102517e-05, "loss": 0.409, "step": 10110 }, { "epoch": 2.358908582089552, "grad_norm": 0.3402250310578, "learning_rate": 3.1439358231700165e-05, "loss": 0.3959, "step": 10115 }, { "epoch": 2.360074626865672, "grad_norm": 0.31665747129753874, "learning_rate": 3.142227290939595e-05, "loss": 0.406, "step": 10120 }, { "epoch": 2.361240671641791, "grad_norm": 0.3227769955380264, "learning_rate": 3.14051852542742e-05, "loss": 0.3836, "step": 10125 }, { "epoch": 2.3624067164179103, "grad_norm": 0.33119680460644846, "learning_rate": 3.1388095276498013e-05, "loss": 0.4183, "step": 10130 }, { "epoch": 2.36357276119403, "grad_norm": 0.3314793712422934, "learning_rate": 3.1371002986231855e-05, "loss": 0.384, "step": 10135 }, { "epoch": 2.3647388059701493, "grad_norm": 0.32789918586111844, "learning_rate": 3.1353908393641574e-05, "loss": 0.4062, "step": 10140 }, { "epoch": 2.3659048507462686, "grad_norm": 0.3257571322072482, "learning_rate": 3.133681150889434e-05, "loss": 0.3933, "step": 10145 }, { "epoch": 2.3670708955223883, "grad_norm": 0.33231083324624255, "learning_rate": 3.131971234215877e-05, "loss": 0.4031, "step": 10150 }, { "epoch": 2.3682369402985075, "grad_norm": 0.32134342411653466, "learning_rate": 3.1302610903604775e-05, "loss": 0.3977, "step": 10155 }, { "epoch": 2.3694029850746268, "grad_norm": 0.3459994188256327, "learning_rate": 3.128550720340362e-05, "loss": 0.4047, "step": 10160 }, { "epoch": 2.3705690298507465, "grad_norm": 0.34509381119124644, "learning_rate": 3.126840125172795e-05, "loss": 0.3998, "step": 10165 }, { "epoch": 2.3717350746268657, "grad_norm": 0.3119082455680392, "learning_rate": 3.125129305875172e-05, "loss": 0.4072, "step": 10170 }, { "epoch": 2.372901119402985, "grad_norm": 0.34586817402009273, "learning_rate": 3.1234182634650234e-05, "loss": 0.4088, "step": 10175 }, { "epoch": 2.3740671641791042, "grad_norm": 0.3345551711929769, "learning_rate": 3.1217069989600097e-05, "loss": 0.4122, "step": 10180 }, { "epoch": 2.375233208955224, "grad_norm": 0.3203957634261716, "learning_rate": 3.119995513377928e-05, "loss": 0.3928, "step": 10185 }, { "epoch": 2.376399253731343, "grad_norm": 0.3596092995549471, "learning_rate": 3.118283807736703e-05, "loss": 0.4136, "step": 10190 }, { "epoch": 2.377565298507463, "grad_norm": 0.3331826192089746, "learning_rate": 3.1165718830543914e-05, "loss": 0.4113, "step": 10195 }, { "epoch": 2.378731343283582, "grad_norm": 0.35171452204060677, "learning_rate": 3.1148597403491816e-05, "loss": 0.4183, "step": 10200 }, { "epoch": 2.3798973880597014, "grad_norm": 0.3284178824184799, "learning_rate": 3.1131473806393876e-05, "loss": 0.4003, "step": 10205 }, { "epoch": 2.3810634328358207, "grad_norm": 0.32613482390213105, "learning_rate": 3.1114348049434583e-05, "loss": 0.3988, "step": 10210 }, { "epoch": 2.3822294776119404, "grad_norm": 0.3308846819271684, "learning_rate": 3.109722014279967e-05, "loss": 0.3892, "step": 10215 }, { "epoch": 2.3833955223880596, "grad_norm": 0.3285079817418403, "learning_rate": 3.108009009667615e-05, "loss": 0.379, "step": 10220 }, { "epoch": 2.3845615671641793, "grad_norm": 0.34480978638571164, "learning_rate": 3.106295792125233e-05, "loss": 0.4196, "step": 10225 }, { "epoch": 2.3857276119402986, "grad_norm": 1.8144949097090854, "learning_rate": 3.104582362671778e-05, "loss": 0.417, "step": 10230 }, { "epoch": 2.386893656716418, "grad_norm": 0.3526169863481771, "learning_rate": 3.102868722326328e-05, "loss": 0.3867, "step": 10235 }, { "epoch": 2.388059701492537, "grad_norm": 0.32196962856243905, "learning_rate": 3.1011548721080955e-05, "loss": 0.3958, "step": 10240 }, { "epoch": 2.389225746268657, "grad_norm": 0.32620353229048504, "learning_rate": 3.099440813036411e-05, "loss": 0.4074, "step": 10245 }, { "epoch": 2.390391791044776, "grad_norm": 0.3250314883921857, "learning_rate": 3.097726546130729e-05, "loss": 0.3851, "step": 10250 }, { "epoch": 2.3915578358208958, "grad_norm": 0.3202608673120686, "learning_rate": 3.096012072410633e-05, "loss": 0.4024, "step": 10255 }, { "epoch": 2.392723880597015, "grad_norm": 0.32410091788452705, "learning_rate": 3.094297392895825e-05, "loss": 0.3995, "step": 10260 }, { "epoch": 2.3938899253731343, "grad_norm": 0.3577726965191297, "learning_rate": 3.0925825086061295e-05, "loss": 0.4185, "step": 10265 }, { "epoch": 2.3950559701492535, "grad_norm": 0.3584149177157239, "learning_rate": 3.090867420561495e-05, "loss": 0.4111, "step": 10270 }, { "epoch": 2.3962220149253732, "grad_norm": 0.36685961686960566, "learning_rate": 3.0891521297819906e-05, "loss": 0.4171, "step": 10275 }, { "epoch": 2.3973880597014925, "grad_norm": 0.3391626954307687, "learning_rate": 3.0874366372878036e-05, "loss": 0.4219, "step": 10280 }, { "epoch": 2.3985541044776117, "grad_norm": 0.3364139323696701, "learning_rate": 3.085720944099246e-05, "loss": 0.3884, "step": 10285 }, { "epoch": 2.3997201492537314, "grad_norm": 0.3459153137425684, "learning_rate": 3.0840050512367444e-05, "loss": 0.3768, "step": 10290 }, { "epoch": 2.4008861940298507, "grad_norm": 0.3554510164309164, "learning_rate": 3.082288959720845e-05, "loss": 0.3954, "step": 10295 }, { "epoch": 2.40205223880597, "grad_norm": 0.5307041269625768, "learning_rate": 3.0805726705722156e-05, "loss": 0.405, "step": 10300 }, { "epoch": 2.4032182835820897, "grad_norm": 0.31334252629145287, "learning_rate": 3.078856184811638e-05, "loss": 0.3797, "step": 10305 }, { "epoch": 2.404384328358209, "grad_norm": 0.31746455307462684, "learning_rate": 3.077139503460012e-05, "loss": 0.4075, "step": 10310 }, { "epoch": 2.405550373134328, "grad_norm": 0.35179501384941514, "learning_rate": 3.0754226275383546e-05, "loss": 0.4108, "step": 10315 }, { "epoch": 2.406716417910448, "grad_norm": 0.32612692754775957, "learning_rate": 3.073705558067797e-05, "loss": 0.4048, "step": 10320 }, { "epoch": 2.407882462686567, "grad_norm": 0.36856468182483, "learning_rate": 3.071988296069586e-05, "loss": 0.4091, "step": 10325 }, { "epoch": 2.4090485074626864, "grad_norm": 0.33215179272632817, "learning_rate": 3.070270842565084e-05, "loss": 0.4033, "step": 10330 }, { "epoch": 2.410214552238806, "grad_norm": 0.3529031418620512, "learning_rate": 3.068553198575767e-05, "loss": 0.4014, "step": 10335 }, { "epoch": 2.4113805970149254, "grad_norm": 0.352076797924759, "learning_rate": 3.0668353651232226e-05, "loss": 0.399, "step": 10340 }, { "epoch": 2.4125466417910446, "grad_norm": 0.32273091432723616, "learning_rate": 3.065117343229153e-05, "loss": 0.385, "step": 10345 }, { "epoch": 2.4137126865671643, "grad_norm": 0.3336054787241504, "learning_rate": 3.063399133915371e-05, "loss": 0.3986, "step": 10350 }, { "epoch": 2.4148787313432836, "grad_norm": 0.33073092455074937, "learning_rate": 3.0616807382038016e-05, "loss": 0.379, "step": 10355 }, { "epoch": 2.416044776119403, "grad_norm": 0.3251397381722083, "learning_rate": 3.059962157116481e-05, "loss": 0.3833, "step": 10360 }, { "epoch": 2.4172108208955225, "grad_norm": 0.3413498059059611, "learning_rate": 3.058243391675557e-05, "loss": 0.4056, "step": 10365 }, { "epoch": 2.418376865671642, "grad_norm": 0.35337941213086, "learning_rate": 3.056524442903282e-05, "loss": 0.406, "step": 10370 }, { "epoch": 2.419542910447761, "grad_norm": 0.34151696255758757, "learning_rate": 3.054805311822023e-05, "loss": 0.4164, "step": 10375 }, { "epoch": 2.4207089552238807, "grad_norm": 0.3319208887764344, "learning_rate": 3.053085999454254e-05, "loss": 0.3897, "step": 10380 }, { "epoch": 2.421875, "grad_norm": 0.3520706047391738, "learning_rate": 3.051366506822554e-05, "loss": 0.4153, "step": 10385 }, { "epoch": 2.4230410447761193, "grad_norm": 0.31789560104516634, "learning_rate": 3.0496468349496115e-05, "loss": 0.3986, "step": 10390 }, { "epoch": 2.424207089552239, "grad_norm": 0.3597622154424394, "learning_rate": 3.047926984858223e-05, "loss": 0.4146, "step": 10395 }, { "epoch": 2.425373134328358, "grad_norm": 0.3332932526410318, "learning_rate": 3.046206957571288e-05, "loss": 0.3975, "step": 10400 }, { "epoch": 2.4265391791044775, "grad_norm": 0.33580940086004796, "learning_rate": 3.0444867541118145e-05, "loss": 0.4031, "step": 10405 }, { "epoch": 2.427705223880597, "grad_norm": 0.32027555259339785, "learning_rate": 3.0427663755029108e-05, "loss": 0.3869, "step": 10410 }, { "epoch": 2.4288712686567164, "grad_norm": 0.322247385919075, "learning_rate": 3.0410458227677934e-05, "loss": 0.4043, "step": 10415 }, { "epoch": 2.4300373134328357, "grad_norm": 0.3644410852880657, "learning_rate": 3.0393250969297826e-05, "loss": 0.399, "step": 10420 }, { "epoch": 2.4312033582089554, "grad_norm": 0.33834561753799774, "learning_rate": 3.0376041990122983e-05, "loss": 0.4092, "step": 10425 }, { "epoch": 2.4323694029850746, "grad_norm": 0.32506434525029393, "learning_rate": 3.0358831300388657e-05, "loss": 0.4118, "step": 10430 }, { "epoch": 2.433535447761194, "grad_norm": 0.34663468361506883, "learning_rate": 3.0341618910331093e-05, "loss": 0.3959, "step": 10435 }, { "epoch": 2.4347014925373136, "grad_norm": 0.3398568910309269, "learning_rate": 3.0324404830187564e-05, "loss": 0.4018, "step": 10440 }, { "epoch": 2.435867537313433, "grad_norm": 0.3353124579925197, "learning_rate": 3.0307189070196358e-05, "loss": 0.3997, "step": 10445 }, { "epoch": 2.437033582089552, "grad_norm": 0.35212485306323965, "learning_rate": 3.0289971640596737e-05, "loss": 0.3849, "step": 10450 }, { "epoch": 2.438199626865672, "grad_norm": 0.33020930114740726, "learning_rate": 3.0272752551628975e-05, "loss": 0.3882, "step": 10455 }, { "epoch": 2.439365671641791, "grad_norm": 0.3418629569518309, "learning_rate": 3.0255531813534322e-05, "loss": 0.3901, "step": 10460 }, { "epoch": 2.4405317164179103, "grad_norm": 0.3467397178053059, "learning_rate": 3.0238309436555e-05, "loss": 0.408, "step": 10465 }, { "epoch": 2.44169776119403, "grad_norm": 0.3325376652620171, "learning_rate": 3.022108543093425e-05, "loss": 0.4018, "step": 10470 }, { "epoch": 2.4428638059701493, "grad_norm": 0.3527808062363941, "learning_rate": 3.020385980691621e-05, "loss": 0.4078, "step": 10475 }, { "epoch": 2.4440298507462686, "grad_norm": 0.3377371410844193, "learning_rate": 3.0186632574746055e-05, "loss": 0.3948, "step": 10480 }, { "epoch": 2.4451958955223883, "grad_norm": 0.34172950471986, "learning_rate": 3.016940374466986e-05, "loss": 0.3941, "step": 10485 }, { "epoch": 2.4463619402985075, "grad_norm": 0.32838751269242406, "learning_rate": 3.0152173326934692e-05, "loss": 0.4048, "step": 10490 }, { "epoch": 2.4475279850746268, "grad_norm": 0.31714363182431027, "learning_rate": 3.0134941331788525e-05, "loss": 0.3934, "step": 10495 }, { "epoch": 2.4486940298507465, "grad_norm": 0.31738577198258777, "learning_rate": 3.0117707769480285e-05, "loss": 0.3772, "step": 10500 }, { "epoch": 2.4498600746268657, "grad_norm": 0.3109178049328986, "learning_rate": 3.0100472650259866e-05, "loss": 0.3825, "step": 10505 }, { "epoch": 2.451026119402985, "grad_norm": 0.32192043035374324, "learning_rate": 3.008323598437802e-05, "loss": 0.4201, "step": 10510 }, { "epoch": 2.4521921641791042, "grad_norm": 0.33672228524159903, "learning_rate": 3.006599778208647e-05, "loss": 0.3952, "step": 10515 }, { "epoch": 2.453358208955224, "grad_norm": 0.34256596840833037, "learning_rate": 3.0048758053637844e-05, "loss": 0.391, "step": 10520 }, { "epoch": 2.454524253731343, "grad_norm": 0.3390966293245535, "learning_rate": 3.0031516809285658e-05, "loss": 0.4009, "step": 10525 }, { "epoch": 2.455690298507463, "grad_norm": 0.322208018320275, "learning_rate": 3.001427405928435e-05, "loss": 0.3977, "step": 10530 }, { "epoch": 2.456856343283582, "grad_norm": 0.33096617816553986, "learning_rate": 2.999702981388925e-05, "loss": 0.4029, "step": 10535 }, { "epoch": 2.4580223880597014, "grad_norm": 0.32913530116371414, "learning_rate": 2.9979784083356567e-05, "loss": 0.4121, "step": 10540 }, { "epoch": 2.4591884328358207, "grad_norm": 0.3324314228830138, "learning_rate": 2.996253687794341e-05, "loss": 0.4184, "step": 10545 }, { "epoch": 2.4603544776119404, "grad_norm": 0.344369600694389, "learning_rate": 2.994528820790774e-05, "loss": 0.393, "step": 10550 }, { "epoch": 2.4615205223880596, "grad_norm": 0.35216412788337015, "learning_rate": 2.9928038083508415e-05, "loss": 0.3957, "step": 10555 }, { "epoch": 2.4626865671641793, "grad_norm": 0.3236085499304529, "learning_rate": 2.9910786515005146e-05, "loss": 0.3933, "step": 10560 }, { "epoch": 2.4638526119402986, "grad_norm": 0.33114051183143334, "learning_rate": 2.9893533512658507e-05, "loss": 0.395, "step": 10565 }, { "epoch": 2.465018656716418, "grad_norm": 0.32588187811400243, "learning_rate": 2.987627908672992e-05, "loss": 0.399, "step": 10570 }, { "epoch": 2.466184701492537, "grad_norm": 0.31675091471450684, "learning_rate": 2.9859023247481644e-05, "loss": 0.4059, "step": 10575 }, { "epoch": 2.467350746268657, "grad_norm": 0.34012484971873264, "learning_rate": 2.9841766005176808e-05, "loss": 0.3979, "step": 10580 }, { "epoch": 2.468516791044776, "grad_norm": 0.32363133426231616, "learning_rate": 2.982450737007935e-05, "loss": 0.4072, "step": 10585 }, { "epoch": 2.4696828358208958, "grad_norm": 0.31620916149814543, "learning_rate": 2.9807247352454055e-05, "loss": 0.3875, "step": 10590 }, { "epoch": 2.470848880597015, "grad_norm": 0.3531241878315411, "learning_rate": 2.9789985962566503e-05, "loss": 0.4097, "step": 10595 }, { "epoch": 2.4720149253731343, "grad_norm": 0.34753890425407236, "learning_rate": 2.977272321068311e-05, "loss": 0.397, "step": 10600 }, { "epoch": 2.4731809701492535, "grad_norm": 0.32675131126217766, "learning_rate": 2.975545910707111e-05, "loss": 0.3778, "step": 10605 }, { "epoch": 2.4743470149253732, "grad_norm": 0.3765561934154416, "learning_rate": 2.9738193661998526e-05, "loss": 0.4077, "step": 10610 }, { "epoch": 2.4755130597014925, "grad_norm": 0.3358992863592674, "learning_rate": 2.9720926885734167e-05, "loss": 0.3992, "step": 10615 }, { "epoch": 2.4766791044776117, "grad_norm": 0.32565241368168574, "learning_rate": 2.9703658788547674e-05, "loss": 0.3795, "step": 10620 }, { "epoch": 2.4778451492537314, "grad_norm": 0.3255668406361268, "learning_rate": 2.968638938070942e-05, "loss": 0.4102, "step": 10625 }, { "epoch": 2.4790111940298507, "grad_norm": 0.3255638278906826, "learning_rate": 2.9669118672490627e-05, "loss": 0.3916, "step": 10630 }, { "epoch": 2.48017723880597, "grad_norm": 0.34354907528716916, "learning_rate": 2.9651846674163208e-05, "loss": 0.4237, "step": 10635 }, { "epoch": 2.4813432835820897, "grad_norm": 0.3288688933933861, "learning_rate": 2.9634573395999916e-05, "loss": 0.4254, "step": 10640 }, { "epoch": 2.482509328358209, "grad_norm": 0.3403301878304212, "learning_rate": 2.9617298848274223e-05, "loss": 0.381, "step": 10645 }, { "epoch": 2.483675373134328, "grad_norm": 0.31440854403841334, "learning_rate": 2.9600023041260355e-05, "loss": 0.3835, "step": 10650 }, { "epoch": 2.484841417910448, "grad_norm": 0.3521502758854088, "learning_rate": 2.9582745985233312e-05, "loss": 0.402, "step": 10655 }, { "epoch": 2.486007462686567, "grad_norm": 0.3452838350317882, "learning_rate": 2.9565467690468834e-05, "loss": 0.3938, "step": 10660 }, { "epoch": 2.4871735074626864, "grad_norm": 0.3161399712571175, "learning_rate": 2.9548188167243372e-05, "loss": 0.3829, "step": 10665 }, { "epoch": 2.488339552238806, "grad_norm": 0.32358393403248104, "learning_rate": 2.953090742583413e-05, "loss": 0.4056, "step": 10670 }, { "epoch": 2.4895055970149254, "grad_norm": 0.3217085579518673, "learning_rate": 2.951362547651903e-05, "loss": 0.4123, "step": 10675 }, { "epoch": 2.4906716417910446, "grad_norm": 0.33999354835093465, "learning_rate": 2.949634232957671e-05, "loss": 0.4114, "step": 10680 }, { "epoch": 2.4918376865671643, "grad_norm": 0.3346261826072672, "learning_rate": 2.9479057995286528e-05, "loss": 0.4058, "step": 10685 }, { "epoch": 2.4930037313432836, "grad_norm": 0.35037223712728494, "learning_rate": 2.9461772483928547e-05, "loss": 0.3912, "step": 10690 }, { "epoch": 2.494169776119403, "grad_norm": 0.3224419644716262, "learning_rate": 2.944448580578351e-05, "loss": 0.4016, "step": 10695 }, { "epoch": 2.4953358208955225, "grad_norm": 0.3409582170593146, "learning_rate": 2.9427197971132886e-05, "loss": 0.3966, "step": 10700 }, { "epoch": 2.496501865671642, "grad_norm": 0.32065587569769405, "learning_rate": 2.9409908990258812e-05, "loss": 0.3997, "step": 10705 }, { "epoch": 2.497667910447761, "grad_norm": 0.31995324751754234, "learning_rate": 2.9392618873444112e-05, "loss": 0.4037, "step": 10710 }, { "epoch": 2.4988339552238807, "grad_norm": 0.3498064138015017, "learning_rate": 2.937532763097227e-05, "loss": 0.4141, "step": 10715 }, { "epoch": 2.5, "grad_norm": 0.3384621293378265, "learning_rate": 2.9358035273127483e-05, "loss": 0.3816, "step": 10720 }, { "epoch": 2.5011660447761193, "grad_norm": 0.34214690141611453, "learning_rate": 2.934074181019455e-05, "loss": 0.3886, "step": 10725 }, { "epoch": 2.502332089552239, "grad_norm": 0.33582287074367945, "learning_rate": 2.9323447252458986e-05, "loss": 0.3956, "step": 10730 }, { "epoch": 2.503498134328358, "grad_norm": 0.3594059753627121, "learning_rate": 2.9306151610206916e-05, "loss": 0.4089, "step": 10735 }, { "epoch": 2.5046641791044775, "grad_norm": 0.35619172065572746, "learning_rate": 2.9288854893725128e-05, "loss": 0.3992, "step": 10740 }, { "epoch": 2.505830223880597, "grad_norm": 0.3268673798294333, "learning_rate": 2.9271557113301047e-05, "loss": 0.3965, "step": 10745 }, { "epoch": 2.5069962686567164, "grad_norm": 0.3355143444336588, "learning_rate": 2.9254258279222724e-05, "loss": 0.3999, "step": 10750 }, { "epoch": 2.5081623134328357, "grad_norm": 0.32549204242638863, "learning_rate": 2.9236958401778854e-05, "loss": 0.3998, "step": 10755 }, { "epoch": 2.5093283582089554, "grad_norm": 0.34416651578044116, "learning_rate": 2.921965749125873e-05, "loss": 0.4179, "step": 10760 }, { "epoch": 2.5104944029850746, "grad_norm": 0.3605924045910323, "learning_rate": 2.920235555795227e-05, "loss": 0.3957, "step": 10765 }, { "epoch": 2.511660447761194, "grad_norm": 0.34790625946822856, "learning_rate": 2.9185052612150004e-05, "loss": 0.3936, "step": 10770 }, { "epoch": 2.5128264925373136, "grad_norm": 0.32786396747662133, "learning_rate": 2.9167748664143067e-05, "loss": 0.3968, "step": 10775 }, { "epoch": 2.513992537313433, "grad_norm": 0.4410945382251137, "learning_rate": 2.9150443724223174e-05, "loss": 0.3867, "step": 10780 }, { "epoch": 2.515158582089552, "grad_norm": 0.33370436734201, "learning_rate": 2.9133137802682646e-05, "loss": 0.3982, "step": 10785 }, { "epoch": 2.5163246268656714, "grad_norm": 0.32308352973684296, "learning_rate": 2.9115830909814374e-05, "loss": 0.4055, "step": 10790 }, { "epoch": 2.517490671641791, "grad_norm": 0.3091339519323906, "learning_rate": 2.909852305591184e-05, "loss": 0.3917, "step": 10795 }, { "epoch": 2.5186567164179103, "grad_norm": 0.3476782282482589, "learning_rate": 2.9081214251269095e-05, "loss": 0.4129, "step": 10800 }, { "epoch": 2.51982276119403, "grad_norm": 0.3343136586693371, "learning_rate": 2.9063904506180746e-05, "loss": 0.3969, "step": 10805 }, { "epoch": 2.5209888059701493, "grad_norm": 0.33647997245022454, "learning_rate": 2.904659383094197e-05, "loss": 0.4127, "step": 10810 }, { "epoch": 2.5221548507462686, "grad_norm": 0.3597715766647747, "learning_rate": 2.902928223584848e-05, "loss": 0.3866, "step": 10815 }, { "epoch": 2.523320895522388, "grad_norm": 0.3325219298612197, "learning_rate": 2.9011969731196565e-05, "loss": 0.3959, "step": 10820 }, { "epoch": 2.5244869402985075, "grad_norm": 0.32310679698430905, "learning_rate": 2.8994656327283036e-05, "loss": 0.3814, "step": 10825 }, { "epoch": 2.5256529850746268, "grad_norm": 0.3261197381788878, "learning_rate": 2.897734203440524e-05, "loss": 0.3921, "step": 10830 }, { "epoch": 2.5268190298507465, "grad_norm": 0.33835951605396203, "learning_rate": 2.8960026862861057e-05, "loss": 0.4002, "step": 10835 }, { "epoch": 2.5279850746268657, "grad_norm": 0.33359168610897627, "learning_rate": 2.894271082294887e-05, "loss": 0.4054, "step": 10840 }, { "epoch": 2.529151119402985, "grad_norm": 0.30907345255010304, "learning_rate": 2.8925393924967615e-05, "loss": 0.3803, "step": 10845 }, { "epoch": 2.5303171641791042, "grad_norm": 0.3252368104046308, "learning_rate": 2.8908076179216715e-05, "loss": 0.3909, "step": 10850 }, { "epoch": 2.531483208955224, "grad_norm": 0.3292986540454282, "learning_rate": 2.88907575959961e-05, "loss": 0.4068, "step": 10855 }, { "epoch": 2.532649253731343, "grad_norm": 0.32859313146762636, "learning_rate": 2.8873438185606194e-05, "loss": 0.3959, "step": 10860 }, { "epoch": 2.533815298507463, "grad_norm": 0.33746489817943187, "learning_rate": 2.8856117958347923e-05, "loss": 0.4025, "step": 10865 }, { "epoch": 2.534981343283582, "grad_norm": 0.3153155014164979, "learning_rate": 2.8838796924522694e-05, "loss": 0.3906, "step": 10870 }, { "epoch": 2.5361473880597014, "grad_norm": 0.33923991365508954, "learning_rate": 2.8821475094432393e-05, "loss": 0.4081, "step": 10875 }, { "epoch": 2.5373134328358207, "grad_norm": 0.3186874297831969, "learning_rate": 2.8804152478379377e-05, "loss": 0.3999, "step": 10880 }, { "epoch": 2.5384794776119404, "grad_norm": 0.31733276566443414, "learning_rate": 2.8786829086666483e-05, "loss": 0.3825, "step": 10885 }, { "epoch": 2.5396455223880596, "grad_norm": 0.33228644986673356, "learning_rate": 2.8769504929596986e-05, "loss": 0.4013, "step": 10890 }, { "epoch": 2.5408115671641793, "grad_norm": 0.3366748932350541, "learning_rate": 2.8752180017474646e-05, "loss": 0.4124, "step": 10895 }, { "epoch": 2.5419776119402986, "grad_norm": 0.32734712036855396, "learning_rate": 2.8734854360603646e-05, "loss": 0.408, "step": 10900 }, { "epoch": 2.543143656716418, "grad_norm": 0.3181618803084983, "learning_rate": 2.8717527969288632e-05, "loss": 0.3816, "step": 10905 }, { "epoch": 2.544309701492537, "grad_norm": 0.3361742979419798, "learning_rate": 2.870020085383466e-05, "loss": 0.4011, "step": 10910 }, { "epoch": 2.545475746268657, "grad_norm": 0.3324159163938773, "learning_rate": 2.868287302454725e-05, "loss": 0.4101, "step": 10915 }, { "epoch": 2.546641791044776, "grad_norm": 0.3277822050735822, "learning_rate": 2.8665544491732315e-05, "loss": 0.4003, "step": 10920 }, { "epoch": 2.5478078358208958, "grad_norm": 0.3724442747185561, "learning_rate": 2.8648215265696227e-05, "loss": 0.4073, "step": 10925 }, { "epoch": 2.548973880597015, "grad_norm": 0.32836599322795856, "learning_rate": 2.8630885356745716e-05, "loss": 0.41, "step": 10930 }, { "epoch": 2.5501399253731343, "grad_norm": 0.33133400987168266, "learning_rate": 2.8613554775187962e-05, "loss": 0.38, "step": 10935 }, { "epoch": 2.5513059701492535, "grad_norm": 0.3448970138977121, "learning_rate": 2.859622353133054e-05, "loss": 0.4016, "step": 10940 }, { "epoch": 2.5524720149253732, "grad_norm": 0.35279193257714214, "learning_rate": 2.8578891635481387e-05, "loss": 0.4081, "step": 10945 }, { "epoch": 2.5536380597014925, "grad_norm": 0.3243935245847756, "learning_rate": 2.8561559097948863e-05, "loss": 0.4039, "step": 10950 }, { "epoch": 2.554804104477612, "grad_norm": 0.3248898066941752, "learning_rate": 2.8544225929041697e-05, "loss": 0.3902, "step": 10955 }, { "epoch": 2.5559701492537314, "grad_norm": 0.32629310243655124, "learning_rate": 2.852689213906899e-05, "loss": 0.4117, "step": 10960 }, { "epoch": 2.5571361940298507, "grad_norm": 0.3150391771824354, "learning_rate": 2.850955773834022e-05, "loss": 0.3839, "step": 10965 }, { "epoch": 2.55830223880597, "grad_norm": 0.31956479026051104, "learning_rate": 2.849222273716522e-05, "loss": 0.3843, "step": 10970 }, { "epoch": 2.5594682835820897, "grad_norm": 0.3367837851435414, "learning_rate": 2.8474887145854183e-05, "loss": 0.4005, "step": 10975 }, { "epoch": 2.560634328358209, "grad_norm": 0.32271229350305947, "learning_rate": 2.8457550974717655e-05, "loss": 0.4051, "step": 10980 }, { "epoch": 2.5618003731343286, "grad_norm": 0.3491479293926865, "learning_rate": 2.8440214234066524e-05, "loss": 0.3888, "step": 10985 }, { "epoch": 2.562966417910448, "grad_norm": 0.3681124864150189, "learning_rate": 2.8422876934212027e-05, "loss": 0.4028, "step": 10990 }, { "epoch": 2.564132462686567, "grad_norm": 0.35561386881539514, "learning_rate": 2.8405539085465717e-05, "loss": 0.415, "step": 10995 }, { "epoch": 2.5652985074626864, "grad_norm": 0.34146255984065516, "learning_rate": 2.8388200698139484e-05, "loss": 0.4051, "step": 11000 }, { "epoch": 2.566464552238806, "grad_norm": 0.34126450314668866, "learning_rate": 2.8370861782545537e-05, "loss": 0.3842, "step": 11005 }, { "epoch": 2.5676305970149254, "grad_norm": 0.33846587831377495, "learning_rate": 2.8353522348996388e-05, "loss": 0.4034, "step": 11010 }, { "epoch": 2.5687966417910446, "grad_norm": 0.33056921381933013, "learning_rate": 2.8336182407804886e-05, "loss": 0.4055, "step": 11015 }, { "epoch": 2.5699626865671643, "grad_norm": 0.31770192838128103, "learning_rate": 2.8318841969284145e-05, "loss": 0.3969, "step": 11020 }, { "epoch": 2.5711287313432836, "grad_norm": 0.3422252535274209, "learning_rate": 2.8301501043747608e-05, "loss": 0.3995, "step": 11025 }, { "epoch": 2.572294776119403, "grad_norm": 0.33408660146302854, "learning_rate": 2.8284159641508972e-05, "loss": 0.3928, "step": 11030 }, { "epoch": 2.5734608208955225, "grad_norm": 0.3056766347046411, "learning_rate": 2.826681777288226e-05, "loss": 0.3916, "step": 11035 }, { "epoch": 2.574626865671642, "grad_norm": 0.33128605438275327, "learning_rate": 2.824947544818175e-05, "loss": 0.4001, "step": 11040 }, { "epoch": 2.575792910447761, "grad_norm": 0.3184401376927514, "learning_rate": 2.8232132677721972e-05, "loss": 0.3925, "step": 11045 }, { "epoch": 2.5769589552238807, "grad_norm": 0.3524473188642416, "learning_rate": 2.8214789471817754e-05, "loss": 0.3978, "step": 11050 }, { "epoch": 2.578125, "grad_norm": 0.3258655834426742, "learning_rate": 2.819744584078417e-05, "loss": 0.4147, "step": 11055 }, { "epoch": 2.5792910447761193, "grad_norm": 0.3258583122709031, "learning_rate": 2.8180101794936542e-05, "loss": 0.4042, "step": 11060 }, { "epoch": 2.580457089552239, "grad_norm": 0.33982337808957086, "learning_rate": 2.8162757344590445e-05, "loss": 0.4083, "step": 11065 }, { "epoch": 2.581623134328358, "grad_norm": 0.32406290550327665, "learning_rate": 2.8145412500061702e-05, "loss": 0.3791, "step": 11070 }, { "epoch": 2.5827891791044775, "grad_norm": 0.3119583147853536, "learning_rate": 2.812806727166635e-05, "loss": 0.4092, "step": 11075 }, { "epoch": 2.583955223880597, "grad_norm": 0.3196499398717737, "learning_rate": 2.8110721669720663e-05, "loss": 0.3857, "step": 11080 }, { "epoch": 2.5851212686567164, "grad_norm": 0.32554825492053696, "learning_rate": 2.8093375704541158e-05, "loss": 0.3845, "step": 11085 }, { "epoch": 2.5862873134328357, "grad_norm": 0.38251568384629714, "learning_rate": 2.8076029386444524e-05, "loss": 0.3994, "step": 11090 }, { "epoch": 2.5874533582089554, "grad_norm": 0.3199323482279115, "learning_rate": 2.805868272574771e-05, "loss": 0.3858, "step": 11095 }, { "epoch": 2.5886194029850746, "grad_norm": 0.34292229990871886, "learning_rate": 2.804133573276783e-05, "loss": 0.4276, "step": 11100 }, { "epoch": 2.589785447761194, "grad_norm": 0.32460126706512665, "learning_rate": 2.8023988417822222e-05, "loss": 0.3921, "step": 11105 }, { "epoch": 2.5909514925373136, "grad_norm": 0.32353862849431725, "learning_rate": 2.800664079122839e-05, "loss": 0.4138, "step": 11110 }, { "epoch": 2.592117537313433, "grad_norm": 0.33458661213733015, "learning_rate": 2.7989292863304045e-05, "loss": 0.3962, "step": 11115 }, { "epoch": 2.593283582089552, "grad_norm": 0.36347875244808336, "learning_rate": 2.7971944644367066e-05, "loss": 0.4038, "step": 11120 }, { "epoch": 2.5944496268656714, "grad_norm": 0.316560823284915, "learning_rate": 2.7954596144735512e-05, "loss": 0.3906, "step": 11125 }, { "epoch": 2.595615671641791, "grad_norm": 0.3498918106403532, "learning_rate": 2.79372473747276e-05, "loss": 0.3973, "step": 11130 }, { "epoch": 2.5967817164179103, "grad_norm": 0.3469956114538937, "learning_rate": 2.7919898344661723e-05, "loss": 0.4115, "step": 11135 }, { "epoch": 2.59794776119403, "grad_norm": 0.34961336786912744, "learning_rate": 2.7902549064856405e-05, "loss": 0.3979, "step": 11140 }, { "epoch": 2.5991138059701493, "grad_norm": 0.3294875112462785, "learning_rate": 2.7885199545630343e-05, "loss": 0.4141, "step": 11145 }, { "epoch": 2.6002798507462686, "grad_norm": 0.321734211773879, "learning_rate": 2.7867849797302357e-05, "loss": 0.4062, "step": 11150 }, { "epoch": 2.601445895522388, "grad_norm": 0.3571711501997368, "learning_rate": 2.785049983019143e-05, "loss": 0.4036, "step": 11155 }, { "epoch": 2.6026119402985075, "grad_norm": 0.34531340164768715, "learning_rate": 2.7833149654616637e-05, "loss": 0.4019, "step": 11160 }, { "epoch": 2.6037779850746268, "grad_norm": 0.3255306297638485, "learning_rate": 2.7815799280897202e-05, "loss": 0.388, "step": 11165 }, { "epoch": 2.6049440298507465, "grad_norm": 0.3481015414142927, "learning_rate": 2.7798448719352467e-05, "loss": 0.4003, "step": 11170 }, { "epoch": 2.6061100746268657, "grad_norm": 0.31532168438580027, "learning_rate": 2.7781097980301878e-05, "loss": 0.3791, "step": 11175 }, { "epoch": 2.607276119402985, "grad_norm": 0.3763784149688322, "learning_rate": 2.7763747074065e-05, "loss": 0.4038, "step": 11180 }, { "epoch": 2.6084421641791042, "grad_norm": 0.31892940667405195, "learning_rate": 2.7746396010961462e-05, "loss": 0.3865, "step": 11185 }, { "epoch": 2.609608208955224, "grad_norm": 0.3557532967762879, "learning_rate": 2.7729044801311032e-05, "loss": 0.3985, "step": 11190 }, { "epoch": 2.610774253731343, "grad_norm": 0.3360995728639955, "learning_rate": 2.7711693455433534e-05, "loss": 0.3877, "step": 11195 }, { "epoch": 2.611940298507463, "grad_norm": 0.3491493918558091, "learning_rate": 2.7694341983648884e-05, "loss": 0.4065, "step": 11200 }, { "epoch": 2.613106343283582, "grad_norm": 0.34104265174432297, "learning_rate": 2.7676990396277085e-05, "loss": 0.4191, "step": 11205 }, { "epoch": 2.6142723880597014, "grad_norm": 0.3343339198378052, "learning_rate": 2.7659638703638173e-05, "loss": 0.4, "step": 11210 }, { "epoch": 2.6154384328358207, "grad_norm": 0.3677754643152509, "learning_rate": 2.764228691605229e-05, "loss": 0.4079, "step": 11215 }, { "epoch": 2.6166044776119404, "grad_norm": 0.31721255439706836, "learning_rate": 2.76249350438396e-05, "loss": 0.3866, "step": 11220 }, { "epoch": 2.6177705223880596, "grad_norm": 0.32511204163275603, "learning_rate": 2.7607583097320345e-05, "loss": 0.4018, "step": 11225 }, { "epoch": 2.6189365671641793, "grad_norm": 0.3644919707231653, "learning_rate": 2.7590231086814782e-05, "loss": 0.395, "step": 11230 }, { "epoch": 2.6201026119402986, "grad_norm": 0.3186136651250511, "learning_rate": 2.7572879022643228e-05, "loss": 0.3832, "step": 11235 }, { "epoch": 2.621268656716418, "grad_norm": 0.3396535926877513, "learning_rate": 2.7555526915126033e-05, "loss": 0.3882, "step": 11240 }, { "epoch": 2.622434701492537, "grad_norm": 0.3162307242754689, "learning_rate": 2.7538174774583552e-05, "loss": 0.3704, "step": 11245 }, { "epoch": 2.623600746268657, "grad_norm": 0.33208467844912376, "learning_rate": 2.7520822611336176e-05, "loss": 0.4047, "step": 11250 }, { "epoch": 2.624766791044776, "grad_norm": 0.32975040235303565, "learning_rate": 2.7503470435704322e-05, "loss": 0.3803, "step": 11255 }, { "epoch": 2.6259328358208958, "grad_norm": 0.32702536417309136, "learning_rate": 2.7486118258008374e-05, "loss": 0.3966, "step": 11260 }, { "epoch": 2.627098880597015, "grad_norm": 0.32603898335853465, "learning_rate": 2.746876608856876e-05, "loss": 0.3937, "step": 11265 }, { "epoch": 2.6282649253731343, "grad_norm": 0.32761402145244256, "learning_rate": 2.7451413937705878e-05, "loss": 0.3789, "step": 11270 }, { "epoch": 2.6294309701492535, "grad_norm": 0.3258060309573296, "learning_rate": 2.743406181574012e-05, "loss": 0.4005, "step": 11275 }, { "epoch": 2.6305970149253732, "grad_norm": 0.3492841677010907, "learning_rate": 2.7416709732991863e-05, "loss": 0.4047, "step": 11280 }, { "epoch": 2.6317630597014925, "grad_norm": 0.350072312532316, "learning_rate": 2.7399357699781477e-05, "loss": 0.4096, "step": 11285 }, { "epoch": 2.632929104477612, "grad_norm": 0.30080685045019406, "learning_rate": 2.7382005726429256e-05, "loss": 0.382, "step": 11290 }, { "epoch": 2.6340951492537314, "grad_norm": 0.33943411424315834, "learning_rate": 2.736465382325551e-05, "loss": 0.3991, "step": 11295 }, { "epoch": 2.6352611940298507, "grad_norm": 0.33228527249028883, "learning_rate": 2.7347302000580475e-05, "loss": 0.4053, "step": 11300 }, { "epoch": 2.63642723880597, "grad_norm": 0.40510683583736945, "learning_rate": 2.7329950268724358e-05, "loss": 0.3939, "step": 11305 }, { "epoch": 2.6375932835820897, "grad_norm": 0.3317850795628135, "learning_rate": 2.7312598638007308e-05, "loss": 0.391, "step": 11310 }, { "epoch": 2.638759328358209, "grad_norm": 0.34005872267908127, "learning_rate": 2.7295247118749395e-05, "loss": 0.4119, "step": 11315 }, { "epoch": 2.6399253731343286, "grad_norm": 0.3534443990767683, "learning_rate": 2.727789572127064e-05, "loss": 0.4069, "step": 11320 }, { "epoch": 2.641091417910448, "grad_norm": 0.3235925758605551, "learning_rate": 2.7260544455890996e-05, "loss": 0.3915, "step": 11325 }, { "epoch": 2.642257462686567, "grad_norm": 0.30904204184094364, "learning_rate": 2.724319333293033e-05, "loss": 0.388, "step": 11330 }, { "epoch": 2.6434235074626864, "grad_norm": 0.32952150866860735, "learning_rate": 2.7225842362708427e-05, "loss": 0.3904, "step": 11335 }, { "epoch": 2.644589552238806, "grad_norm": 0.32308363663222794, "learning_rate": 2.7208491555544964e-05, "loss": 0.3914, "step": 11340 }, { "epoch": 2.6457555970149254, "grad_norm": 0.320582004158062, "learning_rate": 2.7191140921759546e-05, "loss": 0.4147, "step": 11345 }, { "epoch": 2.6469216417910446, "grad_norm": 0.3142131125665813, "learning_rate": 2.7173790471671662e-05, "loss": 0.3907, "step": 11350 }, { "epoch": 2.6480876865671643, "grad_norm": 0.3613758839320645, "learning_rate": 2.7156440215600703e-05, "loss": 0.3989, "step": 11355 }, { "epoch": 2.6492537313432836, "grad_norm": 0.3268037278177219, "learning_rate": 2.7139090163865932e-05, "loss": 0.4028, "step": 11360 }, { "epoch": 2.650419776119403, "grad_norm": 0.34418869429663473, "learning_rate": 2.712174032678648e-05, "loss": 0.4006, "step": 11365 }, { "epoch": 2.6515858208955225, "grad_norm": 0.34411292220486966, "learning_rate": 2.7104390714681393e-05, "loss": 0.4064, "step": 11370 }, { "epoch": 2.652751865671642, "grad_norm": 0.31855509220399103, "learning_rate": 2.7087041337869522e-05, "loss": 0.4084, "step": 11375 }, { "epoch": 2.653917910447761, "grad_norm": 0.39000246313061576, "learning_rate": 2.7069692206669633e-05, "loss": 0.4004, "step": 11380 }, { "epoch": 2.6550839552238807, "grad_norm": 0.3520022397137946, "learning_rate": 2.7052343331400322e-05, "loss": 0.4011, "step": 11385 }, { "epoch": 2.65625, "grad_norm": 0.31755249099356897, "learning_rate": 2.7034994722380036e-05, "loss": 0.4028, "step": 11390 }, { "epoch": 2.6574160447761193, "grad_norm": 0.3261361077644442, "learning_rate": 2.701764638992705e-05, "loss": 0.3844, "step": 11395 }, { "epoch": 2.658582089552239, "grad_norm": 0.3537403079466135, "learning_rate": 2.7000298344359494e-05, "loss": 0.3916, "step": 11400 }, { "epoch": 2.659748134328358, "grad_norm": 0.32627948445058835, "learning_rate": 2.6982950595995315e-05, "loss": 0.4105, "step": 11405 }, { "epoch": 2.6609141791044775, "grad_norm": 0.3389767530228438, "learning_rate": 2.6965603155152302e-05, "loss": 0.3882, "step": 11410 }, { "epoch": 2.662080223880597, "grad_norm": 0.3322576814889697, "learning_rate": 2.6948256032148052e-05, "loss": 0.4068, "step": 11415 }, { "epoch": 2.6632462686567164, "grad_norm": 0.3426517578214566, "learning_rate": 2.6930909237299934e-05, "loss": 0.4031, "step": 11420 }, { "epoch": 2.6644123134328357, "grad_norm": 0.32799301808104175, "learning_rate": 2.691356278092519e-05, "loss": 0.3885, "step": 11425 }, { "epoch": 2.6655783582089554, "grad_norm": 0.3233374574722326, "learning_rate": 2.6896216673340814e-05, "loss": 0.4049, "step": 11430 }, { "epoch": 2.6667444029850746, "grad_norm": 0.3131013971210666, "learning_rate": 2.687887092486361e-05, "loss": 0.3908, "step": 11435 }, { "epoch": 2.667910447761194, "grad_norm": 0.34802518428559803, "learning_rate": 2.686152554581016e-05, "loss": 0.4375, "step": 11440 }, { "epoch": 2.6690764925373136, "grad_norm": 0.3352028409611966, "learning_rate": 2.6844180546496833e-05, "loss": 0.3954, "step": 11445 }, { "epoch": 2.670242537313433, "grad_norm": 0.34116821999145536, "learning_rate": 2.682683593723977e-05, "loss": 0.4029, "step": 11450 }, { "epoch": 2.671408582089552, "grad_norm": 0.31866260815082276, "learning_rate": 2.680949172835487e-05, "loss": 0.3812, "step": 11455 }, { "epoch": 2.6725746268656714, "grad_norm": 0.3071887351168522, "learning_rate": 2.6792147930157812e-05, "loss": 0.4016, "step": 11460 }, { "epoch": 2.673740671641791, "grad_norm": 0.35257332177523143, "learning_rate": 2.6774804552964034e-05, "loss": 0.4032, "step": 11465 }, { "epoch": 2.6749067164179103, "grad_norm": 0.3303925221970838, "learning_rate": 2.6757461607088692e-05, "loss": 0.4003, "step": 11470 }, { "epoch": 2.67607276119403, "grad_norm": 0.32544472846675615, "learning_rate": 2.6740119102846707e-05, "loss": 0.4123, "step": 11475 }, { "epoch": 2.6772388059701493, "grad_norm": 0.33468831151919165, "learning_rate": 2.6722777050552737e-05, "loss": 0.4063, "step": 11480 }, { "epoch": 2.6784048507462686, "grad_norm": 0.33652628557284775, "learning_rate": 2.6705435460521177e-05, "loss": 0.4002, "step": 11485 }, { "epoch": 2.679570895522388, "grad_norm": 0.327341712433127, "learning_rate": 2.668809434306615e-05, "loss": 0.4015, "step": 11490 }, { "epoch": 2.6807369402985075, "grad_norm": 0.3122521221151579, "learning_rate": 2.6670753708501454e-05, "loss": 0.3952, "step": 11495 }, { "epoch": 2.6819029850746268, "grad_norm": 0.4599505403758927, "learning_rate": 2.6653413567140668e-05, "loss": 0.4086, "step": 11500 }, { "epoch": 2.6830690298507465, "grad_norm": 0.3389549123423084, "learning_rate": 2.6636073929297018e-05, "loss": 0.3937, "step": 11505 }, { "epoch": 2.6842350746268657, "grad_norm": 0.35693314172674717, "learning_rate": 2.661873480528347e-05, "loss": 0.39, "step": 11510 }, { "epoch": 2.685401119402985, "grad_norm": 0.34909238230751594, "learning_rate": 2.660139620541267e-05, "loss": 0.4118, "step": 11515 }, { "epoch": 2.6865671641791042, "grad_norm": 0.3398210153849959, "learning_rate": 2.6584058139996942e-05, "loss": 0.4042, "step": 11520 }, { "epoch": 2.687733208955224, "grad_norm": 0.33475580925794657, "learning_rate": 2.656672061934831e-05, "loss": 0.3982, "step": 11525 }, { "epoch": 2.688899253731343, "grad_norm": 0.31449648136065284, "learning_rate": 2.654938365377847e-05, "loss": 0.3907, "step": 11530 }, { "epoch": 2.690065298507463, "grad_norm": 0.34235536529933425, "learning_rate": 2.6532047253598776e-05, "loss": 0.4042, "step": 11535 }, { "epoch": 2.691231343283582, "grad_norm": 0.3391407943414869, "learning_rate": 2.651471142912026e-05, "loss": 0.3932, "step": 11540 }, { "epoch": 2.6923973880597014, "grad_norm": 0.33897338575542646, "learning_rate": 2.6497376190653607e-05, "loss": 0.3924, "step": 11545 }, { "epoch": 2.6935634328358207, "grad_norm": 0.3291808731993279, "learning_rate": 2.6480041548509137e-05, "loss": 0.3923, "step": 11550 }, { "epoch": 2.6947294776119404, "grad_norm": 0.32127289283311683, "learning_rate": 2.6462707512996847e-05, "loss": 0.3873, "step": 11555 }, { "epoch": 2.6958955223880596, "grad_norm": 0.3193042644399082, "learning_rate": 2.644537409442635e-05, "loss": 0.3831, "step": 11560 }, { "epoch": 2.6970615671641793, "grad_norm": 0.3330610435175032, "learning_rate": 2.642804130310691e-05, "loss": 0.3932, "step": 11565 }, { "epoch": 2.6982276119402986, "grad_norm": 0.3256751410441129, "learning_rate": 2.6410709149347385e-05, "loss": 0.398, "step": 11570 }, { "epoch": 2.699393656716418, "grad_norm": 0.34017827821819047, "learning_rate": 2.6393377643456284e-05, "loss": 0.4003, "step": 11575 }, { "epoch": 2.700559701492537, "grad_norm": 0.3422711435713736, "learning_rate": 2.6376046795741733e-05, "loss": 0.3884, "step": 11580 }, { "epoch": 2.701725746268657, "grad_norm": 0.3557609765443232, "learning_rate": 2.6358716616511446e-05, "loss": 0.3985, "step": 11585 }, { "epoch": 2.702891791044776, "grad_norm": 0.3294102070107835, "learning_rate": 2.6341387116072763e-05, "loss": 0.3767, "step": 11590 }, { "epoch": 2.7040578358208958, "grad_norm": 0.3476481571980942, "learning_rate": 2.6324058304732574e-05, "loss": 0.3897, "step": 11595 }, { "epoch": 2.705223880597015, "grad_norm": 0.3180216864445747, "learning_rate": 2.630673019279742e-05, "loss": 0.3939, "step": 11600 }, { "epoch": 2.7063899253731343, "grad_norm": 0.3447200668148722, "learning_rate": 2.6289402790573392e-05, "loss": 0.3945, "step": 11605 }, { "epoch": 2.7075559701492535, "grad_norm": 0.3169899581728641, "learning_rate": 2.6272076108366163e-05, "loss": 0.3975, "step": 11610 }, { "epoch": 2.7087220149253732, "grad_norm": 0.35037596538415655, "learning_rate": 2.6254750156480973e-05, "loss": 0.4174, "step": 11615 }, { "epoch": 2.7098880597014925, "grad_norm": 0.3295914190684624, "learning_rate": 2.623742494522264e-05, "loss": 0.3884, "step": 11620 }, { "epoch": 2.711054104477612, "grad_norm": 0.3216119293129583, "learning_rate": 2.6220100484895527e-05, "loss": 0.3957, "step": 11625 }, { "epoch": 2.7122201492537314, "grad_norm": 0.3322297246793832, "learning_rate": 2.620277678580358e-05, "loss": 0.4197, "step": 11630 }, { "epoch": 2.7133861940298507, "grad_norm": 0.34768686891971406, "learning_rate": 2.6185453858250242e-05, "loss": 0.409, "step": 11635 }, { "epoch": 2.71455223880597, "grad_norm": 0.334971956806543, "learning_rate": 2.616813171253855e-05, "loss": 0.3836, "step": 11640 }, { "epoch": 2.7157182835820897, "grad_norm": 0.34111962346610814, "learning_rate": 2.615081035897104e-05, "loss": 0.3807, "step": 11645 }, { "epoch": 2.716884328358209, "grad_norm": 0.3195901275859157, "learning_rate": 2.6133489807849786e-05, "loss": 0.3797, "step": 11650 }, { "epoch": 2.7180503731343286, "grad_norm": 0.32013670350785867, "learning_rate": 2.6116170069476397e-05, "loss": 0.3788, "step": 11655 }, { "epoch": 2.719216417910448, "grad_norm": 0.47011368906524376, "learning_rate": 2.609885115415198e-05, "loss": 0.3962, "step": 11660 }, { "epoch": 2.720382462686567, "grad_norm": 0.32982538837259384, "learning_rate": 2.6081533072177183e-05, "loss": 0.4108, "step": 11665 }, { "epoch": 2.7215485074626864, "grad_norm": 0.35363158156051727, "learning_rate": 2.6064215833852113e-05, "loss": 0.3957, "step": 11670 }, { "epoch": 2.722714552238806, "grad_norm": 0.3168259313377346, "learning_rate": 2.6046899449476397e-05, "loss": 0.4056, "step": 11675 }, { "epoch": 2.7238805970149254, "grad_norm": 0.31529162330079885, "learning_rate": 2.602958392934917e-05, "loss": 0.3823, "step": 11680 }, { "epoch": 2.7250466417910446, "grad_norm": 0.34805762786353617, "learning_rate": 2.601226928376904e-05, "loss": 0.4105, "step": 11685 }, { "epoch": 2.7262126865671643, "grad_norm": 0.3304407121033633, "learning_rate": 2.5994955523034098e-05, "loss": 0.3743, "step": 11690 }, { "epoch": 2.7273787313432836, "grad_norm": 0.3513517133228261, "learning_rate": 2.5977642657441893e-05, "loss": 0.4106, "step": 11695 }, { "epoch": 2.728544776119403, "grad_norm": 0.3403043760631628, "learning_rate": 2.5960330697289447e-05, "loss": 0.4063, "step": 11700 }, { "epoch": 2.7297108208955225, "grad_norm": 0.31497780385390123, "learning_rate": 2.5943019652873267e-05, "loss": 0.4014, "step": 11705 }, { "epoch": 2.730876865671642, "grad_norm": 0.3049103415259645, "learning_rate": 2.5925709534489295e-05, "loss": 0.371, "step": 11710 }, { "epoch": 2.732042910447761, "grad_norm": 0.3444127101710014, "learning_rate": 2.5908400352432927e-05, "loss": 0.3947, "step": 11715 }, { "epoch": 2.7332089552238807, "grad_norm": 0.3337247336672335, "learning_rate": 2.589109211699899e-05, "loss": 0.4007, "step": 11720 }, { "epoch": 2.734375, "grad_norm": 0.32567021716465516, "learning_rate": 2.5873784838481762e-05, "loss": 0.3925, "step": 11725 }, { "epoch": 2.7355410447761193, "grad_norm": 0.3260034023143692, "learning_rate": 2.5856478527174955e-05, "loss": 0.3907, "step": 11730 }, { "epoch": 2.736707089552239, "grad_norm": 0.3481198984833565, "learning_rate": 2.5839173193371697e-05, "loss": 0.4046, "step": 11735 }, { "epoch": 2.737873134328358, "grad_norm": 0.33180708473431747, "learning_rate": 2.5821868847364534e-05, "loss": 0.3944, "step": 11740 }, { "epoch": 2.7390391791044775, "grad_norm": 0.37912682432222555, "learning_rate": 2.5804565499445437e-05, "loss": 0.4046, "step": 11745 }, { "epoch": 2.740205223880597, "grad_norm": 0.34226470716209806, "learning_rate": 2.578726315990576e-05, "loss": 0.3985, "step": 11750 }, { "epoch": 2.7413712686567164, "grad_norm": 0.32678430005638676, "learning_rate": 2.5769961839036277e-05, "loss": 0.3896, "step": 11755 }, { "epoch": 2.7425373134328357, "grad_norm": 0.33792757870111595, "learning_rate": 2.575266154712715e-05, "loss": 0.4046, "step": 11760 }, { "epoch": 2.7437033582089554, "grad_norm": 0.33741924695677405, "learning_rate": 2.5735362294467928e-05, "loss": 0.435, "step": 11765 }, { "epoch": 2.7448694029850746, "grad_norm": 0.33921663121838525, "learning_rate": 2.571806409134756e-05, "loss": 0.3997, "step": 11770 }, { "epoch": 2.746035447761194, "grad_norm": 0.31464119160756665, "learning_rate": 2.570076694805432e-05, "loss": 0.4003, "step": 11775 }, { "epoch": 2.7472014925373136, "grad_norm": 0.3544414565417629, "learning_rate": 2.5683470874875913e-05, "loss": 0.412, "step": 11780 }, { "epoch": 2.748367537313433, "grad_norm": 0.3167308552952958, "learning_rate": 2.566617588209937e-05, "loss": 0.3772, "step": 11785 }, { "epoch": 2.749533582089552, "grad_norm": 0.3311358398906818, "learning_rate": 2.564888198001109e-05, "loss": 0.4004, "step": 11790 }, { "epoch": 2.7506996268656714, "grad_norm": 0.3212483813375164, "learning_rate": 2.563158917889683e-05, "loss": 0.4057, "step": 11795 }, { "epoch": 2.751865671641791, "grad_norm": 0.3479030879680861, "learning_rate": 2.5614297489041673e-05, "loss": 0.4092, "step": 11800 }, { "epoch": 2.7530317164179103, "grad_norm": 0.33158240582876686, "learning_rate": 2.559700692073006e-05, "loss": 0.3842, "step": 11805 }, { "epoch": 2.75419776119403, "grad_norm": 0.3286502900384725, "learning_rate": 2.5579717484245756e-05, "loss": 0.372, "step": 11810 }, { "epoch": 2.7553638059701493, "grad_norm": 0.3454572183227963, "learning_rate": 2.556242918987185e-05, "loss": 0.4046, "step": 11815 }, { "epoch": 2.7565298507462686, "grad_norm": 0.315371977694258, "learning_rate": 2.554514204789078e-05, "loss": 0.4007, "step": 11820 }, { "epoch": 2.757695895522388, "grad_norm": 0.3151912960465065, "learning_rate": 2.5527856068584244e-05, "loss": 0.3869, "step": 11825 }, { "epoch": 2.7588619402985075, "grad_norm": 0.3422704418232128, "learning_rate": 2.551057126223329e-05, "loss": 0.4056, "step": 11830 }, { "epoch": 2.7600279850746268, "grad_norm": 0.3284554562452858, "learning_rate": 2.5493287639118265e-05, "loss": 0.4066, "step": 11835 }, { "epoch": 2.7611940298507465, "grad_norm": 0.3115128050029284, "learning_rate": 2.54760052095188e-05, "loss": 0.3908, "step": 11840 }, { "epoch": 2.7623600746268657, "grad_norm": 0.3423318131846034, "learning_rate": 2.545872398371383e-05, "loss": 0.4091, "step": 11845 }, { "epoch": 2.763526119402985, "grad_norm": 0.3192794044388495, "learning_rate": 2.544144397198155e-05, "loss": 0.3883, "step": 11850 }, { "epoch": 2.7646921641791042, "grad_norm": 0.3298892515466483, "learning_rate": 2.5424165184599457e-05, "loss": 0.4, "step": 11855 }, { "epoch": 2.765858208955224, "grad_norm": 0.33594465530904233, "learning_rate": 2.5406887631844312e-05, "loss": 0.4043, "step": 11860 }, { "epoch": 2.767024253731343, "grad_norm": 0.34840887315760244, "learning_rate": 2.5389611323992134e-05, "loss": 0.3991, "step": 11865 }, { "epoch": 2.768190298507463, "grad_norm": 0.3659025813828882, "learning_rate": 2.5372336271318225e-05, "loss": 0.4036, "step": 11870 }, { "epoch": 2.769356343283582, "grad_norm": 0.3292366496998942, "learning_rate": 2.5355062484097103e-05, "loss": 0.4026, "step": 11875 }, { "epoch": 2.7705223880597014, "grad_norm": 0.3789236833035916, "learning_rate": 2.5337789972602566e-05, "loss": 0.397, "step": 11880 }, { "epoch": 2.7716884328358207, "grad_norm": 0.3193885477040352, "learning_rate": 2.5320518747107646e-05, "loss": 0.4056, "step": 11885 }, { "epoch": 2.7728544776119404, "grad_norm": 0.3452125973954222, "learning_rate": 2.530324881788459e-05, "loss": 0.4155, "step": 11890 }, { "epoch": 2.7740205223880596, "grad_norm": 0.32392224808503434, "learning_rate": 2.5285980195204906e-05, "loss": 0.3839, "step": 11895 }, { "epoch": 2.7751865671641793, "grad_norm": 0.3140824518357596, "learning_rate": 2.5268712889339296e-05, "loss": 0.3889, "step": 11900 }, { "epoch": 2.7763526119402986, "grad_norm": 0.34023526978279334, "learning_rate": 2.5251446910557704e-05, "loss": 0.4018, "step": 11905 }, { "epoch": 2.777518656716418, "grad_norm": 0.3163431386329222, "learning_rate": 2.5234182269129253e-05, "loss": 0.3931, "step": 11910 }, { "epoch": 2.778684701492537, "grad_norm": 0.33335623057536234, "learning_rate": 2.5216918975322303e-05, "loss": 0.3951, "step": 11915 }, { "epoch": 2.779850746268657, "grad_norm": 0.3358972294222299, "learning_rate": 2.519965703940441e-05, "loss": 0.3848, "step": 11920 }, { "epoch": 2.781016791044776, "grad_norm": 0.33874256012388476, "learning_rate": 2.5182396471642287e-05, "loss": 0.4076, "step": 11925 }, { "epoch": 2.7821828358208958, "grad_norm": 0.34139399168943446, "learning_rate": 2.5165137282301877e-05, "loss": 0.3993, "step": 11930 }, { "epoch": 2.783348880597015, "grad_norm": 0.3175169310964827, "learning_rate": 2.5147879481648266e-05, "loss": 0.3871, "step": 11935 }, { "epoch": 2.7845149253731343, "grad_norm": 0.32703895942532285, "learning_rate": 2.5130623079945754e-05, "loss": 0.3873, "step": 11940 }, { "epoch": 2.7856809701492535, "grad_norm": 0.3147451559140355, "learning_rate": 2.511336808745778e-05, "loss": 0.3761, "step": 11945 }, { "epoch": 2.7868470149253732, "grad_norm": 0.31512001651088684, "learning_rate": 2.5096114514446934e-05, "loss": 0.3929, "step": 11950 }, { "epoch": 2.7880130597014925, "grad_norm": 0.3431873108310587, "learning_rate": 2.5078862371175e-05, "loss": 0.4068, "step": 11955 }, { "epoch": 2.789179104477612, "grad_norm": 0.31343137159498574, "learning_rate": 2.5061611667902878e-05, "loss": 0.395, "step": 11960 }, { "epoch": 2.7903451492537314, "grad_norm": 0.3516000367218387, "learning_rate": 2.504436241489064e-05, "loss": 0.3799, "step": 11965 }, { "epoch": 2.7915111940298507, "grad_norm": 0.35051456582328566, "learning_rate": 2.5027114622397473e-05, "loss": 0.4243, "step": 11970 }, { "epoch": 2.79267723880597, "grad_norm": 0.3315836718030881, "learning_rate": 2.50098683006817e-05, "loss": 0.4206, "step": 11975 }, { "epoch": 2.7938432835820897, "grad_norm": 0.3141947828655213, "learning_rate": 2.4992623460000763e-05, "loss": 0.3982, "step": 11980 }, { "epoch": 2.795009328358209, "grad_norm": 0.3471648042578197, "learning_rate": 2.497538011061125e-05, "loss": 0.3947, "step": 11985 }, { "epoch": 2.7961753731343286, "grad_norm": 0.34129598811920847, "learning_rate": 2.495813826276884e-05, "loss": 0.4008, "step": 11990 }, { "epoch": 2.797341417910448, "grad_norm": 0.3455030171856007, "learning_rate": 2.4940897926728314e-05, "loss": 0.3918, "step": 11995 }, { "epoch": 2.798507462686567, "grad_norm": 0.34742618232423944, "learning_rate": 2.4923659112743576e-05, "loss": 0.3866, "step": 12000 }, { "epoch": 2.7996735074626864, "grad_norm": 0.3362327819759096, "learning_rate": 2.490642183106759e-05, "loss": 0.3953, "step": 12005 }, { "epoch": 2.800839552238806, "grad_norm": 0.3551627915697855, "learning_rate": 2.4889186091952444e-05, "loss": 0.4209, "step": 12010 }, { "epoch": 2.8020055970149254, "grad_norm": 0.3575109929114036, "learning_rate": 2.48719519056493e-05, "loss": 0.3932, "step": 12015 }, { "epoch": 2.8031716417910446, "grad_norm": 0.33500516760610627, "learning_rate": 2.485471928240839e-05, "loss": 0.4008, "step": 12020 }, { "epoch": 2.8043376865671643, "grad_norm": 0.3216137890554855, "learning_rate": 2.4837488232479005e-05, "loss": 0.3753, "step": 12025 }, { "epoch": 2.8055037313432836, "grad_norm": 0.3295186381540003, "learning_rate": 2.4820258766109515e-05, "loss": 0.4048, "step": 12030 }, { "epoch": 2.806669776119403, "grad_norm": 0.3199776912817911, "learning_rate": 2.4803030893547357e-05, "loss": 0.4088, "step": 12035 }, { "epoch": 2.8078358208955225, "grad_norm": 0.3404215529681031, "learning_rate": 2.4785804625039005e-05, "loss": 0.3856, "step": 12040 }, { "epoch": 2.809001865671642, "grad_norm": 0.3225427308190754, "learning_rate": 2.4768579970829985e-05, "loss": 0.4016, "step": 12045 }, { "epoch": 2.810167910447761, "grad_norm": 0.3278613222509628, "learning_rate": 2.4751356941164855e-05, "loss": 0.4032, "step": 12050 }, { "epoch": 2.8113339552238807, "grad_norm": 0.3336860498574663, "learning_rate": 2.4734135546287208e-05, "loss": 0.3937, "step": 12055 }, { "epoch": 2.8125, "grad_norm": 0.35002787355868725, "learning_rate": 2.4716915796439678e-05, "loss": 0.4071, "step": 12060 }, { "epoch": 2.8136660447761193, "grad_norm": 0.36621963458977336, "learning_rate": 2.4699697701863916e-05, "loss": 0.413, "step": 12065 }, { "epoch": 2.814832089552239, "grad_norm": 0.32839128654034394, "learning_rate": 2.4682481272800572e-05, "loss": 0.3838, "step": 12070 }, { "epoch": 2.815998134328358, "grad_norm": 0.35798691442553493, "learning_rate": 2.4665266519489328e-05, "loss": 0.3981, "step": 12075 }, { "epoch": 2.8171641791044775, "grad_norm": 0.3140940363462768, "learning_rate": 2.4648053452168857e-05, "loss": 0.3884, "step": 12080 }, { "epoch": 2.818330223880597, "grad_norm": 0.324076070136119, "learning_rate": 2.463084208107682e-05, "loss": 0.3963, "step": 12085 }, { "epoch": 2.8194962686567164, "grad_norm": 0.3330385699374476, "learning_rate": 2.4613632416449893e-05, "loss": 0.3858, "step": 12090 }, { "epoch": 2.8206623134328357, "grad_norm": 0.31894826227742207, "learning_rate": 2.4596424468523728e-05, "loss": 0.3992, "step": 12095 }, { "epoch": 2.8218283582089554, "grad_norm": 0.3192765644082724, "learning_rate": 2.4579218247532947e-05, "loss": 0.3828, "step": 12100 }, { "epoch": 2.8229944029850746, "grad_norm": 0.346843156119822, "learning_rate": 2.4562013763711145e-05, "loss": 0.4018, "step": 12105 }, { "epoch": 2.824160447761194, "grad_norm": 0.3221140934657376, "learning_rate": 2.4544811027290893e-05, "loss": 0.3958, "step": 12110 }, { "epoch": 2.8253264925373136, "grad_norm": 0.3367028071026674, "learning_rate": 2.452761004850371e-05, "loss": 0.4, "step": 12115 }, { "epoch": 2.826492537313433, "grad_norm": 0.33845487651179035, "learning_rate": 2.4510410837580106e-05, "loss": 0.3895, "step": 12120 }, { "epoch": 2.827658582089552, "grad_norm": 0.33026623127228605, "learning_rate": 2.4493213404749493e-05, "loss": 0.3869, "step": 12125 }, { "epoch": 2.8288246268656714, "grad_norm": 0.31860952569546147, "learning_rate": 2.447601776024024e-05, "loss": 0.3625, "step": 12130 }, { "epoch": 2.829990671641791, "grad_norm": 0.32578484248594275, "learning_rate": 2.4458823914279662e-05, "loss": 0.3938, "step": 12135 }, { "epoch": 2.8311567164179103, "grad_norm": 0.31136953954015467, "learning_rate": 2.4441631877093995e-05, "loss": 0.3861, "step": 12140 }, { "epoch": 2.83232276119403, "grad_norm": 0.32698424639935214, "learning_rate": 2.442444165890842e-05, "loss": 0.411, "step": 12145 }, { "epoch": 2.8334888059701493, "grad_norm": 0.3406999308412676, "learning_rate": 2.4407253269947006e-05, "loss": 0.4088, "step": 12150 }, { "epoch": 2.8346548507462686, "grad_norm": 0.3480968262746312, "learning_rate": 2.4390066720432746e-05, "loss": 0.42, "step": 12155 }, { "epoch": 2.835820895522388, "grad_norm": 0.3270986452676917, "learning_rate": 2.437288202058755e-05, "loss": 0.3996, "step": 12160 }, { "epoch": 2.8369869402985075, "grad_norm": 0.362320092080957, "learning_rate": 2.4355699180632207e-05, "loss": 0.4108, "step": 12165 }, { "epoch": 2.8381529850746268, "grad_norm": 0.3530087344249372, "learning_rate": 2.4338518210786416e-05, "loss": 0.412, "step": 12170 }, { "epoch": 2.8393190298507465, "grad_norm": 0.32787449927078066, "learning_rate": 2.4321339121268766e-05, "loss": 0.4057, "step": 12175 }, { "epoch": 2.8404850746268657, "grad_norm": 0.31801229519548774, "learning_rate": 2.430416192229672e-05, "loss": 0.4014, "step": 12180 }, { "epoch": 2.841651119402985, "grad_norm": 0.3165809465068147, "learning_rate": 2.42869866240866e-05, "loss": 0.3957, "step": 12185 }, { "epoch": 2.8428171641791042, "grad_norm": 0.33666006077250604, "learning_rate": 2.4269813236853632e-05, "loss": 0.3957, "step": 12190 }, { "epoch": 2.843983208955224, "grad_norm": 0.3261104756662522, "learning_rate": 2.4252641770811886e-05, "loss": 0.3803, "step": 12195 }, { "epoch": 2.845149253731343, "grad_norm": 0.32277494507890464, "learning_rate": 2.423547223617429e-05, "loss": 0.4117, "step": 12200 }, { "epoch": 2.846315298507463, "grad_norm": 0.3406342653932348, "learning_rate": 2.4218304643152617e-05, "loss": 0.3917, "step": 12205 }, { "epoch": 2.847481343283582, "grad_norm": 0.34454687543498197, "learning_rate": 2.42011390019575e-05, "loss": 0.3952, "step": 12210 }, { "epoch": 2.8486473880597014, "grad_norm": 0.32368511001968725, "learning_rate": 2.4183975322798407e-05, "loss": 0.3792, "step": 12215 }, { "epoch": 2.8498134328358207, "grad_norm": 0.3267063901512848, "learning_rate": 2.4166813615883625e-05, "loss": 0.3917, "step": 12220 }, { "epoch": 2.8509794776119404, "grad_norm": 0.3126605529469473, "learning_rate": 2.4149653891420304e-05, "loss": 0.396, "step": 12225 }, { "epoch": 2.8521455223880596, "grad_norm": 0.3394317315293222, "learning_rate": 2.4132496159614366e-05, "loss": 0.4072, "step": 12230 }, { "epoch": 2.8533115671641793, "grad_norm": 0.30186806585986314, "learning_rate": 2.4115340430670574e-05, "loss": 0.3815, "step": 12235 }, { "epoch": 2.8544776119402986, "grad_norm": 0.3381417026019833, "learning_rate": 2.4098186714792504e-05, "loss": 0.4056, "step": 12240 }, { "epoch": 2.855643656716418, "grad_norm": 0.3279298294433154, "learning_rate": 2.408103502218253e-05, "loss": 0.412, "step": 12245 }, { "epoch": 2.856809701492537, "grad_norm": 0.3324818038432356, "learning_rate": 2.4063885363041822e-05, "loss": 0.4107, "step": 12250 }, { "epoch": 2.857975746268657, "grad_norm": 0.34387584154562095, "learning_rate": 2.4046737747570326e-05, "loss": 0.4024, "step": 12255 }, { "epoch": 2.859141791044776, "grad_norm": 0.3295931675094607, "learning_rate": 2.4029592185966804e-05, "loss": 0.3976, "step": 12260 }, { "epoch": 2.8603078358208958, "grad_norm": 0.32557698565481685, "learning_rate": 2.4012448688428768e-05, "loss": 0.3839, "step": 12265 }, { "epoch": 2.861473880597015, "grad_norm": 0.3271252750956059, "learning_rate": 2.399530726515251e-05, "loss": 0.4011, "step": 12270 }, { "epoch": 2.8626399253731343, "grad_norm": 0.3462489088571486, "learning_rate": 2.397816792633311e-05, "loss": 0.3965, "step": 12275 }, { "epoch": 2.8638059701492535, "grad_norm": 0.3263109507849424, "learning_rate": 2.396103068216437e-05, "loss": 0.3985, "step": 12280 }, { "epoch": 2.8649720149253732, "grad_norm": 0.3158163998008106, "learning_rate": 2.3943895542838868e-05, "loss": 0.3927, "step": 12285 }, { "epoch": 2.8661380597014925, "grad_norm": 0.3210667559741464, "learning_rate": 2.3926762518547928e-05, "loss": 0.3769, "step": 12290 }, { "epoch": 2.867304104477612, "grad_norm": 0.3491386828131719, "learning_rate": 2.3909631619481626e-05, "loss": 0.4006, "step": 12295 }, { "epoch": 2.8684701492537314, "grad_norm": 0.3489130761037985, "learning_rate": 2.3892502855828762e-05, "loss": 0.3987, "step": 12300 }, { "epoch": 2.8696361940298507, "grad_norm": 0.33401463597130787, "learning_rate": 2.387537623777686e-05, "loss": 0.4054, "step": 12305 }, { "epoch": 2.87080223880597, "grad_norm": 0.3300099766666651, "learning_rate": 2.3858251775512176e-05, "loss": 0.4049, "step": 12310 }, { "epoch": 2.8719682835820897, "grad_norm": 0.3626893710625864, "learning_rate": 2.384112947921968e-05, "loss": 0.4107, "step": 12315 }, { "epoch": 2.873134328358209, "grad_norm": 0.31283099905413925, "learning_rate": 2.3824009359083073e-05, "loss": 0.3841, "step": 12320 }, { "epoch": 2.8743003731343286, "grad_norm": 0.32022021248826466, "learning_rate": 2.380689142528474e-05, "loss": 0.404, "step": 12325 }, { "epoch": 2.875466417910448, "grad_norm": 0.34838027513671244, "learning_rate": 2.378977568800576e-05, "loss": 0.409, "step": 12330 }, { "epoch": 2.876632462686567, "grad_norm": 0.33376525286127284, "learning_rate": 2.3772662157425925e-05, "loss": 0.4148, "step": 12335 }, { "epoch": 2.8777985074626864, "grad_norm": 0.32721898306258324, "learning_rate": 2.375555084372371e-05, "loss": 0.3961, "step": 12340 }, { "epoch": 2.878964552238806, "grad_norm": 0.3293576017768263, "learning_rate": 2.3738441757076268e-05, "loss": 0.4084, "step": 12345 }, { "epoch": 2.8801305970149254, "grad_norm": 0.3285917296102872, "learning_rate": 2.3721334907659424e-05, "loss": 0.3966, "step": 12350 }, { "epoch": 2.8812966417910446, "grad_norm": 0.3231103566001575, "learning_rate": 2.370423030564768e-05, "loss": 0.3822, "step": 12355 }, { "epoch": 2.8824626865671643, "grad_norm": 0.3291106258171459, "learning_rate": 2.368712796121419e-05, "loss": 0.3806, "step": 12360 }, { "epoch": 2.8836287313432836, "grad_norm": 0.3173687904309937, "learning_rate": 2.367002788453077e-05, "loss": 0.3898, "step": 12365 }, { "epoch": 2.884794776119403, "grad_norm": 0.4311246072339703, "learning_rate": 2.3652930085767904e-05, "loss": 0.3952, "step": 12370 }, { "epoch": 2.8859608208955225, "grad_norm": 0.32551932885966506, "learning_rate": 2.3635834575094705e-05, "loss": 0.3914, "step": 12375 }, { "epoch": 2.887126865671642, "grad_norm": 0.3262781870283349, "learning_rate": 2.3618741362678915e-05, "loss": 0.3928, "step": 12380 }, { "epoch": 2.888292910447761, "grad_norm": 0.33000097131434913, "learning_rate": 2.360165045868693e-05, "loss": 0.3875, "step": 12385 }, { "epoch": 2.8894589552238807, "grad_norm": 0.31939590414550273, "learning_rate": 2.358456187328376e-05, "loss": 0.3876, "step": 12390 }, { "epoch": 2.890625, "grad_norm": 0.31138347582004905, "learning_rate": 2.3567475616633046e-05, "loss": 0.3728, "step": 12395 }, { "epoch": 2.8917910447761193, "grad_norm": 0.33682073854992745, "learning_rate": 2.355039169889704e-05, "loss": 0.394, "step": 12400 }, { "epoch": 2.892957089552239, "grad_norm": 0.3309306353993416, "learning_rate": 2.3533310130236592e-05, "loss": 0.3945, "step": 12405 }, { "epoch": 2.894123134328358, "grad_norm": 0.32849882157788585, "learning_rate": 2.3516230920811166e-05, "loss": 0.3931, "step": 12410 }, { "epoch": 2.8952891791044775, "grad_norm": 0.3273734084049458, "learning_rate": 2.3499154080778823e-05, "loss": 0.3834, "step": 12415 }, { "epoch": 2.896455223880597, "grad_norm": 0.3053087709724216, "learning_rate": 2.3482079620296223e-05, "loss": 0.3837, "step": 12420 }, { "epoch": 2.8976212686567164, "grad_norm": 0.3190287803135248, "learning_rate": 2.3465007549518576e-05, "loss": 0.3885, "step": 12425 }, { "epoch": 2.8987873134328357, "grad_norm": 0.31900171760544377, "learning_rate": 2.3447937878599725e-05, "loss": 0.4102, "step": 12430 }, { "epoch": 2.8999533582089554, "grad_norm": 0.32566002677049954, "learning_rate": 2.343087061769203e-05, "loss": 0.4016, "step": 12435 }, { "epoch": 2.9011194029850746, "grad_norm": 0.3333006017751379, "learning_rate": 2.3413805776946453e-05, "loss": 0.4047, "step": 12440 }, { "epoch": 2.902285447761194, "grad_norm": 0.325557748685479, "learning_rate": 2.3396743366512508e-05, "loss": 0.407, "step": 12445 }, { "epoch": 2.9034514925373136, "grad_norm": 0.34387362835910973, "learning_rate": 2.337968339653826e-05, "loss": 0.3915, "step": 12450 }, { "epoch": 2.904617537313433, "grad_norm": 0.3254231469793663, "learning_rate": 2.3362625877170336e-05, "loss": 0.4012, "step": 12455 }, { "epoch": 2.905783582089552, "grad_norm": 0.31883782984302567, "learning_rate": 2.3345570818553874e-05, "loss": 0.3995, "step": 12460 }, { "epoch": 2.9069496268656714, "grad_norm": 0.3322127939426441, "learning_rate": 2.3328518230832587e-05, "loss": 0.3991, "step": 12465 }, { "epoch": 2.908115671641791, "grad_norm": 0.31989111332781434, "learning_rate": 2.331146812414869e-05, "loss": 0.3837, "step": 12470 }, { "epoch": 2.9092817164179103, "grad_norm": 0.33879282085594653, "learning_rate": 2.329442050864293e-05, "loss": 0.4053, "step": 12475 }, { "epoch": 2.91044776119403, "grad_norm": 0.3543375690832978, "learning_rate": 2.3277375394454594e-05, "loss": 0.4041, "step": 12480 }, { "epoch": 2.9116138059701493, "grad_norm": 0.31999931712441865, "learning_rate": 2.326033279172144e-05, "loss": 0.3966, "step": 12485 }, { "epoch": 2.9127798507462686, "grad_norm": 0.32076392516037394, "learning_rate": 2.324329271057976e-05, "loss": 0.397, "step": 12490 }, { "epoch": 2.913945895522388, "grad_norm": 0.3247086331298665, "learning_rate": 2.322625516116435e-05, "loss": 0.3946, "step": 12495 }, { "epoch": 2.9151119402985075, "grad_norm": 0.3281782323495906, "learning_rate": 2.3209220153608486e-05, "loss": 0.4137, "step": 12500 }, { "epoch": 2.9162779850746268, "grad_norm": 0.3372712945851879, "learning_rate": 2.3192187698043944e-05, "loss": 0.3975, "step": 12505 }, { "epoch": 2.9174440298507465, "grad_norm": 0.37702107211387365, "learning_rate": 2.3175157804600954e-05, "loss": 0.4143, "step": 12510 }, { "epoch": 2.9186100746268657, "grad_norm": 0.310618204430658, "learning_rate": 2.3158130483408262e-05, "loss": 0.3862, "step": 12515 }, { "epoch": 2.919776119402985, "grad_norm": 0.3346469519772596, "learning_rate": 2.3141105744593065e-05, "loss": 0.3826, "step": 12520 }, { "epoch": 2.9209421641791042, "grad_norm": 0.3206023141307392, "learning_rate": 2.3124083598281022e-05, "loss": 0.3803, "step": 12525 }, { "epoch": 2.922108208955224, "grad_norm": 0.3310996156602885, "learning_rate": 2.310706405459625e-05, "loss": 0.3753, "step": 12530 }, { "epoch": 2.923274253731343, "grad_norm": 0.31499057177592116, "learning_rate": 2.3090047123661324e-05, "loss": 0.3889, "step": 12535 }, { "epoch": 2.924440298507463, "grad_norm": 0.30367718744586075, "learning_rate": 2.3073032815597263e-05, "loss": 0.3849, "step": 12540 }, { "epoch": 2.925606343283582, "grad_norm": 0.31983114832533827, "learning_rate": 2.3056021140523516e-05, "loss": 0.3924, "step": 12545 }, { "epoch": 2.9267723880597014, "grad_norm": 0.31463552150541285, "learning_rate": 2.3039012108557982e-05, "loss": 0.3762, "step": 12550 }, { "epoch": 2.9279384328358207, "grad_norm": 0.31553299625090286, "learning_rate": 2.3022005729817e-05, "loss": 0.3854, "step": 12555 }, { "epoch": 2.9291044776119404, "grad_norm": 0.32415412932787335, "learning_rate": 2.3005002014415274e-05, "loss": 0.406, "step": 12560 }, { "epoch": 2.9302705223880596, "grad_norm": 0.35082667598873624, "learning_rate": 2.2988000972465978e-05, "loss": 0.4063, "step": 12565 }, { "epoch": 2.9314365671641793, "grad_norm": 0.34850139125998236, "learning_rate": 2.297100261408069e-05, "loss": 0.4172, "step": 12570 }, { "epoch": 2.9326026119402986, "grad_norm": 0.3226814126905502, "learning_rate": 2.295400694936937e-05, "loss": 0.3868, "step": 12575 }, { "epoch": 2.933768656716418, "grad_norm": 0.321302545691153, "learning_rate": 2.2937013988440405e-05, "loss": 0.4168, "step": 12580 }, { "epoch": 2.934934701492537, "grad_norm": 0.3323426345500244, "learning_rate": 2.2920023741400533e-05, "loss": 0.4041, "step": 12585 }, { "epoch": 2.936100746268657, "grad_norm": 0.3278335493510715, "learning_rate": 2.2903036218354912e-05, "loss": 0.3833, "step": 12590 }, { "epoch": 2.937266791044776, "grad_norm": 0.3109046277534067, "learning_rate": 2.288605142940707e-05, "loss": 0.3929, "step": 12595 }, { "epoch": 2.9384328358208958, "grad_norm": 0.3470122662986262, "learning_rate": 2.2869069384658908e-05, "loss": 0.4113, "step": 12600 }, { "epoch": 2.939598880597015, "grad_norm": 0.33151628305856545, "learning_rate": 2.2852090094210698e-05, "loss": 0.4082, "step": 12605 }, { "epoch": 2.9407649253731343, "grad_norm": 0.31165230323487975, "learning_rate": 2.283511356816106e-05, "loss": 0.4003, "step": 12610 }, { "epoch": 2.9419309701492535, "grad_norm": 0.30091752240182185, "learning_rate": 2.2818139816607e-05, "loss": 0.3699, "step": 12615 }, { "epoch": 2.9430970149253732, "grad_norm": 0.3510837756513753, "learning_rate": 2.280116884964383e-05, "loss": 0.3971, "step": 12620 }, { "epoch": 2.9442630597014925, "grad_norm": 0.3570534670145446, "learning_rate": 2.2784200677365242e-05, "loss": 0.4046, "step": 12625 }, { "epoch": 2.945429104477612, "grad_norm": 0.3462519120583106, "learning_rate": 2.276723530986327e-05, "loss": 0.4083, "step": 12630 }, { "epoch": 2.9465951492537314, "grad_norm": 0.3155521881776295, "learning_rate": 2.2750272757228235e-05, "loss": 0.3898, "step": 12635 }, { "epoch": 2.9477611940298507, "grad_norm": 0.3302676200847098, "learning_rate": 2.273331302954883e-05, "loss": 0.4021, "step": 12640 }, { "epoch": 2.94892723880597, "grad_norm": 0.31531950202628056, "learning_rate": 2.271635613691205e-05, "loss": 0.3854, "step": 12645 }, { "epoch": 2.9500932835820897, "grad_norm": 0.32218333777070857, "learning_rate": 2.26994020894032e-05, "loss": 0.4051, "step": 12650 }, { "epoch": 2.951259328358209, "grad_norm": 0.33339429502269663, "learning_rate": 2.2682450897105905e-05, "loss": 0.3943, "step": 12655 }, { "epoch": 2.9524253731343286, "grad_norm": 0.33577421197015894, "learning_rate": 2.266550257010207e-05, "loss": 0.3907, "step": 12660 }, { "epoch": 2.953591417910448, "grad_norm": 0.33660080490133865, "learning_rate": 2.2648557118471918e-05, "loss": 0.3891, "step": 12665 }, { "epoch": 2.954757462686567, "grad_norm": 0.3365030888007477, "learning_rate": 2.2631614552293963e-05, "loss": 0.3946, "step": 12670 }, { "epoch": 2.9559235074626864, "grad_norm": 0.32265600033551056, "learning_rate": 2.2614674881644974e-05, "loss": 0.3942, "step": 12675 }, { "epoch": 2.957089552238806, "grad_norm": 0.33902639321595024, "learning_rate": 2.2597738116600048e-05, "loss": 0.3942, "step": 12680 }, { "epoch": 2.9582555970149254, "grad_norm": 0.3300590753897082, "learning_rate": 2.2580804267232484e-05, "loss": 0.3876, "step": 12685 }, { "epoch": 2.9594216417910446, "grad_norm": 0.32527692274528264, "learning_rate": 2.2563873343613916e-05, "loss": 0.4014, "step": 12690 }, { "epoch": 2.9605876865671643, "grad_norm": 0.320066216570011, "learning_rate": 2.2546945355814196e-05, "loss": 0.3916, "step": 12695 }, { "epoch": 2.9617537313432836, "grad_norm": 0.34400766536803673, "learning_rate": 2.2530020313901446e-05, "loss": 0.4014, "step": 12700 }, { "epoch": 2.962919776119403, "grad_norm": 0.32742320925968044, "learning_rate": 2.2513098227942032e-05, "loss": 0.3826, "step": 12705 }, { "epoch": 2.9640858208955225, "grad_norm": 0.32644568730014994, "learning_rate": 2.249617910800056e-05, "loss": 0.3909, "step": 12710 }, { "epoch": 2.965251865671642, "grad_norm": 0.32990731992647065, "learning_rate": 2.2479262964139863e-05, "loss": 0.3964, "step": 12715 }, { "epoch": 2.966417910447761, "grad_norm": 0.3321003374534492, "learning_rate": 2.2462349806421035e-05, "loss": 0.3916, "step": 12720 }, { "epoch": 2.9675839552238807, "grad_norm": 0.32661592271770257, "learning_rate": 2.244543964490336e-05, "loss": 0.3847, "step": 12725 }, { "epoch": 2.96875, "grad_norm": 0.32407147265046626, "learning_rate": 2.2428532489644368e-05, "loss": 0.3937, "step": 12730 }, { "epoch": 2.9699160447761193, "grad_norm": 0.351299879969938, "learning_rate": 2.2411628350699766e-05, "loss": 0.4001, "step": 12735 }, { "epoch": 2.971082089552239, "grad_norm": 0.3420923695091976, "learning_rate": 2.2394727238123497e-05, "loss": 0.3976, "step": 12740 }, { "epoch": 2.972248134328358, "grad_norm": 0.335331945197588, "learning_rate": 2.23778291619677e-05, "loss": 0.4026, "step": 12745 }, { "epoch": 2.9734141791044775, "grad_norm": 0.33402675220345296, "learning_rate": 2.236093413228269e-05, "loss": 0.4088, "step": 12750 }, { "epoch": 2.974580223880597, "grad_norm": 0.3473916621893903, "learning_rate": 2.2344042159117006e-05, "loss": 0.4159, "step": 12755 }, { "epoch": 2.9757462686567164, "grad_norm": 0.31715982641176704, "learning_rate": 2.2327153252517323e-05, "loss": 0.3887, "step": 12760 }, { "epoch": 2.9769123134328357, "grad_norm": 0.3302423283046164, "learning_rate": 2.2310267422528523e-05, "loss": 0.3945, "step": 12765 }, { "epoch": 2.9780783582089554, "grad_norm": 0.3697981083704603, "learning_rate": 2.2293384679193645e-05, "loss": 0.4099, "step": 12770 }, { "epoch": 2.9792444029850746, "grad_norm": 0.32605460793354085, "learning_rate": 2.2276505032553912e-05, "loss": 0.3994, "step": 12775 }, { "epoch": 2.980410447761194, "grad_norm": 0.3375878303957479, "learning_rate": 2.2259628492648676e-05, "loss": 0.4165, "step": 12780 }, { "epoch": 2.9815764925373136, "grad_norm": 0.3289720125129972, "learning_rate": 2.224275506951547e-05, "loss": 0.3867, "step": 12785 }, { "epoch": 2.982742537313433, "grad_norm": 0.39439488786969035, "learning_rate": 2.2225884773189936e-05, "loss": 0.4027, "step": 12790 }, { "epoch": 2.983908582089552, "grad_norm": 0.3167173881527927, "learning_rate": 2.2209017613705908e-05, "loss": 0.388, "step": 12795 }, { "epoch": 2.9850746268656714, "grad_norm": 0.32674021695452415, "learning_rate": 2.2192153601095293e-05, "loss": 0.3981, "step": 12800 }, { "epoch": 2.986240671641791, "grad_norm": 0.3220894508303537, "learning_rate": 2.2175292745388186e-05, "loss": 0.3843, "step": 12805 }, { "epoch": 2.9874067164179103, "grad_norm": 0.32963873894719864, "learning_rate": 2.2158435056612775e-05, "loss": 0.4062, "step": 12810 }, { "epoch": 2.98857276119403, "grad_norm": 0.3183732730239878, "learning_rate": 2.2141580544795353e-05, "loss": 0.3832, "step": 12815 }, { "epoch": 2.9897388059701493, "grad_norm": 0.36222438252167755, "learning_rate": 2.2124729219960343e-05, "loss": 0.3861, "step": 12820 }, { "epoch": 2.9909048507462686, "grad_norm": 0.344714460761376, "learning_rate": 2.2107881092130266e-05, "loss": 0.4102, "step": 12825 }, { "epoch": 2.992070895522388, "grad_norm": 0.3350653668106831, "learning_rate": 2.2091036171325754e-05, "loss": 0.3987, "step": 12830 }, { "epoch": 2.9932369402985075, "grad_norm": 0.3319939448431505, "learning_rate": 2.2074194467565514e-05, "loss": 0.3676, "step": 12835 }, { "epoch": 2.9944029850746268, "grad_norm": 0.32887622190198984, "learning_rate": 2.2057355990866328e-05, "loss": 0.3873, "step": 12840 }, { "epoch": 2.9955690298507465, "grad_norm": 0.32805591693365205, "learning_rate": 2.2040520751243094e-05, "loss": 0.4024, "step": 12845 }, { "epoch": 2.9967350746268657, "grad_norm": 0.33828172614612584, "learning_rate": 2.2023688758708767e-05, "loss": 0.4147, "step": 12850 }, { "epoch": 2.997901119402985, "grad_norm": 0.34288695869143143, "learning_rate": 2.2006860023274363e-05, "loss": 0.3981, "step": 12855 }, { "epoch": 2.9990671641791042, "grad_norm": 0.3339388809033874, "learning_rate": 2.199003455494898e-05, "loss": 0.4067, "step": 12860 }, { "epoch": 3.000233208955224, "grad_norm": 0.31480415784330323, "learning_rate": 2.1973212363739747e-05, "loss": 0.3647, "step": 12865 }, { "epoch": 3.001399253731343, "grad_norm": 0.3415285396744332, "learning_rate": 2.1956393459651864e-05, "loss": 0.3147, "step": 12870 }, { "epoch": 3.002565298507463, "grad_norm": 0.4424315410720294, "learning_rate": 2.1939577852688576e-05, "loss": 0.3333, "step": 12875 }, { "epoch": 3.003731343283582, "grad_norm": 0.3576318000100144, "learning_rate": 2.1922765552851155e-05, "loss": 0.325, "step": 12880 }, { "epoch": 3.0048973880597014, "grad_norm": 0.37283361438841267, "learning_rate": 2.190595657013892e-05, "loss": 0.3426, "step": 12885 }, { "epoch": 3.006063432835821, "grad_norm": 0.3266554273224183, "learning_rate": 2.1889150914549195e-05, "loss": 0.32, "step": 12890 }, { "epoch": 3.0072294776119404, "grad_norm": 0.3739654519269057, "learning_rate": 2.1872348596077348e-05, "loss": 0.3256, "step": 12895 }, { "epoch": 3.0083955223880596, "grad_norm": 0.3711708858583731, "learning_rate": 2.1855549624716755e-05, "loss": 0.3373, "step": 12900 }, { "epoch": 3.009561567164179, "grad_norm": 0.3469551419431101, "learning_rate": 2.1838754010458796e-05, "loss": 0.3399, "step": 12905 }, { "epoch": 3.0107276119402986, "grad_norm": 0.3544564289921535, "learning_rate": 2.182196176329287e-05, "loss": 0.3246, "step": 12910 }, { "epoch": 3.011893656716418, "grad_norm": 0.3769777201124636, "learning_rate": 2.1805172893206342e-05, "loss": 0.3297, "step": 12915 }, { "epoch": 3.013059701492537, "grad_norm": 0.34419023106242463, "learning_rate": 2.1788387410184603e-05, "loss": 0.3127, "step": 12920 }, { "epoch": 3.014225746268657, "grad_norm": 0.3656702030299312, "learning_rate": 2.177160532421101e-05, "loss": 0.329, "step": 12925 }, { "epoch": 3.015391791044776, "grad_norm": 0.35183086668261326, "learning_rate": 2.1754826645266895e-05, "loss": 0.3194, "step": 12930 }, { "epoch": 3.0165578358208953, "grad_norm": 0.37377436097190053, "learning_rate": 2.1738051383331598e-05, "loss": 0.3312, "step": 12935 }, { "epoch": 3.017723880597015, "grad_norm": 0.3537758265486952, "learning_rate": 2.172127954838238e-05, "loss": 0.3337, "step": 12940 }, { "epoch": 3.0188899253731343, "grad_norm": 0.3657259262696127, "learning_rate": 2.1704511150394486e-05, "loss": 0.3251, "step": 12945 }, { "epoch": 3.0200559701492535, "grad_norm": 0.352442228486061, "learning_rate": 2.1687746199341118e-05, "loss": 0.3139, "step": 12950 }, { "epoch": 3.0212220149253732, "grad_norm": 0.34137337379741683, "learning_rate": 2.167098470519344e-05, "loss": 0.3193, "step": 12955 }, { "epoch": 3.0223880597014925, "grad_norm": 0.35424670829336125, "learning_rate": 2.165422667792053e-05, "loss": 0.3253, "step": 12960 }, { "epoch": 3.0235541044776117, "grad_norm": 0.3417579101815207, "learning_rate": 2.1637472127489427e-05, "loss": 0.3284, "step": 12965 }, { "epoch": 3.0247201492537314, "grad_norm": 0.3731817006318227, "learning_rate": 2.162072106386509e-05, "loss": 0.3305, "step": 12970 }, { "epoch": 3.0258861940298507, "grad_norm": 0.3567048213584788, "learning_rate": 2.1603973497010417e-05, "loss": 0.3235, "step": 12975 }, { "epoch": 3.02705223880597, "grad_norm": 0.37368030600959284, "learning_rate": 2.158722943688621e-05, "loss": 0.3304, "step": 12980 }, { "epoch": 3.0282182835820897, "grad_norm": 0.3511157140059816, "learning_rate": 2.1570488893451203e-05, "loss": 0.3317, "step": 12985 }, { "epoch": 3.029384328358209, "grad_norm": 0.3811281961954354, "learning_rate": 2.1553751876662014e-05, "loss": 0.3314, "step": 12990 }, { "epoch": 3.030550373134328, "grad_norm": 0.3574592911407229, "learning_rate": 2.1537018396473195e-05, "loss": 0.3272, "step": 12995 }, { "epoch": 3.031716417910448, "grad_norm": 0.369797368557322, "learning_rate": 2.1520288462837175e-05, "loss": 0.3351, "step": 13000 }, { "epoch": 3.032882462686567, "grad_norm": 0.3582951305801054, "learning_rate": 2.1503562085704265e-05, "loss": 0.3179, "step": 13005 }, { "epoch": 3.0340485074626864, "grad_norm": 0.3790297047857973, "learning_rate": 2.148683927502269e-05, "loss": 0.3338, "step": 13010 }, { "epoch": 3.035214552238806, "grad_norm": 0.35676889097550846, "learning_rate": 2.147012004073853e-05, "loss": 0.3428, "step": 13015 }, { "epoch": 3.0363805970149254, "grad_norm": 0.3333520567798483, "learning_rate": 2.1453404392795735e-05, "loss": 0.3155, "step": 13020 }, { "epoch": 3.0375466417910446, "grad_norm": 0.3714709979317058, "learning_rate": 2.143669234113614e-05, "loss": 0.3156, "step": 13025 }, { "epoch": 3.0387126865671643, "grad_norm": 0.3710941423544791, "learning_rate": 2.1419983895699437e-05, "loss": 0.3385, "step": 13030 }, { "epoch": 3.0398787313432836, "grad_norm": 0.36832576543170387, "learning_rate": 2.1403279066423166e-05, "loss": 0.3314, "step": 13035 }, { "epoch": 3.041044776119403, "grad_norm": 0.3669444577828884, "learning_rate": 2.1386577863242708e-05, "loss": 0.3377, "step": 13040 }, { "epoch": 3.0422108208955225, "grad_norm": 0.37173717143067986, "learning_rate": 2.136988029609131e-05, "loss": 0.333, "step": 13045 }, { "epoch": 3.043376865671642, "grad_norm": 0.34211398039343427, "learning_rate": 2.135318637490004e-05, "loss": 0.3222, "step": 13050 }, { "epoch": 3.044542910447761, "grad_norm": 0.3488277941545808, "learning_rate": 2.1336496109597804e-05, "loss": 0.3187, "step": 13055 }, { "epoch": 3.0457089552238807, "grad_norm": 0.3521515864556318, "learning_rate": 2.131980951011134e-05, "loss": 0.325, "step": 13060 }, { "epoch": 3.046875, "grad_norm": 0.37878699689573786, "learning_rate": 2.1303126586365175e-05, "loss": 0.3321, "step": 13065 }, { "epoch": 3.0480410447761193, "grad_norm": 0.36530897021315384, "learning_rate": 2.1286447348281695e-05, "loss": 0.3225, "step": 13070 }, { "epoch": 3.049207089552239, "grad_norm": 0.3917845682734505, "learning_rate": 2.126977180578106e-05, "loss": 0.3389, "step": 13075 }, { "epoch": 3.050373134328358, "grad_norm": 0.34326636319182824, "learning_rate": 2.1253099968781237e-05, "loss": 0.3263, "step": 13080 }, { "epoch": 3.0515391791044775, "grad_norm": 0.357514903894581, "learning_rate": 2.1236431847198017e-05, "loss": 0.3252, "step": 13085 }, { "epoch": 3.052705223880597, "grad_norm": 0.36977408550720303, "learning_rate": 2.1219767450944938e-05, "loss": 0.3336, "step": 13090 }, { "epoch": 3.0538712686567164, "grad_norm": 0.3978530847172167, "learning_rate": 2.1203106789933352e-05, "loss": 0.3318, "step": 13095 }, { "epoch": 3.0550373134328357, "grad_norm": 0.3854667714673586, "learning_rate": 2.1186449874072385e-05, "loss": 0.3326, "step": 13100 }, { "epoch": 3.0562033582089554, "grad_norm": 0.3973655043830843, "learning_rate": 2.116979671326892e-05, "loss": 0.337, "step": 13105 }, { "epoch": 3.0573694029850746, "grad_norm": 0.3453069582811405, "learning_rate": 2.115314731742764e-05, "loss": 0.3147, "step": 13110 }, { "epoch": 3.058535447761194, "grad_norm": 0.3528439692816455, "learning_rate": 2.1136501696450943e-05, "loss": 0.3366, "step": 13115 }, { "epoch": 3.0597014925373136, "grad_norm": 0.4015670467809074, "learning_rate": 2.1119859860239023e-05, "loss": 0.3229, "step": 13120 }, { "epoch": 3.060867537313433, "grad_norm": 0.348396813775293, "learning_rate": 2.1103221818689794e-05, "loss": 0.3155, "step": 13125 }, { "epoch": 3.062033582089552, "grad_norm": 0.3655626156571847, "learning_rate": 2.108658758169893e-05, "loss": 0.3222, "step": 13130 }, { "epoch": 3.063199626865672, "grad_norm": 0.3933887786176633, "learning_rate": 2.1069957159159848e-05, "loss": 0.3288, "step": 13135 }, { "epoch": 3.064365671641791, "grad_norm": 0.37811653007737567, "learning_rate": 2.105333056096367e-05, "loss": 0.3263, "step": 13140 }, { "epoch": 3.0655317164179103, "grad_norm": 0.38015189790286996, "learning_rate": 2.1036707796999267e-05, "loss": 0.3174, "step": 13145 }, { "epoch": 3.06669776119403, "grad_norm": 0.36729503197373997, "learning_rate": 2.1020088877153215e-05, "loss": 0.3205, "step": 13150 }, { "epoch": 3.0678638059701493, "grad_norm": 0.38074724946063593, "learning_rate": 2.100347381130982e-05, "loss": 0.3256, "step": 13155 }, { "epoch": 3.0690298507462686, "grad_norm": 0.3733607784160199, "learning_rate": 2.0986862609351077e-05, "loss": 0.3317, "step": 13160 }, { "epoch": 3.0701958955223883, "grad_norm": 0.420440733695549, "learning_rate": 2.09702552811567e-05, "loss": 0.338, "step": 13165 }, { "epoch": 3.0713619402985075, "grad_norm": 0.3821363536784176, "learning_rate": 2.0953651836604083e-05, "loss": 0.3314, "step": 13170 }, { "epoch": 3.0725279850746268, "grad_norm": 0.35020554029833856, "learning_rate": 2.093705228556832e-05, "loss": 0.3294, "step": 13175 }, { "epoch": 3.0736940298507465, "grad_norm": 0.3915225549020453, "learning_rate": 2.0920456637922194e-05, "loss": 0.3219, "step": 13180 }, { "epoch": 3.0748600746268657, "grad_norm": 0.3792672546786865, "learning_rate": 2.0903864903536147e-05, "loss": 0.3337, "step": 13185 }, { "epoch": 3.076026119402985, "grad_norm": 0.382816038533477, "learning_rate": 2.088727709227833e-05, "loss": 0.3239, "step": 13190 }, { "epoch": 3.0771921641791047, "grad_norm": 0.36956181558759693, "learning_rate": 2.087069321401451e-05, "loss": 0.3203, "step": 13195 }, { "epoch": 3.078358208955224, "grad_norm": 0.36696730415738354, "learning_rate": 2.085411327860815e-05, "loss": 0.342, "step": 13200 }, { "epoch": 3.079524253731343, "grad_norm": 0.3479574920746276, "learning_rate": 2.083753729592037e-05, "loss": 0.3317, "step": 13205 }, { "epoch": 3.080690298507463, "grad_norm": 0.3555964171988262, "learning_rate": 2.0820965275809913e-05, "loss": 0.3218, "step": 13210 }, { "epoch": 3.081856343283582, "grad_norm": 0.3786019049736542, "learning_rate": 2.0804397228133205e-05, "loss": 0.332, "step": 13215 }, { "epoch": 3.0830223880597014, "grad_norm": 0.3531465412610589, "learning_rate": 2.0787833162744257e-05, "loss": 0.3244, "step": 13220 }, { "epoch": 3.0841884328358207, "grad_norm": 0.3603836800371123, "learning_rate": 2.077127308949476e-05, "loss": 0.3491, "step": 13225 }, { "epoch": 3.0853544776119404, "grad_norm": 0.3456946918483755, "learning_rate": 2.0754717018234003e-05, "loss": 0.3201, "step": 13230 }, { "epoch": 3.0865205223880596, "grad_norm": 0.3566261178734796, "learning_rate": 2.0738164958808905e-05, "loss": 0.322, "step": 13235 }, { "epoch": 3.0876865671641793, "grad_norm": 0.40131521266042497, "learning_rate": 2.072161692106399e-05, "loss": 0.3576, "step": 13240 }, { "epoch": 3.0888526119402986, "grad_norm": 0.3693815359775984, "learning_rate": 2.0705072914841407e-05, "loss": 0.3178, "step": 13245 }, { "epoch": 3.090018656716418, "grad_norm": 0.39108908279940313, "learning_rate": 2.0688532949980882e-05, "loss": 0.3375, "step": 13250 }, { "epoch": 3.091184701492537, "grad_norm": 0.3589102967846014, "learning_rate": 2.0671997036319763e-05, "loss": 0.3088, "step": 13255 }, { "epoch": 3.092350746268657, "grad_norm": 0.384997470366685, "learning_rate": 2.0655465183692972e-05, "loss": 0.3265, "step": 13260 }, { "epoch": 3.093516791044776, "grad_norm": 0.35614145612588777, "learning_rate": 2.063893740193304e-05, "loss": 0.3365, "step": 13265 }, { "epoch": 3.0946828358208953, "grad_norm": 0.3682250788133163, "learning_rate": 2.0622413700870026e-05, "loss": 0.3444, "step": 13270 }, { "epoch": 3.095848880597015, "grad_norm": 0.3612428170736634, "learning_rate": 2.0605894090331607e-05, "loss": 0.3322, "step": 13275 }, { "epoch": 3.0970149253731343, "grad_norm": 0.36197909380447313, "learning_rate": 2.0589378580143016e-05, "loss": 0.3336, "step": 13280 }, { "epoch": 3.0981809701492535, "grad_norm": 0.3606849185646576, "learning_rate": 2.057286718012705e-05, "loss": 0.3266, "step": 13285 }, { "epoch": 3.0993470149253732, "grad_norm": 0.3946123516420972, "learning_rate": 2.0556359900104054e-05, "loss": 0.3356, "step": 13290 }, { "epoch": 3.1005130597014925, "grad_norm": 0.3670960041616496, "learning_rate": 2.0539856749891918e-05, "loss": 0.3257, "step": 13295 }, { "epoch": 3.1016791044776117, "grad_norm": 0.3694155072794769, "learning_rate": 2.0523357739306087e-05, "loss": 0.3399, "step": 13300 }, { "epoch": 3.1028451492537314, "grad_norm": 0.3759644766045218, "learning_rate": 2.050686287815954e-05, "loss": 0.3426, "step": 13305 }, { "epoch": 3.1040111940298507, "grad_norm": 0.3752222434874275, "learning_rate": 2.049037217626279e-05, "loss": 0.3398, "step": 13310 }, { "epoch": 3.10517723880597, "grad_norm": 0.3740020967603994, "learning_rate": 2.0473885643423885e-05, "loss": 0.3271, "step": 13315 }, { "epoch": 3.1063432835820897, "grad_norm": 0.34731913525042724, "learning_rate": 2.0457403289448353e-05, "loss": 0.3313, "step": 13320 }, { "epoch": 3.107509328358209, "grad_norm": 0.3592863698837627, "learning_rate": 2.0440925124139286e-05, "loss": 0.336, "step": 13325 }, { "epoch": 3.108675373134328, "grad_norm": 0.36920519073043123, "learning_rate": 2.0424451157297264e-05, "loss": 0.3206, "step": 13330 }, { "epoch": 3.109841417910448, "grad_norm": 0.3744224947470079, "learning_rate": 2.040798139872037e-05, "loss": 0.3262, "step": 13335 }, { "epoch": 3.111007462686567, "grad_norm": 0.3597885708249331, "learning_rate": 2.0391515858204184e-05, "loss": 0.3245, "step": 13340 }, { "epoch": 3.1121735074626864, "grad_norm": 0.3731047113659824, "learning_rate": 2.0375054545541776e-05, "loss": 0.3266, "step": 13345 }, { "epoch": 3.113339552238806, "grad_norm": 0.36423501844030926, "learning_rate": 2.0358597470523706e-05, "loss": 0.3351, "step": 13350 }, { "epoch": 3.1145055970149254, "grad_norm": 0.3794624930929981, "learning_rate": 2.034214464293801e-05, "loss": 0.3256, "step": 13355 }, { "epoch": 3.1156716417910446, "grad_norm": 0.34443932446845915, "learning_rate": 2.0325696072570195e-05, "loss": 0.3249, "step": 13360 }, { "epoch": 3.1168376865671643, "grad_norm": 0.3759457556449976, "learning_rate": 2.0309251769203252e-05, "loss": 0.3231, "step": 13365 }, { "epoch": 3.1180037313432836, "grad_norm": 0.3745091535962183, "learning_rate": 2.0292811742617607e-05, "loss": 0.3327, "step": 13370 }, { "epoch": 3.119169776119403, "grad_norm": 0.41148298835812236, "learning_rate": 2.0276376002591164e-05, "loss": 0.3351, "step": 13375 }, { "epoch": 3.1203358208955225, "grad_norm": 0.3903153953591576, "learning_rate": 2.0259944558899274e-05, "loss": 0.3311, "step": 13380 }, { "epoch": 3.121501865671642, "grad_norm": 0.39583155887190763, "learning_rate": 2.0243517421314727e-05, "loss": 0.3307, "step": 13385 }, { "epoch": 3.122667910447761, "grad_norm": 0.3540602512876891, "learning_rate": 2.022709459960776e-05, "loss": 0.333, "step": 13390 }, { "epoch": 3.1238339552238807, "grad_norm": 0.349006419179924, "learning_rate": 2.0210676103546028e-05, "loss": 0.3305, "step": 13395 }, { "epoch": 3.125, "grad_norm": 0.35353284890354925, "learning_rate": 2.0194261942894628e-05, "loss": 0.3267, "step": 13400 }, { "epoch": 3.1261660447761193, "grad_norm": 0.3684268525794442, "learning_rate": 2.0177852127416063e-05, "loss": 0.3294, "step": 13405 }, { "epoch": 3.127332089552239, "grad_norm": 0.3876907997700918, "learning_rate": 2.016144666687029e-05, "loss": 0.3324, "step": 13410 }, { "epoch": 3.128498134328358, "grad_norm": 0.34843344551205363, "learning_rate": 2.0145045571014614e-05, "loss": 0.3048, "step": 13415 }, { "epoch": 3.1296641791044775, "grad_norm": 0.3739906165940147, "learning_rate": 2.0128648849603798e-05, "loss": 0.3296, "step": 13420 }, { "epoch": 3.130830223880597, "grad_norm": 0.369790686747009, "learning_rate": 2.0112256512389976e-05, "loss": 0.3247, "step": 13425 }, { "epoch": 3.1319962686567164, "grad_norm": 0.4027240855788343, "learning_rate": 2.009586856912269e-05, "loss": 0.3233, "step": 13430 }, { "epoch": 3.1331623134328357, "grad_norm": 0.3746132905195428, "learning_rate": 2.0079485029548838e-05, "loss": 0.3286, "step": 13435 }, { "epoch": 3.1343283582089554, "grad_norm": 0.37255110913703593, "learning_rate": 2.006310590341276e-05, "loss": 0.3203, "step": 13440 }, { "epoch": 3.1354944029850746, "grad_norm": 0.3971067568722652, "learning_rate": 2.0046731200456097e-05, "loss": 0.3438, "step": 13445 }, { "epoch": 3.136660447761194, "grad_norm": 0.3763188179431443, "learning_rate": 2.00303609304179e-05, "loss": 0.3373, "step": 13450 }, { "epoch": 3.1378264925373136, "grad_norm": 0.354426564175202, "learning_rate": 2.0013995103034594e-05, "loss": 0.3386, "step": 13455 }, { "epoch": 3.138992537313433, "grad_norm": 0.3503828579014741, "learning_rate": 1.9997633728039933e-05, "loss": 0.3233, "step": 13460 }, { "epoch": 3.140158582089552, "grad_norm": 0.3647250356826776, "learning_rate": 1.9981276815165046e-05, "loss": 0.3281, "step": 13465 }, { "epoch": 3.141324626865672, "grad_norm": 0.3430909864290912, "learning_rate": 1.996492437413838e-05, "loss": 0.3175, "step": 13470 }, { "epoch": 3.142490671641791, "grad_norm": 0.35006325560037993, "learning_rate": 1.994857641468575e-05, "loss": 0.3365, "step": 13475 }, { "epoch": 3.1436567164179103, "grad_norm": 0.36103268334665484, "learning_rate": 1.99322329465303e-05, "loss": 0.3163, "step": 13480 }, { "epoch": 3.14482276119403, "grad_norm": 0.39489082780521373, "learning_rate": 1.9915893979392492e-05, "loss": 0.3421, "step": 13485 }, { "epoch": 3.1459888059701493, "grad_norm": 0.371311627980263, "learning_rate": 1.989955952299012e-05, "loss": 0.3235, "step": 13490 }, { "epoch": 3.1471548507462686, "grad_norm": 0.38942356459182614, "learning_rate": 1.9883229587038287e-05, "loss": 0.328, "step": 13495 }, { "epoch": 3.1483208955223883, "grad_norm": 0.37027465368557755, "learning_rate": 1.986690418124942e-05, "loss": 0.334, "step": 13500 }, { "epoch": 3.1494869402985075, "grad_norm": 0.3749228281321373, "learning_rate": 1.9850583315333242e-05, "loss": 0.3288, "step": 13505 }, { "epoch": 3.1506529850746268, "grad_norm": 0.3605921528491555, "learning_rate": 1.983426699899677e-05, "loss": 0.3235, "step": 13510 }, { "epoch": 3.1518190298507465, "grad_norm": 0.366589498485244, "learning_rate": 1.9817955241944335e-05, "loss": 0.3331, "step": 13515 }, { "epoch": 3.1529850746268657, "grad_norm": 0.3722581231316989, "learning_rate": 1.9801648053877548e-05, "loss": 0.3325, "step": 13520 }, { "epoch": 3.154151119402985, "grad_norm": 0.3624795413286155, "learning_rate": 1.978534544449528e-05, "loss": 0.3062, "step": 13525 }, { "epoch": 3.1553171641791047, "grad_norm": 0.3702713213461391, "learning_rate": 1.9769047423493707e-05, "loss": 0.3318, "step": 13530 }, { "epoch": 3.156483208955224, "grad_norm": 0.3854460558686183, "learning_rate": 1.975275400056627e-05, "loss": 0.3413, "step": 13535 }, { "epoch": 3.157649253731343, "grad_norm": 0.35754585019696894, "learning_rate": 1.9736465185403675e-05, "loss": 0.3174, "step": 13540 }, { "epoch": 3.158815298507463, "grad_norm": 0.35173201839919066, "learning_rate": 1.9720180987693888e-05, "loss": 0.3359, "step": 13545 }, { "epoch": 3.159981343283582, "grad_norm": 0.3679370893478662, "learning_rate": 1.9703901417122106e-05, "loss": 0.331, "step": 13550 }, { "epoch": 3.1611473880597014, "grad_norm": 0.3535033269618664, "learning_rate": 1.968762648337081e-05, "loss": 0.3173, "step": 13555 }, { "epoch": 3.1623134328358207, "grad_norm": 0.38482761857804487, "learning_rate": 1.96713561961197e-05, "loss": 0.3436, "step": 13560 }, { "epoch": 3.1634794776119404, "grad_norm": 0.37373056008546923, "learning_rate": 1.9655090565045718e-05, "loss": 0.3415, "step": 13565 }, { "epoch": 3.1646455223880596, "grad_norm": 0.352634120020291, "learning_rate": 1.9638829599823056e-05, "loss": 0.328, "step": 13570 }, { "epoch": 3.1658115671641793, "grad_norm": 0.3719863213467069, "learning_rate": 1.9622573310123082e-05, "loss": 0.34, "step": 13575 }, { "epoch": 3.1669776119402986, "grad_norm": 0.3775757810642568, "learning_rate": 1.9606321705614427e-05, "loss": 0.3407, "step": 13580 }, { "epoch": 3.168143656716418, "grad_norm": 0.3517808938109586, "learning_rate": 1.9590074795962925e-05, "loss": 0.3173, "step": 13585 }, { "epoch": 3.169309701492537, "grad_norm": 0.3673636906191626, "learning_rate": 1.957383259083162e-05, "loss": 0.3378, "step": 13590 }, { "epoch": 3.170475746268657, "grad_norm": 0.3773964702378201, "learning_rate": 1.955759509988075e-05, "loss": 0.331, "step": 13595 }, { "epoch": 3.171641791044776, "grad_norm": 0.36583522899610693, "learning_rate": 1.9541362332767737e-05, "loss": 0.3374, "step": 13600 }, { "epoch": 3.1728078358208953, "grad_norm": 0.35882066116368944, "learning_rate": 1.952513429914723e-05, "loss": 0.3386, "step": 13605 }, { "epoch": 3.173973880597015, "grad_norm": 0.3522475379983531, "learning_rate": 1.950891100867102e-05, "loss": 0.3319, "step": 13610 }, { "epoch": 3.1751399253731343, "grad_norm": 0.3380721645634281, "learning_rate": 1.9492692470988115e-05, "loss": 0.3172, "step": 13615 }, { "epoch": 3.1763059701492535, "grad_norm": 0.3638642550275009, "learning_rate": 1.9476478695744683e-05, "loss": 0.326, "step": 13620 }, { "epoch": 3.1774720149253732, "grad_norm": 0.34490985996230167, "learning_rate": 1.9460269692584034e-05, "loss": 0.311, "step": 13625 }, { "epoch": 3.1786380597014925, "grad_norm": 0.3293630559420069, "learning_rate": 1.944406547114667e-05, "loss": 0.3175, "step": 13630 }, { "epoch": 3.1798041044776117, "grad_norm": 0.36214515194537317, "learning_rate": 1.9427866041070254e-05, "loss": 0.3425, "step": 13635 }, { "epoch": 3.1809701492537314, "grad_norm": 0.4074765117307699, "learning_rate": 1.9411671411989568e-05, "loss": 0.3347, "step": 13640 }, { "epoch": 3.1821361940298507, "grad_norm": 0.3690374728241686, "learning_rate": 1.9395481593536575e-05, "loss": 0.3376, "step": 13645 }, { "epoch": 3.18330223880597, "grad_norm": 0.3598588307706936, "learning_rate": 1.937929659534034e-05, "loss": 0.3135, "step": 13650 }, { "epoch": 3.1844682835820897, "grad_norm": 0.37614810654671105, "learning_rate": 1.9363116427027084e-05, "loss": 0.3171, "step": 13655 }, { "epoch": 3.185634328358209, "grad_norm": 0.3652088800983655, "learning_rate": 1.9346941098220157e-05, "loss": 0.3451, "step": 13660 }, { "epoch": 3.186800373134328, "grad_norm": 0.37026072957166567, "learning_rate": 1.933077061854002e-05, "loss": 0.337, "step": 13665 }, { "epoch": 3.187966417910448, "grad_norm": 0.3674339134285864, "learning_rate": 1.931460499760426e-05, "loss": 0.3209, "step": 13670 }, { "epoch": 3.189132462686567, "grad_norm": 0.3787792050932168, "learning_rate": 1.929844424502755e-05, "loss": 0.3329, "step": 13675 }, { "epoch": 3.1902985074626864, "grad_norm": 0.3464872434870928, "learning_rate": 1.9282288370421708e-05, "loss": 0.3274, "step": 13680 }, { "epoch": 3.191464552238806, "grad_norm": 0.35433237253231414, "learning_rate": 1.9266137383395626e-05, "loss": 0.3194, "step": 13685 }, { "epoch": 3.1926305970149254, "grad_norm": 0.36665862738024346, "learning_rate": 1.9249991293555276e-05, "loss": 0.3229, "step": 13690 }, { "epoch": 3.1937966417910446, "grad_norm": 0.35818228941602226, "learning_rate": 1.9233850110503748e-05, "loss": 0.308, "step": 13695 }, { "epoch": 3.1949626865671643, "grad_norm": 0.3818773569995412, "learning_rate": 1.9217713843841195e-05, "loss": 0.3242, "step": 13700 }, { "epoch": 3.1961287313432836, "grad_norm": 0.3686870097052765, "learning_rate": 1.9201582503164845e-05, "loss": 0.3335, "step": 13705 }, { "epoch": 3.197294776119403, "grad_norm": 0.39478332509122016, "learning_rate": 1.9185456098068998e-05, "loss": 0.3191, "step": 13710 }, { "epoch": 3.1984608208955225, "grad_norm": 0.4369403040320393, "learning_rate": 1.9169334638145037e-05, "loss": 0.3074, "step": 13715 }, { "epoch": 3.199626865671642, "grad_norm": 0.36890272116814343, "learning_rate": 1.9153218132981375e-05, "loss": 0.33, "step": 13720 }, { "epoch": 3.200792910447761, "grad_norm": 0.3692134632105836, "learning_rate": 1.9137106592163495e-05, "loss": 0.3273, "step": 13725 }, { "epoch": 3.2019589552238807, "grad_norm": 0.33604994736255905, "learning_rate": 1.912100002527392e-05, "loss": 0.323, "step": 13730 }, { "epoch": 3.203125, "grad_norm": 0.38695669374062913, "learning_rate": 1.9104898441892222e-05, "loss": 0.3373, "step": 13735 }, { "epoch": 3.2042910447761193, "grad_norm": 0.37765927910733466, "learning_rate": 1.9088801851595008e-05, "loss": 0.3259, "step": 13740 }, { "epoch": 3.205457089552239, "grad_norm": 0.35284623104031987, "learning_rate": 1.907271026395592e-05, "loss": 0.3056, "step": 13745 }, { "epoch": 3.206623134328358, "grad_norm": 0.3600030937462303, "learning_rate": 1.9056623688545588e-05, "loss": 0.3377, "step": 13750 }, { "epoch": 3.2077891791044775, "grad_norm": 0.35080121298899186, "learning_rate": 1.9040542134931715e-05, "loss": 0.3265, "step": 13755 }, { "epoch": 3.208955223880597, "grad_norm": 0.3678894513431948, "learning_rate": 1.9024465612678993e-05, "loss": 0.3368, "step": 13760 }, { "epoch": 3.2101212686567164, "grad_norm": 0.37001400720834304, "learning_rate": 1.900839413134911e-05, "loss": 0.3402, "step": 13765 }, { "epoch": 3.2112873134328357, "grad_norm": 0.3684583095084399, "learning_rate": 1.8992327700500772e-05, "loss": 0.3279, "step": 13770 }, { "epoch": 3.2124533582089554, "grad_norm": 0.3395411991593244, "learning_rate": 1.897626632968968e-05, "loss": 0.3115, "step": 13775 }, { "epoch": 3.2136194029850746, "grad_norm": 0.36901019403122015, "learning_rate": 1.8960210028468512e-05, "loss": 0.3372, "step": 13780 }, { "epoch": 3.214785447761194, "grad_norm": 0.37786966826385215, "learning_rate": 1.8944158806386942e-05, "loss": 0.3278, "step": 13785 }, { "epoch": 3.2159514925373136, "grad_norm": 0.3484072803507474, "learning_rate": 1.8928112672991626e-05, "loss": 0.3227, "step": 13790 }, { "epoch": 3.217117537313433, "grad_norm": 0.3813917097052821, "learning_rate": 1.8912071637826196e-05, "loss": 0.3182, "step": 13795 }, { "epoch": 3.218283582089552, "grad_norm": 0.40392844230661523, "learning_rate": 1.8896035710431225e-05, "loss": 0.3535, "step": 13800 }, { "epoch": 3.219449626865672, "grad_norm": 0.3711458054470723, "learning_rate": 1.8880004900344283e-05, "loss": 0.3377, "step": 13805 }, { "epoch": 3.220615671641791, "grad_norm": 0.3489947468412329, "learning_rate": 1.8863979217099874e-05, "loss": 0.3422, "step": 13810 }, { "epoch": 3.2217817164179103, "grad_norm": 0.4039440749796976, "learning_rate": 1.8847958670229465e-05, "loss": 0.3308, "step": 13815 }, { "epoch": 3.22294776119403, "grad_norm": 0.35899557282466393, "learning_rate": 1.8831943269261467e-05, "loss": 0.3309, "step": 13820 }, { "epoch": 3.2241138059701493, "grad_norm": 0.35455566040198766, "learning_rate": 1.8815933023721206e-05, "loss": 0.3344, "step": 13825 }, { "epoch": 3.2252798507462686, "grad_norm": 0.3588685079349673, "learning_rate": 1.8799927943130986e-05, "loss": 0.3329, "step": 13830 }, { "epoch": 3.2264458955223883, "grad_norm": 0.35914473477906345, "learning_rate": 1.878392803701e-05, "loss": 0.3316, "step": 13835 }, { "epoch": 3.2276119402985075, "grad_norm": 0.3668416379119767, "learning_rate": 1.8767933314874382e-05, "loss": 0.3403, "step": 13840 }, { "epoch": 3.2287779850746268, "grad_norm": 0.36205186453945915, "learning_rate": 1.875194378623718e-05, "loss": 0.3425, "step": 13845 }, { "epoch": 3.2299440298507465, "grad_norm": 0.3760203722085514, "learning_rate": 1.8735959460608364e-05, "loss": 0.3423, "step": 13850 }, { "epoch": 3.2311100746268657, "grad_norm": 0.37574713190429826, "learning_rate": 1.871998034749478e-05, "loss": 0.3337, "step": 13855 }, { "epoch": 3.232276119402985, "grad_norm": 0.3630560486674643, "learning_rate": 1.8704006456400202e-05, "loss": 0.3449, "step": 13860 }, { "epoch": 3.2334421641791047, "grad_norm": 0.38382066549657823, "learning_rate": 1.8688037796825285e-05, "loss": 0.3317, "step": 13865 }, { "epoch": 3.234608208955224, "grad_norm": 0.36720684573288553, "learning_rate": 1.8672074378267573e-05, "loss": 0.3285, "step": 13870 }, { "epoch": 3.235774253731343, "grad_norm": 0.3637815439353751, "learning_rate": 1.8656116210221502e-05, "loss": 0.3313, "step": 13875 }, { "epoch": 3.236940298507463, "grad_norm": 0.35665712066317706, "learning_rate": 1.8640163302178377e-05, "loss": 0.3395, "step": 13880 }, { "epoch": 3.238106343283582, "grad_norm": 0.37234806881985544, "learning_rate": 1.8624215663626365e-05, "loss": 0.3274, "step": 13885 }, { "epoch": 3.2392723880597014, "grad_norm": 0.35997104573000205, "learning_rate": 1.8608273304050515e-05, "loss": 0.3452, "step": 13890 }, { "epoch": 3.2404384328358207, "grad_norm": 0.35646213305760127, "learning_rate": 1.859233623293274e-05, "loss": 0.3272, "step": 13895 }, { "epoch": 3.2416044776119404, "grad_norm": 0.3717220228017601, "learning_rate": 1.8576404459751796e-05, "loss": 0.3344, "step": 13900 }, { "epoch": 3.2427705223880596, "grad_norm": 0.39226176760700243, "learning_rate": 1.8560477993983284e-05, "loss": 0.3373, "step": 13905 }, { "epoch": 3.2439365671641793, "grad_norm": 0.3708386290612459, "learning_rate": 1.8544556845099657e-05, "loss": 0.3304, "step": 13910 }, { "epoch": 3.2451026119402986, "grad_norm": 0.3632392546028572, "learning_rate": 1.8528641022570202e-05, "loss": 0.3355, "step": 13915 }, { "epoch": 3.246268656716418, "grad_norm": 0.3662832048203213, "learning_rate": 1.851273053586105e-05, "loss": 0.3252, "step": 13920 }, { "epoch": 3.247434701492537, "grad_norm": 0.3599674150678043, "learning_rate": 1.8496825394435146e-05, "loss": 0.3366, "step": 13925 }, { "epoch": 3.248600746268657, "grad_norm": 0.38699543809394027, "learning_rate": 1.8480925607752248e-05, "loss": 0.3378, "step": 13930 }, { "epoch": 3.249766791044776, "grad_norm": 0.3904572616104557, "learning_rate": 1.8465031185268943e-05, "loss": 0.3481, "step": 13935 }, { "epoch": 3.2509328358208958, "grad_norm": 0.35402925173137356, "learning_rate": 1.8449142136438628e-05, "loss": 0.3236, "step": 13940 }, { "epoch": 3.252098880597015, "grad_norm": 0.36346030275637714, "learning_rate": 1.84332584707115e-05, "loss": 0.3374, "step": 13945 }, { "epoch": 3.2532649253731343, "grad_norm": 0.37169783700709885, "learning_rate": 1.8417380197534558e-05, "loss": 0.3308, "step": 13950 }, { "epoch": 3.2544309701492535, "grad_norm": 0.3460841700376648, "learning_rate": 1.8401507326351575e-05, "loss": 0.3326, "step": 13955 }, { "epoch": 3.2555970149253732, "grad_norm": 0.3530027735330055, "learning_rate": 1.8385639866603144e-05, "loss": 0.3353, "step": 13960 }, { "epoch": 3.2567630597014925, "grad_norm": 0.37620468595081824, "learning_rate": 1.836977782772661e-05, "loss": 0.3357, "step": 13965 }, { "epoch": 3.2579291044776117, "grad_norm": 0.3784865153952274, "learning_rate": 1.8353921219156102e-05, "loss": 0.3381, "step": 13970 }, { "epoch": 3.2590951492537314, "grad_norm": 0.3667850525620458, "learning_rate": 1.8338070050322544e-05, "loss": 0.3329, "step": 13975 }, { "epoch": 3.2602611940298507, "grad_norm": 0.3596704409553893, "learning_rate": 1.8322224330653576e-05, "loss": 0.3328, "step": 13980 }, { "epoch": 3.26142723880597, "grad_norm": 0.36071354726025173, "learning_rate": 1.830638406957364e-05, "loss": 0.332, "step": 13985 }, { "epoch": 3.2625932835820897, "grad_norm": 0.38004524875147305, "learning_rate": 1.8290549276503915e-05, "loss": 0.342, "step": 13990 }, { "epoch": 3.263759328358209, "grad_norm": 0.35262133682833335, "learning_rate": 1.8274719960862325e-05, "loss": 0.3354, "step": 13995 }, { "epoch": 3.264925373134328, "grad_norm": 0.3668641053269362, "learning_rate": 1.825889613206355e-05, "loss": 0.3286, "step": 14000 }, { "epoch": 3.266091417910448, "grad_norm": 0.37265888531935687, "learning_rate": 1.824307779951898e-05, "loss": 0.3212, "step": 14005 }, { "epoch": 3.267257462686567, "grad_norm": 0.34176171445637105, "learning_rate": 1.8227264972636758e-05, "loss": 0.3134, "step": 14010 }, { "epoch": 3.2684235074626864, "grad_norm": 0.37134680617034843, "learning_rate": 1.821145766082176e-05, "loss": 0.3366, "step": 14015 }, { "epoch": 3.269589552238806, "grad_norm": 0.3605056947908637, "learning_rate": 1.8195655873475554e-05, "loss": 0.346, "step": 14020 }, { "epoch": 3.2707555970149254, "grad_norm": 0.36325907950139397, "learning_rate": 1.8179859619996448e-05, "loss": 0.3271, "step": 14025 }, { "epoch": 3.2719216417910446, "grad_norm": 0.354427230774899, "learning_rate": 1.8164068909779437e-05, "loss": 0.3309, "step": 14030 }, { "epoch": 3.2730876865671643, "grad_norm": 0.40211130887509683, "learning_rate": 1.814828375221623e-05, "loss": 0.336, "step": 14035 }, { "epoch": 3.2742537313432836, "grad_norm": 0.3669552473450502, "learning_rate": 1.8132504156695245e-05, "loss": 0.3413, "step": 14040 }, { "epoch": 3.275419776119403, "grad_norm": 0.35626016714325476, "learning_rate": 1.8116730132601565e-05, "loss": 0.3337, "step": 14045 }, { "epoch": 3.2765858208955225, "grad_norm": 0.36791701144465083, "learning_rate": 1.8100961689317003e-05, "loss": 0.3265, "step": 14050 }, { "epoch": 3.277751865671642, "grad_norm": 0.3641957621004843, "learning_rate": 1.808519883621999e-05, "loss": 0.319, "step": 14055 }, { "epoch": 3.278917910447761, "grad_norm": 0.36880246236354486, "learning_rate": 1.806944158268568e-05, "loss": 0.3309, "step": 14060 }, { "epoch": 3.2800839552238807, "grad_norm": 0.3459063020550023, "learning_rate": 1.805368993808589e-05, "loss": 0.321, "step": 14065 }, { "epoch": 3.28125, "grad_norm": 0.35941020884127856, "learning_rate": 1.803794391178908e-05, "loss": 0.3347, "step": 14070 }, { "epoch": 3.2824160447761193, "grad_norm": 0.33881347076600704, "learning_rate": 1.8022203513160406e-05, "loss": 0.3275, "step": 14075 }, { "epoch": 3.283582089552239, "grad_norm": 0.36013411302573745, "learning_rate": 1.8006468751561628e-05, "loss": 0.3299, "step": 14080 }, { "epoch": 3.284748134328358, "grad_norm": 0.3605260553175594, "learning_rate": 1.7990739636351188e-05, "loss": 0.329, "step": 14085 }, { "epoch": 3.2859141791044775, "grad_norm": 0.3379452521638893, "learning_rate": 1.797501617688417e-05, "loss": 0.3124, "step": 14090 }, { "epoch": 3.287080223880597, "grad_norm": 0.34755508575552396, "learning_rate": 1.795929838251227e-05, "loss": 0.3293, "step": 14095 }, { "epoch": 3.2882462686567164, "grad_norm": 0.36215468157393715, "learning_rate": 1.7943586262583846e-05, "loss": 0.3346, "step": 14100 }, { "epoch": 3.2894123134328357, "grad_norm": 0.35680337811185614, "learning_rate": 1.7927879826443844e-05, "loss": 0.3395, "step": 14105 }, { "epoch": 3.2905783582089554, "grad_norm": 0.37236710880652174, "learning_rate": 1.791217908343386e-05, "loss": 0.3352, "step": 14110 }, { "epoch": 3.2917444029850746, "grad_norm": 0.3748908586072519, "learning_rate": 1.78964840428921e-05, "loss": 0.3328, "step": 14115 }, { "epoch": 3.292910447761194, "grad_norm": 0.3696905878621036, "learning_rate": 1.7880794714153366e-05, "loss": 0.3277, "step": 14120 }, { "epoch": 3.2940764925373136, "grad_norm": 0.3925177280795479, "learning_rate": 1.786511110654907e-05, "loss": 0.3359, "step": 14125 }, { "epoch": 3.295242537313433, "grad_norm": 0.3573280326662546, "learning_rate": 1.784943322940722e-05, "loss": 0.3326, "step": 14130 }, { "epoch": 3.296408582089552, "grad_norm": 0.36097554010255833, "learning_rate": 1.7833761092052415e-05, "loss": 0.3288, "step": 14135 }, { "epoch": 3.297574626865672, "grad_norm": 0.3395814953744775, "learning_rate": 1.7818094703805837e-05, "loss": 0.3206, "step": 14140 }, { "epoch": 3.298740671641791, "grad_norm": 0.3846515758263603, "learning_rate": 1.780243407398527e-05, "loss": 0.3327, "step": 14145 }, { "epoch": 3.2999067164179103, "grad_norm": 0.3686113299692695, "learning_rate": 1.7786779211905048e-05, "loss": 0.3516, "step": 14150 }, { "epoch": 3.30107276119403, "grad_norm": 0.37897026986716464, "learning_rate": 1.7771130126876068e-05, "loss": 0.3365, "step": 14155 }, { "epoch": 3.3022388059701493, "grad_norm": 0.37926004560560306, "learning_rate": 1.775548682820582e-05, "loss": 0.3267, "step": 14160 }, { "epoch": 3.3034048507462686, "grad_norm": 0.3532865663444608, "learning_rate": 1.7739849325198334e-05, "loss": 0.318, "step": 14165 }, { "epoch": 3.3045708955223883, "grad_norm": 0.375951212959595, "learning_rate": 1.7724217627154204e-05, "loss": 0.3153, "step": 14170 }, { "epoch": 3.3057369402985075, "grad_norm": 0.3892275901278885, "learning_rate": 1.7708591743370555e-05, "loss": 0.3423, "step": 14175 }, { "epoch": 3.3069029850746268, "grad_norm": 0.352642314062309, "learning_rate": 1.7692971683141063e-05, "loss": 0.3203, "step": 14180 }, { "epoch": 3.3080690298507465, "grad_norm": 0.3608644544656739, "learning_rate": 1.7677357455755954e-05, "loss": 0.3149, "step": 14185 }, { "epoch": 3.3092350746268657, "grad_norm": 0.37383509533418063, "learning_rate": 1.766174907050196e-05, "loss": 0.3326, "step": 14190 }, { "epoch": 3.310401119402985, "grad_norm": 0.35408016367951, "learning_rate": 1.764614653666235e-05, "loss": 0.3354, "step": 14195 }, { "epoch": 3.3115671641791042, "grad_norm": 0.4073773632420507, "learning_rate": 1.7630549863516914e-05, "loss": 0.3315, "step": 14200 }, { "epoch": 3.312733208955224, "grad_norm": 0.36025295017030395, "learning_rate": 1.7614959060341968e-05, "loss": 0.3269, "step": 14205 }, { "epoch": 3.313899253731343, "grad_norm": 0.36990534903869327, "learning_rate": 1.75993741364103e-05, "loss": 0.3233, "step": 14210 }, { "epoch": 3.315065298507463, "grad_norm": 0.3549222879114956, "learning_rate": 1.7583795100991246e-05, "loss": 0.3324, "step": 14215 }, { "epoch": 3.316231343283582, "grad_norm": 0.35963908710499526, "learning_rate": 1.7568221963350605e-05, "loss": 0.3214, "step": 14220 }, { "epoch": 3.3173973880597014, "grad_norm": 0.37972969127051454, "learning_rate": 1.755265473275069e-05, "loss": 0.3447, "step": 14225 }, { "epoch": 3.3185634328358207, "grad_norm": 0.37379510903005386, "learning_rate": 1.7537093418450294e-05, "loss": 0.3335, "step": 14230 }, { "epoch": 3.3197294776119404, "grad_norm": 0.3712018006629852, "learning_rate": 1.7521538029704682e-05, "loss": 0.3302, "step": 14235 }, { "epoch": 3.3208955223880596, "grad_norm": 0.3499709201063348, "learning_rate": 1.750598857576561e-05, "loss": 0.3331, "step": 14240 }, { "epoch": 3.3220615671641793, "grad_norm": 0.34800044920430206, "learning_rate": 1.749044506588129e-05, "loss": 0.324, "step": 14245 }, { "epoch": 3.3232276119402986, "grad_norm": 0.3755558003097278, "learning_rate": 1.7474907509296412e-05, "loss": 0.3388, "step": 14250 }, { "epoch": 3.324393656716418, "grad_norm": 0.38141193719906546, "learning_rate": 1.7459375915252123e-05, "loss": 0.347, "step": 14255 }, { "epoch": 3.325559701492537, "grad_norm": 0.4026915264621621, "learning_rate": 1.7443850292986007e-05, "loss": 0.3266, "step": 14260 }, { "epoch": 3.326725746268657, "grad_norm": 0.3627612966851823, "learning_rate": 1.742833065173212e-05, "loss": 0.3362, "step": 14265 }, { "epoch": 3.327891791044776, "grad_norm": 0.3754425378380735, "learning_rate": 1.7412817000720937e-05, "loss": 0.3488, "step": 14270 }, { "epoch": 3.3290578358208958, "grad_norm": 0.35308948266583345, "learning_rate": 1.7397309349179393e-05, "loss": 0.3157, "step": 14275 }, { "epoch": 3.330223880597015, "grad_norm": 0.363506854612107, "learning_rate": 1.738180770633085e-05, "loss": 0.335, "step": 14280 }, { "epoch": 3.3313899253731343, "grad_norm": 0.36242269890457657, "learning_rate": 1.7366312081395075e-05, "loss": 0.342, "step": 14285 }, { "epoch": 3.3325559701492535, "grad_norm": 0.3699237198400984, "learning_rate": 1.7350822483588277e-05, "loss": 0.3402, "step": 14290 }, { "epoch": 3.3337220149253732, "grad_norm": 0.3705774296824844, "learning_rate": 1.7335338922123076e-05, "loss": 0.3261, "step": 14295 }, { "epoch": 3.3348880597014925, "grad_norm": 0.3595502541334972, "learning_rate": 1.7319861406208504e-05, "loss": 0.3191, "step": 14300 }, { "epoch": 3.3360541044776117, "grad_norm": 0.3647020771322686, "learning_rate": 1.7304389945050004e-05, "loss": 0.3314, "step": 14305 }, { "epoch": 3.3372201492537314, "grad_norm": 0.37062679555831696, "learning_rate": 1.728892454784938e-05, "loss": 0.3338, "step": 14310 }, { "epoch": 3.3383861940298507, "grad_norm": 0.37268020664483076, "learning_rate": 1.7273465223804876e-05, "loss": 0.3324, "step": 14315 }, { "epoch": 3.33955223880597, "grad_norm": 0.3651408537647329, "learning_rate": 1.7258011982111094e-05, "loss": 0.329, "step": 14320 }, { "epoch": 3.3407182835820897, "grad_norm": 0.351578371283549, "learning_rate": 1.7242564831959045e-05, "loss": 0.3321, "step": 14325 }, { "epoch": 3.341884328358209, "grad_norm": 0.35842571012922647, "learning_rate": 1.72271237825361e-05, "loss": 0.3188, "step": 14330 }, { "epoch": 3.343050373134328, "grad_norm": 0.3719815758057306, "learning_rate": 1.7211688843025987e-05, "loss": 0.3315, "step": 14335 }, { "epoch": 3.344216417910448, "grad_norm": 0.3492881141302099, "learning_rate": 1.7196260022608828e-05, "loss": 0.3322, "step": 14340 }, { "epoch": 3.345382462686567, "grad_norm": 0.3467557284343145, "learning_rate": 1.7180837330461093e-05, "loss": 0.3301, "step": 14345 }, { "epoch": 3.3465485074626864, "grad_norm": 0.3490059049398232, "learning_rate": 1.716542077575561e-05, "loss": 0.3403, "step": 14350 }, { "epoch": 3.347714552238806, "grad_norm": 0.3637037891945392, "learning_rate": 1.7150010367661546e-05, "loss": 0.3421, "step": 14355 }, { "epoch": 3.3488805970149254, "grad_norm": 0.3819779828956117, "learning_rate": 1.7134606115344427e-05, "loss": 0.3379, "step": 14360 }, { "epoch": 3.3500466417910446, "grad_norm": 0.3480677049117983, "learning_rate": 1.7119208027966116e-05, "loss": 0.3146, "step": 14365 }, { "epoch": 3.3512126865671643, "grad_norm": 0.3588782011005385, "learning_rate": 1.710381611468479e-05, "loss": 0.3308, "step": 14370 }, { "epoch": 3.3523787313432836, "grad_norm": 0.36499620650935155, "learning_rate": 1.7088430384654984e-05, "loss": 0.333, "step": 14375 }, { "epoch": 3.353544776119403, "grad_norm": 0.34008861921924666, "learning_rate": 1.7073050847027537e-05, "loss": 0.3198, "step": 14380 }, { "epoch": 3.3547108208955225, "grad_norm": 0.3565514773656197, "learning_rate": 1.7057677510949598e-05, "loss": 0.3331, "step": 14385 }, { "epoch": 3.355876865671642, "grad_norm": 0.3611844622144962, "learning_rate": 1.704231038556465e-05, "loss": 0.3339, "step": 14390 }, { "epoch": 3.357042910447761, "grad_norm": 0.3899193141653559, "learning_rate": 1.702694948001246e-05, "loss": 0.3434, "step": 14395 }, { "epoch": 3.3582089552238807, "grad_norm": 0.3725681162182271, "learning_rate": 1.701159480342911e-05, "loss": 0.3278, "step": 14400 }, { "epoch": 3.359375, "grad_norm": 0.36021694811227956, "learning_rate": 1.6996246364946985e-05, "loss": 0.3244, "step": 14405 }, { "epoch": 3.3605410447761193, "grad_norm": 0.3569267961249343, "learning_rate": 1.6980904173694727e-05, "loss": 0.3345, "step": 14410 }, { "epoch": 3.361707089552239, "grad_norm": 0.35999107607516434, "learning_rate": 1.69655682387973e-05, "loss": 0.3326, "step": 14415 }, { "epoch": 3.362873134328358, "grad_norm": 0.3780637462882897, "learning_rate": 1.695023856937591e-05, "loss": 0.3358, "step": 14420 }, { "epoch": 3.3640391791044775, "grad_norm": 0.3686828628333255, "learning_rate": 1.6934915174548073e-05, "loss": 0.3273, "step": 14425 }, { "epoch": 3.365205223880597, "grad_norm": 0.3279985473768487, "learning_rate": 1.691959806342756e-05, "loss": 0.3301, "step": 14430 }, { "epoch": 3.3663712686567164, "grad_norm": 0.35865782560224624, "learning_rate": 1.690428724512439e-05, "loss": 0.3384, "step": 14435 }, { "epoch": 3.3675373134328357, "grad_norm": 0.3557517735777206, "learning_rate": 1.688898272874485e-05, "loss": 0.3114, "step": 14440 }, { "epoch": 3.3687033582089554, "grad_norm": 0.40655515559650046, "learning_rate": 1.6873684523391487e-05, "loss": 0.3285, "step": 14445 }, { "epoch": 3.3698694029850746, "grad_norm": 0.3690029448888416, "learning_rate": 1.685839263816308e-05, "loss": 0.3355, "step": 14450 }, { "epoch": 3.371035447761194, "grad_norm": 0.34192650742954583, "learning_rate": 1.6843107082154675e-05, "loss": 0.3196, "step": 14455 }, { "epoch": 3.3722014925373136, "grad_norm": 0.36763451883757187, "learning_rate": 1.68278278644575e-05, "loss": 0.3354, "step": 14460 }, { "epoch": 3.373367537313433, "grad_norm": 0.37204877793428426, "learning_rate": 1.6812554994159073e-05, "loss": 0.337, "step": 14465 }, { "epoch": 3.374533582089552, "grad_norm": 0.3783992424160964, "learning_rate": 1.679728848034311e-05, "loss": 0.3528, "step": 14470 }, { "epoch": 3.375699626865672, "grad_norm": 0.38689274924933764, "learning_rate": 1.678202833208954e-05, "loss": 0.3365, "step": 14475 }, { "epoch": 3.376865671641791, "grad_norm": 0.3841655946617686, "learning_rate": 1.6766774558474523e-05, "loss": 0.3312, "step": 14480 }, { "epoch": 3.3780317164179103, "grad_norm": 0.3593124300222711, "learning_rate": 1.675152716857041e-05, "loss": 0.3264, "step": 14485 }, { "epoch": 3.37919776119403, "grad_norm": 0.3486205976615116, "learning_rate": 1.6736286171445763e-05, "loss": 0.3254, "step": 14490 }, { "epoch": 3.3803638059701493, "grad_norm": 0.3540705212510821, "learning_rate": 1.672105157616535e-05, "loss": 0.3173, "step": 14495 }, { "epoch": 3.3815298507462686, "grad_norm": 0.3752893939402229, "learning_rate": 1.670582339179012e-05, "loss": 0.3419, "step": 14500 }, { "epoch": 3.3826958955223883, "grad_norm": 0.36620118750667346, "learning_rate": 1.669060162737722e-05, "loss": 0.3332, "step": 14505 }, { "epoch": 3.3838619402985075, "grad_norm": 0.34932427427599044, "learning_rate": 1.667538629197996e-05, "loss": 0.3279, "step": 14510 }, { "epoch": 3.3850279850746268, "grad_norm": 0.3775048502619263, "learning_rate": 1.666017739464784e-05, "loss": 0.3449, "step": 14515 }, { "epoch": 3.3861940298507465, "grad_norm": 0.3945710780938654, "learning_rate": 1.664497494442654e-05, "loss": 0.3357, "step": 14520 }, { "epoch": 3.3873600746268657, "grad_norm": 0.3668755523994854, "learning_rate": 1.6629778950357883e-05, "loss": 0.3339, "step": 14525 }, { "epoch": 3.388526119402985, "grad_norm": 0.3681631742381257, "learning_rate": 1.6614589421479876e-05, "loss": 0.3278, "step": 14530 }, { "epoch": 3.3896921641791042, "grad_norm": 0.38126690337022373, "learning_rate": 1.6599406366826648e-05, "loss": 0.3387, "step": 14535 }, { "epoch": 3.390858208955224, "grad_norm": 0.3805957320880095, "learning_rate": 1.6584229795428514e-05, "loss": 0.3412, "step": 14540 }, { "epoch": 3.392024253731343, "grad_norm": 0.3455529416980486, "learning_rate": 1.656905971631192e-05, "loss": 0.3278, "step": 14545 }, { "epoch": 3.393190298507463, "grad_norm": 0.3666042009298596, "learning_rate": 1.655389613849943e-05, "loss": 0.3305, "step": 14550 }, { "epoch": 3.394356343283582, "grad_norm": 0.3562036770529133, "learning_rate": 1.653873907100977e-05, "loss": 0.3271, "step": 14555 }, { "epoch": 3.3955223880597014, "grad_norm": 0.351897081751981, "learning_rate": 1.6523588522857784e-05, "loss": 0.3261, "step": 14560 }, { "epoch": 3.3966884328358207, "grad_norm": 0.3711425099386066, "learning_rate": 1.6508444503054432e-05, "loss": 0.3299, "step": 14565 }, { "epoch": 3.3978544776119404, "grad_norm": 0.3688138996565365, "learning_rate": 1.6493307020606796e-05, "loss": 0.3348, "step": 14570 }, { "epoch": 3.3990205223880596, "grad_norm": 0.36755301611238034, "learning_rate": 1.647817608451807e-05, "loss": 0.3416, "step": 14575 }, { "epoch": 3.4001865671641793, "grad_norm": 0.37049370498932566, "learning_rate": 1.6463051703787557e-05, "loss": 0.334, "step": 14580 }, { "epoch": 3.4013526119402986, "grad_norm": 0.3516671218129951, "learning_rate": 1.644793388741067e-05, "loss": 0.3265, "step": 14585 }, { "epoch": 3.402518656716418, "grad_norm": 0.3601317188695524, "learning_rate": 1.6432822644378888e-05, "loss": 0.332, "step": 14590 }, { "epoch": 3.403684701492537, "grad_norm": 0.36753741291718534, "learning_rate": 1.64177179836798e-05, "loss": 0.3289, "step": 14595 }, { "epoch": 3.404850746268657, "grad_norm": 0.35386825988075743, "learning_rate": 1.6402619914297087e-05, "loss": 0.3389, "step": 14600 }, { "epoch": 3.406016791044776, "grad_norm": 0.3667412175060357, "learning_rate": 1.6387528445210497e-05, "loss": 0.3307, "step": 14605 }, { "epoch": 3.4071828358208958, "grad_norm": 0.4038785901545273, "learning_rate": 1.6372443585395875e-05, "loss": 0.3265, "step": 14610 }, { "epoch": 3.408348880597015, "grad_norm": 0.3550492244513922, "learning_rate": 1.6357365343825088e-05, "loss": 0.3177, "step": 14615 }, { "epoch": 3.4095149253731343, "grad_norm": 0.3613251240825706, "learning_rate": 1.634229372946611e-05, "loss": 0.3288, "step": 14620 }, { "epoch": 3.4106809701492535, "grad_norm": 0.36542787637025925, "learning_rate": 1.632722875128296e-05, "loss": 0.3361, "step": 14625 }, { "epoch": 3.4118470149253732, "grad_norm": 0.35927686580214985, "learning_rate": 1.6312170418235705e-05, "loss": 0.3517, "step": 14630 }, { "epoch": 3.4130130597014925, "grad_norm": 0.35177798901385426, "learning_rate": 1.6297118739280483e-05, "loss": 0.3277, "step": 14635 }, { "epoch": 3.4141791044776117, "grad_norm": 0.3452142196151901, "learning_rate": 1.6282073723369427e-05, "loss": 0.3231, "step": 14640 }, { "epoch": 3.4153451492537314, "grad_norm": 0.3680101886199691, "learning_rate": 1.6267035379450744e-05, "loss": 0.321, "step": 14645 }, { "epoch": 3.4165111940298507, "grad_norm": 0.35988407445514975, "learning_rate": 1.625200371646867e-05, "loss": 0.327, "step": 14650 }, { "epoch": 3.41767723880597, "grad_norm": 0.37105421667334054, "learning_rate": 1.6236978743363464e-05, "loss": 0.3309, "step": 14655 }, { "epoch": 3.4188432835820897, "grad_norm": 0.3798918900036447, "learning_rate": 1.622196046907141e-05, "loss": 0.3392, "step": 14660 }, { "epoch": 3.420009328358209, "grad_norm": 0.3446234975186816, "learning_rate": 1.6206948902524783e-05, "loss": 0.3126, "step": 14665 }, { "epoch": 3.421175373134328, "grad_norm": 0.358649061311015, "learning_rate": 1.619194405265189e-05, "loss": 0.3277, "step": 14670 }, { "epoch": 3.422341417910448, "grad_norm": 0.3493127914792498, "learning_rate": 1.617694592837705e-05, "loss": 0.3162, "step": 14675 }, { "epoch": 3.423507462686567, "grad_norm": 0.3720671187559675, "learning_rate": 1.616195453862057e-05, "loss": 0.3185, "step": 14680 }, { "epoch": 3.4246735074626864, "grad_norm": 0.3829346481868671, "learning_rate": 1.614696989229876e-05, "loss": 0.3313, "step": 14685 }, { "epoch": 3.425839552238806, "grad_norm": 0.36026012633648186, "learning_rate": 1.6131991998323893e-05, "loss": 0.3159, "step": 14690 }, { "epoch": 3.4270055970149254, "grad_norm": 0.364732664708826, "learning_rate": 1.611702086560426e-05, "loss": 0.3321, "step": 14695 }, { "epoch": 3.4281716417910446, "grad_norm": 0.3733393386423135, "learning_rate": 1.6102056503044115e-05, "loss": 0.3243, "step": 14700 }, { "epoch": 3.4293376865671643, "grad_norm": 0.36131975570955943, "learning_rate": 1.6087098919543696e-05, "loss": 0.3292, "step": 14705 }, { "epoch": 3.4305037313432836, "grad_norm": 0.35939910986677887, "learning_rate": 1.6072148123999182e-05, "loss": 0.3394, "step": 14710 }, { "epoch": 3.431669776119403, "grad_norm": 0.3447149523770174, "learning_rate": 1.605720412530274e-05, "loss": 0.3312, "step": 14715 }, { "epoch": 3.4328358208955225, "grad_norm": 0.42216881380869536, "learning_rate": 1.6042266932342498e-05, "loss": 0.3379, "step": 14720 }, { "epoch": 3.434001865671642, "grad_norm": 0.3607212441952194, "learning_rate": 1.6027336554002512e-05, "loss": 0.3359, "step": 14725 }, { "epoch": 3.435167910447761, "grad_norm": 0.38130759153704746, "learning_rate": 1.60124129991628e-05, "loss": 0.3257, "step": 14730 }, { "epoch": 3.4363339552238807, "grad_norm": 0.33928036201086126, "learning_rate": 1.599749627669933e-05, "loss": 0.3157, "step": 14735 }, { "epoch": 3.4375, "grad_norm": 0.36133861610151197, "learning_rate": 1.5982586395483983e-05, "loss": 0.3357, "step": 14740 }, { "epoch": 3.4386660447761193, "grad_norm": 0.35291428627048993, "learning_rate": 1.5967683364384595e-05, "loss": 0.3238, "step": 14745 }, { "epoch": 3.439832089552239, "grad_norm": 0.3682635737404115, "learning_rate": 1.595278719226491e-05, "loss": 0.329, "step": 14750 }, { "epoch": 3.440998134328358, "grad_norm": 0.35753795407437755, "learning_rate": 1.5937897887984605e-05, "loss": 0.3387, "step": 14755 }, { "epoch": 3.4421641791044775, "grad_norm": 0.36141855960515507, "learning_rate": 1.5923015460399277e-05, "loss": 0.3375, "step": 14760 }, { "epoch": 3.443330223880597, "grad_norm": 0.36290759516276055, "learning_rate": 1.59081399183604e-05, "loss": 0.333, "step": 14765 }, { "epoch": 3.4444962686567164, "grad_norm": 0.3498827290295448, "learning_rate": 1.589327127071539e-05, "loss": 0.3405, "step": 14770 }, { "epoch": 3.4456623134328357, "grad_norm": 0.3596383278188398, "learning_rate": 1.587840952630755e-05, "loss": 0.3203, "step": 14775 }, { "epoch": 3.4468283582089554, "grad_norm": 0.3664597378303038, "learning_rate": 1.5863554693976065e-05, "loss": 0.3392, "step": 14780 }, { "epoch": 3.4479944029850746, "grad_norm": 0.37031740975092664, "learning_rate": 1.584870678255604e-05, "loss": 0.3306, "step": 14785 }, { "epoch": 3.449160447761194, "grad_norm": 0.34070326780599375, "learning_rate": 1.5833865800878422e-05, "loss": 0.3355, "step": 14790 }, { "epoch": 3.4503264925373136, "grad_norm": 0.36613699523160836, "learning_rate": 1.5819031757770064e-05, "loss": 0.336, "step": 14795 }, { "epoch": 3.451492537313433, "grad_norm": 0.36801957707677047, "learning_rate": 1.580420466205369e-05, "loss": 0.3344, "step": 14800 }, { "epoch": 3.452658582089552, "grad_norm": 0.3858393377866959, "learning_rate": 1.5789384522547888e-05, "loss": 0.3338, "step": 14805 }, { "epoch": 3.453824626865672, "grad_norm": 0.3503525325976741, "learning_rate": 1.577457134806711e-05, "loss": 0.3108, "step": 14810 }, { "epoch": 3.454990671641791, "grad_norm": 0.36296555377155126, "learning_rate": 1.5759765147421658e-05, "loss": 0.3296, "step": 14815 }, { "epoch": 3.4561567164179103, "grad_norm": 0.37633455287935386, "learning_rate": 1.5744965929417693e-05, "loss": 0.3206, "step": 14820 }, { "epoch": 3.45732276119403, "grad_norm": 0.35934618176243993, "learning_rate": 1.573017370285722e-05, "loss": 0.319, "step": 14825 }, { "epoch": 3.4584888059701493, "grad_norm": 0.3653365171269483, "learning_rate": 1.57153884765381e-05, "loss": 0.3343, "step": 14830 }, { "epoch": 3.4596548507462686, "grad_norm": 0.3769542476376287, "learning_rate": 1.5700610259254018e-05, "loss": 0.3351, "step": 14835 }, { "epoch": 3.4608208955223883, "grad_norm": 0.3669040001478793, "learning_rate": 1.5685839059794476e-05, "loss": 0.3368, "step": 14840 }, { "epoch": 3.4619869402985075, "grad_norm": 0.35911218432643416, "learning_rate": 1.5671074886944823e-05, "loss": 0.323, "step": 14845 }, { "epoch": 3.4631529850746268, "grad_norm": 0.3523318873019422, "learning_rate": 1.5656317749486225e-05, "loss": 0.3286, "step": 14850 }, { "epoch": 3.4643190298507465, "grad_norm": 0.37422689672709303, "learning_rate": 1.5641567656195664e-05, "loss": 0.3324, "step": 14855 }, { "epoch": 3.4654850746268657, "grad_norm": 0.38766511982005936, "learning_rate": 1.562682461584594e-05, "loss": 0.3503, "step": 14860 }, { "epoch": 3.466651119402985, "grad_norm": 0.35505242648315294, "learning_rate": 1.561208863720562e-05, "loss": 0.3299, "step": 14865 }, { "epoch": 3.4678171641791042, "grad_norm": 0.3611680837469734, "learning_rate": 1.559735972903912e-05, "loss": 0.3274, "step": 14870 }, { "epoch": 3.468983208955224, "grad_norm": 0.34972098864033974, "learning_rate": 1.5582637900106622e-05, "loss": 0.339, "step": 14875 }, { "epoch": 3.470149253731343, "grad_norm": 0.34847286933699, "learning_rate": 1.5567923159164108e-05, "loss": 0.3343, "step": 14880 }, { "epoch": 3.471315298507463, "grad_norm": 0.36757621767472476, "learning_rate": 1.555321551496335e-05, "loss": 0.3425, "step": 14885 }, { "epoch": 3.472481343283582, "grad_norm": 0.3793889665690012, "learning_rate": 1.553851497625187e-05, "loss": 0.3483, "step": 14890 }, { "epoch": 3.4736473880597014, "grad_norm": 0.3711182127309298, "learning_rate": 1.5523821551773006e-05, "loss": 0.3325, "step": 14895 }, { "epoch": 3.4748134328358207, "grad_norm": 0.36533654574408975, "learning_rate": 1.5509135250265835e-05, "loss": 0.3284, "step": 14900 }, { "epoch": 3.4759794776119404, "grad_norm": 0.37700089004250487, "learning_rate": 1.5494456080465198e-05, "loss": 0.3178, "step": 14905 }, { "epoch": 3.4771455223880596, "grad_norm": 0.36278063063044996, "learning_rate": 1.547978405110171e-05, "loss": 0.3312, "step": 14910 }, { "epoch": 3.4783115671641793, "grad_norm": 0.3493431734159773, "learning_rate": 1.5465119170901742e-05, "loss": 0.334, "step": 14915 }, { "epoch": 3.4794776119402986, "grad_norm": 0.3655611583068637, "learning_rate": 1.545046144858738e-05, "loss": 0.3389, "step": 14920 }, { "epoch": 3.480643656716418, "grad_norm": 0.3805565658243217, "learning_rate": 1.543581089287649e-05, "loss": 0.3273, "step": 14925 }, { "epoch": 3.481809701492537, "grad_norm": 0.3627125661528815, "learning_rate": 1.5421167512482655e-05, "loss": 0.3422, "step": 14930 }, { "epoch": 3.482975746268657, "grad_norm": 0.3678410497823603, "learning_rate": 1.5406531316115197e-05, "loss": 0.3257, "step": 14935 }, { "epoch": 3.484141791044776, "grad_norm": 0.3654476965843805, "learning_rate": 1.539190231247917e-05, "loss": 0.3241, "step": 14940 }, { "epoch": 3.4853078358208958, "grad_norm": 0.3530117766823089, "learning_rate": 1.5377280510275342e-05, "loss": 0.3183, "step": 14945 }, { "epoch": 3.486473880597015, "grad_norm": 0.36856292967098087, "learning_rate": 1.5362665918200193e-05, "loss": 0.3299, "step": 14950 }, { "epoch": 3.4876399253731343, "grad_norm": 0.3686830865840385, "learning_rate": 1.534805854494593e-05, "loss": 0.3297, "step": 14955 }, { "epoch": 3.4888059701492535, "grad_norm": 0.3494305535754593, "learning_rate": 1.533345839920045e-05, "loss": 0.3263, "step": 14960 }, { "epoch": 3.4899720149253732, "grad_norm": 0.3753474242982273, "learning_rate": 1.5318865489647383e-05, "loss": 0.3263, "step": 14965 }, { "epoch": 3.4911380597014925, "grad_norm": 0.36408345590552543, "learning_rate": 1.5304279824966e-05, "loss": 0.3363, "step": 14970 }, { "epoch": 3.4923041044776117, "grad_norm": 0.37622496051105103, "learning_rate": 1.528970141383131e-05, "loss": 0.3534, "step": 14975 }, { "epoch": 3.4934701492537314, "grad_norm": 0.3794392803979275, "learning_rate": 1.5275130264913994e-05, "loss": 0.3351, "step": 14980 }, { "epoch": 3.4946361940298507, "grad_norm": 0.3859595842099221, "learning_rate": 1.5260566386880413e-05, "loss": 0.3314, "step": 14985 }, { "epoch": 3.49580223880597, "grad_norm": 0.34602364135429065, "learning_rate": 1.5246009788392606e-05, "loss": 0.3209, "step": 14990 }, { "epoch": 3.4969682835820897, "grad_norm": 0.3816376672551873, "learning_rate": 1.5231460478108268e-05, "loss": 0.3357, "step": 14995 }, { "epoch": 3.498134328358209, "grad_norm": 0.3717461736813935, "learning_rate": 1.5216918464680776e-05, "loss": 0.3324, "step": 15000 }, { "epoch": 3.499300373134328, "grad_norm": 0.33178416765381397, "learning_rate": 1.520238375675917e-05, "loss": 0.3181, "step": 15005 }, { "epoch": 3.500466417910448, "grad_norm": 0.3903633798359936, "learning_rate": 1.5187856362988123e-05, "loss": 0.3407, "step": 15010 }, { "epoch": 3.501632462686567, "grad_norm": 0.3367713124282168, "learning_rate": 1.5173336292007994e-05, "loss": 0.338, "step": 15015 }, { "epoch": 3.5027985074626864, "grad_norm": 0.3544053714592569, "learning_rate": 1.5158823552454737e-05, "loss": 0.328, "step": 15020 }, { "epoch": 3.503964552238806, "grad_norm": 0.35941095782907484, "learning_rate": 1.5144318152959985e-05, "loss": 0.3404, "step": 15025 }, { "epoch": 3.5051305970149254, "grad_norm": 0.35440758227539454, "learning_rate": 1.5129820102151e-05, "loss": 0.3297, "step": 15030 }, { "epoch": 3.5062966417910446, "grad_norm": 0.3591423084644365, "learning_rate": 1.5115329408650658e-05, "loss": 0.3252, "step": 15035 }, { "epoch": 3.5074626865671643, "grad_norm": 0.3622317501692589, "learning_rate": 1.5100846081077479e-05, "loss": 0.3259, "step": 15040 }, { "epoch": 3.5086287313432836, "grad_norm": 0.3758616159252475, "learning_rate": 1.5086370128045574e-05, "loss": 0.3331, "step": 15045 }, { "epoch": 3.509794776119403, "grad_norm": 0.3474888575260031, "learning_rate": 1.5071901558164692e-05, "loss": 0.3435, "step": 15050 }, { "epoch": 3.5109608208955225, "grad_norm": 0.35899589947351973, "learning_rate": 1.5057440380040184e-05, "loss": 0.3183, "step": 15055 }, { "epoch": 3.512126865671642, "grad_norm": 0.38714229369622266, "learning_rate": 1.5042986602273017e-05, "loss": 0.3395, "step": 15060 }, { "epoch": 3.513292910447761, "grad_norm": 0.3657904168301437, "learning_rate": 1.502854023345972e-05, "loss": 0.34, "step": 15065 }, { "epoch": 3.5144589552238807, "grad_norm": 0.38156129937719535, "learning_rate": 1.5014101282192452e-05, "loss": 0.325, "step": 15070 }, { "epoch": 3.515625, "grad_norm": 0.371536919613815, "learning_rate": 1.4999669757058956e-05, "loss": 0.3461, "step": 15075 }, { "epoch": 3.5167910447761193, "grad_norm": 0.35176434893868985, "learning_rate": 1.498524566664253e-05, "loss": 0.3142, "step": 15080 }, { "epoch": 3.517957089552239, "grad_norm": 0.33867449955543255, "learning_rate": 1.4970829019522083e-05, "loss": 0.312, "step": 15085 }, { "epoch": 3.519123134328358, "grad_norm": 0.3640566878166673, "learning_rate": 1.4956419824272083e-05, "loss": 0.3373, "step": 15090 }, { "epoch": 3.5202891791044775, "grad_norm": 0.3868816339197881, "learning_rate": 1.4942018089462567e-05, "loss": 0.3332, "step": 15095 }, { "epoch": 3.521455223880597, "grad_norm": 0.3680703668899198, "learning_rate": 1.4927623823659126e-05, "loss": 0.3226, "step": 15100 }, { "epoch": 3.5226212686567164, "grad_norm": 0.3745327814610825, "learning_rate": 1.4913237035422926e-05, "loss": 0.3312, "step": 15105 }, { "epoch": 3.5237873134328357, "grad_norm": 0.3615791358363849, "learning_rate": 1.4898857733310673e-05, "loss": 0.3322, "step": 15110 }, { "epoch": 3.5249533582089554, "grad_norm": 0.38463221506641937, "learning_rate": 1.4884485925874634e-05, "loss": 0.3372, "step": 15115 }, { "epoch": 3.5261194029850746, "grad_norm": 0.4251087903244426, "learning_rate": 1.4870121621662594e-05, "loss": 0.3422, "step": 15120 }, { "epoch": 3.527285447761194, "grad_norm": 0.38294678196384374, "learning_rate": 1.4855764829217894e-05, "loss": 0.3285, "step": 15125 }, { "epoch": 3.5284514925373136, "grad_norm": 0.3728894580708877, "learning_rate": 1.4841415557079413e-05, "loss": 0.3304, "step": 15130 }, { "epoch": 3.529617537313433, "grad_norm": 0.3578395421103677, "learning_rate": 1.482707381378154e-05, "loss": 0.3469, "step": 15135 }, { "epoch": 3.530783582089552, "grad_norm": 0.3685878381954686, "learning_rate": 1.4812739607854199e-05, "loss": 0.3351, "step": 15140 }, { "epoch": 3.5319496268656714, "grad_norm": 0.33687964250194624, "learning_rate": 1.479841294782282e-05, "loss": 0.3138, "step": 15145 }, { "epoch": 3.533115671641791, "grad_norm": 0.362544293885602, "learning_rate": 1.4784093842208351e-05, "loss": 0.3316, "step": 15150 }, { "epoch": 3.5342817164179103, "grad_norm": 0.3548155459638462, "learning_rate": 1.4769782299527252e-05, "loss": 0.3386, "step": 15155 }, { "epoch": 3.53544776119403, "grad_norm": 0.359730220407117, "learning_rate": 1.4755478328291476e-05, "loss": 0.3228, "step": 15160 }, { "epoch": 3.5366138059701493, "grad_norm": 0.3670487320757613, "learning_rate": 1.4741181937008485e-05, "loss": 0.3143, "step": 15165 }, { "epoch": 3.5377798507462686, "grad_norm": 0.3607450050322974, "learning_rate": 1.4726893134181214e-05, "loss": 0.3259, "step": 15170 }, { "epoch": 3.538945895522388, "grad_norm": 0.3534104666076903, "learning_rate": 1.4712611928308095e-05, "loss": 0.3348, "step": 15175 }, { "epoch": 3.5401119402985075, "grad_norm": 0.37555662273732066, "learning_rate": 1.4698338327883044e-05, "loss": 0.3283, "step": 15180 }, { "epoch": 3.5412779850746268, "grad_norm": 0.3586619929521861, "learning_rate": 1.4684072341395454e-05, "loss": 0.3267, "step": 15185 }, { "epoch": 3.5424440298507465, "grad_norm": 0.3739340903164566, "learning_rate": 1.4669813977330193e-05, "loss": 0.3187, "step": 15190 }, { "epoch": 3.5436100746268657, "grad_norm": 0.3765370086078261, "learning_rate": 1.4655563244167572e-05, "loss": 0.3175, "step": 15195 }, { "epoch": 3.544776119402985, "grad_norm": 0.361939433787893, "learning_rate": 1.4641320150383391e-05, "loss": 0.3279, "step": 15200 }, { "epoch": 3.5459421641791042, "grad_norm": 0.33771998888651455, "learning_rate": 1.4627084704448895e-05, "loss": 0.3279, "step": 15205 }, { "epoch": 3.547108208955224, "grad_norm": 0.3679615023683131, "learning_rate": 1.461285691483078e-05, "loss": 0.3289, "step": 15210 }, { "epoch": 3.548274253731343, "grad_norm": 0.35695586140050806, "learning_rate": 1.4598636789991199e-05, "loss": 0.3318, "step": 15215 }, { "epoch": 3.549440298507463, "grad_norm": 0.3470314610034076, "learning_rate": 1.458442433838772e-05, "loss": 0.3303, "step": 15220 }, { "epoch": 3.550606343283582, "grad_norm": 0.35117993739354486, "learning_rate": 1.4570219568473376e-05, "loss": 0.3244, "step": 15225 }, { "epoch": 3.5517723880597014, "grad_norm": 0.35359037370879304, "learning_rate": 1.4556022488696614e-05, "loss": 0.3231, "step": 15230 }, { "epoch": 3.5529384328358207, "grad_norm": 0.36067931917478974, "learning_rate": 1.4541833107501312e-05, "loss": 0.3332, "step": 15235 }, { "epoch": 3.5541044776119404, "grad_norm": 0.3711918788865033, "learning_rate": 1.4527651433326786e-05, "loss": 0.334, "step": 15240 }, { "epoch": 3.5552705223880596, "grad_norm": 0.3638690629653978, "learning_rate": 1.4513477474607729e-05, "loss": 0.3253, "step": 15245 }, { "epoch": 3.5564365671641793, "grad_norm": 0.34811799023017653, "learning_rate": 1.4499311239774277e-05, "loss": 0.3351, "step": 15250 }, { "epoch": 3.5576026119402986, "grad_norm": 0.37106387121399587, "learning_rate": 1.4485152737251972e-05, "loss": 0.339, "step": 15255 }, { "epoch": 3.558768656716418, "grad_norm": 0.361600670652692, "learning_rate": 1.4471001975461735e-05, "loss": 0.3438, "step": 15260 }, { "epoch": 3.559934701492537, "grad_norm": 0.3637800703363995, "learning_rate": 1.4456858962819897e-05, "loss": 0.3368, "step": 15265 }, { "epoch": 3.561100746268657, "grad_norm": 0.3796780001513974, "learning_rate": 1.4442723707738199e-05, "loss": 0.328, "step": 15270 }, { "epoch": 3.562266791044776, "grad_norm": 0.36835448631835566, "learning_rate": 1.4428596218623722e-05, "loss": 0.3448, "step": 15275 }, { "epoch": 3.5634328358208958, "grad_norm": 0.3577276120644672, "learning_rate": 1.4414476503878968e-05, "loss": 0.3372, "step": 15280 }, { "epoch": 3.564598880597015, "grad_norm": 0.3613767996407675, "learning_rate": 1.4400364571901803e-05, "loss": 0.3307, "step": 15285 }, { "epoch": 3.5657649253731343, "grad_norm": 0.3677552637842849, "learning_rate": 1.4386260431085457e-05, "loss": 0.338, "step": 15290 }, { "epoch": 3.5669309701492535, "grad_norm": 0.356908452833887, "learning_rate": 1.4372164089818546e-05, "loss": 0.3427, "step": 15295 }, { "epoch": 3.5680970149253732, "grad_norm": 0.36206965948095843, "learning_rate": 1.4358075556485016e-05, "loss": 0.3376, "step": 15300 }, { "epoch": 3.5692630597014925, "grad_norm": 0.3624327624630281, "learning_rate": 1.4343994839464192e-05, "loss": 0.3206, "step": 15305 }, { "epoch": 3.570429104477612, "grad_norm": 0.38006948309906924, "learning_rate": 1.4329921947130748e-05, "loss": 0.3212, "step": 15310 }, { "epoch": 3.5715951492537314, "grad_norm": 0.38480622442127016, "learning_rate": 1.43158568878547e-05, "loss": 0.3241, "step": 15315 }, { "epoch": 3.5727611940298507, "grad_norm": 0.34484025763709397, "learning_rate": 1.430179967000141e-05, "loss": 0.3299, "step": 15320 }, { "epoch": 3.57392723880597, "grad_norm": 0.3499694142962118, "learning_rate": 1.4287750301931557e-05, "loss": 0.3321, "step": 15325 }, { "epoch": 3.5750932835820897, "grad_norm": 0.3398013700826958, "learning_rate": 1.4273708792001182e-05, "loss": 0.3316, "step": 15330 }, { "epoch": 3.576259328358209, "grad_norm": 0.3760552529788722, "learning_rate": 1.4259675148561627e-05, "loss": 0.3492, "step": 15335 }, { "epoch": 3.5774253731343286, "grad_norm": 0.38859124118217825, "learning_rate": 1.424564937995957e-05, "loss": 0.3479, "step": 15340 }, { "epoch": 3.578591417910448, "grad_norm": 0.35942793969687326, "learning_rate": 1.4231631494537007e-05, "loss": 0.3322, "step": 15345 }, { "epoch": 3.579757462686567, "grad_norm": 0.36194333425110714, "learning_rate": 1.4217621500631222e-05, "loss": 0.3354, "step": 15350 }, { "epoch": 3.5809235074626864, "grad_norm": 0.3459759448708475, "learning_rate": 1.4203619406574833e-05, "loss": 0.3231, "step": 15355 }, { "epoch": 3.582089552238806, "grad_norm": 0.3589072630519182, "learning_rate": 1.4189625220695746e-05, "loss": 0.3348, "step": 15360 }, { "epoch": 3.5832555970149254, "grad_norm": 0.3677150312874064, "learning_rate": 1.4175638951317166e-05, "loss": 0.3326, "step": 15365 }, { "epoch": 3.5844216417910446, "grad_norm": 0.3579009984342167, "learning_rate": 1.41616606067576e-05, "loss": 0.3427, "step": 15370 }, { "epoch": 3.5855876865671643, "grad_norm": 0.411591501910976, "learning_rate": 1.4147690195330815e-05, "loss": 0.3271, "step": 15375 }, { "epoch": 3.5867537313432836, "grad_norm": 0.38433124951876196, "learning_rate": 1.413372772534588e-05, "loss": 0.3272, "step": 15380 }, { "epoch": 3.587919776119403, "grad_norm": 0.3634833176942538, "learning_rate": 1.411977320510714e-05, "loss": 0.3432, "step": 15385 }, { "epoch": 3.5890858208955225, "grad_norm": 0.3668720363682665, "learning_rate": 1.410582664291421e-05, "loss": 0.3276, "step": 15390 }, { "epoch": 3.590251865671642, "grad_norm": 0.3638320437184362, "learning_rate": 1.4091888047061974e-05, "loss": 0.3343, "step": 15395 }, { "epoch": 3.591417910447761, "grad_norm": 0.3524707695867182, "learning_rate": 1.4077957425840563e-05, "loss": 0.3213, "step": 15400 }, { "epoch": 3.5925839552238807, "grad_norm": 0.3621898628153211, "learning_rate": 1.406403478753538e-05, "loss": 0.3194, "step": 15405 }, { "epoch": 3.59375, "grad_norm": 0.3590742712912799, "learning_rate": 1.4050120140427081e-05, "loss": 0.3261, "step": 15410 }, { "epoch": 3.5949160447761193, "grad_norm": 0.3450912029468673, "learning_rate": 1.4036213492791561e-05, "loss": 0.3256, "step": 15415 }, { "epoch": 3.596082089552239, "grad_norm": 0.35088638446113596, "learning_rate": 1.4022314852899968e-05, "loss": 0.3297, "step": 15420 }, { "epoch": 3.597248134328358, "grad_norm": 0.3472203270848133, "learning_rate": 1.4008424229018668e-05, "loss": 0.337, "step": 15425 }, { "epoch": 3.5984141791044775, "grad_norm": 0.3589158404874403, "learning_rate": 1.3994541629409275e-05, "loss": 0.3109, "step": 15430 }, { "epoch": 3.599580223880597, "grad_norm": 0.36760153151119807, "learning_rate": 1.398066706232864e-05, "loss": 0.3336, "step": 15435 }, { "epoch": 3.6007462686567164, "grad_norm": 0.3960640249827398, "learning_rate": 1.3966800536028802e-05, "loss": 0.3493, "step": 15440 }, { "epoch": 3.6019123134328357, "grad_norm": 0.3460082205141992, "learning_rate": 1.395294205875705e-05, "loss": 0.3, "step": 15445 }, { "epoch": 3.6030783582089554, "grad_norm": 0.36391806609117766, "learning_rate": 1.3939091638755882e-05, "loss": 0.3232, "step": 15450 }, { "epoch": 3.6042444029850746, "grad_norm": 0.36817682739749436, "learning_rate": 1.3925249284262984e-05, "loss": 0.3389, "step": 15455 }, { "epoch": 3.605410447761194, "grad_norm": 0.3970887012135433, "learning_rate": 1.3911415003511258e-05, "loss": 0.3331, "step": 15460 }, { "epoch": 3.6065764925373136, "grad_norm": 0.3598222387254086, "learning_rate": 1.3897588804728818e-05, "loss": 0.3395, "step": 15465 }, { "epoch": 3.607742537313433, "grad_norm": 0.39486262469076977, "learning_rate": 1.3883770696138946e-05, "loss": 0.3502, "step": 15470 }, { "epoch": 3.608908582089552, "grad_norm": 0.35882559754039134, "learning_rate": 1.3869960685960118e-05, "loss": 0.3341, "step": 15475 }, { "epoch": 3.6100746268656714, "grad_norm": 0.36429890692316313, "learning_rate": 1.3856158782406007e-05, "loss": 0.3272, "step": 15480 }, { "epoch": 3.611240671641791, "grad_norm": 0.35308271094610133, "learning_rate": 1.3842364993685453e-05, "loss": 0.3314, "step": 15485 }, { "epoch": 3.6124067164179103, "grad_norm": 0.33863993816585075, "learning_rate": 1.3828579328002473e-05, "loss": 0.3325, "step": 15490 }, { "epoch": 3.61357276119403, "grad_norm": 0.363113691762699, "learning_rate": 1.3814801793556264e-05, "loss": 0.3276, "step": 15495 }, { "epoch": 3.6147388059701493, "grad_norm": 0.3417010384915563, "learning_rate": 1.3801032398541153e-05, "loss": 0.3263, "step": 15500 }, { "epoch": 3.6159048507462686, "grad_norm": 0.35067492562279406, "learning_rate": 1.3787271151146658e-05, "loss": 0.3253, "step": 15505 }, { "epoch": 3.617070895522388, "grad_norm": 0.3608403842297616, "learning_rate": 1.3773518059557445e-05, "loss": 0.3314, "step": 15510 }, { "epoch": 3.6182369402985075, "grad_norm": 0.3371418126128464, "learning_rate": 1.3759773131953321e-05, "loss": 0.3263, "step": 15515 }, { "epoch": 3.6194029850746268, "grad_norm": 0.3573099675524598, "learning_rate": 1.3746036376509252e-05, "loss": 0.3277, "step": 15520 }, { "epoch": 3.6205690298507465, "grad_norm": 0.3757913562251166, "learning_rate": 1.3732307801395322e-05, "loss": 0.3222, "step": 15525 }, { "epoch": 3.6217350746268657, "grad_norm": 0.3903498408572106, "learning_rate": 1.3718587414776756e-05, "loss": 0.3253, "step": 15530 }, { "epoch": 3.622901119402985, "grad_norm": 0.373151775416442, "learning_rate": 1.3704875224813928e-05, "loss": 0.344, "step": 15535 }, { "epoch": 3.6240671641791042, "grad_norm": 0.3573542232540521, "learning_rate": 1.3691171239662315e-05, "loss": 0.3349, "step": 15540 }, { "epoch": 3.625233208955224, "grad_norm": 0.35338449471090716, "learning_rate": 1.3677475467472534e-05, "loss": 0.3384, "step": 15545 }, { "epoch": 3.626399253731343, "grad_norm": 0.37427275029898005, "learning_rate": 1.366378791639028e-05, "loss": 0.3157, "step": 15550 }, { "epoch": 3.627565298507463, "grad_norm": 0.38765675807836114, "learning_rate": 1.3650108594556404e-05, "loss": 0.345, "step": 15555 }, { "epoch": 3.628731343283582, "grad_norm": 0.3533464330030186, "learning_rate": 1.3636437510106836e-05, "loss": 0.3269, "step": 15560 }, { "epoch": 3.6298973880597014, "grad_norm": 0.3417358657942666, "learning_rate": 1.362277467117261e-05, "loss": 0.3244, "step": 15565 }, { "epoch": 3.6310634328358207, "grad_norm": 0.3715215147670424, "learning_rate": 1.3609120085879872e-05, "loss": 0.3395, "step": 15570 }, { "epoch": 3.6322294776119404, "grad_norm": 0.35037124144158394, "learning_rate": 1.3595473762349825e-05, "loss": 0.3173, "step": 15575 }, { "epoch": 3.6333955223880596, "grad_norm": 0.35952911712874575, "learning_rate": 1.3581835708698796e-05, "loss": 0.3454, "step": 15580 }, { "epoch": 3.6345615671641793, "grad_norm": 0.37880670504603425, "learning_rate": 1.3568205933038164e-05, "loss": 0.3403, "step": 15585 }, { "epoch": 3.6357276119402986, "grad_norm": 0.35633675803326076, "learning_rate": 1.3554584443474405e-05, "loss": 0.333, "step": 15590 }, { "epoch": 3.636893656716418, "grad_norm": 0.34864687126539257, "learning_rate": 1.3540971248109063e-05, "loss": 0.317, "step": 15595 }, { "epoch": 3.638059701492537, "grad_norm": 0.3613190677360087, "learning_rate": 1.352736635503873e-05, "loss": 0.3223, "step": 15600 }, { "epoch": 3.639225746268657, "grad_norm": 0.3493317201849097, "learning_rate": 1.3513769772355083e-05, "loss": 0.328, "step": 15605 }, { "epoch": 3.640391791044776, "grad_norm": 0.36571371883536613, "learning_rate": 1.3500181508144855e-05, "loss": 0.3287, "step": 15610 }, { "epoch": 3.6415578358208958, "grad_norm": 0.3535576982310634, "learning_rate": 1.3486601570489809e-05, "loss": 0.3372, "step": 15615 }, { "epoch": 3.642723880597015, "grad_norm": 0.34118310103297367, "learning_rate": 1.3473029967466779e-05, "loss": 0.3278, "step": 15620 }, { "epoch": 3.6438899253731343, "grad_norm": 0.38546871733126326, "learning_rate": 1.3459466707147644e-05, "loss": 0.3391, "step": 15625 }, { "epoch": 3.6450559701492535, "grad_norm": 0.37287157064773213, "learning_rate": 1.3445911797599293e-05, "loss": 0.3387, "step": 15630 }, { "epoch": 3.6462220149253732, "grad_norm": 0.36069452698571924, "learning_rate": 1.3432365246883671e-05, "loss": 0.3313, "step": 15635 }, { "epoch": 3.6473880597014925, "grad_norm": 0.3605317083094569, "learning_rate": 1.3418827063057754e-05, "loss": 0.3184, "step": 15640 }, { "epoch": 3.648554104477612, "grad_norm": 0.35748846974735504, "learning_rate": 1.3405297254173532e-05, "loss": 0.3337, "step": 15645 }, { "epoch": 3.6497201492537314, "grad_norm": 0.42971535301545016, "learning_rate": 1.3391775828278023e-05, "loss": 0.3193, "step": 15650 }, { "epoch": 3.6508861940298507, "grad_norm": 0.3833379583017852, "learning_rate": 1.3378262793413237e-05, "loss": 0.3307, "step": 15655 }, { "epoch": 3.65205223880597, "grad_norm": 0.36711703155663405, "learning_rate": 1.3364758157616219e-05, "loss": 0.3193, "step": 15660 }, { "epoch": 3.6532182835820897, "grad_norm": 0.40412254990119817, "learning_rate": 1.3351261928919007e-05, "loss": 0.3557, "step": 15665 }, { "epoch": 3.654384328358209, "grad_norm": 0.3453863264301336, "learning_rate": 1.3337774115348639e-05, "loss": 0.3169, "step": 15670 }, { "epoch": 3.6555503731343286, "grad_norm": 0.35121559249784134, "learning_rate": 1.3324294724927155e-05, "loss": 0.309, "step": 15675 }, { "epoch": 3.656716417910448, "grad_norm": 0.3930320470487703, "learning_rate": 1.3310823765671571e-05, "loss": 0.3465, "step": 15680 }, { "epoch": 3.657882462686567, "grad_norm": 0.35668151675568066, "learning_rate": 1.32973612455939e-05, "loss": 0.3269, "step": 15685 }, { "epoch": 3.6590485074626864, "grad_norm": 0.41385254819495504, "learning_rate": 1.3283907172701135e-05, "loss": 0.3423, "step": 15690 }, { "epoch": 3.660214552238806, "grad_norm": 0.3705111186390367, "learning_rate": 1.3270461554995243e-05, "loss": 0.3237, "step": 15695 }, { "epoch": 3.6613805970149254, "grad_norm": 0.36339732325505775, "learning_rate": 1.3257024400473162e-05, "loss": 0.3204, "step": 15700 }, { "epoch": 3.6625466417910446, "grad_norm": 0.3860933890392395, "learning_rate": 1.3243595717126792e-05, "loss": 0.3197, "step": 15705 }, { "epoch": 3.6637126865671643, "grad_norm": 0.3520029127768971, "learning_rate": 1.3230175512943e-05, "loss": 0.318, "step": 15710 }, { "epoch": 3.6648787313432836, "grad_norm": 0.34843853869679225, "learning_rate": 1.3216763795903608e-05, "loss": 0.3147, "step": 15715 }, { "epoch": 3.666044776119403, "grad_norm": 0.36568601479642543, "learning_rate": 1.3203360573985394e-05, "loss": 0.3323, "step": 15720 }, { "epoch": 3.6672108208955225, "grad_norm": 0.3542524262376503, "learning_rate": 1.3189965855160088e-05, "loss": 0.3172, "step": 15725 }, { "epoch": 3.668376865671642, "grad_norm": 0.36937279614828306, "learning_rate": 1.3176579647394338e-05, "loss": 0.3225, "step": 15730 }, { "epoch": 3.669542910447761, "grad_norm": 0.36723823203508654, "learning_rate": 1.3163201958649757e-05, "loss": 0.3244, "step": 15735 }, { "epoch": 3.6707089552238807, "grad_norm": 0.39009194370239136, "learning_rate": 1.314983279688288e-05, "loss": 0.329, "step": 15740 }, { "epoch": 3.671875, "grad_norm": 0.34495145793866044, "learning_rate": 1.3136472170045171e-05, "loss": 0.3367, "step": 15745 }, { "epoch": 3.6730410447761193, "grad_norm": 0.3759980412237486, "learning_rate": 1.3123120086083026e-05, "loss": 0.3303, "step": 15750 }, { "epoch": 3.674207089552239, "grad_norm": 0.382340831760479, "learning_rate": 1.3109776552937742e-05, "loss": 0.3374, "step": 15755 }, { "epoch": 3.675373134328358, "grad_norm": 0.3767650864889917, "learning_rate": 1.3096441578545544e-05, "loss": 0.3368, "step": 15760 }, { "epoch": 3.6765391791044775, "grad_norm": 0.3589395179024901, "learning_rate": 1.308311517083756e-05, "loss": 0.3263, "step": 15765 }, { "epoch": 3.677705223880597, "grad_norm": 0.37011970417314655, "learning_rate": 1.306979733773983e-05, "loss": 0.3298, "step": 15770 }, { "epoch": 3.6788712686567164, "grad_norm": 0.353542681220899, "learning_rate": 1.3056488087173302e-05, "loss": 0.3264, "step": 15775 }, { "epoch": 3.6800373134328357, "grad_norm": 0.3554306940645688, "learning_rate": 1.3043187427053788e-05, "loss": 0.3228, "step": 15780 }, { "epoch": 3.6812033582089554, "grad_norm": 0.36167627559379484, "learning_rate": 1.3029895365292018e-05, "loss": 0.3357, "step": 15785 }, { "epoch": 3.6823694029850746, "grad_norm": 0.36949305735522564, "learning_rate": 1.3016611909793613e-05, "loss": 0.3346, "step": 15790 }, { "epoch": 3.683535447761194, "grad_norm": 0.34689154298138003, "learning_rate": 1.3003337068459037e-05, "loss": 0.3222, "step": 15795 }, { "epoch": 3.6847014925373136, "grad_norm": 0.3565317678427899, "learning_rate": 1.2990070849183678e-05, "loss": 0.3395, "step": 15800 }, { "epoch": 3.685867537313433, "grad_norm": 0.34608924921795814, "learning_rate": 1.2976813259857773e-05, "loss": 0.3159, "step": 15805 }, { "epoch": 3.687033582089552, "grad_norm": 0.36119933659445547, "learning_rate": 1.2963564308366416e-05, "loss": 0.3329, "step": 15810 }, { "epoch": 3.6881996268656714, "grad_norm": 0.3777273397001088, "learning_rate": 1.295032400258958e-05, "loss": 0.3437, "step": 15815 }, { "epoch": 3.689365671641791, "grad_norm": 0.36791880504170366, "learning_rate": 1.2937092350402097e-05, "loss": 0.3274, "step": 15820 }, { "epoch": 3.6905317164179103, "grad_norm": 0.3650755251064847, "learning_rate": 1.2923869359673646e-05, "loss": 0.335, "step": 15825 }, { "epoch": 3.69169776119403, "grad_norm": 0.3518973143656496, "learning_rate": 1.2910655038268749e-05, "loss": 0.3229, "step": 15830 }, { "epoch": 3.6928638059701493, "grad_norm": 0.35874834227369784, "learning_rate": 1.2897449394046773e-05, "loss": 0.3326, "step": 15835 }, { "epoch": 3.6940298507462686, "grad_norm": 0.35772673982122344, "learning_rate": 1.2884252434861938e-05, "loss": 0.3148, "step": 15840 }, { "epoch": 3.695195895522388, "grad_norm": 0.3625737095208401, "learning_rate": 1.2871064168563291e-05, "loss": 0.3218, "step": 15845 }, { "epoch": 3.6963619402985075, "grad_norm": 0.36284227434342214, "learning_rate": 1.2857884602994706e-05, "loss": 0.33, "step": 15850 }, { "epoch": 3.6975279850746268, "grad_norm": 0.39173976995634996, "learning_rate": 1.2844713745994871e-05, "loss": 0.3414, "step": 15855 }, { "epoch": 3.6986940298507465, "grad_norm": 0.3815036748658974, "learning_rate": 1.2831551605397321e-05, "loss": 0.3295, "step": 15860 }, { "epoch": 3.6998600746268657, "grad_norm": 0.35444807700065517, "learning_rate": 1.2818398189030383e-05, "loss": 0.3137, "step": 15865 }, { "epoch": 3.701026119402985, "grad_norm": 0.3524164632489728, "learning_rate": 1.2805253504717213e-05, "loss": 0.3244, "step": 15870 }, { "epoch": 3.7021921641791042, "grad_norm": 0.3783306387762941, "learning_rate": 1.2792117560275766e-05, "loss": 0.3336, "step": 15875 }, { "epoch": 3.703358208955224, "grad_norm": 0.37788785199661423, "learning_rate": 1.2778990363518785e-05, "loss": 0.3377, "step": 15880 }, { "epoch": 3.704524253731343, "grad_norm": 0.38682248210669895, "learning_rate": 1.2765871922253835e-05, "loss": 0.3418, "step": 15885 }, { "epoch": 3.705690298507463, "grad_norm": 0.36074312860848057, "learning_rate": 1.2752762244283255e-05, "loss": 0.3426, "step": 15890 }, { "epoch": 3.706856343283582, "grad_norm": 0.3610923629832659, "learning_rate": 1.2739661337404185e-05, "loss": 0.333, "step": 15895 }, { "epoch": 3.7080223880597014, "grad_norm": 0.3665839019632394, "learning_rate": 1.2726569209408545e-05, "loss": 0.3212, "step": 15900 }, { "epoch": 3.7091884328358207, "grad_norm": 0.36768178961895553, "learning_rate": 1.2713485868083014e-05, "loss": 0.3409, "step": 15905 }, { "epoch": 3.7103544776119404, "grad_norm": 0.3849882925587323, "learning_rate": 1.2700411321209078e-05, "loss": 0.3311, "step": 15910 }, { "epoch": 3.7115205223880596, "grad_norm": 0.37112550059375615, "learning_rate": 1.2687345576562965e-05, "loss": 0.3433, "step": 15915 }, { "epoch": 3.7126865671641793, "grad_norm": 0.34565291867874093, "learning_rate": 1.2674288641915688e-05, "loss": 0.3118, "step": 15920 }, { "epoch": 3.7138526119402986, "grad_norm": 0.37740723081110616, "learning_rate": 1.2661240525033016e-05, "loss": 0.3371, "step": 15925 }, { "epoch": 3.715018656716418, "grad_norm": 0.3655720709530991, "learning_rate": 1.264820123367545e-05, "loss": 0.3408, "step": 15930 }, { "epoch": 3.716184701492537, "grad_norm": 0.3616587077451775, "learning_rate": 1.2635170775598271e-05, "loss": 0.3221, "step": 15935 }, { "epoch": 3.717350746268657, "grad_norm": 0.37889750941780176, "learning_rate": 1.26221491585515e-05, "loss": 0.3298, "step": 15940 }, { "epoch": 3.718516791044776, "grad_norm": 0.41774288823596173, "learning_rate": 1.2609136390279886e-05, "loss": 0.3328, "step": 15945 }, { "epoch": 3.7196828358208958, "grad_norm": 0.3780901090472747, "learning_rate": 1.2596132478522938e-05, "loss": 0.3375, "step": 15950 }, { "epoch": 3.720848880597015, "grad_norm": 0.3557886701922332, "learning_rate": 1.258313743101487e-05, "loss": 0.3282, "step": 15955 }, { "epoch": 3.7220149253731343, "grad_norm": 0.3581564980288118, "learning_rate": 1.2570151255484639e-05, "loss": 0.3329, "step": 15960 }, { "epoch": 3.7231809701492535, "grad_norm": 0.36383138312350627, "learning_rate": 1.2557173959655932e-05, "loss": 0.3251, "step": 15965 }, { "epoch": 3.7243470149253732, "grad_norm": 0.3675587335921228, "learning_rate": 1.2544205551247148e-05, "loss": 0.3313, "step": 15970 }, { "epoch": 3.7255130597014925, "grad_norm": 0.35992698182812694, "learning_rate": 1.253124603797139e-05, "loss": 0.3217, "step": 15975 }, { "epoch": 3.726679104477612, "grad_norm": 0.36245092199613566, "learning_rate": 1.251829542753648e-05, "loss": 0.3327, "step": 15980 }, { "epoch": 3.7278451492537314, "grad_norm": 0.36628656492309, "learning_rate": 1.2505353727644958e-05, "loss": 0.3277, "step": 15985 }, { "epoch": 3.7290111940298507, "grad_norm": 0.3764789038085975, "learning_rate": 1.249242094599404e-05, "loss": 0.3411, "step": 15990 }, { "epoch": 3.73017723880597, "grad_norm": 0.3551201730830275, "learning_rate": 1.2479497090275643e-05, "loss": 0.3267, "step": 15995 }, { "epoch": 3.7313432835820897, "grad_norm": 0.37070614924990003, "learning_rate": 1.246658216817639e-05, "loss": 0.3459, "step": 16000 }, { "epoch": 3.732509328358209, "grad_norm": 0.38709253175875064, "learning_rate": 1.2453676187377588e-05, "loss": 0.3208, "step": 16005 }, { "epoch": 3.7336753731343286, "grad_norm": 0.3655769446848934, "learning_rate": 1.2440779155555202e-05, "loss": 0.329, "step": 16010 }, { "epoch": 3.734841417910448, "grad_norm": 0.33993270878017057, "learning_rate": 1.24278910803799e-05, "loss": 0.3296, "step": 16015 }, { "epoch": 3.736007462686567, "grad_norm": 0.36181420058602254, "learning_rate": 1.2415011969517016e-05, "loss": 0.3464, "step": 16020 }, { "epoch": 3.7371735074626864, "grad_norm": 0.3663382714754168, "learning_rate": 1.2402141830626547e-05, "loss": 0.3276, "step": 16025 }, { "epoch": 3.738339552238806, "grad_norm": 0.3866828361307577, "learning_rate": 1.2389280671363175e-05, "loss": 0.3224, "step": 16030 }, { "epoch": 3.7395055970149254, "grad_norm": 0.36179016855606844, "learning_rate": 1.2376428499376201e-05, "loss": 0.3252, "step": 16035 }, { "epoch": 3.7406716417910446, "grad_norm": 0.3524771337224453, "learning_rate": 1.2363585322309615e-05, "loss": 0.3273, "step": 16040 }, { "epoch": 3.7418376865671643, "grad_norm": 0.35460626714970217, "learning_rate": 1.2350751147802047e-05, "loss": 0.3315, "step": 16045 }, { "epoch": 3.7430037313432836, "grad_norm": 0.3697576134505472, "learning_rate": 1.2337925983486768e-05, "loss": 0.3229, "step": 16050 }, { "epoch": 3.744169776119403, "grad_norm": 0.3510243176952665, "learning_rate": 1.2325109836991703e-05, "loss": 0.3233, "step": 16055 }, { "epoch": 3.7453358208955225, "grad_norm": 0.3606194166769512, "learning_rate": 1.2312302715939394e-05, "loss": 0.327, "step": 16060 }, { "epoch": 3.746501865671642, "grad_norm": 0.3766537672012198, "learning_rate": 1.2299504627947029e-05, "loss": 0.3225, "step": 16065 }, { "epoch": 3.747667910447761, "grad_norm": 0.3601936128603242, "learning_rate": 1.2286715580626418e-05, "loss": 0.3367, "step": 16070 }, { "epoch": 3.7488339552238807, "grad_norm": 0.36101305872400397, "learning_rate": 1.2273935581584e-05, "loss": 0.3334, "step": 16075 }, { "epoch": 3.75, "grad_norm": 0.3606697833578396, "learning_rate": 1.2261164638420832e-05, "loss": 0.3201, "step": 16080 }, { "epoch": 3.7511660447761193, "grad_norm": 0.36056632999574745, "learning_rate": 1.2248402758732568e-05, "loss": 0.3292, "step": 16085 }, { "epoch": 3.752332089552239, "grad_norm": 0.3570610193262246, "learning_rate": 1.2235649950109492e-05, "loss": 0.3354, "step": 16090 }, { "epoch": 3.753498134328358, "grad_norm": 0.3569585132664504, "learning_rate": 1.222290622013649e-05, "loss": 0.3258, "step": 16095 }, { "epoch": 3.7546641791044775, "grad_norm": 0.38364762652881995, "learning_rate": 1.2210171576393037e-05, "loss": 0.343, "step": 16100 }, { "epoch": 3.755830223880597, "grad_norm": 0.3619653811600626, "learning_rate": 1.2197446026453219e-05, "loss": 0.3295, "step": 16105 }, { "epoch": 3.7569962686567164, "grad_norm": 0.360349124413514, "learning_rate": 1.2184729577885695e-05, "loss": 0.3502, "step": 16110 }, { "epoch": 3.7581623134328357, "grad_norm": 0.37133569103651815, "learning_rate": 1.2172022238253727e-05, "loss": 0.327, "step": 16115 }, { "epoch": 3.7593283582089554, "grad_norm": 0.3792583674612013, "learning_rate": 1.2159324015115148e-05, "loss": 0.3421, "step": 16120 }, { "epoch": 3.7604944029850746, "grad_norm": 0.36459360623121073, "learning_rate": 1.2146634916022383e-05, "loss": 0.3453, "step": 16125 }, { "epoch": 3.761660447761194, "grad_norm": 0.3839162151450258, "learning_rate": 1.2133954948522423e-05, "loss": 0.3321, "step": 16130 }, { "epoch": 3.7628264925373136, "grad_norm": 0.358302189283745, "learning_rate": 1.2121284120156812e-05, "loss": 0.3243, "step": 16135 }, { "epoch": 3.763992537313433, "grad_norm": 0.36844151802635366, "learning_rate": 1.210862243846168e-05, "loss": 0.3294, "step": 16140 }, { "epoch": 3.765158582089552, "grad_norm": 0.37835947628768996, "learning_rate": 1.209596991096772e-05, "loss": 0.3324, "step": 16145 }, { "epoch": 3.7663246268656714, "grad_norm": 0.3985861036005318, "learning_rate": 1.2083326545200154e-05, "loss": 0.3464, "step": 16150 }, { "epoch": 3.767490671641791, "grad_norm": 0.3860859803367432, "learning_rate": 1.2070692348678776e-05, "loss": 0.3297, "step": 16155 }, { "epoch": 3.7686567164179103, "grad_norm": 0.3720051991308574, "learning_rate": 1.205806732891793e-05, "loss": 0.3236, "step": 16160 }, { "epoch": 3.76982276119403, "grad_norm": 0.357634493214289, "learning_rate": 1.2045451493426483e-05, "loss": 0.3293, "step": 16165 }, { "epoch": 3.7709888059701493, "grad_norm": 0.3478269872758963, "learning_rate": 1.2032844849707853e-05, "loss": 0.3133, "step": 16170 }, { "epoch": 3.7721548507462686, "grad_norm": 0.3721615636676857, "learning_rate": 1.202024740525999e-05, "loss": 0.3353, "step": 16175 }, { "epoch": 3.773320895522388, "grad_norm": 0.3429143624732521, "learning_rate": 1.2007659167575377e-05, "loss": 0.3142, "step": 16180 }, { "epoch": 3.7744869402985075, "grad_norm": 0.35599678689392994, "learning_rate": 1.1995080144141004e-05, "loss": 0.3247, "step": 16185 }, { "epoch": 3.7756529850746268, "grad_norm": 0.36873530185602066, "learning_rate": 1.1982510342438395e-05, "loss": 0.3289, "step": 16190 }, { "epoch": 3.7768190298507465, "grad_norm": 0.34625541721057895, "learning_rate": 1.1969949769943587e-05, "loss": 0.3184, "step": 16195 }, { "epoch": 3.7779850746268657, "grad_norm": 0.3757368747073787, "learning_rate": 1.195739843412713e-05, "loss": 0.3274, "step": 16200 }, { "epoch": 3.779151119402985, "grad_norm": 0.3673938662320257, "learning_rate": 1.1944856342454078e-05, "loss": 0.3309, "step": 16205 }, { "epoch": 3.7803171641791042, "grad_norm": 0.37229335411824105, "learning_rate": 1.1932323502383978e-05, "loss": 0.329, "step": 16210 }, { "epoch": 3.781483208955224, "grad_norm": 0.38713291988033305, "learning_rate": 1.1919799921370888e-05, "loss": 0.333, "step": 16215 }, { "epoch": 3.782649253731343, "grad_norm": 0.3637043924495309, "learning_rate": 1.1907285606863351e-05, "loss": 0.3226, "step": 16220 }, { "epoch": 3.783815298507463, "grad_norm": 0.3474172831422929, "learning_rate": 1.1894780566304406e-05, "loss": 0.3314, "step": 16225 }, { "epoch": 3.784981343283582, "grad_norm": 0.36606885899211933, "learning_rate": 1.1882284807131576e-05, "loss": 0.3375, "step": 16230 }, { "epoch": 3.7861473880597014, "grad_norm": 0.3662270801385073, "learning_rate": 1.1869798336776845e-05, "loss": 0.3257, "step": 16235 }, { "epoch": 3.7873134328358207, "grad_norm": 0.35687393795731764, "learning_rate": 1.1857321162666692e-05, "loss": 0.3295, "step": 16240 }, { "epoch": 3.7884794776119404, "grad_norm": 0.3431823692654768, "learning_rate": 1.1844853292222066e-05, "loss": 0.3318, "step": 16245 }, { "epoch": 3.7896455223880596, "grad_norm": 0.32490377735150194, "learning_rate": 1.1832394732858377e-05, "loss": 0.3114, "step": 16250 }, { "epoch": 3.7908115671641793, "grad_norm": 0.3743464264848783, "learning_rate": 1.1819945491985504e-05, "loss": 0.3513, "step": 16255 }, { "epoch": 3.7919776119402986, "grad_norm": 0.37000283732742667, "learning_rate": 1.1807505577007765e-05, "loss": 0.3303, "step": 16260 }, { "epoch": 3.793143656716418, "grad_norm": 0.3928333375430292, "learning_rate": 1.179507499532395e-05, "loss": 0.3459, "step": 16265 }, { "epoch": 3.794309701492537, "grad_norm": 0.3536567995936902, "learning_rate": 1.1782653754327295e-05, "loss": 0.3365, "step": 16270 }, { "epoch": 3.795475746268657, "grad_norm": 0.3625995531356676, "learning_rate": 1.1770241861405475e-05, "loss": 0.3329, "step": 16275 }, { "epoch": 3.796641791044776, "grad_norm": 0.38334973081198714, "learning_rate": 1.1757839323940616e-05, "loss": 0.3325, "step": 16280 }, { "epoch": 3.7978078358208958, "grad_norm": 0.34318316196978654, "learning_rate": 1.1745446149309257e-05, "loss": 0.326, "step": 16285 }, { "epoch": 3.798973880597015, "grad_norm": 0.3501659069979656, "learning_rate": 1.1733062344882396e-05, "loss": 0.3428, "step": 16290 }, { "epoch": 3.8001399253731343, "grad_norm": 0.3467977209951471, "learning_rate": 1.1720687918025434e-05, "loss": 0.3259, "step": 16295 }, { "epoch": 3.8013059701492535, "grad_norm": 0.3628241490196815, "learning_rate": 1.1708322876098215e-05, "loss": 0.3284, "step": 16300 }, { "epoch": 3.8024720149253732, "grad_norm": 0.34558505589509175, "learning_rate": 1.1695967226454996e-05, "loss": 0.3331, "step": 16305 }, { "epoch": 3.8036380597014925, "grad_norm": 0.36797179727050766, "learning_rate": 1.1683620976444426e-05, "loss": 0.3382, "step": 16310 }, { "epoch": 3.804804104477612, "grad_norm": 0.34973063617918587, "learning_rate": 1.1671284133409592e-05, "loss": 0.3321, "step": 16315 }, { "epoch": 3.8059701492537314, "grad_norm": 0.3450380100052671, "learning_rate": 1.1658956704687974e-05, "loss": 0.3257, "step": 16320 }, { "epoch": 3.8071361940298507, "grad_norm": 0.3710667274629841, "learning_rate": 1.1646638697611462e-05, "loss": 0.3464, "step": 16325 }, { "epoch": 3.80830223880597, "grad_norm": 0.3729847127011631, "learning_rate": 1.1634330119506317e-05, "loss": 0.3402, "step": 16330 }, { "epoch": 3.8094682835820897, "grad_norm": 0.35671892051640974, "learning_rate": 1.1622030977693221e-05, "loss": 0.3253, "step": 16335 }, { "epoch": 3.810634328358209, "grad_norm": 0.36766966266855905, "learning_rate": 1.1609741279487236e-05, "loss": 0.3491, "step": 16340 }, { "epoch": 3.8118003731343286, "grad_norm": 0.3835837834330343, "learning_rate": 1.1597461032197788e-05, "loss": 0.3219, "step": 16345 }, { "epoch": 3.812966417910448, "grad_norm": 0.3650261367830838, "learning_rate": 1.1585190243128707e-05, "loss": 0.3342, "step": 16350 }, { "epoch": 3.814132462686567, "grad_norm": 0.37029816407186616, "learning_rate": 1.1572928919578186e-05, "loss": 0.3174, "step": 16355 }, { "epoch": 3.8152985074626864, "grad_norm": 0.36217941719615987, "learning_rate": 1.15606770688388e-05, "loss": 0.3289, "step": 16360 }, { "epoch": 3.816464552238806, "grad_norm": 0.36013492606117087, "learning_rate": 1.154843469819746e-05, "loss": 0.3203, "step": 16365 }, { "epoch": 3.8176305970149254, "grad_norm": 0.3864651132792706, "learning_rate": 1.1536201814935473e-05, "loss": 0.3164, "step": 16370 }, { "epoch": 3.8187966417910446, "grad_norm": 0.35905870938331036, "learning_rate": 1.152397842632848e-05, "loss": 0.3278, "step": 16375 }, { "epoch": 3.8199626865671643, "grad_norm": 0.3649833454823798, "learning_rate": 1.1511764539646494e-05, "loss": 0.3351, "step": 16380 }, { "epoch": 3.8211287313432836, "grad_norm": 0.37236860547218065, "learning_rate": 1.1499560162153866e-05, "loss": 0.324, "step": 16385 }, { "epoch": 3.822294776119403, "grad_norm": 0.37017362807070964, "learning_rate": 1.1487365301109281e-05, "loss": 0.338, "step": 16390 }, { "epoch": 3.8234608208955225, "grad_norm": 0.35689226631205556, "learning_rate": 1.147517996376578e-05, "loss": 0.326, "step": 16395 }, { "epoch": 3.824626865671642, "grad_norm": 0.3628218576176336, "learning_rate": 1.1463004157370735e-05, "loss": 0.32, "step": 16400 }, { "epoch": 3.825792910447761, "grad_norm": 0.3578471571354439, "learning_rate": 1.1450837889165852e-05, "loss": 0.3315, "step": 16405 }, { "epoch": 3.8269589552238807, "grad_norm": 0.3848440765017413, "learning_rate": 1.1438681166387162e-05, "loss": 0.3512, "step": 16410 }, { "epoch": 3.828125, "grad_norm": 0.3571541370538188, "learning_rate": 1.1426533996265008e-05, "loss": 0.3242, "step": 16415 }, { "epoch": 3.8292910447761193, "grad_norm": 0.3418990113679379, "learning_rate": 1.1414396386024064e-05, "loss": 0.3209, "step": 16420 }, { "epoch": 3.830457089552239, "grad_norm": 0.33975118139796545, "learning_rate": 1.140226834288332e-05, "loss": 0.3358, "step": 16425 }, { "epoch": 3.831623134328358, "grad_norm": 0.33976282322002593, "learning_rate": 1.1390149874056065e-05, "loss": 0.3429, "step": 16430 }, { "epoch": 3.8327891791044775, "grad_norm": 0.34259215704067036, "learning_rate": 1.1378040986749912e-05, "loss": 0.3282, "step": 16435 }, { "epoch": 3.833955223880597, "grad_norm": 0.3688992447689341, "learning_rate": 1.1365941688166747e-05, "loss": 0.3359, "step": 16440 }, { "epoch": 3.8351212686567164, "grad_norm": 0.36270140916201354, "learning_rate": 1.1353851985502777e-05, "loss": 0.3314, "step": 16445 }, { "epoch": 3.8362873134328357, "grad_norm": 0.344950401242646, "learning_rate": 1.134177188594849e-05, "loss": 0.3303, "step": 16450 }, { "epoch": 3.8374533582089554, "grad_norm": 0.36785239477332343, "learning_rate": 1.1329701396688669e-05, "loss": 0.3516, "step": 16455 }, { "epoch": 3.8386194029850746, "grad_norm": 0.38894096599593125, "learning_rate": 1.1317640524902383e-05, "loss": 0.3384, "step": 16460 }, { "epoch": 3.839785447761194, "grad_norm": 0.36061789289459095, "learning_rate": 1.1305589277762965e-05, "loss": 0.3385, "step": 16465 }, { "epoch": 3.8409514925373136, "grad_norm": 0.36253449553220574, "learning_rate": 1.129354766243804e-05, "loss": 0.3184, "step": 16470 }, { "epoch": 3.842117537313433, "grad_norm": 0.37415484991471054, "learning_rate": 1.1281515686089497e-05, "loss": 0.3161, "step": 16475 }, { "epoch": 3.843283582089552, "grad_norm": 0.4816301426672368, "learning_rate": 1.1269493355873498e-05, "loss": 0.3343, "step": 16480 }, { "epoch": 3.8444496268656714, "grad_norm": 0.3447241723165985, "learning_rate": 1.1257480678940469e-05, "loss": 0.3168, "step": 16485 }, { "epoch": 3.845615671641791, "grad_norm": 0.34973382369570744, "learning_rate": 1.1245477662435076e-05, "loss": 0.3167, "step": 16490 }, { "epoch": 3.8467817164179103, "grad_norm": 0.35499020069663667, "learning_rate": 1.123348431349626e-05, "loss": 0.3287, "step": 16495 }, { "epoch": 3.84794776119403, "grad_norm": 0.36365685080246757, "learning_rate": 1.1221500639257204e-05, "loss": 0.3285, "step": 16500 }, { "epoch": 3.8491138059701493, "grad_norm": 0.3643265001488134, "learning_rate": 1.1209526646845346e-05, "loss": 0.3374, "step": 16505 }, { "epoch": 3.8502798507462686, "grad_norm": 0.3673120965924131, "learning_rate": 1.1197562343382341e-05, "loss": 0.3283, "step": 16510 }, { "epoch": 3.851445895522388, "grad_norm": 0.35885175027255184, "learning_rate": 1.118560773598411e-05, "loss": 0.3242, "step": 16515 }, { "epoch": 3.8526119402985075, "grad_norm": 0.3722870845157102, "learning_rate": 1.1173662831760798e-05, "loss": 0.3436, "step": 16520 }, { "epoch": 3.8537779850746268, "grad_norm": 0.3632231569664917, "learning_rate": 1.1161727637816762e-05, "loss": 0.3339, "step": 16525 }, { "epoch": 3.8549440298507465, "grad_norm": 0.35643835047349254, "learning_rate": 1.1149802161250607e-05, "loss": 0.3343, "step": 16530 }, { "epoch": 3.8561100746268657, "grad_norm": 0.362687010349951, "learning_rate": 1.1137886409155158e-05, "loss": 0.3254, "step": 16535 }, { "epoch": 3.857276119402985, "grad_norm": 0.38194800139357077, "learning_rate": 1.1125980388617425e-05, "loss": 0.3464, "step": 16540 }, { "epoch": 3.8584421641791042, "grad_norm": 0.37894934633642446, "learning_rate": 1.1114084106718667e-05, "loss": 0.3354, "step": 16545 }, { "epoch": 3.859608208955224, "grad_norm": 0.3505124849219401, "learning_rate": 1.1102197570534334e-05, "loss": 0.3312, "step": 16550 }, { "epoch": 3.860774253731343, "grad_norm": 0.3756841556190372, "learning_rate": 1.1090320787134085e-05, "loss": 0.3332, "step": 16555 }, { "epoch": 3.861940298507463, "grad_norm": 0.36218154459593793, "learning_rate": 1.1078453763581776e-05, "loss": 0.3237, "step": 16560 }, { "epoch": 3.863106343283582, "grad_norm": 0.3822273203480594, "learning_rate": 1.1066596506935447e-05, "loss": 0.3286, "step": 16565 }, { "epoch": 3.8642723880597014, "grad_norm": 0.3670961602710075, "learning_rate": 1.1054749024247348e-05, "loss": 0.3312, "step": 16570 }, { "epoch": 3.8654384328358207, "grad_norm": 0.35519039241521144, "learning_rate": 1.1042911322563903e-05, "loss": 0.3213, "step": 16575 }, { "epoch": 3.8666044776119404, "grad_norm": 0.36159604405217605, "learning_rate": 1.103108340892573e-05, "loss": 0.3437, "step": 16580 }, { "epoch": 3.8677705223880596, "grad_norm": 0.36936968190453584, "learning_rate": 1.1019265290367616e-05, "loss": 0.3329, "step": 16585 }, { "epoch": 3.8689365671641793, "grad_norm": 0.34637316043003397, "learning_rate": 1.100745697391852e-05, "loss": 0.3095, "step": 16590 }, { "epoch": 3.8701026119402986, "grad_norm": 0.3584097955671261, "learning_rate": 1.099565846660158e-05, "loss": 0.3254, "step": 16595 }, { "epoch": 3.871268656716418, "grad_norm": 0.36626316083434135, "learning_rate": 1.0983869775434091e-05, "loss": 0.3384, "step": 16600 }, { "epoch": 3.872434701492537, "grad_norm": 0.3433672160631697, "learning_rate": 1.097209090742752e-05, "loss": 0.3154, "step": 16605 }, { "epoch": 3.873600746268657, "grad_norm": 0.36423084520484045, "learning_rate": 1.096032186958749e-05, "loss": 0.3298, "step": 16610 }, { "epoch": 3.874766791044776, "grad_norm": 0.369634546025693, "learning_rate": 1.0948562668913763e-05, "loss": 0.3319, "step": 16615 }, { "epoch": 3.8759328358208958, "grad_norm": 0.3581641937544491, "learning_rate": 1.0936813312400263e-05, "loss": 0.3448, "step": 16620 }, { "epoch": 3.877098880597015, "grad_norm": 0.3447436422193593, "learning_rate": 1.092507380703506e-05, "loss": 0.3361, "step": 16625 }, { "epoch": 3.8782649253731343, "grad_norm": 0.3642745484519943, "learning_rate": 1.091334415980036e-05, "loss": 0.3167, "step": 16630 }, { "epoch": 3.8794309701492535, "grad_norm": 0.35772294652482234, "learning_rate": 1.0901624377672513e-05, "loss": 0.3161, "step": 16635 }, { "epoch": 3.8805970149253732, "grad_norm": 0.35915974216675167, "learning_rate": 1.0889914467621986e-05, "loss": 0.3372, "step": 16640 }, { "epoch": 3.8817630597014925, "grad_norm": 0.35121936939843745, "learning_rate": 1.0878214436613387e-05, "loss": 0.3234, "step": 16645 }, { "epoch": 3.882929104477612, "grad_norm": 0.35372520150892794, "learning_rate": 1.0866524291605452e-05, "loss": 0.3323, "step": 16650 }, { "epoch": 3.8840951492537314, "grad_norm": 0.3466279598128762, "learning_rate": 1.0854844039551023e-05, "loss": 0.3326, "step": 16655 }, { "epoch": 3.8852611940298507, "grad_norm": 0.3580657133906849, "learning_rate": 1.0843173687397079e-05, "loss": 0.3216, "step": 16660 }, { "epoch": 3.88642723880597, "grad_norm": 0.36608447027340973, "learning_rate": 1.0831513242084681e-05, "loss": 0.3398, "step": 16665 }, { "epoch": 3.8875932835820897, "grad_norm": 0.3518514817912954, "learning_rate": 1.0819862710549025e-05, "loss": 0.3211, "step": 16670 }, { "epoch": 3.888759328358209, "grad_norm": 0.3780953494971563, "learning_rate": 1.0808222099719396e-05, "loss": 0.3238, "step": 16675 }, { "epoch": 3.8899253731343286, "grad_norm": 0.34541646400879394, "learning_rate": 1.0796591416519192e-05, "loss": 0.3244, "step": 16680 }, { "epoch": 3.891091417910448, "grad_norm": 0.3710284220371676, "learning_rate": 1.0784970667865882e-05, "loss": 0.3321, "step": 16685 }, { "epoch": 3.892257462686567, "grad_norm": 0.3494964492431871, "learning_rate": 1.0773359860671054e-05, "loss": 0.3256, "step": 16690 }, { "epoch": 3.8934235074626864, "grad_norm": 0.3959199222512189, "learning_rate": 1.0761759001840371e-05, "loss": 0.339, "step": 16695 }, { "epoch": 3.894589552238806, "grad_norm": 0.3682203271829885, "learning_rate": 1.0750168098273569e-05, "loss": 0.3258, "step": 16700 }, { "epoch": 3.8957555970149254, "grad_norm": 0.36868967803369485, "learning_rate": 1.073858715686448e-05, "loss": 0.3446, "step": 16705 }, { "epoch": 3.8969216417910446, "grad_norm": 0.38765007689293923, "learning_rate": 1.0727016184501e-05, "loss": 0.3419, "step": 16710 }, { "epoch": 3.8980876865671643, "grad_norm": 0.36932697842412554, "learning_rate": 1.0715455188065112e-05, "loss": 0.3262, "step": 16715 }, { "epoch": 3.8992537313432836, "grad_norm": 0.350074039553313, "learning_rate": 1.0703904174432836e-05, "loss": 0.3417, "step": 16720 }, { "epoch": 3.900419776119403, "grad_norm": 0.3695761815222959, "learning_rate": 1.069236315047428e-05, "loss": 0.344, "step": 16725 }, { "epoch": 3.9015858208955225, "grad_norm": 0.3730850744905737, "learning_rate": 1.0680832123053603e-05, "loss": 0.3328, "step": 16730 }, { "epoch": 3.902751865671642, "grad_norm": 0.3766990919413252, "learning_rate": 1.0669311099029014e-05, "loss": 0.3539, "step": 16735 }, { "epoch": 3.903917910447761, "grad_norm": 0.37727879408409504, "learning_rate": 1.0657800085252789e-05, "loss": 0.3434, "step": 16740 }, { "epoch": 3.9050839552238807, "grad_norm": 0.38143995339382847, "learning_rate": 1.064629908857122e-05, "loss": 0.3311, "step": 16745 }, { "epoch": 3.90625, "grad_norm": 0.34587440774360995, "learning_rate": 1.0634808115824668e-05, "loss": 0.3353, "step": 16750 }, { "epoch": 3.9074160447761193, "grad_norm": 0.3496412727509482, "learning_rate": 1.062332717384752e-05, "loss": 0.3436, "step": 16755 }, { "epoch": 3.908582089552239, "grad_norm": 0.40704435544810874, "learning_rate": 1.0611856269468203e-05, "loss": 0.3451, "step": 16760 }, { "epoch": 3.909748134328358, "grad_norm": 0.3655776721725987, "learning_rate": 1.0600395409509177e-05, "loss": 0.3385, "step": 16765 }, { "epoch": 3.9109141791044775, "grad_norm": 0.3569723384561409, "learning_rate": 1.0588944600786907e-05, "loss": 0.3056, "step": 16770 }, { "epoch": 3.912080223880597, "grad_norm": 0.3677148317383257, "learning_rate": 1.0577503850111903e-05, "loss": 0.3172, "step": 16775 }, { "epoch": 3.9132462686567164, "grad_norm": 0.3679093419193611, "learning_rate": 1.0566073164288687e-05, "loss": 0.3393, "step": 16780 }, { "epoch": 3.9144123134328357, "grad_norm": 0.34795094404353005, "learning_rate": 1.0554652550115788e-05, "loss": 0.3246, "step": 16785 }, { "epoch": 3.9155783582089554, "grad_norm": 0.37333639101376587, "learning_rate": 1.0543242014385758e-05, "loss": 0.3101, "step": 16790 }, { "epoch": 3.9167444029850746, "grad_norm": 0.36861435656773384, "learning_rate": 1.0531841563885134e-05, "loss": 0.3241, "step": 16795 }, { "epoch": 3.917910447761194, "grad_norm": 0.3542847891851535, "learning_rate": 1.052045120539447e-05, "loss": 0.3246, "step": 16800 }, { "epoch": 3.9190764925373136, "grad_norm": 0.3726980335684039, "learning_rate": 1.050907094568832e-05, "loss": 0.3548, "step": 16805 }, { "epoch": 3.920242537313433, "grad_norm": 0.3719498959276835, "learning_rate": 1.0497700791535221e-05, "loss": 0.3345, "step": 16810 }, { "epoch": 3.921408582089552, "grad_norm": 0.3777979555885601, "learning_rate": 1.0486340749697716e-05, "loss": 0.3507, "step": 16815 }, { "epoch": 3.9225746268656714, "grad_norm": 0.352674645064866, "learning_rate": 1.0474990826932301e-05, "loss": 0.33, "step": 16820 }, { "epoch": 3.923740671641791, "grad_norm": 0.35508721372875535, "learning_rate": 1.0463651029989492e-05, "loss": 0.3431, "step": 16825 }, { "epoch": 3.9249067164179103, "grad_norm": 0.41400108346055425, "learning_rate": 1.0452321365613758e-05, "loss": 0.3654, "step": 16830 }, { "epoch": 3.92607276119403, "grad_norm": 0.3598885605631565, "learning_rate": 1.0441001840543548e-05, "loss": 0.3387, "step": 16835 }, { "epoch": 3.9272388059701493, "grad_norm": 0.39152450463817323, "learning_rate": 1.0429692461511298e-05, "loss": 0.3262, "step": 16840 }, { "epoch": 3.9284048507462686, "grad_norm": 0.36601038358208293, "learning_rate": 1.041839323524337e-05, "loss": 0.3324, "step": 16845 }, { "epoch": 3.929570895522388, "grad_norm": 0.35711964271930985, "learning_rate": 1.0407104168460116e-05, "loss": 0.3293, "step": 16850 }, { "epoch": 3.9307369402985075, "grad_norm": 0.33704013731851895, "learning_rate": 1.0395825267875846e-05, "loss": 0.3301, "step": 16855 }, { "epoch": 3.9319029850746268, "grad_norm": 0.3412339094158753, "learning_rate": 1.0384556540198825e-05, "loss": 0.3257, "step": 16860 }, { "epoch": 3.9330690298507465, "grad_norm": 0.3487698052040988, "learning_rate": 1.0373297992131242e-05, "loss": 0.3242, "step": 16865 }, { "epoch": 3.9342350746268657, "grad_norm": 0.3398157030416527, "learning_rate": 1.0362049630369259e-05, "loss": 0.3136, "step": 16870 }, { "epoch": 3.935401119402985, "grad_norm": 0.34943895508449124, "learning_rate": 1.0350811461602974e-05, "loss": 0.3341, "step": 16875 }, { "epoch": 3.9365671641791042, "grad_norm": 0.34294387406591686, "learning_rate": 1.033958349251641e-05, "loss": 0.3284, "step": 16880 }, { "epoch": 3.937733208955224, "grad_norm": 0.38092359504875767, "learning_rate": 1.0328365729787536e-05, "loss": 0.3376, "step": 16885 }, { "epoch": 3.938899253731343, "grad_norm": 0.39175322808446295, "learning_rate": 1.0317158180088254e-05, "loss": 0.3427, "step": 16890 }, { "epoch": 3.940065298507463, "grad_norm": 0.3642002771961428, "learning_rate": 1.0305960850084373e-05, "loss": 0.3311, "step": 16895 }, { "epoch": 3.941231343283582, "grad_norm": 0.4044208411505232, "learning_rate": 1.0294773746435638e-05, "loss": 0.3531, "step": 16900 }, { "epoch": 3.9423973880597014, "grad_norm": 0.3835140241070223, "learning_rate": 1.0283596875795718e-05, "loss": 0.346, "step": 16905 }, { "epoch": 3.9435634328358207, "grad_norm": 0.3570930841402473, "learning_rate": 1.0272430244812175e-05, "loss": 0.3411, "step": 16910 }, { "epoch": 3.9447294776119404, "grad_norm": 0.3360779667826955, "learning_rate": 1.0261273860126514e-05, "loss": 0.3252, "step": 16915 }, { "epoch": 3.9458955223880596, "grad_norm": 0.3637844641033224, "learning_rate": 1.0250127728374098e-05, "loss": 0.322, "step": 16920 }, { "epoch": 3.9470615671641793, "grad_norm": 0.370577876806029, "learning_rate": 1.023899185618423e-05, "loss": 0.3219, "step": 16925 }, { "epoch": 3.9482276119402986, "grad_norm": 0.37931647342956126, "learning_rate": 1.0227866250180105e-05, "loss": 0.3302, "step": 16930 }, { "epoch": 3.949393656716418, "grad_norm": 0.3518036227852333, "learning_rate": 1.02167509169788e-05, "loss": 0.3291, "step": 16935 }, { "epoch": 3.950559701492537, "grad_norm": 0.3523764144241545, "learning_rate": 1.02056458631913e-05, "loss": 0.3423, "step": 16940 }, { "epoch": 3.951725746268657, "grad_norm": 0.3593927237913662, "learning_rate": 1.0194551095422447e-05, "loss": 0.3388, "step": 16945 }, { "epoch": 3.952891791044776, "grad_norm": 0.3664733667070302, "learning_rate": 1.0183466620270996e-05, "loss": 0.3357, "step": 16950 }, { "epoch": 3.9540578358208958, "grad_norm": 0.3487621273512764, "learning_rate": 1.0172392444329561e-05, "loss": 0.3325, "step": 16955 }, { "epoch": 3.955223880597015, "grad_norm": 0.3641172967334953, "learning_rate": 1.0161328574184645e-05, "loss": 0.3241, "step": 16960 }, { "epoch": 3.9563899253731343, "grad_norm": 0.3767742238835195, "learning_rate": 1.0150275016416613e-05, "loss": 0.3205, "step": 16965 }, { "epoch": 3.9575559701492535, "grad_norm": 0.3389958994897356, "learning_rate": 1.0139231777599689e-05, "loss": 0.3405, "step": 16970 }, { "epoch": 3.9587220149253732, "grad_norm": 0.37157193171540465, "learning_rate": 1.0128198864301976e-05, "loss": 0.3334, "step": 16975 }, { "epoch": 3.9598880597014925, "grad_norm": 0.3458020925348863, "learning_rate": 1.0117176283085419e-05, "loss": 0.3257, "step": 16980 }, { "epoch": 3.961054104477612, "grad_norm": 0.36863265137328216, "learning_rate": 1.0106164040505835e-05, "loss": 0.3364, "step": 16985 }, { "epoch": 3.9622201492537314, "grad_norm": 0.3808840293256123, "learning_rate": 1.009516214311289e-05, "loss": 0.3403, "step": 16990 }, { "epoch": 3.9633861940298507, "grad_norm": 0.35766163428437153, "learning_rate": 1.0084170597450073e-05, "loss": 0.3224, "step": 16995 }, { "epoch": 3.96455223880597, "grad_norm": 0.3469163213748774, "learning_rate": 1.0073189410054742e-05, "loss": 0.3327, "step": 17000 }, { "epoch": 3.9657182835820897, "grad_norm": 0.3607061724727493, "learning_rate": 1.0062218587458085e-05, "loss": 0.3211, "step": 17005 }, { "epoch": 3.966884328358209, "grad_norm": 0.36323009345054685, "learning_rate": 1.0051258136185132e-05, "loss": 0.3293, "step": 17010 }, { "epoch": 3.9680503731343286, "grad_norm": 0.36402290316596186, "learning_rate": 1.0040308062754738e-05, "loss": 0.3341, "step": 17015 }, { "epoch": 3.969216417910448, "grad_norm": 0.38681597015686653, "learning_rate": 1.0029368373679583e-05, "loss": 0.3323, "step": 17020 }, { "epoch": 3.970382462686567, "grad_norm": 0.34623211602067594, "learning_rate": 1.001843907546617e-05, "loss": 0.3175, "step": 17025 }, { "epoch": 3.9715485074626864, "grad_norm": 0.3499523440752185, "learning_rate": 1.0007520174614836e-05, "loss": 0.3237, "step": 17030 }, { "epoch": 3.972714552238806, "grad_norm": 0.3605004792956158, "learning_rate": 9.996611677619719e-06, "loss": 0.3266, "step": 17035 }, { "epoch": 3.9738805970149254, "grad_norm": 0.35267364742934776, "learning_rate": 9.98571359096878e-06, "loss": 0.3244, "step": 17040 }, { "epoch": 3.9750466417910446, "grad_norm": 0.3708483035028362, "learning_rate": 9.97482592114378e-06, "loss": 0.3422, "step": 17045 }, { "epoch": 3.9762126865671643, "grad_norm": 0.3648063731433488, "learning_rate": 9.96394867462028e-06, "loss": 0.3392, "step": 17050 }, { "epoch": 3.9773787313432836, "grad_norm": 0.3458415816651768, "learning_rate": 9.953081857867665e-06, "loss": 0.3268, "step": 17055 }, { "epoch": 3.978544776119403, "grad_norm": 0.38164120855475553, "learning_rate": 9.94222547734909e-06, "loss": 0.3112, "step": 17060 }, { "epoch": 3.9797108208955225, "grad_norm": 0.3684841691014534, "learning_rate": 9.93137953952151e-06, "loss": 0.3326, "step": 17065 }, { "epoch": 3.980876865671642, "grad_norm": 0.37748506035923324, "learning_rate": 9.92054405083569e-06, "loss": 0.3333, "step": 17070 }, { "epoch": 3.982042910447761, "grad_norm": 0.35884362385574936, "learning_rate": 9.90971901773614e-06, "loss": 0.3334, "step": 17075 }, { "epoch": 3.9832089552238807, "grad_norm": 0.349232885564346, "learning_rate": 9.898904446661188e-06, "loss": 0.3244, "step": 17080 }, { "epoch": 3.984375, "grad_norm": 0.3803573479542466, "learning_rate": 9.888100344042926e-06, "loss": 0.3293, "step": 17085 }, { "epoch": 3.9855410447761193, "grad_norm": 0.36835281448664914, "learning_rate": 9.87730671630722e-06, "loss": 0.3189, "step": 17090 }, { "epoch": 3.986707089552239, "grad_norm": 0.3633115584459684, "learning_rate": 9.866523569873708e-06, "loss": 0.3342, "step": 17095 }, { "epoch": 3.987873134328358, "grad_norm": 0.355961031124995, "learning_rate": 9.855750911155784e-06, "loss": 0.3202, "step": 17100 }, { "epoch": 3.9890391791044775, "grad_norm": 0.36014989217850457, "learning_rate": 9.844988746560615e-06, "loss": 0.3189, "step": 17105 }, { "epoch": 3.990205223880597, "grad_norm": 0.37534210089790565, "learning_rate": 9.834237082489126e-06, "loss": 0.3255, "step": 17110 }, { "epoch": 3.9913712686567164, "grad_norm": 0.3688245738811151, "learning_rate": 9.823495925335995e-06, "loss": 0.3284, "step": 17115 }, { "epoch": 3.9925373134328357, "grad_norm": 0.37833927184877747, "learning_rate": 9.812765281489655e-06, "loss": 0.3496, "step": 17120 }, { "epoch": 3.9937033582089554, "grad_norm": 0.3692126708342929, "learning_rate": 9.802045157332269e-06, "loss": 0.3269, "step": 17125 }, { "epoch": 3.9948694029850746, "grad_norm": 0.39539934417641975, "learning_rate": 9.79133555923976e-06, "loss": 0.3285, "step": 17130 }, { "epoch": 3.996035447761194, "grad_norm": 0.37023818018411236, "learning_rate": 9.780636493581797e-06, "loss": 0.3383, "step": 17135 }, { "epoch": 3.9972014925373136, "grad_norm": 0.36515860019550433, "learning_rate": 9.76994796672176e-06, "loss": 0.3269, "step": 17140 }, { "epoch": 3.998367537313433, "grad_norm": 0.38032836933336583, "learning_rate": 9.759269985016786e-06, "loss": 0.3435, "step": 17145 }, { "epoch": 3.999533582089552, "grad_norm": 0.34346026570957655, "learning_rate": 9.748602554817721e-06, "loss": 0.3335, "step": 17150 }, { "epoch": 4.000699626865671, "grad_norm": 0.34961007074318995, "learning_rate": 9.737945682469145e-06, "loss": 0.2843, "step": 17155 }, { "epoch": 4.001865671641791, "grad_norm": 0.3595797350406245, "learning_rate": 9.72729937430936e-06, "loss": 0.2724, "step": 17160 }, { "epoch": 4.003031716417911, "grad_norm": 0.3757878590900649, "learning_rate": 9.716663636670375e-06, "loss": 0.2671, "step": 17165 }, { "epoch": 4.00419776119403, "grad_norm": 0.4232679939550836, "learning_rate": 9.706038475877938e-06, "loss": 0.2768, "step": 17170 }, { "epoch": 4.005363805970149, "grad_norm": 0.3853101808137293, "learning_rate": 9.69542389825146e-06, "loss": 0.278, "step": 17175 }, { "epoch": 4.0065298507462686, "grad_norm": 0.38065011179529434, "learning_rate": 9.6848199101041e-06, "loss": 0.2743, "step": 17180 }, { "epoch": 4.007695895522388, "grad_norm": 0.37438551006463544, "learning_rate": 9.674226517742705e-06, "loss": 0.2639, "step": 17185 }, { "epoch": 4.008861940298507, "grad_norm": 0.37899651523451733, "learning_rate": 9.66364372746781e-06, "loss": 0.2697, "step": 17190 }, { "epoch": 4.010027985074627, "grad_norm": 0.4365087875180718, "learning_rate": 9.653071545573667e-06, "loss": 0.2803, "step": 17195 }, { "epoch": 4.0111940298507465, "grad_norm": 0.3721769349613592, "learning_rate": 9.64250997834819e-06, "loss": 0.2696, "step": 17200 }, { "epoch": 4.012360074626866, "grad_norm": 0.37729886286105113, "learning_rate": 9.631959032072997e-06, "loss": 0.2667, "step": 17205 }, { "epoch": 4.013526119402985, "grad_norm": 0.3903095843544387, "learning_rate": 9.621418713023389e-06, "loss": 0.2736, "step": 17210 }, { "epoch": 4.014692164179104, "grad_norm": 0.38610021114229004, "learning_rate": 9.61088902746835e-06, "loss": 0.2657, "step": 17215 }, { "epoch": 4.0158582089552235, "grad_norm": 0.3667713690500915, "learning_rate": 9.60036998167052e-06, "loss": 0.2486, "step": 17220 }, { "epoch": 4.017024253731344, "grad_norm": 0.40451194763613085, "learning_rate": 9.589861581886232e-06, "loss": 0.2895, "step": 17225 }, { "epoch": 4.018190298507463, "grad_norm": 0.38119312707674224, "learning_rate": 9.579363834365484e-06, "loss": 0.2578, "step": 17230 }, { "epoch": 4.019356343283582, "grad_norm": 0.40688848855625515, "learning_rate": 9.568876745351919e-06, "loss": 0.2736, "step": 17235 }, { "epoch": 4.020522388059701, "grad_norm": 0.3778376679826051, "learning_rate": 9.558400321082863e-06, "loss": 0.2754, "step": 17240 }, { "epoch": 4.021688432835821, "grad_norm": 0.3899220307228565, "learning_rate": 9.547934567789302e-06, "loss": 0.2799, "step": 17245 }, { "epoch": 4.02285447761194, "grad_norm": 0.4087913552661455, "learning_rate": 9.537479491695845e-06, "loss": 0.2848, "step": 17250 }, { "epoch": 4.02402052238806, "grad_norm": 0.38211948963355963, "learning_rate": 9.527035099020784e-06, "loss": 0.2715, "step": 17255 }, { "epoch": 4.025186567164179, "grad_norm": 0.39987617273591775, "learning_rate": 9.516601395976038e-06, "loss": 0.2812, "step": 17260 }, { "epoch": 4.026352611940299, "grad_norm": 0.3746875540320509, "learning_rate": 9.506178388767176e-06, "loss": 0.2702, "step": 17265 }, { "epoch": 4.027518656716418, "grad_norm": 0.39832317079454277, "learning_rate": 9.495766083593407e-06, "loss": 0.2637, "step": 17270 }, { "epoch": 4.028684701492537, "grad_norm": 0.41295342388894385, "learning_rate": 9.485364486647561e-06, "loss": 0.2731, "step": 17275 }, { "epoch": 4.029850746268656, "grad_norm": 0.40496088844307826, "learning_rate": 9.474973604116112e-06, "loss": 0.2705, "step": 17280 }, { "epoch": 4.0310167910447765, "grad_norm": 0.38663525321521314, "learning_rate": 9.464593442179162e-06, "loss": 0.27, "step": 17285 }, { "epoch": 4.032182835820896, "grad_norm": 0.4299620384176467, "learning_rate": 9.454224007010428e-06, "loss": 0.3039, "step": 17290 }, { "epoch": 4.033348880597015, "grad_norm": 0.3728216932152512, "learning_rate": 9.443865304777266e-06, "loss": 0.2649, "step": 17295 }, { "epoch": 4.034514925373134, "grad_norm": 0.3899351037140187, "learning_rate": 9.433517341640621e-06, "loss": 0.2695, "step": 17300 }, { "epoch": 4.0356809701492535, "grad_norm": 0.4000363248231797, "learning_rate": 9.423180123755064e-06, "loss": 0.2696, "step": 17305 }, { "epoch": 4.036847014925373, "grad_norm": 0.41445350192630664, "learning_rate": 9.41285365726878e-06, "loss": 0.2743, "step": 17310 }, { "epoch": 4.038013059701493, "grad_norm": 0.41177140790199745, "learning_rate": 9.40253794832356e-06, "loss": 0.2861, "step": 17315 }, { "epoch": 4.039179104477612, "grad_norm": 0.39093016206560616, "learning_rate": 9.39223300305479e-06, "loss": 0.2695, "step": 17320 }, { "epoch": 4.0403451492537314, "grad_norm": 0.43359088740355056, "learning_rate": 9.381938827591447e-06, "loss": 0.2828, "step": 17325 }, { "epoch": 4.041511194029851, "grad_norm": 0.3861700543791944, "learning_rate": 9.371655428056122e-06, "loss": 0.2701, "step": 17330 }, { "epoch": 4.04267723880597, "grad_norm": 0.39946561197399155, "learning_rate": 9.361382810564984e-06, "loss": 0.2752, "step": 17335 }, { "epoch": 4.043843283582089, "grad_norm": 0.38194852995606676, "learning_rate": 9.351120981227788e-06, "loss": 0.2748, "step": 17340 }, { "epoch": 4.045009328358209, "grad_norm": 0.37606470497937694, "learning_rate": 9.34086994614789e-06, "loss": 0.2704, "step": 17345 }, { "epoch": 4.046175373134329, "grad_norm": 0.38728957201754266, "learning_rate": 9.330629711422196e-06, "loss": 0.2714, "step": 17350 }, { "epoch": 4.047341417910448, "grad_norm": 0.41514043457818217, "learning_rate": 9.320400283141208e-06, "loss": 0.2843, "step": 17355 }, { "epoch": 4.048507462686567, "grad_norm": 0.4022222643818692, "learning_rate": 9.310181667389003e-06, "loss": 0.279, "step": 17360 }, { "epoch": 4.049673507462686, "grad_norm": 0.38707845189965684, "learning_rate": 9.299973870243222e-06, "loss": 0.2662, "step": 17365 }, { "epoch": 4.050839552238806, "grad_norm": 0.380700902845872, "learning_rate": 9.289776897775074e-06, "loss": 0.2764, "step": 17370 }, { "epoch": 4.052005597014926, "grad_norm": 0.38072165881104764, "learning_rate": 9.279590756049316e-06, "loss": 0.2707, "step": 17375 }, { "epoch": 4.053171641791045, "grad_norm": 0.3980632672807575, "learning_rate": 9.269415451124283e-06, "loss": 0.2724, "step": 17380 }, { "epoch": 4.054337686567164, "grad_norm": 0.39328929956905995, "learning_rate": 9.25925098905185e-06, "loss": 0.2647, "step": 17385 }, { "epoch": 4.055503731343284, "grad_norm": 0.3783499102829366, "learning_rate": 9.249097375877458e-06, "loss": 0.265, "step": 17390 }, { "epoch": 4.056669776119403, "grad_norm": 0.3944809621780304, "learning_rate": 9.23895461764009e-06, "loss": 0.2648, "step": 17395 }, { "epoch": 4.057835820895522, "grad_norm": 0.4155514147390988, "learning_rate": 9.22882272037225e-06, "loss": 0.2806, "step": 17400 }, { "epoch": 4.059001865671642, "grad_norm": 0.40146981171808716, "learning_rate": 9.218701690100017e-06, "loss": 0.2842, "step": 17405 }, { "epoch": 4.0601679104477615, "grad_norm": 0.42473378469312006, "learning_rate": 9.208591532842995e-06, "loss": 0.2695, "step": 17410 }, { "epoch": 4.061333955223881, "grad_norm": 0.4132395156270794, "learning_rate": 9.198492254614302e-06, "loss": 0.2754, "step": 17415 }, { "epoch": 4.0625, "grad_norm": 0.38857253688506155, "learning_rate": 9.188403861420615e-06, "loss": 0.2864, "step": 17420 }, { "epoch": 4.063666044776119, "grad_norm": 0.3858487720074873, "learning_rate": 9.178326359262124e-06, "loss": 0.275, "step": 17425 }, { "epoch": 4.0648320895522385, "grad_norm": 0.37561090884474024, "learning_rate": 9.16825975413253e-06, "loss": 0.2817, "step": 17430 }, { "epoch": 4.065998134328359, "grad_norm": 0.40132068910283547, "learning_rate": 9.158204052019069e-06, "loss": 0.2781, "step": 17435 }, { "epoch": 4.067164179104478, "grad_norm": 0.39343345573457644, "learning_rate": 9.148159258902488e-06, "loss": 0.2761, "step": 17440 }, { "epoch": 4.068330223880597, "grad_norm": 0.3989119946989191, "learning_rate": 9.138125380757046e-06, "loss": 0.2675, "step": 17445 }, { "epoch": 4.069496268656716, "grad_norm": 0.41008492201275865, "learning_rate": 9.128102423550511e-06, "loss": 0.2699, "step": 17450 }, { "epoch": 4.070662313432836, "grad_norm": 0.42224674882948077, "learning_rate": 9.118090393244147e-06, "loss": 0.2849, "step": 17455 }, { "epoch": 4.071828358208955, "grad_norm": 0.41417025410187047, "learning_rate": 9.108089295792726e-06, "loss": 0.2791, "step": 17460 }, { "epoch": 4.072994402985074, "grad_norm": 0.38992394241051687, "learning_rate": 9.098099137144522e-06, "loss": 0.2682, "step": 17465 }, { "epoch": 4.074160447761194, "grad_norm": 0.4002085567371614, "learning_rate": 9.088119923241295e-06, "loss": 0.2848, "step": 17470 }, { "epoch": 4.075326492537314, "grad_norm": 0.4008502225666991, "learning_rate": 9.07815166001831e-06, "loss": 0.2786, "step": 17475 }, { "epoch": 4.076492537313433, "grad_norm": 0.4198649960906934, "learning_rate": 9.068194353404288e-06, "loss": 0.2836, "step": 17480 }, { "epoch": 4.077658582089552, "grad_norm": 0.38919466382635604, "learning_rate": 9.058248009321464e-06, "loss": 0.2759, "step": 17485 }, { "epoch": 4.078824626865671, "grad_norm": 0.4040898916629864, "learning_rate": 9.04831263368554e-06, "loss": 0.2639, "step": 17490 }, { "epoch": 4.079990671641791, "grad_norm": 0.4061639016868157, "learning_rate": 9.038388232405699e-06, "loss": 0.2686, "step": 17495 }, { "epoch": 4.081156716417911, "grad_norm": 0.38873144956384365, "learning_rate": 9.028474811384597e-06, "loss": 0.2739, "step": 17500 }, { "epoch": 4.08232276119403, "grad_norm": 0.3907113401610684, "learning_rate": 9.01857237651835e-06, "loss": 0.2719, "step": 17505 }, { "epoch": 4.083488805970149, "grad_norm": 0.43037381088785187, "learning_rate": 9.008680933696545e-06, "loss": 0.2903, "step": 17510 }, { "epoch": 4.0846548507462686, "grad_norm": 0.4153131214146846, "learning_rate": 8.998800488802239e-06, "loss": 0.2802, "step": 17515 }, { "epoch": 4.085820895522388, "grad_norm": 0.4254650531632462, "learning_rate": 8.98893104771194e-06, "loss": 0.2836, "step": 17520 }, { "epoch": 4.086986940298507, "grad_norm": 0.41021442918425227, "learning_rate": 8.979072616295616e-06, "loss": 0.2776, "step": 17525 }, { "epoch": 4.088152985074627, "grad_norm": 0.4199890447183592, "learning_rate": 8.969225200416678e-06, "loss": 0.2781, "step": 17530 }, { "epoch": 4.0893190298507465, "grad_norm": 0.4224494686795398, "learning_rate": 8.959388805931993e-06, "loss": 0.2829, "step": 17535 }, { "epoch": 4.090485074626866, "grad_norm": 0.3965658671662444, "learning_rate": 8.94956343869187e-06, "loss": 0.2775, "step": 17540 }, { "epoch": 4.091651119402985, "grad_norm": 0.3845067592761444, "learning_rate": 8.939749104540065e-06, "loss": 0.2519, "step": 17545 }, { "epoch": 4.092817164179104, "grad_norm": 0.42278234806318643, "learning_rate": 8.929945809313773e-06, "loss": 0.2732, "step": 17550 }, { "epoch": 4.0939832089552235, "grad_norm": 0.40257751752804294, "learning_rate": 8.9201535588436e-06, "loss": 0.2657, "step": 17555 }, { "epoch": 4.095149253731344, "grad_norm": 0.3905140218344296, "learning_rate": 8.910372358953614e-06, "loss": 0.2659, "step": 17560 }, { "epoch": 4.096315298507463, "grad_norm": 0.3970352823295486, "learning_rate": 8.900602215461297e-06, "loss": 0.2608, "step": 17565 }, { "epoch": 4.097481343283582, "grad_norm": 0.4211184463964193, "learning_rate": 8.890843134177555e-06, "loss": 0.2796, "step": 17570 }, { "epoch": 4.098647388059701, "grad_norm": 0.4148157751306683, "learning_rate": 8.881095120906716e-06, "loss": 0.2755, "step": 17575 }, { "epoch": 4.099813432835821, "grad_norm": 0.3834600079099796, "learning_rate": 8.871358181446519e-06, "loss": 0.2617, "step": 17580 }, { "epoch": 4.10097947761194, "grad_norm": 0.3839034265657398, "learning_rate": 8.861632321588126e-06, "loss": 0.2679, "step": 17585 }, { "epoch": 4.10214552238806, "grad_norm": 0.3763802424840373, "learning_rate": 8.851917547116111e-06, "loss": 0.2804, "step": 17590 }, { "epoch": 4.103311567164179, "grad_norm": 0.4062580262689815, "learning_rate": 8.842213863808439e-06, "loss": 0.2905, "step": 17595 }, { "epoch": 4.104477611940299, "grad_norm": 0.39543484011614194, "learning_rate": 8.83252127743649e-06, "loss": 0.2653, "step": 17600 }, { "epoch": 4.105643656716418, "grad_norm": 0.3950151746012995, "learning_rate": 8.822839793765056e-06, "loss": 0.2776, "step": 17605 }, { "epoch": 4.106809701492537, "grad_norm": 0.42042659879200683, "learning_rate": 8.813169418552294e-06, "loss": 0.2794, "step": 17610 }, { "epoch": 4.107975746268656, "grad_norm": 0.4085981463591945, "learning_rate": 8.803510157549785e-06, "loss": 0.2793, "step": 17615 }, { "epoch": 4.1091417910447765, "grad_norm": 0.3962803355524766, "learning_rate": 8.793862016502477e-06, "loss": 0.271, "step": 17620 }, { "epoch": 4.110307835820896, "grad_norm": 0.4046974830505113, "learning_rate": 8.78422500114873e-06, "loss": 0.2687, "step": 17625 }, { "epoch": 4.111473880597015, "grad_norm": 0.409597263803634, "learning_rate": 8.774599117220254e-06, "loss": 0.2855, "step": 17630 }, { "epoch": 4.112639925373134, "grad_norm": 0.416872029353015, "learning_rate": 8.764984370442166e-06, "loss": 0.2819, "step": 17635 }, { "epoch": 4.1138059701492535, "grad_norm": 0.3727123254002717, "learning_rate": 8.755380766532945e-06, "loss": 0.2634, "step": 17640 }, { "epoch": 4.114972014925373, "grad_norm": 0.38964046030172467, "learning_rate": 8.745788311204444e-06, "loss": 0.2758, "step": 17645 }, { "epoch": 4.116138059701493, "grad_norm": 0.41981137533908836, "learning_rate": 8.736207010161899e-06, "loss": 0.2806, "step": 17650 }, { "epoch": 4.117304104477612, "grad_norm": 0.39914056124124697, "learning_rate": 8.726636869103884e-06, "loss": 0.2712, "step": 17655 }, { "epoch": 4.1184701492537314, "grad_norm": 0.3914937415659602, "learning_rate": 8.71707789372236e-06, "loss": 0.2678, "step": 17660 }, { "epoch": 4.119636194029851, "grad_norm": 0.42513322775911844, "learning_rate": 8.70753008970264e-06, "loss": 0.2737, "step": 17665 }, { "epoch": 4.12080223880597, "grad_norm": 0.3925064576087041, "learning_rate": 8.697993462723392e-06, "loss": 0.2845, "step": 17670 }, { "epoch": 4.121968283582089, "grad_norm": 0.4068642934868272, "learning_rate": 8.688468018456639e-06, "loss": 0.2617, "step": 17675 }, { "epoch": 4.123134328358209, "grad_norm": 0.39727368108579314, "learning_rate": 8.678953762567739e-06, "loss": 0.2743, "step": 17680 }, { "epoch": 4.124300373134329, "grad_norm": 0.3928667850605243, "learning_rate": 8.669450700715414e-06, "loss": 0.2697, "step": 17685 }, { "epoch": 4.125466417910448, "grad_norm": 0.4019362126336018, "learning_rate": 8.659958838551722e-06, "loss": 0.2643, "step": 17690 }, { "epoch": 4.126632462686567, "grad_norm": 0.4272157250003413, "learning_rate": 8.650478181722055e-06, "loss": 0.2757, "step": 17695 }, { "epoch": 4.127798507462686, "grad_norm": 0.4214354240713484, "learning_rate": 8.641008735865153e-06, "loss": 0.2791, "step": 17700 }, { "epoch": 4.128964552238806, "grad_norm": 0.40725072931202166, "learning_rate": 8.631550506613062e-06, "loss": 0.2729, "step": 17705 }, { "epoch": 4.130130597014926, "grad_norm": 0.3779088159590965, "learning_rate": 8.62210349959119e-06, "loss": 0.2689, "step": 17710 }, { "epoch": 4.131296641791045, "grad_norm": 0.4156240483038565, "learning_rate": 8.612667720418243e-06, "loss": 0.2809, "step": 17715 }, { "epoch": 4.132462686567164, "grad_norm": 0.40249218473646325, "learning_rate": 8.60324317470627e-06, "loss": 0.2738, "step": 17720 }, { "epoch": 4.133628731343284, "grad_norm": 0.3934841325796097, "learning_rate": 8.593829868060632e-06, "loss": 0.2674, "step": 17725 }, { "epoch": 4.134794776119403, "grad_norm": 0.4007634368816624, "learning_rate": 8.584427806079988e-06, "loss": 0.2751, "step": 17730 }, { "epoch": 4.135960820895522, "grad_norm": 0.4138556471912693, "learning_rate": 8.575036994356334e-06, "loss": 0.2787, "step": 17735 }, { "epoch": 4.137126865671641, "grad_norm": 0.40615935492996275, "learning_rate": 8.565657438474963e-06, "loss": 0.2776, "step": 17740 }, { "epoch": 4.1382929104477615, "grad_norm": 0.3867587031964204, "learning_rate": 8.556289144014474e-06, "loss": 0.2638, "step": 17745 }, { "epoch": 4.139458955223881, "grad_norm": 0.430155591472759, "learning_rate": 8.546932116546775e-06, "loss": 0.2793, "step": 17750 }, { "epoch": 4.140625, "grad_norm": 0.40298935237119554, "learning_rate": 8.53758636163706e-06, "loss": 0.2644, "step": 17755 }, { "epoch": 4.141791044776119, "grad_norm": 0.41138868654844857, "learning_rate": 8.528251884843829e-06, "loss": 0.2759, "step": 17760 }, { "epoch": 4.1429570895522385, "grad_norm": 0.41005162569903975, "learning_rate": 8.518928691718872e-06, "loss": 0.2892, "step": 17765 }, { "epoch": 4.144123134328359, "grad_norm": 0.38415735441945087, "learning_rate": 8.509616787807263e-06, "loss": 0.2731, "step": 17770 }, { "epoch": 4.145289179104478, "grad_norm": 0.40812940858209573, "learning_rate": 8.500316178647366e-06, "loss": 0.2771, "step": 17775 }, { "epoch": 4.146455223880597, "grad_norm": 0.4229080087039047, "learning_rate": 8.491026869770832e-06, "loss": 0.2775, "step": 17780 }, { "epoch": 4.147621268656716, "grad_norm": 0.4162540906219848, "learning_rate": 8.48174886670258e-06, "loss": 0.2678, "step": 17785 }, { "epoch": 4.148787313432836, "grad_norm": 0.4096473455980717, "learning_rate": 8.472482174960808e-06, "loss": 0.2834, "step": 17790 }, { "epoch": 4.149953358208955, "grad_norm": 0.4024387626065952, "learning_rate": 8.463226800056995e-06, "loss": 0.2663, "step": 17795 }, { "epoch": 4.151119402985074, "grad_norm": 0.39489254863081147, "learning_rate": 8.453982747495881e-06, "loss": 0.2764, "step": 17800 }, { "epoch": 4.152285447761194, "grad_norm": 0.4185944125024017, "learning_rate": 8.44475002277548e-06, "loss": 0.2736, "step": 17805 }, { "epoch": 4.153451492537314, "grad_norm": 0.38401269701391966, "learning_rate": 8.435528631387052e-06, "loss": 0.2761, "step": 17810 }, { "epoch": 4.154617537313433, "grad_norm": 0.433321075218259, "learning_rate": 8.426318578815128e-06, "loss": 0.2767, "step": 17815 }, { "epoch": 4.155783582089552, "grad_norm": 0.39581775967542454, "learning_rate": 8.417119870537503e-06, "loss": 0.2875, "step": 17820 }, { "epoch": 4.156949626865671, "grad_norm": 0.38618524509810154, "learning_rate": 8.407932512025207e-06, "loss": 0.2662, "step": 17825 }, { "epoch": 4.158115671641791, "grad_norm": 0.3984497079434106, "learning_rate": 8.398756508742536e-06, "loss": 0.2633, "step": 17830 }, { "epoch": 4.159281716417911, "grad_norm": 0.3982649240862175, "learning_rate": 8.38959186614702e-06, "loss": 0.2698, "step": 17835 }, { "epoch": 4.16044776119403, "grad_norm": 0.40275814991539843, "learning_rate": 8.380438589689438e-06, "loss": 0.2777, "step": 17840 }, { "epoch": 4.161613805970149, "grad_norm": 0.40564226347858123, "learning_rate": 8.371296684813806e-06, "loss": 0.2688, "step": 17845 }, { "epoch": 4.1627798507462686, "grad_norm": 0.40044806911939124, "learning_rate": 8.36216615695738e-06, "loss": 0.281, "step": 17850 }, { "epoch": 4.163945895522388, "grad_norm": 0.3777701394706234, "learning_rate": 8.353047011550654e-06, "loss": 0.2792, "step": 17855 }, { "epoch": 4.165111940298507, "grad_norm": 0.4127086292962352, "learning_rate": 8.343939254017336e-06, "loss": 0.2784, "step": 17860 }, { "epoch": 4.166277985074627, "grad_norm": 0.40379573089311377, "learning_rate": 8.334842889774374e-06, "loss": 0.2722, "step": 17865 }, { "epoch": 4.1674440298507465, "grad_norm": 0.4277490808244483, "learning_rate": 8.325757924231938e-06, "loss": 0.2775, "step": 17870 }, { "epoch": 4.168610074626866, "grad_norm": 0.38276952284794696, "learning_rate": 8.31668436279342e-06, "loss": 0.2679, "step": 17875 }, { "epoch": 4.169776119402985, "grad_norm": 0.42944563393474117, "learning_rate": 8.307622210855425e-06, "loss": 0.2613, "step": 17880 }, { "epoch": 4.170942164179104, "grad_norm": 0.40801134956065815, "learning_rate": 8.298571473807767e-06, "loss": 0.2761, "step": 17885 }, { "epoch": 4.1721082089552235, "grad_norm": 0.4112413108518129, "learning_rate": 8.289532157033481e-06, "loss": 0.2863, "step": 17890 }, { "epoch": 4.173274253731344, "grad_norm": 0.4048818565106462, "learning_rate": 8.28050426590881e-06, "loss": 0.267, "step": 17895 }, { "epoch": 4.174440298507463, "grad_norm": 0.38934026131417887, "learning_rate": 8.271487805803193e-06, "loss": 0.2641, "step": 17900 }, { "epoch": 4.175606343283582, "grad_norm": 0.3953189315405114, "learning_rate": 8.262482782079281e-06, "loss": 0.27, "step": 17905 }, { "epoch": 4.176772388059701, "grad_norm": 0.39999437963549983, "learning_rate": 8.253489200092912e-06, "loss": 0.261, "step": 17910 }, { "epoch": 4.177938432835821, "grad_norm": 0.386670395609924, "learning_rate": 8.244507065193117e-06, "loss": 0.2658, "step": 17915 }, { "epoch": 4.17910447761194, "grad_norm": 0.3856087444116748, "learning_rate": 8.235536382722133e-06, "loss": 0.2663, "step": 17920 }, { "epoch": 4.18027052238806, "grad_norm": 0.4004318455532569, "learning_rate": 8.226577158015383e-06, "loss": 0.2701, "step": 17925 }, { "epoch": 4.181436567164179, "grad_norm": 0.4029934263843612, "learning_rate": 8.217629396401465e-06, "loss": 0.2914, "step": 17930 }, { "epoch": 4.182602611940299, "grad_norm": 0.4297519561669356, "learning_rate": 8.208693103202158e-06, "loss": 0.2676, "step": 17935 }, { "epoch": 4.183768656716418, "grad_norm": 0.41071873659549063, "learning_rate": 8.199768283732432e-06, "loss": 0.2699, "step": 17940 }, { "epoch": 4.184934701492537, "grad_norm": 0.3927487458988259, "learning_rate": 8.190854943300436e-06, "loss": 0.2783, "step": 17945 }, { "epoch": 4.186100746268656, "grad_norm": 0.39544062944596514, "learning_rate": 8.181953087207467e-06, "loss": 0.2655, "step": 17950 }, { "epoch": 4.1872667910447765, "grad_norm": 0.3880874898146546, "learning_rate": 8.17306272074802e-06, "loss": 0.2674, "step": 17955 }, { "epoch": 4.188432835820896, "grad_norm": 0.3958773152588233, "learning_rate": 8.164183849209741e-06, "loss": 0.2629, "step": 17960 }, { "epoch": 4.189598880597015, "grad_norm": 0.37874451988691205, "learning_rate": 8.155316477873438e-06, "loss": 0.2792, "step": 17965 }, { "epoch": 4.190764925373134, "grad_norm": 0.42105218931902155, "learning_rate": 8.146460612013083e-06, "loss": 0.2722, "step": 17970 }, { "epoch": 4.1919309701492535, "grad_norm": 0.4367285017785108, "learning_rate": 8.137616256895811e-06, "loss": 0.2752, "step": 17975 }, { "epoch": 4.193097014925373, "grad_norm": 0.3927999057775576, "learning_rate": 8.128783417781909e-06, "loss": 0.2702, "step": 17980 }, { "epoch": 4.194263059701493, "grad_norm": 0.3967807633098899, "learning_rate": 8.119962099924797e-06, "loss": 0.269, "step": 17985 }, { "epoch": 4.195429104477612, "grad_norm": 0.39769987661014367, "learning_rate": 8.111152308571065e-06, "loss": 0.2753, "step": 17990 }, { "epoch": 4.1965951492537314, "grad_norm": 0.412846110018462, "learning_rate": 8.10235404896044e-06, "loss": 0.2772, "step": 17995 }, { "epoch": 4.197761194029851, "grad_norm": 0.38426551987470575, "learning_rate": 8.09356732632579e-06, "loss": 0.2733, "step": 18000 }, { "epoch": 4.19892723880597, "grad_norm": 0.3668974286942403, "learning_rate": 8.084792145893122e-06, "loss": 0.2588, "step": 18005 }, { "epoch": 4.200093283582089, "grad_norm": 0.41338717569233074, "learning_rate": 8.07602851288157e-06, "loss": 0.2754, "step": 18010 }, { "epoch": 4.201259328358209, "grad_norm": 0.3986613892467767, "learning_rate": 8.067276432503406e-06, "loss": 0.2809, "step": 18015 }, { "epoch": 4.202425373134329, "grad_norm": 0.4229143083671442, "learning_rate": 8.058535909964041e-06, "loss": 0.2701, "step": 18020 }, { "epoch": 4.203591417910448, "grad_norm": 0.4225483163449904, "learning_rate": 8.049806950461996e-06, "loss": 0.2765, "step": 18025 }, { "epoch": 4.204757462686567, "grad_norm": 0.40954725712677237, "learning_rate": 8.041089559188929e-06, "loss": 0.2627, "step": 18030 }, { "epoch": 4.205923507462686, "grad_norm": 0.3928293796239581, "learning_rate": 8.032383741329598e-06, "loss": 0.2677, "step": 18035 }, { "epoch": 4.207089552238806, "grad_norm": 0.4052297327521418, "learning_rate": 8.023689502061897e-06, "loss": 0.2567, "step": 18040 }, { "epoch": 4.208255597014926, "grad_norm": 0.393476277539444, "learning_rate": 8.015006846556825e-06, "loss": 0.2636, "step": 18045 }, { "epoch": 4.209421641791045, "grad_norm": 0.3923784394452194, "learning_rate": 8.006335779978494e-06, "loss": 0.2804, "step": 18050 }, { "epoch": 4.210587686567164, "grad_norm": 0.36991243854819816, "learning_rate": 7.997676307484123e-06, "loss": 0.2691, "step": 18055 }, { "epoch": 4.211753731343284, "grad_norm": 0.43169159772478505, "learning_rate": 7.989028434224028e-06, "loss": 0.2811, "step": 18060 }, { "epoch": 4.212919776119403, "grad_norm": 0.40923562740502417, "learning_rate": 7.980392165341636e-06, "loss": 0.2758, "step": 18065 }, { "epoch": 4.214085820895522, "grad_norm": 0.422018177827729, "learning_rate": 7.971767505973468e-06, "loss": 0.2789, "step": 18070 }, { "epoch": 4.215251865671641, "grad_norm": 0.39494749417562225, "learning_rate": 7.963154461249143e-06, "loss": 0.2776, "step": 18075 }, { "epoch": 4.2164179104477615, "grad_norm": 0.3867227498750476, "learning_rate": 7.95455303629137e-06, "loss": 0.2763, "step": 18080 }, { "epoch": 4.217583955223881, "grad_norm": 0.3753961520495688, "learning_rate": 7.945963236215944e-06, "loss": 0.272, "step": 18085 }, { "epoch": 4.21875, "grad_norm": 0.42936881893191087, "learning_rate": 7.937385066131745e-06, "loss": 0.2913, "step": 18090 }, { "epoch": 4.219916044776119, "grad_norm": 0.41154807923770514, "learning_rate": 7.928818531140748e-06, "loss": 0.2728, "step": 18095 }, { "epoch": 4.2210820895522385, "grad_norm": 0.3877003504655198, "learning_rate": 7.920263636337994e-06, "loss": 0.2697, "step": 18100 }, { "epoch": 4.222248134328359, "grad_norm": 0.39692175480407094, "learning_rate": 7.911720386811613e-06, "loss": 0.2758, "step": 18105 }, { "epoch": 4.223414179104478, "grad_norm": 0.426405668197753, "learning_rate": 7.90318878764279e-06, "loss": 0.2775, "step": 18110 }, { "epoch": 4.224580223880597, "grad_norm": 0.42296221155951175, "learning_rate": 7.894668843905803e-06, "loss": 0.2783, "step": 18115 }, { "epoch": 4.225746268656716, "grad_norm": 0.38304370256201864, "learning_rate": 7.886160560667984e-06, "loss": 0.2827, "step": 18120 }, { "epoch": 4.226912313432836, "grad_norm": 0.4243619646936963, "learning_rate": 7.87766394298974e-06, "loss": 0.274, "step": 18125 }, { "epoch": 4.228078358208955, "grad_norm": 0.3908501604076978, "learning_rate": 7.869178995924525e-06, "loss": 0.2762, "step": 18130 }, { "epoch": 4.229244402985074, "grad_norm": 0.39394592450348964, "learning_rate": 7.860705724518857e-06, "loss": 0.2782, "step": 18135 }, { "epoch": 4.230410447761194, "grad_norm": 0.40271481185017766, "learning_rate": 7.852244133812332e-06, "loss": 0.2774, "step": 18140 }, { "epoch": 4.231576492537314, "grad_norm": 0.39968102282688084, "learning_rate": 7.843794228837556e-06, "loss": 0.2853, "step": 18145 }, { "epoch": 4.232742537313433, "grad_norm": 0.4247008418363225, "learning_rate": 7.83535601462022e-06, "loss": 0.2669, "step": 18150 }, { "epoch": 4.233908582089552, "grad_norm": 0.39448119459676123, "learning_rate": 7.82692949617905e-06, "loss": 0.2877, "step": 18155 }, { "epoch": 4.235074626865671, "grad_norm": 0.41898670602918947, "learning_rate": 7.818514678525822e-06, "loss": 0.2665, "step": 18160 }, { "epoch": 4.236240671641791, "grad_norm": 0.41629875977458675, "learning_rate": 7.810111566665333e-06, "loss": 0.2764, "step": 18165 }, { "epoch": 4.237406716417911, "grad_norm": 0.41336606695873085, "learning_rate": 7.80172016559544e-06, "loss": 0.2739, "step": 18170 }, { "epoch": 4.23857276119403, "grad_norm": 0.4229416876666708, "learning_rate": 7.793340480307027e-06, "loss": 0.259, "step": 18175 }, { "epoch": 4.239738805970149, "grad_norm": 0.41907162173078616, "learning_rate": 7.784972515784004e-06, "loss": 0.2752, "step": 18180 }, { "epoch": 4.2409048507462686, "grad_norm": 0.41674551957571665, "learning_rate": 7.776616277003328e-06, "loss": 0.2805, "step": 18185 }, { "epoch": 4.242070895522388, "grad_norm": 0.410588553164536, "learning_rate": 7.768271768934955e-06, "loss": 0.2722, "step": 18190 }, { "epoch": 4.243236940298507, "grad_norm": 0.402796268396602, "learning_rate": 7.759938996541886e-06, "loss": 0.2858, "step": 18195 }, { "epoch": 4.244402985074627, "grad_norm": 0.3850130874319362, "learning_rate": 7.751617964780131e-06, "loss": 0.2754, "step": 18200 }, { "epoch": 4.2455690298507465, "grad_norm": 0.42831256969779924, "learning_rate": 7.743308678598722e-06, "loss": 0.2821, "step": 18205 }, { "epoch": 4.246735074626866, "grad_norm": 0.424981782694087, "learning_rate": 7.73501114293971e-06, "loss": 0.2783, "step": 18210 }, { "epoch": 4.247901119402985, "grad_norm": 0.4510353226283879, "learning_rate": 7.726725362738141e-06, "loss": 0.2805, "step": 18215 }, { "epoch": 4.249067164179104, "grad_norm": 0.3677540093562044, "learning_rate": 7.71845134292208e-06, "loss": 0.2747, "step": 18220 }, { "epoch": 4.2502332089552235, "grad_norm": 0.4041495223777481, "learning_rate": 7.710189088412604e-06, "loss": 0.2792, "step": 18225 }, { "epoch": 4.251399253731344, "grad_norm": 0.4209873163499664, "learning_rate": 7.70193860412378e-06, "loss": 0.2791, "step": 18230 }, { "epoch": 4.252565298507463, "grad_norm": 0.4050461190548646, "learning_rate": 7.693699894962686e-06, "loss": 0.2754, "step": 18235 }, { "epoch": 4.253731343283582, "grad_norm": 0.40044208229826345, "learning_rate": 7.68547296582938e-06, "loss": 0.2754, "step": 18240 }, { "epoch": 4.254897388059701, "grad_norm": 0.3806130309779204, "learning_rate": 7.67725782161693e-06, "loss": 0.273, "step": 18245 }, { "epoch": 4.256063432835821, "grad_norm": 0.4071739565270072, "learning_rate": 7.669054467211388e-06, "loss": 0.2669, "step": 18250 }, { "epoch": 4.25722947761194, "grad_norm": 0.4102686853577913, "learning_rate": 7.660862907491795e-06, "loss": 0.2792, "step": 18255 }, { "epoch": 4.25839552238806, "grad_norm": 0.3934025175959887, "learning_rate": 7.652683147330177e-06, "loss": 0.2782, "step": 18260 }, { "epoch": 4.259561567164179, "grad_norm": 0.39507155314076864, "learning_rate": 7.644515191591542e-06, "loss": 0.2813, "step": 18265 }, { "epoch": 4.260727611940299, "grad_norm": 0.3890717689329465, "learning_rate": 7.636359045133873e-06, "loss": 0.2705, "step": 18270 }, { "epoch": 4.261893656716418, "grad_norm": 0.3933419963216611, "learning_rate": 7.6282147128081364e-06, "loss": 0.2705, "step": 18275 }, { "epoch": 4.263059701492537, "grad_norm": 0.40378936435653107, "learning_rate": 7.620082199458269e-06, "loss": 0.2808, "step": 18280 }, { "epoch": 4.264225746268656, "grad_norm": 0.4218348692990293, "learning_rate": 7.611961509921182e-06, "loss": 0.2744, "step": 18285 }, { "epoch": 4.2653917910447765, "grad_norm": 0.4371185608735183, "learning_rate": 7.603852649026738e-06, "loss": 0.2695, "step": 18290 }, { "epoch": 4.266557835820896, "grad_norm": 0.36353804865112804, "learning_rate": 7.595755621597788e-06, "loss": 0.2701, "step": 18295 }, { "epoch": 4.267723880597015, "grad_norm": 0.4031347424289463, "learning_rate": 7.587670432450131e-06, "loss": 0.2743, "step": 18300 }, { "epoch": 4.268889925373134, "grad_norm": 0.40866978560488426, "learning_rate": 7.57959708639252e-06, "loss": 0.279, "step": 18305 }, { "epoch": 4.2700559701492535, "grad_norm": 0.41098549774145526, "learning_rate": 7.5715355882266815e-06, "loss": 0.2735, "step": 18310 }, { "epoch": 4.271222014925373, "grad_norm": 0.40197967026229475, "learning_rate": 7.5634859427472835e-06, "loss": 0.2718, "step": 18315 }, { "epoch": 4.272388059701493, "grad_norm": 0.4326629450762017, "learning_rate": 7.5554481547419395e-06, "loss": 0.2817, "step": 18320 }, { "epoch": 4.273554104477612, "grad_norm": 0.4164172872124524, "learning_rate": 7.547422228991223e-06, "loss": 0.2845, "step": 18325 }, { "epoch": 4.2747201492537314, "grad_norm": 0.4115319235447634, "learning_rate": 7.539408170268644e-06, "loss": 0.28, "step": 18330 }, { "epoch": 4.275886194029851, "grad_norm": 0.3922002276692683, "learning_rate": 7.531405983340668e-06, "loss": 0.2681, "step": 18335 }, { "epoch": 4.27705223880597, "grad_norm": 0.4006349977272167, "learning_rate": 7.523415672966675e-06, "loss": 0.2613, "step": 18340 }, { "epoch": 4.278218283582089, "grad_norm": 0.42867198696884184, "learning_rate": 7.515437243898998e-06, "loss": 0.2716, "step": 18345 }, { "epoch": 4.279384328358209, "grad_norm": 0.395475700228639, "learning_rate": 7.507470700882905e-06, "loss": 0.2728, "step": 18350 }, { "epoch": 4.280550373134329, "grad_norm": 0.4105932715424159, "learning_rate": 7.499516048656589e-06, "loss": 0.267, "step": 18355 }, { "epoch": 4.281716417910448, "grad_norm": 0.402741507907578, "learning_rate": 7.491573291951176e-06, "loss": 0.2593, "step": 18360 }, { "epoch": 4.282882462686567, "grad_norm": 0.40645558368972784, "learning_rate": 7.483642435490706e-06, "loss": 0.2966, "step": 18365 }, { "epoch": 4.284048507462686, "grad_norm": 0.417345801951849, "learning_rate": 7.475723483992149e-06, "loss": 0.28, "step": 18370 }, { "epoch": 4.285214552238806, "grad_norm": 0.42859196731371385, "learning_rate": 7.467816442165397e-06, "loss": 0.2736, "step": 18375 }, { "epoch": 4.286380597014926, "grad_norm": 0.42671733070056267, "learning_rate": 7.459921314713253e-06, "loss": 0.2799, "step": 18380 }, { "epoch": 4.287546641791045, "grad_norm": 0.39637314816055846, "learning_rate": 7.452038106331442e-06, "loss": 0.263, "step": 18385 }, { "epoch": 4.288712686567164, "grad_norm": 0.4060549873320143, "learning_rate": 7.444166821708584e-06, "loss": 0.2742, "step": 18390 }, { "epoch": 4.289878731343284, "grad_norm": 0.4248193501855419, "learning_rate": 7.436307465526224e-06, "loss": 0.2769, "step": 18395 }, { "epoch": 4.291044776119403, "grad_norm": 0.4000788833508795, "learning_rate": 7.4284600424588045e-06, "loss": 0.275, "step": 18400 }, { "epoch": 4.292210820895522, "grad_norm": 0.40681180347843243, "learning_rate": 7.42062455717367e-06, "loss": 0.2672, "step": 18405 }, { "epoch": 4.293376865671641, "grad_norm": 0.3794132400613795, "learning_rate": 7.412801014331075e-06, "loss": 0.2658, "step": 18410 }, { "epoch": 4.2945429104477615, "grad_norm": 0.4195999135179553, "learning_rate": 7.4049894185841476e-06, "loss": 0.275, "step": 18415 }, { "epoch": 4.295708955223881, "grad_norm": 0.3891850858333817, "learning_rate": 7.397189774578939e-06, "loss": 0.2684, "step": 18420 }, { "epoch": 4.296875, "grad_norm": 0.42414664545538083, "learning_rate": 7.389402086954368e-06, "loss": 0.2736, "step": 18425 }, { "epoch": 4.298041044776119, "grad_norm": 0.3932808123921889, "learning_rate": 7.38162636034226e-06, "loss": 0.2751, "step": 18430 }, { "epoch": 4.2992070895522385, "grad_norm": 0.40286976532965957, "learning_rate": 7.373862599367316e-06, "loss": 0.2787, "step": 18435 }, { "epoch": 4.300373134328359, "grad_norm": 0.4191664541492445, "learning_rate": 7.366110808647128e-06, "loss": 0.2796, "step": 18440 }, { "epoch": 4.301539179104478, "grad_norm": 0.380688004195661, "learning_rate": 7.3583709927921574e-06, "loss": 0.2714, "step": 18445 }, { "epoch": 4.302705223880597, "grad_norm": 0.38275380874369747, "learning_rate": 7.350643156405751e-06, "loss": 0.2652, "step": 18450 }, { "epoch": 4.303871268656716, "grad_norm": 0.3953094826945292, "learning_rate": 7.342927304084132e-06, "loss": 0.2748, "step": 18455 }, { "epoch": 4.305037313432836, "grad_norm": 0.40109883831737486, "learning_rate": 7.335223440416391e-06, "loss": 0.263, "step": 18460 }, { "epoch": 4.306203358208955, "grad_norm": 0.41201085772616464, "learning_rate": 7.327531569984497e-06, "loss": 0.2746, "step": 18465 }, { "epoch": 4.307369402985074, "grad_norm": 0.4032558562628837, "learning_rate": 7.319851697363271e-06, "loss": 0.2694, "step": 18470 }, { "epoch": 4.308535447761194, "grad_norm": 0.40220772868725246, "learning_rate": 7.31218382712041e-06, "loss": 0.2648, "step": 18475 }, { "epoch": 4.309701492537314, "grad_norm": 0.39557134116257103, "learning_rate": 7.304527963816472e-06, "loss": 0.2719, "step": 18480 }, { "epoch": 4.310867537313433, "grad_norm": 0.3955770890075112, "learning_rate": 7.2968841120048666e-06, "loss": 0.2711, "step": 18485 }, { "epoch": 4.312033582089552, "grad_norm": 0.4401251018553915, "learning_rate": 7.289252276231863e-06, "loss": 0.2701, "step": 18490 }, { "epoch": 4.313199626865671, "grad_norm": 0.40667506023810096, "learning_rate": 7.281632461036594e-06, "loss": 0.2676, "step": 18495 }, { "epoch": 4.314365671641791, "grad_norm": 0.42782409846654357, "learning_rate": 7.27402467095102e-06, "loss": 0.2837, "step": 18500 }, { "epoch": 4.315531716417911, "grad_norm": 0.4158211862037495, "learning_rate": 7.266428910499971e-06, "loss": 0.2888, "step": 18505 }, { "epoch": 4.31669776119403, "grad_norm": 0.40022210349031667, "learning_rate": 7.258845184201111e-06, "loss": 0.2791, "step": 18510 }, { "epoch": 4.317863805970149, "grad_norm": 0.39224874072895266, "learning_rate": 7.251273496564957e-06, "loss": 0.2735, "step": 18515 }, { "epoch": 4.3190298507462686, "grad_norm": 0.3828169151730118, "learning_rate": 7.243713852094848e-06, "loss": 0.2712, "step": 18520 }, { "epoch": 4.320195895522388, "grad_norm": 0.3785602543827901, "learning_rate": 7.2361662552869734e-06, "loss": 0.2757, "step": 18525 }, { "epoch": 4.321361940298507, "grad_norm": 0.40368842673001093, "learning_rate": 7.228630710630356e-06, "loss": 0.2813, "step": 18530 }, { "epoch": 4.322527985074627, "grad_norm": 0.3966995045907521, "learning_rate": 7.221107222606851e-06, "loss": 0.2683, "step": 18535 }, { "epoch": 4.3236940298507465, "grad_norm": 0.3881458660154604, "learning_rate": 7.21359579569114e-06, "loss": 0.2741, "step": 18540 }, { "epoch": 4.324860074626866, "grad_norm": 0.38040306667446083, "learning_rate": 7.206096434350728e-06, "loss": 0.276, "step": 18545 }, { "epoch": 4.326026119402985, "grad_norm": 0.4210535710387319, "learning_rate": 7.198609143045948e-06, "loss": 0.2746, "step": 18550 }, { "epoch": 4.327192164179104, "grad_norm": 0.3957509557638466, "learning_rate": 7.191133926229957e-06, "loss": 0.2694, "step": 18555 }, { "epoch": 4.3283582089552235, "grad_norm": 0.3868809353549212, "learning_rate": 7.183670788348726e-06, "loss": 0.2641, "step": 18560 }, { "epoch": 4.329524253731344, "grad_norm": 0.39010445915469755, "learning_rate": 7.176219733841047e-06, "loss": 0.2538, "step": 18565 }, { "epoch": 4.330690298507463, "grad_norm": 0.396442626935043, "learning_rate": 7.168780767138512e-06, "loss": 0.2606, "step": 18570 }, { "epoch": 4.331856343283582, "grad_norm": 0.4006272597680199, "learning_rate": 7.161353892665538e-06, "loss": 0.2704, "step": 18575 }, { "epoch": 4.333022388059701, "grad_norm": 0.4025410382377788, "learning_rate": 7.1539391148393474e-06, "loss": 0.2761, "step": 18580 }, { "epoch": 4.334188432835821, "grad_norm": 0.43821229012187163, "learning_rate": 7.146536438069963e-06, "loss": 0.2902, "step": 18585 }, { "epoch": 4.33535447761194, "grad_norm": 0.42203900554396184, "learning_rate": 7.139145866760217e-06, "loss": 0.2863, "step": 18590 }, { "epoch": 4.33652052238806, "grad_norm": 0.3928273962646149, "learning_rate": 7.1317674053057335e-06, "loss": 0.267, "step": 18595 }, { "epoch": 4.337686567164179, "grad_norm": 0.4475315306879053, "learning_rate": 7.124401058094938e-06, "loss": 0.2832, "step": 18600 }, { "epoch": 4.338852611940299, "grad_norm": 0.3875417608232742, "learning_rate": 7.117046829509057e-06, "loss": 0.2714, "step": 18605 }, { "epoch": 4.340018656716418, "grad_norm": 0.40792835610705425, "learning_rate": 7.109704723922094e-06, "loss": 0.2811, "step": 18610 }, { "epoch": 4.341184701492537, "grad_norm": 0.394978446970077, "learning_rate": 7.102374745700866e-06, "loss": 0.2823, "step": 18615 }, { "epoch": 4.342350746268656, "grad_norm": 0.3873837118235636, "learning_rate": 7.0950568992049494e-06, "loss": 0.2708, "step": 18620 }, { "epoch": 4.3435167910447765, "grad_norm": 0.39206257530074834, "learning_rate": 7.087751188786723e-06, "loss": 0.2777, "step": 18625 }, { "epoch": 4.344682835820896, "grad_norm": 0.39565087671877125, "learning_rate": 7.080457618791344e-06, "loss": 0.2743, "step": 18630 }, { "epoch": 4.345848880597015, "grad_norm": 0.41411369302212386, "learning_rate": 7.0731761935567495e-06, "loss": 0.2883, "step": 18635 }, { "epoch": 4.347014925373134, "grad_norm": 0.36316642570779184, "learning_rate": 7.0659069174136544e-06, "loss": 0.2532, "step": 18640 }, { "epoch": 4.3481809701492535, "grad_norm": 0.41132054512797556, "learning_rate": 7.058649794685537e-06, "loss": 0.2698, "step": 18645 }, { "epoch": 4.349347014925373, "grad_norm": 0.4114841400832778, "learning_rate": 7.051404829688663e-06, "loss": 0.2676, "step": 18650 }, { "epoch": 4.350513059701493, "grad_norm": 0.4257416427557622, "learning_rate": 7.044172026732059e-06, "loss": 0.2844, "step": 18655 }, { "epoch": 4.351679104477612, "grad_norm": 0.420027093639048, "learning_rate": 7.036951390117512e-06, "loss": 0.273, "step": 18660 }, { "epoch": 4.3528451492537314, "grad_norm": 0.3897984050752887, "learning_rate": 7.029742924139586e-06, "loss": 0.2707, "step": 18665 }, { "epoch": 4.354011194029851, "grad_norm": 0.38179591064784923, "learning_rate": 7.022546633085604e-06, "loss": 0.2697, "step": 18670 }, { "epoch": 4.35517723880597, "grad_norm": 0.40566034270554724, "learning_rate": 7.015362521235632e-06, "loss": 0.2697, "step": 18675 }, { "epoch": 4.356343283582089, "grad_norm": 0.4006639140691593, "learning_rate": 7.008190592862514e-06, "loss": 0.2841, "step": 18680 }, { "epoch": 4.357509328358209, "grad_norm": 0.41275912265736375, "learning_rate": 7.0010308522318355e-06, "loss": 0.2776, "step": 18685 }, { "epoch": 4.358675373134329, "grad_norm": 0.40050083868302266, "learning_rate": 6.9938833036019365e-06, "loss": 0.2687, "step": 18690 }, { "epoch": 4.359841417910448, "grad_norm": 0.39616132942510224, "learning_rate": 6.9867479512239e-06, "loss": 0.2865, "step": 18695 }, { "epoch": 4.361007462686567, "grad_norm": 0.41896509762753836, "learning_rate": 6.979624799341565e-06, "loss": 0.2678, "step": 18700 }, { "epoch": 4.362173507462686, "grad_norm": 0.39898689634510065, "learning_rate": 6.972513852191508e-06, "loss": 0.2838, "step": 18705 }, { "epoch": 4.363339552238806, "grad_norm": 0.39019308904651356, "learning_rate": 6.965415114003046e-06, "loss": 0.2729, "step": 18710 }, { "epoch": 4.364505597014926, "grad_norm": 0.41372980766676376, "learning_rate": 6.958328588998242e-06, "loss": 0.2801, "step": 18715 }, { "epoch": 4.365671641791045, "grad_norm": 0.38547078697304504, "learning_rate": 6.951254281391881e-06, "loss": 0.2638, "step": 18720 }, { "epoch": 4.366837686567164, "grad_norm": 0.388440955064004, "learning_rate": 6.944192195391494e-06, "loss": 0.2706, "step": 18725 }, { "epoch": 4.368003731343284, "grad_norm": 0.4340887735110533, "learning_rate": 6.937142335197338e-06, "loss": 0.2835, "step": 18730 }, { "epoch": 4.369169776119403, "grad_norm": 0.3925250738284526, "learning_rate": 6.930104705002403e-06, "loss": 0.2641, "step": 18735 }, { "epoch": 4.370335820895522, "grad_norm": 0.41935211598110145, "learning_rate": 6.9230793089924005e-06, "loss": 0.2752, "step": 18740 }, { "epoch": 4.371501865671641, "grad_norm": 0.3858755016319207, "learning_rate": 6.916066151345761e-06, "loss": 0.2764, "step": 18745 }, { "epoch": 4.3726679104477615, "grad_norm": 0.3988249900306538, "learning_rate": 6.909065236233644e-06, "loss": 0.2765, "step": 18750 }, { "epoch": 4.373833955223881, "grad_norm": 0.39211849279452404, "learning_rate": 6.90207656781993e-06, "loss": 0.2706, "step": 18755 }, { "epoch": 4.375, "grad_norm": 0.4099475077210503, "learning_rate": 6.8951001502612065e-06, "loss": 0.2681, "step": 18760 }, { "epoch": 4.376166044776119, "grad_norm": 0.40630297492656764, "learning_rate": 6.888135987706787e-06, "loss": 0.2879, "step": 18765 }, { "epoch": 4.3773320895522385, "grad_norm": 0.42286882372567736, "learning_rate": 6.881184084298675e-06, "loss": 0.2806, "step": 18770 }, { "epoch": 4.378498134328359, "grad_norm": 0.3960834884757748, "learning_rate": 6.874244444171607e-06, "loss": 0.2688, "step": 18775 }, { "epoch": 4.379664179104478, "grad_norm": 0.41468075646088814, "learning_rate": 6.867317071453007e-06, "loss": 0.2653, "step": 18780 }, { "epoch": 4.380830223880597, "grad_norm": 0.4023368857309421, "learning_rate": 6.860401970263017e-06, "loss": 0.2626, "step": 18785 }, { "epoch": 4.381996268656716, "grad_norm": 0.3793091077410151, "learning_rate": 6.8534991447144706e-06, "loss": 0.2634, "step": 18790 }, { "epoch": 4.383162313432836, "grad_norm": 0.37678824298536073, "learning_rate": 6.8466085989129066e-06, "loss": 0.2595, "step": 18795 }, { "epoch": 4.384328358208955, "grad_norm": 0.40251889586948386, "learning_rate": 6.839730336956554e-06, "loss": 0.2709, "step": 18800 }, { "epoch": 4.385494402985074, "grad_norm": 0.4056816699893774, "learning_rate": 6.83286436293634e-06, "loss": 0.2744, "step": 18805 }, { "epoch": 4.386660447761194, "grad_norm": 0.41362262742566164, "learning_rate": 6.826010680935886e-06, "loss": 0.2724, "step": 18810 }, { "epoch": 4.387826492537314, "grad_norm": 0.3784388793501856, "learning_rate": 6.819169295031493e-06, "loss": 0.2678, "step": 18815 }, { "epoch": 4.388992537313433, "grad_norm": 0.40012074332417263, "learning_rate": 6.812340209292164e-06, "loss": 0.2833, "step": 18820 }, { "epoch": 4.390158582089552, "grad_norm": 0.39757265960329474, "learning_rate": 6.80552342777957e-06, "loss": 0.275, "step": 18825 }, { "epoch": 4.391324626865671, "grad_norm": 0.41927261976118096, "learning_rate": 6.79871895454807e-06, "loss": 0.2897, "step": 18830 }, { "epoch": 4.392490671641791, "grad_norm": 0.39977536858299106, "learning_rate": 6.791926793644713e-06, "loss": 0.2682, "step": 18835 }, { "epoch": 4.393656716417911, "grad_norm": 0.41731170175521587, "learning_rate": 6.785146949109206e-06, "loss": 0.2745, "step": 18840 }, { "epoch": 4.39482276119403, "grad_norm": 0.39821488851666337, "learning_rate": 6.778379424973943e-06, "loss": 0.2648, "step": 18845 }, { "epoch": 4.395988805970149, "grad_norm": 0.40766819813214755, "learning_rate": 6.771624225263994e-06, "loss": 0.2618, "step": 18850 }, { "epoch": 4.3971548507462686, "grad_norm": 0.4142756771919827, "learning_rate": 6.764881353997082e-06, "loss": 0.2741, "step": 18855 }, { "epoch": 4.398320895522388, "grad_norm": 0.42668661830470156, "learning_rate": 6.758150815183618e-06, "loss": 0.2875, "step": 18860 }, { "epoch": 4.399486940298507, "grad_norm": 0.4113454794124042, "learning_rate": 6.751432612826664e-06, "loss": 0.2853, "step": 18865 }, { "epoch": 4.400652985074627, "grad_norm": 0.3945741472602751, "learning_rate": 6.7447267509219494e-06, "loss": 0.2686, "step": 18870 }, { "epoch": 4.4018190298507465, "grad_norm": 0.39115576733852403, "learning_rate": 6.738033233457863e-06, "loss": 0.2782, "step": 18875 }, { "epoch": 4.402985074626866, "grad_norm": 0.3988196565507617, "learning_rate": 6.7313520644154555e-06, "loss": 0.2698, "step": 18880 }, { "epoch": 4.404151119402985, "grad_norm": 0.3676469105261775, "learning_rate": 6.724683247768427e-06, "loss": 0.264, "step": 18885 }, { "epoch": 4.405317164179104, "grad_norm": 0.3970448843150535, "learning_rate": 6.718026787483131e-06, "loss": 0.2684, "step": 18890 }, { "epoch": 4.4064832089552235, "grad_norm": 0.40734340524720397, "learning_rate": 6.7113826875185885e-06, "loss": 0.2901, "step": 18895 }, { "epoch": 4.407649253731344, "grad_norm": 0.3879153206101479, "learning_rate": 6.704750951826438e-06, "loss": 0.2728, "step": 18900 }, { "epoch": 4.408815298507463, "grad_norm": 0.37419427548808687, "learning_rate": 6.698131584350989e-06, "loss": 0.2789, "step": 18905 }, { "epoch": 4.409981343283582, "grad_norm": 0.41270892431821893, "learning_rate": 6.691524589029188e-06, "loss": 0.2801, "step": 18910 }, { "epoch": 4.411147388059701, "grad_norm": 0.3982928618966707, "learning_rate": 6.684929969790622e-06, "loss": 0.2698, "step": 18915 }, { "epoch": 4.412313432835821, "grad_norm": 0.39749924018053034, "learning_rate": 6.6783477305575215e-06, "loss": 0.2843, "step": 18920 }, { "epoch": 4.41347947761194, "grad_norm": 0.37837567587105775, "learning_rate": 6.671777875244745e-06, "loss": 0.2729, "step": 18925 }, { "epoch": 4.41464552238806, "grad_norm": 0.39945407037967195, "learning_rate": 6.665220407759788e-06, "loss": 0.2871, "step": 18930 }, { "epoch": 4.415811567164179, "grad_norm": 0.4057597958414668, "learning_rate": 6.658675332002787e-06, "loss": 0.2697, "step": 18935 }, { "epoch": 4.416977611940299, "grad_norm": 0.40746892828624903, "learning_rate": 6.652142651866497e-06, "loss": 0.2685, "step": 18940 }, { "epoch": 4.418143656716418, "grad_norm": 0.3918110842437935, "learning_rate": 6.645622371236314e-06, "loss": 0.2756, "step": 18945 }, { "epoch": 4.419309701492537, "grad_norm": 0.37102893586161895, "learning_rate": 6.639114493990238e-06, "loss": 0.2734, "step": 18950 }, { "epoch": 4.420475746268656, "grad_norm": 0.4056425541286255, "learning_rate": 6.6326190239989135e-06, "loss": 0.2743, "step": 18955 }, { "epoch": 4.4216417910447765, "grad_norm": 0.39256078118021526, "learning_rate": 6.626135965125597e-06, "loss": 0.265, "step": 18960 }, { "epoch": 4.422807835820896, "grad_norm": 0.3935822845824643, "learning_rate": 6.61966532122616e-06, "loss": 0.2838, "step": 18965 }, { "epoch": 4.423973880597015, "grad_norm": 0.4158362871831473, "learning_rate": 6.613207096149099e-06, "loss": 0.2673, "step": 18970 }, { "epoch": 4.425139925373134, "grad_norm": 0.40875456504871766, "learning_rate": 6.606761293735513e-06, "loss": 0.2767, "step": 18975 }, { "epoch": 4.4263059701492535, "grad_norm": 0.3878233054183334, "learning_rate": 6.600327917819114e-06, "loss": 0.2646, "step": 18980 }, { "epoch": 4.427472014925373, "grad_norm": 0.37276222693146205, "learning_rate": 6.593906972226238e-06, "loss": 0.2608, "step": 18985 }, { "epoch": 4.428638059701493, "grad_norm": 0.41202104775478976, "learning_rate": 6.587498460775811e-06, "loss": 0.2812, "step": 18990 }, { "epoch": 4.429804104477612, "grad_norm": 0.421211060105778, "learning_rate": 6.581102387279374e-06, "loss": 0.2852, "step": 18995 }, { "epoch": 4.4309701492537314, "grad_norm": 0.4191625377655691, "learning_rate": 6.574718755541061e-06, "loss": 0.2805, "step": 19000 }, { "epoch": 4.432136194029851, "grad_norm": 0.3941124652552664, "learning_rate": 6.568347569357611e-06, "loss": 0.2695, "step": 19005 }, { "epoch": 4.43330223880597, "grad_norm": 0.3793229746609596, "learning_rate": 6.561988832518367e-06, "loss": 0.2732, "step": 19010 }, { "epoch": 4.434468283582089, "grad_norm": 0.43193453612308014, "learning_rate": 6.555642548805262e-06, "loss": 0.2944, "step": 19015 }, { "epoch": 4.435634328358209, "grad_norm": 0.42671415627695103, "learning_rate": 6.5493087219928114e-06, "loss": 0.2714, "step": 19020 }, { "epoch": 4.436800373134329, "grad_norm": 0.3906920961525172, "learning_rate": 6.542987355848144e-06, "loss": 0.2596, "step": 19025 }, { "epoch": 4.437966417910448, "grad_norm": 0.40912308298665784, "learning_rate": 6.536678454130965e-06, "loss": 0.2839, "step": 19030 }, { "epoch": 4.439132462686567, "grad_norm": 0.39463773036597727, "learning_rate": 6.530382020593559e-06, "loss": 0.2741, "step": 19035 }, { "epoch": 4.440298507462686, "grad_norm": 0.3967586998553728, "learning_rate": 6.52409805898081e-06, "loss": 0.2726, "step": 19040 }, { "epoch": 4.441464552238806, "grad_norm": 0.40978112634281993, "learning_rate": 6.517826573030178e-06, "loss": 0.2848, "step": 19045 }, { "epoch": 4.442630597014926, "grad_norm": 0.4142811019822367, "learning_rate": 6.511567566471697e-06, "loss": 0.2741, "step": 19050 }, { "epoch": 4.443796641791045, "grad_norm": 0.40419837663053265, "learning_rate": 6.50532104302799e-06, "loss": 0.2732, "step": 19055 }, { "epoch": 4.444962686567164, "grad_norm": 0.4187348332592611, "learning_rate": 6.499087006414245e-06, "loss": 0.2775, "step": 19060 }, { "epoch": 4.446128731343284, "grad_norm": 0.4116433904244841, "learning_rate": 6.492865460338228e-06, "loss": 0.2611, "step": 19065 }, { "epoch": 4.447294776119403, "grad_norm": 0.3952524457641686, "learning_rate": 6.4866564085002826e-06, "loss": 0.2787, "step": 19070 }, { "epoch": 4.448460820895522, "grad_norm": 0.41531152114803566, "learning_rate": 6.480459854593305e-06, "loss": 0.2838, "step": 19075 }, { "epoch": 4.449626865671641, "grad_norm": 0.41930044222211527, "learning_rate": 6.474275802302776e-06, "loss": 0.2823, "step": 19080 }, { "epoch": 4.4507929104477615, "grad_norm": 0.4268489902373017, "learning_rate": 6.468104255306728e-06, "loss": 0.2682, "step": 19085 }, { "epoch": 4.451958955223881, "grad_norm": 0.41252067564112116, "learning_rate": 6.461945217275761e-06, "loss": 0.2816, "step": 19090 }, { "epoch": 4.453125, "grad_norm": 0.38449147715064336, "learning_rate": 6.455798691873042e-06, "loss": 0.2605, "step": 19095 }, { "epoch": 4.454291044776119, "grad_norm": 0.4533579608370104, "learning_rate": 6.449664682754278e-06, "loss": 0.2734, "step": 19100 }, { "epoch": 4.4554570895522385, "grad_norm": 0.3904560363082716, "learning_rate": 6.443543193567745e-06, "loss": 0.2808, "step": 19105 }, { "epoch": 4.456623134328359, "grad_norm": 0.40327441751142873, "learning_rate": 6.4374342279542726e-06, "loss": 0.2831, "step": 19110 }, { "epoch": 4.457789179104478, "grad_norm": 0.41445646555278115, "learning_rate": 6.431337789547239e-06, "loss": 0.2714, "step": 19115 }, { "epoch": 4.458955223880597, "grad_norm": 0.3737900447135218, "learning_rate": 6.425253881972573e-06, "loss": 0.261, "step": 19120 }, { "epoch": 4.460121268656716, "grad_norm": 0.4066120230512756, "learning_rate": 6.419182508848745e-06, "loss": 0.2722, "step": 19125 }, { "epoch": 4.461287313432836, "grad_norm": 0.3979750471063328, "learning_rate": 6.4131236737867795e-06, "loss": 0.2707, "step": 19130 }, { "epoch": 4.462453358208955, "grad_norm": 0.3897999751940203, "learning_rate": 6.407077380390236e-06, "loss": 0.2721, "step": 19135 }, { "epoch": 4.463619402985074, "grad_norm": 0.4001411615680101, "learning_rate": 6.4010436322552204e-06, "loss": 0.2642, "step": 19140 }, { "epoch": 4.464785447761194, "grad_norm": 0.41422149509694084, "learning_rate": 6.395022432970375e-06, "loss": 0.2571, "step": 19145 }, { "epoch": 4.465951492537314, "grad_norm": 0.4154919561238171, "learning_rate": 6.389013786116878e-06, "loss": 0.2808, "step": 19150 }, { "epoch": 4.467117537313433, "grad_norm": 0.3978764925884262, "learning_rate": 6.383017695268441e-06, "loss": 0.2639, "step": 19155 }, { "epoch": 4.468283582089552, "grad_norm": 0.39306752609181533, "learning_rate": 6.377034163991308e-06, "loss": 0.2796, "step": 19160 }, { "epoch": 4.469449626865671, "grad_norm": 0.384063747009322, "learning_rate": 6.3710631958442524e-06, "loss": 0.2622, "step": 19165 }, { "epoch": 4.470615671641791, "grad_norm": 0.40126341615131395, "learning_rate": 6.365104794378582e-06, "loss": 0.275, "step": 19170 }, { "epoch": 4.471781716417911, "grad_norm": 0.42365641017668654, "learning_rate": 6.3591589631381286e-06, "loss": 0.2688, "step": 19175 }, { "epoch": 4.47294776119403, "grad_norm": 0.40908085859655946, "learning_rate": 6.353225705659234e-06, "loss": 0.2782, "step": 19180 }, { "epoch": 4.474113805970149, "grad_norm": 0.39825533025059007, "learning_rate": 6.347305025470776e-06, "loss": 0.2749, "step": 19185 }, { "epoch": 4.4752798507462686, "grad_norm": 0.41525520232326524, "learning_rate": 6.341396926094155e-06, "loss": 0.2864, "step": 19190 }, { "epoch": 4.476445895522388, "grad_norm": 0.4338386050024731, "learning_rate": 6.335501411043274e-06, "loss": 0.2939, "step": 19195 }, { "epoch": 4.477611940298507, "grad_norm": 0.39885416238441795, "learning_rate": 6.329618483824559e-06, "loss": 0.2777, "step": 19200 }, { "epoch": 4.478777985074627, "grad_norm": 0.4197249220668985, "learning_rate": 6.323748147936959e-06, "loss": 0.2821, "step": 19205 }, { "epoch": 4.4799440298507465, "grad_norm": 0.42926796475044454, "learning_rate": 6.317890406871914e-06, "loss": 0.2851, "step": 19210 }, { "epoch": 4.481110074626866, "grad_norm": 0.3951438331874199, "learning_rate": 6.312045264113388e-06, "loss": 0.2807, "step": 19215 }, { "epoch": 4.482276119402985, "grad_norm": 0.4255003447727782, "learning_rate": 6.306212723137846e-06, "loss": 0.2747, "step": 19220 }, { "epoch": 4.483442164179104, "grad_norm": 0.4074220430752247, "learning_rate": 6.300392787414265e-06, "loss": 0.2793, "step": 19225 }, { "epoch": 4.4846082089552235, "grad_norm": 0.4025382438197484, "learning_rate": 6.2945854604041135e-06, "loss": 0.2792, "step": 19230 }, { "epoch": 4.485774253731344, "grad_norm": 0.3921972243318409, "learning_rate": 6.28879074556137e-06, "loss": 0.2795, "step": 19235 }, { "epoch": 4.486940298507463, "grad_norm": 0.40775686243195514, "learning_rate": 6.283008646332507e-06, "loss": 0.274, "step": 19240 }, { "epoch": 4.488106343283582, "grad_norm": 0.3870994218374946, "learning_rate": 6.277239166156497e-06, "loss": 0.2641, "step": 19245 }, { "epoch": 4.489272388059701, "grad_norm": 0.3906998650606952, "learning_rate": 6.271482308464807e-06, "loss": 0.2712, "step": 19250 }, { "epoch": 4.490438432835821, "grad_norm": 0.3817095400466071, "learning_rate": 6.265738076681392e-06, "loss": 0.2665, "step": 19255 }, { "epoch": 4.49160447761194, "grad_norm": 0.42633466345186055, "learning_rate": 6.2600064742227e-06, "loss": 0.2916, "step": 19260 }, { "epoch": 4.49277052238806, "grad_norm": 0.4241079906906127, "learning_rate": 6.254287504497672e-06, "loss": 0.2836, "step": 19265 }, { "epoch": 4.493936567164179, "grad_norm": 0.4053351899896776, "learning_rate": 6.248581170907729e-06, "loss": 0.2696, "step": 19270 }, { "epoch": 4.495102611940299, "grad_norm": 0.4092687463740792, "learning_rate": 6.242887476846785e-06, "loss": 0.2758, "step": 19275 }, { "epoch": 4.496268656716418, "grad_norm": 0.4457329920963302, "learning_rate": 6.237206425701223e-06, "loss": 0.2794, "step": 19280 }, { "epoch": 4.497434701492537, "grad_norm": 0.422589189665392, "learning_rate": 6.231538020849919e-06, "loss": 0.2833, "step": 19285 }, { "epoch": 4.498600746268656, "grad_norm": 0.40719545940377416, "learning_rate": 6.225882265664218e-06, "loss": 0.2826, "step": 19290 }, { "epoch": 4.4997667910447765, "grad_norm": 0.3894236450328694, "learning_rate": 6.220239163507955e-06, "loss": 0.2722, "step": 19295 }, { "epoch": 4.500932835820896, "grad_norm": 0.4093844279695568, "learning_rate": 6.214608717737426e-06, "loss": 0.2894, "step": 19300 }, { "epoch": 4.502098880597015, "grad_norm": 0.3999452596114185, "learning_rate": 6.2089909317014e-06, "loss": 0.2637, "step": 19305 }, { "epoch": 4.503264925373134, "grad_norm": 0.37928564631271194, "learning_rate": 6.2033858087411275e-06, "loss": 0.2602, "step": 19310 }, { "epoch": 4.5044309701492535, "grad_norm": 0.40258765319208045, "learning_rate": 6.197793352190316e-06, "loss": 0.2823, "step": 19315 }, { "epoch": 4.505597014925373, "grad_norm": 0.39881501025472776, "learning_rate": 6.192213565375147e-06, "loss": 0.2727, "step": 19320 }, { "epoch": 4.506763059701493, "grad_norm": 0.38961424574655634, "learning_rate": 6.186646451614265e-06, "loss": 0.2708, "step": 19325 }, { "epoch": 4.507929104477612, "grad_norm": 0.40239205310246, "learning_rate": 6.1810920142187726e-06, "loss": 0.2751, "step": 19330 }, { "epoch": 4.5090951492537314, "grad_norm": 0.40769869988114504, "learning_rate": 6.175550256492235e-06, "loss": 0.2913, "step": 19335 }, { "epoch": 4.510261194029851, "grad_norm": 0.41681622084301867, "learning_rate": 6.170021181730681e-06, "loss": 0.2796, "step": 19340 }, { "epoch": 4.51142723880597, "grad_norm": 0.40521223352998204, "learning_rate": 6.164504793222589e-06, "loss": 0.2763, "step": 19345 }, { "epoch": 4.512593283582089, "grad_norm": 0.3794976557676561, "learning_rate": 6.159001094248904e-06, "loss": 0.2887, "step": 19350 }, { "epoch": 4.5137593283582085, "grad_norm": 0.4259876082606273, "learning_rate": 6.153510088083e-06, "loss": 0.2719, "step": 19355 }, { "epoch": 4.514925373134329, "grad_norm": 0.39316638200400666, "learning_rate": 6.1480317779907285e-06, "loss": 0.2757, "step": 19360 }, { "epoch": 4.516091417910448, "grad_norm": 0.39203245337716425, "learning_rate": 6.1425661672303735e-06, "loss": 0.2695, "step": 19365 }, { "epoch": 4.517257462686567, "grad_norm": 0.4167874420088122, "learning_rate": 6.1371132590526744e-06, "loss": 0.2723, "step": 19370 }, { "epoch": 4.518423507462686, "grad_norm": 0.40082339237716336, "learning_rate": 6.1316730567008086e-06, "loss": 0.2741, "step": 19375 }, { "epoch": 4.519589552238806, "grad_norm": 0.39196663617399374, "learning_rate": 6.126245563410399e-06, "loss": 0.2826, "step": 19380 }, { "epoch": 4.520755597014926, "grad_norm": 0.4154410313830451, "learning_rate": 6.120830782409515e-06, "loss": 0.2798, "step": 19385 }, { "epoch": 4.521921641791045, "grad_norm": 0.41780321723229025, "learning_rate": 6.115428716918657e-06, "loss": 0.276, "step": 19390 }, { "epoch": 4.523087686567164, "grad_norm": 0.40963979476987106, "learning_rate": 6.110039370150765e-06, "loss": 0.2678, "step": 19395 }, { "epoch": 4.524253731343284, "grad_norm": 0.4111879076863036, "learning_rate": 6.104662745311222e-06, "loss": 0.2765, "step": 19400 }, { "epoch": 4.525419776119403, "grad_norm": 0.41693216869068, "learning_rate": 6.099298845597832e-06, "loss": 0.2821, "step": 19405 }, { "epoch": 4.526585820895522, "grad_norm": 0.392640636302135, "learning_rate": 6.093947674200838e-06, "loss": 0.2815, "step": 19410 }, { "epoch": 4.527751865671641, "grad_norm": 0.4132072694079558, "learning_rate": 6.088609234302912e-06, "loss": 0.2698, "step": 19415 }, { "epoch": 4.5289179104477615, "grad_norm": 0.40173065497838006, "learning_rate": 6.083283529079157e-06, "loss": 0.2716, "step": 19420 }, { "epoch": 4.530083955223881, "grad_norm": 0.39584872427771123, "learning_rate": 6.077970561697095e-06, "loss": 0.2862, "step": 19425 }, { "epoch": 4.53125, "grad_norm": 0.4061946706562812, "learning_rate": 6.072670335316676e-06, "loss": 0.2716, "step": 19430 }, { "epoch": 4.532416044776119, "grad_norm": 0.40805514869112874, "learning_rate": 6.067382853090269e-06, "loss": 0.2746, "step": 19435 }, { "epoch": 4.5335820895522385, "grad_norm": 0.3963705910534693, "learning_rate": 6.062108118162669e-06, "loss": 0.2716, "step": 19440 }, { "epoch": 4.534748134328359, "grad_norm": 0.40975563096672996, "learning_rate": 6.056846133671083e-06, "loss": 0.2679, "step": 19445 }, { "epoch": 4.535914179104478, "grad_norm": 0.39497962693592065, "learning_rate": 6.051596902745143e-06, "loss": 0.2769, "step": 19450 }, { "epoch": 4.537080223880597, "grad_norm": 0.4096775616097535, "learning_rate": 6.0463604285068834e-06, "loss": 0.2666, "step": 19455 }, { "epoch": 4.538246268656716, "grad_norm": 0.38747793264249547, "learning_rate": 6.0411367140707625e-06, "loss": 0.2743, "step": 19460 }, { "epoch": 4.539412313432836, "grad_norm": 0.40292660808703534, "learning_rate": 6.035925762543644e-06, "loss": 0.2664, "step": 19465 }, { "epoch": 4.540578358208955, "grad_norm": 0.3971369327299724, "learning_rate": 6.030727577024802e-06, "loss": 0.2694, "step": 19470 }, { "epoch": 4.541744402985074, "grad_norm": 0.376268510836514, "learning_rate": 6.025542160605923e-06, "loss": 0.2743, "step": 19475 }, { "epoch": 4.542910447761194, "grad_norm": 0.37070145138374705, "learning_rate": 6.020369516371085e-06, "loss": 0.2862, "step": 19480 }, { "epoch": 4.544076492537314, "grad_norm": 0.3940824099672883, "learning_rate": 6.015209647396781e-06, "loss": 0.2903, "step": 19485 }, { "epoch": 4.545242537313433, "grad_norm": 0.4135855527891064, "learning_rate": 6.010062556751906e-06, "loss": 0.2816, "step": 19490 }, { "epoch": 4.546408582089552, "grad_norm": 0.3998933292712433, "learning_rate": 6.00492824749775e-06, "loss": 0.2726, "step": 19495 }, { "epoch": 4.547574626865671, "grad_norm": 0.40362698833453464, "learning_rate": 5.999806722688007e-06, "loss": 0.2706, "step": 19500 }, { "epoch": 4.5487406716417915, "grad_norm": 0.3996476049128751, "learning_rate": 5.994697985368761e-06, "loss": 0.2759, "step": 19505 }, { "epoch": 4.549906716417911, "grad_norm": 0.4358477852289081, "learning_rate": 5.98960203857849e-06, "loss": 0.2759, "step": 19510 }, { "epoch": 4.55107276119403, "grad_norm": 0.42666702668186296, "learning_rate": 5.98451888534807e-06, "loss": 0.2844, "step": 19515 }, { "epoch": 4.552238805970149, "grad_norm": 0.38888095487037455, "learning_rate": 5.9794485287007696e-06, "loss": 0.2722, "step": 19520 }, { "epoch": 4.5534048507462686, "grad_norm": 0.4222347972637875, "learning_rate": 5.974390971652237e-06, "loss": 0.2823, "step": 19525 }, { "epoch": 4.554570895522388, "grad_norm": 0.4043149810792717, "learning_rate": 5.9693462172105165e-06, "loss": 0.2746, "step": 19530 }, { "epoch": 4.555736940298507, "grad_norm": 0.384161856818575, "learning_rate": 5.964314268376031e-06, "loss": 0.2714, "step": 19535 }, { "epoch": 4.556902985074627, "grad_norm": 0.4074095968516756, "learning_rate": 5.959295128141596e-06, "loss": 0.274, "step": 19540 }, { "epoch": 4.5580690298507465, "grad_norm": 0.37147033638693244, "learning_rate": 5.9542887994923985e-06, "loss": 0.2706, "step": 19545 }, { "epoch": 4.559235074626866, "grad_norm": 0.42004201414060405, "learning_rate": 5.949295285406015e-06, "loss": 0.2894, "step": 19550 }, { "epoch": 4.560401119402985, "grad_norm": 0.405002870436818, "learning_rate": 5.944314588852393e-06, "loss": 0.286, "step": 19555 }, { "epoch": 4.561567164179104, "grad_norm": 0.39430856424292815, "learning_rate": 5.93934671279386e-06, "loss": 0.2703, "step": 19560 }, { "epoch": 4.5627332089552235, "grad_norm": 0.40082691907537743, "learning_rate": 5.934391660185121e-06, "loss": 0.2709, "step": 19565 }, { "epoch": 4.563899253731344, "grad_norm": 0.41785338119901133, "learning_rate": 5.929449433973249e-06, "loss": 0.2651, "step": 19570 }, { "epoch": 4.565065298507463, "grad_norm": 0.4279430163773111, "learning_rate": 5.924520037097688e-06, "loss": 0.2818, "step": 19575 }, { "epoch": 4.566231343283582, "grad_norm": 0.40038913922060654, "learning_rate": 5.919603472490263e-06, "loss": 0.2951, "step": 19580 }, { "epoch": 4.567397388059701, "grad_norm": 0.39948244960271895, "learning_rate": 5.914699743075149e-06, "loss": 0.2605, "step": 19585 }, { "epoch": 4.568563432835821, "grad_norm": 0.43136010733909785, "learning_rate": 5.909808851768898e-06, "loss": 0.2746, "step": 19590 }, { "epoch": 4.56972947761194, "grad_norm": 0.43280422382694816, "learning_rate": 5.904930801480427e-06, "loss": 0.2771, "step": 19595 }, { "epoch": 4.57089552238806, "grad_norm": 0.3957451950378775, "learning_rate": 5.900065595111014e-06, "loss": 0.2749, "step": 19600 }, { "epoch": 4.572061567164179, "grad_norm": 0.44394443324252325, "learning_rate": 5.895213235554298e-06, "loss": 0.2892, "step": 19605 }, { "epoch": 4.573227611940299, "grad_norm": 0.400056466974145, "learning_rate": 5.890373725696271e-06, "loss": 0.2639, "step": 19610 }, { "epoch": 4.574393656716418, "grad_norm": 0.41458887736370276, "learning_rate": 5.885547068415289e-06, "loss": 0.281, "step": 19615 }, { "epoch": 4.575559701492537, "grad_norm": 0.40621557083779697, "learning_rate": 5.880733266582066e-06, "loss": 0.2735, "step": 19620 }, { "epoch": 4.576725746268656, "grad_norm": 0.4023809100073007, "learning_rate": 5.875932323059667e-06, "loss": 0.2731, "step": 19625 }, { "epoch": 4.5778917910447765, "grad_norm": 0.38252664921450025, "learning_rate": 5.871144240703507e-06, "loss": 0.265, "step": 19630 }, { "epoch": 4.579057835820896, "grad_norm": 0.3912285302237702, "learning_rate": 5.866369022361354e-06, "loss": 0.2629, "step": 19635 }, { "epoch": 4.580223880597015, "grad_norm": 0.41530629012469544, "learning_rate": 5.8616066708733255e-06, "loss": 0.2794, "step": 19640 }, { "epoch": 4.581389925373134, "grad_norm": 0.427373998670521, "learning_rate": 5.856857189071884e-06, "loss": 0.2724, "step": 19645 }, { "epoch": 4.5825559701492535, "grad_norm": 0.4235487584855721, "learning_rate": 5.852120579781838e-06, "loss": 0.2759, "step": 19650 }, { "epoch": 4.583722014925373, "grad_norm": 0.4305721437683783, "learning_rate": 5.847396845820349e-06, "loss": 0.2939, "step": 19655 }, { "epoch": 4.584888059701493, "grad_norm": 0.41505408391745124, "learning_rate": 5.8426859899969034e-06, "loss": 0.2799, "step": 19660 }, { "epoch": 4.586054104477612, "grad_norm": 0.3952767658187638, "learning_rate": 5.83798801511334e-06, "loss": 0.2816, "step": 19665 }, { "epoch": 4.5872201492537314, "grad_norm": 0.40807007047949034, "learning_rate": 5.833302923963837e-06, "loss": 0.2826, "step": 19670 }, { "epoch": 4.588386194029851, "grad_norm": 0.39343617196658465, "learning_rate": 5.828630719334905e-06, "loss": 0.2797, "step": 19675 }, { "epoch": 4.58955223880597, "grad_norm": 0.4027098901842826, "learning_rate": 5.8239714040053936e-06, "loss": 0.2898, "step": 19680 }, { "epoch": 4.590718283582089, "grad_norm": 0.38531328965715933, "learning_rate": 5.819324980746483e-06, "loss": 0.2729, "step": 19685 }, { "epoch": 4.5918843283582085, "grad_norm": 0.3879302233639575, "learning_rate": 5.814691452321687e-06, "loss": 0.2645, "step": 19690 }, { "epoch": 4.593050373134329, "grad_norm": 0.39080847205786856, "learning_rate": 5.810070821486854e-06, "loss": 0.2803, "step": 19695 }, { "epoch": 4.594216417910448, "grad_norm": 0.40894217807761285, "learning_rate": 5.805463090990154e-06, "loss": 0.2793, "step": 19700 }, { "epoch": 4.595382462686567, "grad_norm": 0.3825694244005513, "learning_rate": 5.800868263572093e-06, "loss": 0.2749, "step": 19705 }, { "epoch": 4.596548507462686, "grad_norm": 0.37684047859810854, "learning_rate": 5.796286341965492e-06, "loss": 0.2676, "step": 19710 }, { "epoch": 4.597714552238806, "grad_norm": 0.373842441950314, "learning_rate": 5.7917173288955105e-06, "loss": 0.272, "step": 19715 }, { "epoch": 4.598880597014926, "grad_norm": 0.42330168939749574, "learning_rate": 5.787161227079613e-06, "loss": 0.2884, "step": 19720 }, { "epoch": 4.600046641791045, "grad_norm": 0.39687130006718135, "learning_rate": 5.782618039227603e-06, "loss": 0.2705, "step": 19725 }, { "epoch": 4.601212686567164, "grad_norm": 0.4116001899644282, "learning_rate": 5.778087768041589e-06, "loss": 0.2845, "step": 19730 }, { "epoch": 4.602378731343284, "grad_norm": 0.40456122964502916, "learning_rate": 5.7735704162160005e-06, "loss": 0.27, "step": 19735 }, { "epoch": 4.603544776119403, "grad_norm": 0.384095039579496, "learning_rate": 5.769065986437591e-06, "loss": 0.2898, "step": 19740 }, { "epoch": 4.604710820895522, "grad_norm": 0.3855658442097468, "learning_rate": 5.764574481385419e-06, "loss": 0.2631, "step": 19745 }, { "epoch": 4.605876865671641, "grad_norm": 0.4333675633039686, "learning_rate": 5.7600959037308626e-06, "loss": 0.278, "step": 19750 }, { "epoch": 4.6070429104477615, "grad_norm": 0.41376429703298284, "learning_rate": 5.755630256137605e-06, "loss": 0.286, "step": 19755 }, { "epoch": 4.608208955223881, "grad_norm": 0.41920233257169415, "learning_rate": 5.7511775412616415e-06, "loss": 0.2693, "step": 19760 }, { "epoch": 4.609375, "grad_norm": 0.40718260204158097, "learning_rate": 5.74673776175128e-06, "loss": 0.2706, "step": 19765 }, { "epoch": 4.610541044776119, "grad_norm": 0.3864503289913195, "learning_rate": 5.742310920247127e-06, "loss": 0.2612, "step": 19770 }, { "epoch": 4.6117070895522385, "grad_norm": 0.4162571718892432, "learning_rate": 5.737897019382098e-06, "loss": 0.2882, "step": 19775 }, { "epoch": 4.612873134328359, "grad_norm": 0.4005442030212411, "learning_rate": 5.733496061781418e-06, "loss": 0.2735, "step": 19780 }, { "epoch": 4.614039179104478, "grad_norm": 0.4176972373199422, "learning_rate": 5.729108050062603e-06, "loss": 0.2752, "step": 19785 }, { "epoch": 4.615205223880597, "grad_norm": 0.4254386676294371, "learning_rate": 5.7247329868354705e-06, "loss": 0.2674, "step": 19790 }, { "epoch": 4.616371268656716, "grad_norm": 0.4198990073695244, "learning_rate": 5.720370874702148e-06, "loss": 0.2849, "step": 19795 }, { "epoch": 4.617537313432836, "grad_norm": 0.41152217793888024, "learning_rate": 5.716021716257047e-06, "loss": 0.2766, "step": 19800 }, { "epoch": 4.618703358208955, "grad_norm": 0.392074902485431, "learning_rate": 5.7116855140868874e-06, "loss": 0.2658, "step": 19805 }, { "epoch": 4.619869402985074, "grad_norm": 0.4130968665526762, "learning_rate": 5.707362270770665e-06, "loss": 0.2698, "step": 19810 }, { "epoch": 4.621035447761194, "grad_norm": 0.3889066425584659, "learning_rate": 5.703051988879689e-06, "loss": 0.2693, "step": 19815 }, { "epoch": 4.622201492537314, "grad_norm": 0.40396315931555515, "learning_rate": 5.698754670977544e-06, "loss": 0.2681, "step": 19820 }, { "epoch": 4.623367537313433, "grad_norm": 0.45162345998255143, "learning_rate": 5.69447031962011e-06, "loss": 0.2908, "step": 19825 }, { "epoch": 4.624533582089552, "grad_norm": 0.3647318833276829, "learning_rate": 5.690198937355561e-06, "loss": 0.27, "step": 19830 }, { "epoch": 4.625699626865671, "grad_norm": 0.4030906159068544, "learning_rate": 5.685940526724344e-06, "loss": 0.2709, "step": 19835 }, { "epoch": 4.6268656716417915, "grad_norm": 0.39945894837975454, "learning_rate": 5.6816950902592005e-06, "loss": 0.2707, "step": 19840 }, { "epoch": 4.628031716417911, "grad_norm": 0.4005422592919341, "learning_rate": 5.6774626304851555e-06, "loss": 0.2677, "step": 19845 }, { "epoch": 4.62919776119403, "grad_norm": 0.423893919526871, "learning_rate": 5.673243149919512e-06, "loss": 0.2698, "step": 19850 }, { "epoch": 4.630363805970149, "grad_norm": 0.3957984543906955, "learning_rate": 5.669036651071857e-06, "loss": 0.274, "step": 19855 }, { "epoch": 4.6315298507462686, "grad_norm": 0.40995172521649115, "learning_rate": 5.664843136444054e-06, "loss": 0.2818, "step": 19860 }, { "epoch": 4.632695895522388, "grad_norm": 0.4106170054479251, "learning_rate": 5.660662608530239e-06, "loss": 0.2665, "step": 19865 }, { "epoch": 4.633861940298507, "grad_norm": 0.41235426028584793, "learning_rate": 5.6564950698168385e-06, "loss": 0.2743, "step": 19870 }, { "epoch": 4.635027985074627, "grad_norm": 0.39748498292025686, "learning_rate": 5.652340522782542e-06, "loss": 0.27, "step": 19875 }, { "epoch": 4.6361940298507465, "grad_norm": 0.4066110480697797, "learning_rate": 5.648198969898311e-06, "loss": 0.2718, "step": 19880 }, { "epoch": 4.637360074626866, "grad_norm": 0.3675449216703294, "learning_rate": 5.644070413627386e-06, "loss": 0.259, "step": 19885 }, { "epoch": 4.638526119402985, "grad_norm": 0.4288644330626841, "learning_rate": 5.639954856425273e-06, "loss": 0.2897, "step": 19890 }, { "epoch": 4.639692164179104, "grad_norm": 0.3938760486236866, "learning_rate": 5.6358523007397485e-06, "loss": 0.2795, "step": 19895 }, { "epoch": 4.6408582089552235, "grad_norm": 0.3837875410548764, "learning_rate": 5.631762749010855e-06, "loss": 0.2626, "step": 19900 }, { "epoch": 4.642024253731344, "grad_norm": 0.37882241259163735, "learning_rate": 5.6276862036709e-06, "loss": 0.2838, "step": 19905 }, { "epoch": 4.643190298507463, "grad_norm": 0.4461753407156029, "learning_rate": 5.6236226671444555e-06, "loss": 0.2764, "step": 19910 }, { "epoch": 4.644356343283582, "grad_norm": 0.38132448693471266, "learning_rate": 5.619572141848358e-06, "loss": 0.2685, "step": 19915 }, { "epoch": 4.645522388059701, "grad_norm": 0.3946667902969425, "learning_rate": 5.615534630191708e-06, "loss": 0.2691, "step": 19920 }, { "epoch": 4.646688432835821, "grad_norm": 0.42992782334530544, "learning_rate": 5.611510134575859e-06, "loss": 0.2721, "step": 19925 }, { "epoch": 4.64785447761194, "grad_norm": 0.41687010629281285, "learning_rate": 5.607498657394424e-06, "loss": 0.2834, "step": 19930 }, { "epoch": 4.64902052238806, "grad_norm": 0.41904424231712306, "learning_rate": 5.603500201033285e-06, "loss": 0.2925, "step": 19935 }, { "epoch": 4.650186567164179, "grad_norm": 0.41797449445289314, "learning_rate": 5.59951476787056e-06, "loss": 0.2607, "step": 19940 }, { "epoch": 4.651352611940299, "grad_norm": 0.4167939162797424, "learning_rate": 5.595542360276636e-06, "loss": 0.2692, "step": 19945 }, { "epoch": 4.652518656716418, "grad_norm": 0.40344143571319735, "learning_rate": 5.591582980614151e-06, "loss": 0.2705, "step": 19950 }, { "epoch": 4.653684701492537, "grad_norm": 0.4342451674505531, "learning_rate": 5.587636631237991e-06, "loss": 0.2732, "step": 19955 }, { "epoch": 4.654850746268656, "grad_norm": 0.3938663686243035, "learning_rate": 5.583703314495294e-06, "loss": 0.2678, "step": 19960 }, { "epoch": 4.6560167910447765, "grad_norm": 0.391969984782422, "learning_rate": 5.579783032725441e-06, "loss": 0.284, "step": 19965 }, { "epoch": 4.657182835820896, "grad_norm": 0.4009376587634659, "learning_rate": 5.5758757882600706e-06, "loss": 0.2675, "step": 19970 }, { "epoch": 4.658348880597015, "grad_norm": 0.40241606810174496, "learning_rate": 5.57198158342306e-06, "loss": 0.2852, "step": 19975 }, { "epoch": 4.659514925373134, "grad_norm": 0.4247647502726461, "learning_rate": 5.568100420530533e-06, "loss": 0.2768, "step": 19980 }, { "epoch": 4.6606809701492535, "grad_norm": 0.4084528798804982, "learning_rate": 5.5642323018908595e-06, "loss": 0.2786, "step": 19985 }, { "epoch": 4.661847014925373, "grad_norm": 0.4074659575509635, "learning_rate": 5.560377229804644e-06, "loss": 0.27, "step": 19990 }, { "epoch": 4.663013059701493, "grad_norm": 0.3820600401203203, "learning_rate": 5.556535206564733e-06, "loss": 0.2834, "step": 19995 }, { "epoch": 4.664179104477612, "grad_norm": 0.40351571148753534, "learning_rate": 5.55270623445622e-06, "loss": 0.2648, "step": 20000 }, { "epoch": 4.6653451492537314, "grad_norm": 0.3964258851523608, "learning_rate": 5.548890315756433e-06, "loss": 0.2741, "step": 20005 }, { "epoch": 4.666511194029851, "grad_norm": 0.37152087763676445, "learning_rate": 5.545087452734928e-06, "loss": 0.2822, "step": 20010 }, { "epoch": 4.66767723880597, "grad_norm": 0.4253016328676518, "learning_rate": 5.541297647653505e-06, "loss": 0.2714, "step": 20015 }, { "epoch": 4.668843283582089, "grad_norm": 0.3998650134106237, "learning_rate": 5.537520902766193e-06, "loss": 0.2919, "step": 20020 }, { "epoch": 4.6700093283582085, "grad_norm": 0.40601634420089444, "learning_rate": 5.533757220319257e-06, "loss": 0.2767, "step": 20025 }, { "epoch": 4.671175373134329, "grad_norm": 0.41907168348590956, "learning_rate": 5.5300066025511885e-06, "loss": 0.2694, "step": 20030 }, { "epoch": 4.672341417910448, "grad_norm": 0.4157035283843186, "learning_rate": 5.526269051692717e-06, "loss": 0.286, "step": 20035 }, { "epoch": 4.673507462686567, "grad_norm": 0.39126692904238075, "learning_rate": 5.522544569966786e-06, "loss": 0.2747, "step": 20040 }, { "epoch": 4.674673507462686, "grad_norm": 0.4174392835172584, "learning_rate": 5.518833159588582e-06, "loss": 0.2715, "step": 20045 }, { "epoch": 4.675839552238806, "grad_norm": 0.37641835072158225, "learning_rate": 5.515134822765504e-06, "loss": 0.2629, "step": 20050 }, { "epoch": 4.677005597014926, "grad_norm": 0.4158241667768276, "learning_rate": 5.511449561697183e-06, "loss": 0.2758, "step": 20055 }, { "epoch": 4.678171641791045, "grad_norm": 0.40241732091317955, "learning_rate": 5.507777378575474e-06, "loss": 0.2828, "step": 20060 }, { "epoch": 4.679337686567164, "grad_norm": 0.40779033540993415, "learning_rate": 5.504118275584444e-06, "loss": 0.269, "step": 20065 }, { "epoch": 4.680503731343284, "grad_norm": 0.42844495534181765, "learning_rate": 5.500472254900392e-06, "loss": 0.2926, "step": 20070 }, { "epoch": 4.681669776119403, "grad_norm": 0.38903131772197835, "learning_rate": 5.49683931869183e-06, "loss": 0.2604, "step": 20075 }, { "epoch": 4.682835820895522, "grad_norm": 0.39269896031724016, "learning_rate": 5.4932194691194905e-06, "loss": 0.2666, "step": 20080 }, { "epoch": 4.684001865671641, "grad_norm": 0.4428629130930986, "learning_rate": 5.489612708336324e-06, "loss": 0.2769, "step": 20085 }, { "epoch": 4.6851679104477615, "grad_norm": 0.42070859736541616, "learning_rate": 5.486019038487483e-06, "loss": 0.2842, "step": 20090 }, { "epoch": 4.686333955223881, "grad_norm": 0.41839978965281005, "learning_rate": 5.482438461710355e-06, "loss": 0.2792, "step": 20095 }, { "epoch": 4.6875, "grad_norm": 0.41402089350167637, "learning_rate": 5.4788709801345244e-06, "loss": 0.2749, "step": 20100 }, { "epoch": 4.688666044776119, "grad_norm": 0.41518823880099653, "learning_rate": 5.475316595881796e-06, "loss": 0.2755, "step": 20105 }, { "epoch": 4.6898320895522385, "grad_norm": 0.4041644682151216, "learning_rate": 5.471775311066177e-06, "loss": 0.2878, "step": 20110 }, { "epoch": 4.690998134328359, "grad_norm": 0.3913561730786025, "learning_rate": 5.468247127793893e-06, "loss": 0.2834, "step": 20115 }, { "epoch": 4.692164179104478, "grad_norm": 0.4061629665339246, "learning_rate": 5.464732048163365e-06, "loss": 0.2719, "step": 20120 }, { "epoch": 4.693330223880597, "grad_norm": 0.3703512330712156, "learning_rate": 5.461230074265233e-06, "loss": 0.2889, "step": 20125 }, { "epoch": 4.694496268656716, "grad_norm": 0.39130635011992626, "learning_rate": 5.4577412081823355e-06, "loss": 0.2715, "step": 20130 }, { "epoch": 4.695662313432836, "grad_norm": 0.40949348608109915, "learning_rate": 5.45426545198972e-06, "loss": 0.2838, "step": 20135 }, { "epoch": 4.696828358208955, "grad_norm": 0.4139020715594625, "learning_rate": 5.450802807754625e-06, "loss": 0.2875, "step": 20140 }, { "epoch": 4.697994402985074, "grad_norm": 0.4078866618718476, "learning_rate": 5.4473532775365026e-06, "loss": 0.2799, "step": 20145 }, { "epoch": 4.699160447761194, "grad_norm": 0.4337431895927768, "learning_rate": 5.443916863387002e-06, "loss": 0.2739, "step": 20150 }, { "epoch": 4.700326492537314, "grad_norm": 0.40120045642838176, "learning_rate": 5.4404935673499685e-06, "loss": 0.2797, "step": 20155 }, { "epoch": 4.701492537313433, "grad_norm": 0.3969535063563502, "learning_rate": 5.437083391461452e-06, "loss": 0.2804, "step": 20160 }, { "epoch": 4.702658582089552, "grad_norm": 0.42195915136841367, "learning_rate": 5.43368633774969e-06, "loss": 0.2832, "step": 20165 }, { "epoch": 4.703824626865671, "grad_norm": 0.4090587649795599, "learning_rate": 5.43030240823512e-06, "loss": 0.2707, "step": 20170 }, { "epoch": 4.7049906716417915, "grad_norm": 0.39695667224760095, "learning_rate": 5.426931604930375e-06, "loss": 0.259, "step": 20175 }, { "epoch": 4.706156716417911, "grad_norm": 0.415981260709335, "learning_rate": 5.423573929840277e-06, "loss": 0.2788, "step": 20180 }, { "epoch": 4.70732276119403, "grad_norm": 0.42426830296680235, "learning_rate": 5.420229384961847e-06, "loss": 0.2759, "step": 20185 }, { "epoch": 4.708488805970149, "grad_norm": 0.41406649717023797, "learning_rate": 5.416897972284287e-06, "loss": 0.2718, "step": 20190 }, { "epoch": 4.7096548507462686, "grad_norm": 0.38536876981679996, "learning_rate": 5.413579693788995e-06, "loss": 0.269, "step": 20195 }, { "epoch": 4.710820895522388, "grad_norm": 0.3833330688114252, "learning_rate": 5.410274551449559e-06, "loss": 0.2845, "step": 20200 }, { "epoch": 4.711986940298507, "grad_norm": 0.3994575259692321, "learning_rate": 5.406982547231746e-06, "loss": 0.2707, "step": 20205 }, { "epoch": 4.713152985074627, "grad_norm": 0.37610555057233974, "learning_rate": 5.403703683093517e-06, "loss": 0.2724, "step": 20210 }, { "epoch": 4.7143190298507465, "grad_norm": 0.39062542516181153, "learning_rate": 5.400437960985017e-06, "loss": 0.2829, "step": 20215 }, { "epoch": 4.715485074626866, "grad_norm": 0.4061890750292189, "learning_rate": 5.397185382848568e-06, "loss": 0.2806, "step": 20220 }, { "epoch": 4.716651119402985, "grad_norm": 0.4034494158687929, "learning_rate": 5.393945950618678e-06, "loss": 0.281, "step": 20225 }, { "epoch": 4.717817164179104, "grad_norm": 0.404306940460495, "learning_rate": 5.39071966622204e-06, "loss": 0.2787, "step": 20230 }, { "epoch": 4.7189832089552235, "grad_norm": 0.3952795988985536, "learning_rate": 5.387506531577523e-06, "loss": 0.2818, "step": 20235 }, { "epoch": 4.720149253731344, "grad_norm": 0.4097339875261114, "learning_rate": 5.384306548596178e-06, "loss": 0.276, "step": 20240 }, { "epoch": 4.721315298507463, "grad_norm": 0.4143551337285383, "learning_rate": 5.3811197191812296e-06, "loss": 0.2694, "step": 20245 }, { "epoch": 4.722481343283582, "grad_norm": 0.3925335566603688, "learning_rate": 5.377946045228084e-06, "loss": 0.2679, "step": 20250 }, { "epoch": 4.723647388059701, "grad_norm": 0.3979737513923886, "learning_rate": 5.374785528624317e-06, "loss": 0.2969, "step": 20255 }, { "epoch": 4.724813432835821, "grad_norm": 0.4122181264677655, "learning_rate": 5.37163817124969e-06, "loss": 0.273, "step": 20260 }, { "epoch": 4.72597947761194, "grad_norm": 0.4062442845332021, "learning_rate": 5.368503974976122e-06, "loss": 0.2729, "step": 20265 }, { "epoch": 4.72714552238806, "grad_norm": 0.4388084829839234, "learning_rate": 5.36538294166772e-06, "loss": 0.284, "step": 20270 }, { "epoch": 4.728311567164179, "grad_norm": 0.4110496964471569, "learning_rate": 5.362275073180749e-06, "loss": 0.2791, "step": 20275 }, { "epoch": 4.729477611940299, "grad_norm": 0.42563170081055096, "learning_rate": 5.3591803713636545e-06, "loss": 0.2747, "step": 20280 }, { "epoch": 4.730643656716418, "grad_norm": 0.4011040672858432, "learning_rate": 5.3560988380570405e-06, "loss": 0.2828, "step": 20285 }, { "epoch": 4.731809701492537, "grad_norm": 0.38093855338310534, "learning_rate": 5.353030475093694e-06, "loss": 0.277, "step": 20290 }, { "epoch": 4.732975746268656, "grad_norm": 0.3856859545089293, "learning_rate": 5.349975284298552e-06, "loss": 0.2767, "step": 20295 }, { "epoch": 4.7341417910447765, "grad_norm": 0.4069441410447371, "learning_rate": 5.346933267488726e-06, "loss": 0.2815, "step": 20300 }, { "epoch": 4.735307835820896, "grad_norm": 0.41705152172515086, "learning_rate": 5.343904426473493e-06, "loss": 0.2692, "step": 20305 }, { "epoch": 4.736473880597015, "grad_norm": 0.40305160376437654, "learning_rate": 5.340888763054291e-06, "loss": 0.2832, "step": 20310 }, { "epoch": 4.737639925373134, "grad_norm": 0.3821681930196034, "learning_rate": 5.337886279024722e-06, "loss": 0.2747, "step": 20315 }, { "epoch": 4.7388059701492535, "grad_norm": 0.3773940462874887, "learning_rate": 5.3348969761705446e-06, "loss": 0.2735, "step": 20320 }, { "epoch": 4.739972014925373, "grad_norm": 0.41765947000111353, "learning_rate": 5.331920856269686e-06, "loss": 0.2666, "step": 20325 }, { "epoch": 4.741138059701493, "grad_norm": 0.392884893447926, "learning_rate": 5.328957921092224e-06, "loss": 0.277, "step": 20330 }, { "epoch": 4.742304104477612, "grad_norm": 0.426386450404409, "learning_rate": 5.326008172400402e-06, "loss": 0.2887, "step": 20335 }, { "epoch": 4.7434701492537314, "grad_norm": 0.4057923794315817, "learning_rate": 5.323071611948619e-06, "loss": 0.2664, "step": 20340 }, { "epoch": 4.744636194029851, "grad_norm": 0.399774726871975, "learning_rate": 5.320148241483422e-06, "loss": 0.2701, "step": 20345 }, { "epoch": 4.74580223880597, "grad_norm": 0.4391061103101001, "learning_rate": 5.317238062743527e-06, "loss": 0.2925, "step": 20350 }, { "epoch": 4.746968283582089, "grad_norm": 0.3843696557078419, "learning_rate": 5.31434107745979e-06, "loss": 0.2648, "step": 20355 }, { "epoch": 4.7481343283582085, "grad_norm": 0.38943130902787915, "learning_rate": 5.311457287355232e-06, "loss": 0.2735, "step": 20360 }, { "epoch": 4.749300373134329, "grad_norm": 0.39352769937236864, "learning_rate": 5.3085866941450185e-06, "loss": 0.262, "step": 20365 }, { "epoch": 4.750466417910448, "grad_norm": 0.3997169457448534, "learning_rate": 5.3057292995364695e-06, "loss": 0.2893, "step": 20370 }, { "epoch": 4.751632462686567, "grad_norm": 0.4270993888536849, "learning_rate": 5.302885105229052e-06, "loss": 0.2727, "step": 20375 }, { "epoch": 4.752798507462686, "grad_norm": 0.41379495154201595, "learning_rate": 5.300054112914385e-06, "loss": 0.2768, "step": 20380 }, { "epoch": 4.753964552238806, "grad_norm": 0.42526662832515993, "learning_rate": 5.297236324276231e-06, "loss": 0.2741, "step": 20385 }, { "epoch": 4.755130597014926, "grad_norm": 0.4205437580875176, "learning_rate": 5.294431740990509e-06, "loss": 0.2599, "step": 20390 }, { "epoch": 4.756296641791045, "grad_norm": 0.42970850645324027, "learning_rate": 5.291640364725272e-06, "loss": 0.2776, "step": 20395 }, { "epoch": 4.757462686567164, "grad_norm": 0.42266879394528883, "learning_rate": 5.288862197140726e-06, "loss": 0.2771, "step": 20400 }, { "epoch": 4.758628731343284, "grad_norm": 0.4144657388495723, "learning_rate": 5.286097239889219e-06, "loss": 0.2891, "step": 20405 }, { "epoch": 4.759794776119403, "grad_norm": 0.40661242470869285, "learning_rate": 5.283345494615238e-06, "loss": 0.2723, "step": 20410 }, { "epoch": 4.760960820895522, "grad_norm": 0.4191269034167923, "learning_rate": 5.280606962955423e-06, "loss": 0.2771, "step": 20415 }, { "epoch": 4.762126865671641, "grad_norm": 0.4030049130565329, "learning_rate": 5.277881646538537e-06, "loss": 0.281, "step": 20420 }, { "epoch": 4.7632929104477615, "grad_norm": 0.3973846618368994, "learning_rate": 5.275169546985502e-06, "loss": 0.2748, "step": 20425 }, { "epoch": 4.764458955223881, "grad_norm": 0.4012487763683455, "learning_rate": 5.272470665909368e-06, "loss": 0.2681, "step": 20430 }, { "epoch": 4.765625, "grad_norm": 0.3917302257351195, "learning_rate": 5.269785004915328e-06, "loss": 0.2738, "step": 20435 }, { "epoch": 4.766791044776119, "grad_norm": 0.4015905245652249, "learning_rate": 5.267112565600707e-06, "loss": 0.2827, "step": 20440 }, { "epoch": 4.7679570895522385, "grad_norm": 0.41158626768188383, "learning_rate": 5.26445334955497e-06, "loss": 0.2666, "step": 20445 }, { "epoch": 4.769123134328359, "grad_norm": 0.4153865577892089, "learning_rate": 5.261807358359719e-06, "loss": 0.2785, "step": 20450 }, { "epoch": 4.770289179104478, "grad_norm": 0.4074937342163048, "learning_rate": 5.259174593588688e-06, "loss": 0.2771, "step": 20455 }, { "epoch": 4.771455223880597, "grad_norm": 0.42260098375275656, "learning_rate": 5.25655505680774e-06, "loss": 0.2706, "step": 20460 }, { "epoch": 4.772621268656716, "grad_norm": 0.4386691660849028, "learning_rate": 5.253948749574879e-06, "loss": 0.2911, "step": 20465 }, { "epoch": 4.773787313432836, "grad_norm": 0.389190161514281, "learning_rate": 5.2513556734402384e-06, "loss": 0.2659, "step": 20470 }, { "epoch": 4.774953358208955, "grad_norm": 0.3967901724050337, "learning_rate": 5.248775829946076e-06, "loss": 0.2783, "step": 20475 }, { "epoch": 4.776119402985074, "grad_norm": 0.397550656014172, "learning_rate": 5.2462092206267864e-06, "loss": 0.2783, "step": 20480 }, { "epoch": 4.777285447761194, "grad_norm": 0.405818189936009, "learning_rate": 5.243655847008888e-06, "loss": 0.2764, "step": 20485 }, { "epoch": 4.778451492537314, "grad_norm": 0.4079539673137217, "learning_rate": 5.241115710611033e-06, "loss": 0.2582, "step": 20490 }, { "epoch": 4.779617537313433, "grad_norm": 0.41731085220924946, "learning_rate": 5.2385888129439934e-06, "loss": 0.2878, "step": 20495 }, { "epoch": 4.780783582089552, "grad_norm": 0.41954206535870575, "learning_rate": 5.236075155510675e-06, "loss": 0.2878, "step": 20500 }, { "epoch": 4.781949626865671, "grad_norm": 0.3800071249584396, "learning_rate": 5.2335747398061e-06, "loss": 0.2631, "step": 20505 }, { "epoch": 4.7831156716417915, "grad_norm": 0.405722668641218, "learning_rate": 5.231087567317425e-06, "loss": 0.2655, "step": 20510 }, { "epoch": 4.784281716417911, "grad_norm": 0.4223186540224272, "learning_rate": 5.228613639523922e-06, "loss": 0.2822, "step": 20515 }, { "epoch": 4.78544776119403, "grad_norm": 0.3747955830740057, "learning_rate": 5.2261529578969905e-06, "loss": 0.2602, "step": 20520 }, { "epoch": 4.786613805970149, "grad_norm": 0.4166857677830499, "learning_rate": 5.223705523900145e-06, "loss": 0.2692, "step": 20525 }, { "epoch": 4.7877798507462686, "grad_norm": 0.39583760675436, "learning_rate": 5.22127133898903e-06, "loss": 0.2763, "step": 20530 }, { "epoch": 4.788945895522388, "grad_norm": 0.38369609698701834, "learning_rate": 5.2188504046114005e-06, "loss": 0.2838, "step": 20535 }, { "epoch": 4.790111940298507, "grad_norm": 0.41067503935590915, "learning_rate": 5.216442722207141e-06, "loss": 0.2719, "step": 20540 }, { "epoch": 4.791277985074627, "grad_norm": 0.4185264107269963, "learning_rate": 5.214048293208246e-06, "loss": 0.2776, "step": 20545 }, { "epoch": 4.7924440298507465, "grad_norm": 0.40455556927216235, "learning_rate": 5.211667119038829e-06, "loss": 0.2892, "step": 20550 }, { "epoch": 4.793610074626866, "grad_norm": 0.38896140722711875, "learning_rate": 5.209299201115125e-06, "loss": 0.2715, "step": 20555 }, { "epoch": 4.794776119402985, "grad_norm": 0.39473402626185594, "learning_rate": 5.206944540845476e-06, "loss": 0.2725, "step": 20560 }, { "epoch": 4.795942164179104, "grad_norm": 0.4181201762247158, "learning_rate": 5.204603139630345e-06, "loss": 0.2802, "step": 20565 }, { "epoch": 4.7971082089552235, "grad_norm": 0.40088658611859335, "learning_rate": 5.202274998862312e-06, "loss": 0.2852, "step": 20570 }, { "epoch": 4.798274253731344, "grad_norm": 0.42271892088067387, "learning_rate": 5.199960119926059e-06, "loss": 0.2796, "step": 20575 }, { "epoch": 4.799440298507463, "grad_norm": 0.41903168028385773, "learning_rate": 5.197658504198392e-06, "loss": 0.2808, "step": 20580 }, { "epoch": 4.800606343283582, "grad_norm": 0.38513278786793087, "learning_rate": 5.1953701530482215e-06, "loss": 0.2755, "step": 20585 }, { "epoch": 4.801772388059701, "grad_norm": 0.4143591996979099, "learning_rate": 5.1930950678365715e-06, "loss": 0.2923, "step": 20590 }, { "epoch": 4.802938432835821, "grad_norm": 0.4141765979120398, "learning_rate": 5.190833249916577e-06, "loss": 0.2837, "step": 20595 }, { "epoch": 4.80410447761194, "grad_norm": 0.4111899582436221, "learning_rate": 5.188584700633478e-06, "loss": 0.2632, "step": 20600 }, { "epoch": 4.80527052238806, "grad_norm": 0.40661213239556154, "learning_rate": 5.186349421324627e-06, "loss": 0.273, "step": 20605 }, { "epoch": 4.806436567164179, "grad_norm": 0.4241697822125163, "learning_rate": 5.184127413319482e-06, "loss": 0.2669, "step": 20610 }, { "epoch": 4.807602611940299, "grad_norm": 0.3873464057053273, "learning_rate": 5.181918677939608e-06, "loss": 0.2617, "step": 20615 }, { "epoch": 4.808768656716418, "grad_norm": 0.39057400514464946, "learning_rate": 5.179723216498677e-06, "loss": 0.2688, "step": 20620 }, { "epoch": 4.809934701492537, "grad_norm": 0.4436534986256238, "learning_rate": 5.177541030302462e-06, "loss": 0.2889, "step": 20625 }, { "epoch": 4.811100746268656, "grad_norm": 0.39557667111546235, "learning_rate": 5.17537212064885e-06, "loss": 0.2797, "step": 20630 }, { "epoch": 4.8122667910447765, "grad_norm": 0.4025533900646421, "learning_rate": 5.173216488827822e-06, "loss": 0.2665, "step": 20635 }, { "epoch": 4.813432835820896, "grad_norm": 0.4099993726819529, "learning_rate": 5.171074136121461e-06, "loss": 0.27, "step": 20640 }, { "epoch": 4.814598880597015, "grad_norm": 0.4017661316263101, "learning_rate": 5.168945063803962e-06, "loss": 0.2768, "step": 20645 }, { "epoch": 4.815764925373134, "grad_norm": 0.44565005631447957, "learning_rate": 5.166829273141612e-06, "loss": 0.284, "step": 20650 }, { "epoch": 4.8169309701492535, "grad_norm": 0.37438785755230786, "learning_rate": 5.164726765392805e-06, "loss": 0.265, "step": 20655 }, { "epoch": 4.818097014925373, "grad_norm": 0.4614441214998527, "learning_rate": 5.162637541808031e-06, "loss": 0.2807, "step": 20660 }, { "epoch": 4.819263059701493, "grad_norm": 0.4175894918737853, "learning_rate": 5.16056160362988e-06, "loss": 0.2826, "step": 20665 }, { "epoch": 4.820429104477612, "grad_norm": 0.4080039712506842, "learning_rate": 5.158498952093038e-06, "loss": 0.2728, "step": 20670 }, { "epoch": 4.8215951492537314, "grad_norm": 0.40019707337954497, "learning_rate": 5.156449588424295e-06, "loss": 0.28, "step": 20675 }, { "epoch": 4.822761194029851, "grad_norm": 0.3877080707890139, "learning_rate": 5.154413513842533e-06, "loss": 0.258, "step": 20680 }, { "epoch": 4.82392723880597, "grad_norm": 0.4002480845719403, "learning_rate": 5.152390729558727e-06, "loss": 0.2751, "step": 20685 }, { "epoch": 4.825093283582089, "grad_norm": 0.40572437026470765, "learning_rate": 5.1503812367759575e-06, "loss": 0.2758, "step": 20690 }, { "epoch": 4.8262593283582085, "grad_norm": 0.4166201389331063, "learning_rate": 5.148385036689391e-06, "loss": 0.2856, "step": 20695 }, { "epoch": 4.827425373134329, "grad_norm": 0.42408475063675277, "learning_rate": 5.146402130486288e-06, "loss": 0.2779, "step": 20700 }, { "epoch": 4.828591417910448, "grad_norm": 0.40101898017000503, "learning_rate": 5.144432519346011e-06, "loss": 0.2742, "step": 20705 }, { "epoch": 4.829757462686567, "grad_norm": 0.3928200240336515, "learning_rate": 5.142476204440002e-06, "loss": 0.2715, "step": 20710 }, { "epoch": 4.830923507462686, "grad_norm": 0.4181371835163029, "learning_rate": 5.140533186931809e-06, "loss": 0.2856, "step": 20715 }, { "epoch": 4.832089552238806, "grad_norm": 0.4010293431365239, "learning_rate": 5.138603467977062e-06, "loss": 0.2756, "step": 20720 }, { "epoch": 4.833255597014926, "grad_norm": 0.411341035638822, "learning_rate": 5.136687048723483e-06, "loss": 0.2804, "step": 20725 }, { "epoch": 4.834421641791045, "grad_norm": 0.4148270501291078, "learning_rate": 5.134783930310883e-06, "loss": 0.2925, "step": 20730 }, { "epoch": 4.835587686567164, "grad_norm": 0.4198749943133584, "learning_rate": 5.132894113871167e-06, "loss": 0.2797, "step": 20735 }, { "epoch": 4.836753731343284, "grad_norm": 0.38647493430190527, "learning_rate": 5.131017600528324e-06, "loss": 0.2659, "step": 20740 }, { "epoch": 4.837919776119403, "grad_norm": 0.4087886980424793, "learning_rate": 5.129154391398433e-06, "loss": 0.2764, "step": 20745 }, { "epoch": 4.839085820895522, "grad_norm": 0.42352821566942966, "learning_rate": 5.127304487589658e-06, "loss": 0.2836, "step": 20750 }, { "epoch": 4.840251865671641, "grad_norm": 0.4064733010483587, "learning_rate": 5.12546789020225e-06, "loss": 0.2763, "step": 20755 }, { "epoch": 4.8414179104477615, "grad_norm": 0.4057512468643699, "learning_rate": 5.123644600328549e-06, "loss": 0.2799, "step": 20760 }, { "epoch": 4.842583955223881, "grad_norm": 0.41425201830861147, "learning_rate": 5.121834619052979e-06, "loss": 0.2764, "step": 20765 }, { "epoch": 4.84375, "grad_norm": 0.40693348888896497, "learning_rate": 5.120037947452043e-06, "loss": 0.2793, "step": 20770 }, { "epoch": 4.844916044776119, "grad_norm": 0.37555379991166826, "learning_rate": 5.118254586594335e-06, "loss": 0.2669, "step": 20775 }, { "epoch": 4.8460820895522385, "grad_norm": 0.3770990206275446, "learning_rate": 5.116484537540532e-06, "loss": 0.2713, "step": 20780 }, { "epoch": 4.847248134328359, "grad_norm": 0.40450907830436417, "learning_rate": 5.114727801343385e-06, "loss": 0.2729, "step": 20785 }, { "epoch": 4.848414179104478, "grad_norm": 0.4169318326002382, "learning_rate": 5.11298437904774e-06, "loss": 0.2838, "step": 20790 }, { "epoch": 4.849580223880597, "grad_norm": 0.42606791499071295, "learning_rate": 5.111254271690516e-06, "loss": 0.2816, "step": 20795 }, { "epoch": 4.850746268656716, "grad_norm": 0.40524229727924804, "learning_rate": 5.1095374803007115e-06, "loss": 0.2656, "step": 20800 }, { "epoch": 4.851912313432836, "grad_norm": 0.4204196037155859, "learning_rate": 5.107834005899409e-06, "loss": 0.284, "step": 20805 }, { "epoch": 4.853078358208955, "grad_norm": 0.4074731828219992, "learning_rate": 5.1061438494997726e-06, "loss": 0.2719, "step": 20810 }, { "epoch": 4.854244402985074, "grad_norm": 0.41820534589092, "learning_rate": 5.104467012107041e-06, "loss": 0.2826, "step": 20815 }, { "epoch": 4.855410447761194, "grad_norm": 0.4385629601178043, "learning_rate": 5.102803494718532e-06, "loss": 0.2795, "step": 20820 }, { "epoch": 4.856576492537314, "grad_norm": 0.4117020983487212, "learning_rate": 5.101153298323643e-06, "loss": 0.2795, "step": 20825 }, { "epoch": 4.857742537313433, "grad_norm": 0.40007321945644764, "learning_rate": 5.099516423903844e-06, "loss": 0.2743, "step": 20830 }, { "epoch": 4.858908582089552, "grad_norm": 0.37531894798379556, "learning_rate": 5.097892872432691e-06, "loss": 0.2582, "step": 20835 }, { "epoch": 4.860074626865671, "grad_norm": 0.3967798096406278, "learning_rate": 5.096282644875807e-06, "loss": 0.2684, "step": 20840 }, { "epoch": 4.8612406716417915, "grad_norm": 0.4359728323670395, "learning_rate": 5.094685742190896e-06, "loss": 0.271, "step": 20845 }, { "epoch": 4.862406716417911, "grad_norm": 0.40289826215219937, "learning_rate": 5.093102165327729e-06, "loss": 0.2769, "step": 20850 }, { "epoch": 4.86357276119403, "grad_norm": 0.39137328077031774, "learning_rate": 5.09153191522816e-06, "loss": 0.27, "step": 20855 }, { "epoch": 4.864738805970149, "grad_norm": 0.40514599405796825, "learning_rate": 5.089974992826117e-06, "loss": 0.2681, "step": 20860 }, { "epoch": 4.8659048507462686, "grad_norm": 0.4011535322183252, "learning_rate": 5.08843139904759e-06, "loss": 0.2762, "step": 20865 }, { "epoch": 4.867070895522388, "grad_norm": 0.3857881219293404, "learning_rate": 5.086901134810658e-06, "loss": 0.2683, "step": 20870 }, { "epoch": 4.868236940298507, "grad_norm": 0.39294086703585596, "learning_rate": 5.085384201025457e-06, "loss": 0.275, "step": 20875 }, { "epoch": 4.869402985074627, "grad_norm": 0.4397434877985555, "learning_rate": 5.083880598594204e-06, "loss": 0.2882, "step": 20880 }, { "epoch": 4.8705690298507465, "grad_norm": 0.39389140326985644, "learning_rate": 5.082390328411184e-06, "loss": 0.2745, "step": 20885 }, { "epoch": 4.871735074626866, "grad_norm": 0.39485809597519445, "learning_rate": 5.080913391362749e-06, "loss": 0.2759, "step": 20890 }, { "epoch": 4.872901119402985, "grad_norm": 0.40063280239618126, "learning_rate": 5.079449788327332e-06, "loss": 0.2851, "step": 20895 }, { "epoch": 4.874067164179104, "grad_norm": 0.3827772878754662, "learning_rate": 5.0779995201754225e-06, "loss": 0.2723, "step": 20900 }, { "epoch": 4.8752332089552235, "grad_norm": 0.43225014747643237, "learning_rate": 5.076562587769584e-06, "loss": 0.2862, "step": 20905 }, { "epoch": 4.876399253731344, "grad_norm": 0.41449429723546716, "learning_rate": 5.07513899196445e-06, "loss": 0.2693, "step": 20910 }, { "epoch": 4.877565298507463, "grad_norm": 0.39608998902119325, "learning_rate": 5.073728733606722e-06, "loss": 0.2722, "step": 20915 }, { "epoch": 4.878731343283582, "grad_norm": 0.4016620610164004, "learning_rate": 5.072331813535166e-06, "loss": 0.266, "step": 20920 }, { "epoch": 4.879897388059701, "grad_norm": 0.4151739389003867, "learning_rate": 5.070948232580618e-06, "loss": 0.2728, "step": 20925 }, { "epoch": 4.881063432835821, "grad_norm": 0.39938321055353626, "learning_rate": 5.069577991565977e-06, "loss": 0.2626, "step": 20930 }, { "epoch": 4.88222947761194, "grad_norm": 0.4259708662374472, "learning_rate": 5.06822109130621e-06, "loss": 0.2882, "step": 20935 }, { "epoch": 4.88339552238806, "grad_norm": 0.3950929612813323, "learning_rate": 5.066877532608349e-06, "loss": 0.2745, "step": 20940 }, { "epoch": 4.884561567164179, "grad_norm": 0.44903327030586204, "learning_rate": 5.065547316271494e-06, "loss": 0.2878, "step": 20945 }, { "epoch": 4.885727611940299, "grad_norm": 0.4123597349979514, "learning_rate": 5.064230443086805e-06, "loss": 0.2657, "step": 20950 }, { "epoch": 4.886893656716418, "grad_norm": 0.39160944428542344, "learning_rate": 5.062926913837507e-06, "loss": 0.2764, "step": 20955 }, { "epoch": 4.888059701492537, "grad_norm": 0.44649339030687424, "learning_rate": 5.06163672929889e-06, "loss": 0.27, "step": 20960 }, { "epoch": 4.889225746268656, "grad_norm": 0.40361681778558994, "learning_rate": 5.060359890238305e-06, "loss": 0.2793, "step": 20965 }, { "epoch": 4.8903917910447765, "grad_norm": 0.4199921652860125, "learning_rate": 5.059096397415167e-06, "loss": 0.281, "step": 20970 }, { "epoch": 4.891557835820896, "grad_norm": 0.3907780228757323, "learning_rate": 5.057846251580957e-06, "loss": 0.2673, "step": 20975 }, { "epoch": 4.892723880597015, "grad_norm": 0.4156772282704941, "learning_rate": 5.056609453479208e-06, "loss": 0.2788, "step": 20980 }, { "epoch": 4.893889925373134, "grad_norm": 0.39500932422262897, "learning_rate": 5.055386003845524e-06, "loss": 0.2743, "step": 20985 }, { "epoch": 4.8950559701492535, "grad_norm": 0.40733193078774504, "learning_rate": 5.0541759034075645e-06, "loss": 0.2771, "step": 20990 }, { "epoch": 4.896222014925373, "grad_norm": 0.39320460492035453, "learning_rate": 5.0529791528850515e-06, "loss": 0.2816, "step": 20995 }, { "epoch": 4.897388059701493, "grad_norm": 0.3834414057678221, "learning_rate": 5.051795752989764e-06, "loss": 0.2659, "step": 21000 }, { "epoch": 4.898554104477612, "grad_norm": 0.40682123041297497, "learning_rate": 5.050625704425547e-06, "loss": 0.2771, "step": 21005 }, { "epoch": 4.8997201492537314, "grad_norm": 0.3873899334787755, "learning_rate": 5.049469007888298e-06, "loss": 0.2604, "step": 21010 }, { "epoch": 4.900886194029851, "grad_norm": 0.4091484298868379, "learning_rate": 5.048325664065975e-06, "loss": 0.2844, "step": 21015 }, { "epoch": 4.90205223880597, "grad_norm": 0.42939427693103144, "learning_rate": 5.047195673638596e-06, "loss": 0.2894, "step": 21020 }, { "epoch": 4.903218283582089, "grad_norm": 0.41019724653897144, "learning_rate": 5.046079037278237e-06, "loss": 0.2652, "step": 21025 }, { "epoch": 4.9043843283582085, "grad_norm": 0.3975076882210794, "learning_rate": 5.044975755649028e-06, "loss": 0.2738, "step": 21030 }, { "epoch": 4.905550373134329, "grad_norm": 0.40728381591915686, "learning_rate": 5.043885829407164e-06, "loss": 0.2841, "step": 21035 }, { "epoch": 4.906716417910448, "grad_norm": 0.41166680066812117, "learning_rate": 5.042809259200885e-06, "loss": 0.272, "step": 21040 }, { "epoch": 4.907882462686567, "grad_norm": 0.42353064926068584, "learning_rate": 5.041746045670495e-06, "loss": 0.2811, "step": 21045 }, { "epoch": 4.909048507462686, "grad_norm": 0.4080630065092933, "learning_rate": 5.040696189448356e-06, "loss": 0.2732, "step": 21050 }, { "epoch": 4.910214552238806, "grad_norm": 0.40443150645082443, "learning_rate": 5.039659691158878e-06, "loss": 0.2628, "step": 21055 }, { "epoch": 4.911380597014926, "grad_norm": 0.3962804134326119, "learning_rate": 5.038636551418533e-06, "loss": 0.271, "step": 21060 }, { "epoch": 4.912546641791045, "grad_norm": 0.4361127930648337, "learning_rate": 5.0376267708358455e-06, "loss": 0.2826, "step": 21065 }, { "epoch": 4.913712686567164, "grad_norm": 0.4042774117247444, "learning_rate": 5.036630350011395e-06, "loss": 0.2827, "step": 21070 }, { "epoch": 4.914878731343284, "grad_norm": 0.3954326308091401, "learning_rate": 5.03564728953781e-06, "loss": 0.282, "step": 21075 }, { "epoch": 4.916044776119403, "grad_norm": 0.41373453186212483, "learning_rate": 5.034677589999783e-06, "loss": 0.2768, "step": 21080 }, { "epoch": 4.917210820895522, "grad_norm": 0.4055191287627898, "learning_rate": 5.033721251974047e-06, "loss": 0.266, "step": 21085 }, { "epoch": 4.918376865671641, "grad_norm": 0.4484508716939671, "learning_rate": 5.032778276029403e-06, "loss": 0.272, "step": 21090 }, { "epoch": 4.9195429104477615, "grad_norm": 0.4157892756366168, "learning_rate": 5.031848662726692e-06, "loss": 0.2593, "step": 21095 }, { "epoch": 4.920708955223881, "grad_norm": 0.4107094443613056, "learning_rate": 5.030932412618815e-06, "loss": 0.2769, "step": 21100 }, { "epoch": 4.921875, "grad_norm": 0.42663644972197884, "learning_rate": 5.030029526250719e-06, "loss": 0.2715, "step": 21105 }, { "epoch": 4.923041044776119, "grad_norm": 0.4256142655857328, "learning_rate": 5.029140004159409e-06, "loss": 0.2791, "step": 21110 }, { "epoch": 4.9242070895522385, "grad_norm": 0.4328807366078749, "learning_rate": 5.028263846873938e-06, "loss": 0.2797, "step": 21115 }, { "epoch": 4.925373134328359, "grad_norm": 0.42725135225287103, "learning_rate": 5.02740105491541e-06, "loss": 0.2702, "step": 21120 }, { "epoch": 4.926539179104478, "grad_norm": 0.40976720684938533, "learning_rate": 5.026551628796982e-06, "loss": 0.2933, "step": 21125 }, { "epoch": 4.927705223880597, "grad_norm": 0.3838294277414686, "learning_rate": 5.025715569023859e-06, "loss": 0.2745, "step": 21130 }, { "epoch": 4.928871268656716, "grad_norm": 0.40850157188644237, "learning_rate": 5.024892876093299e-06, "loss": 0.2697, "step": 21135 }, { "epoch": 4.930037313432836, "grad_norm": 0.41855334152521473, "learning_rate": 5.024083550494606e-06, "loss": 0.2777, "step": 21140 }, { "epoch": 4.931203358208955, "grad_norm": 0.40228325958343863, "learning_rate": 5.023287592709136e-06, "loss": 0.2701, "step": 21145 }, { "epoch": 4.932369402985074, "grad_norm": 0.38806415061591565, "learning_rate": 5.0225050032102965e-06, "loss": 0.2734, "step": 21150 }, { "epoch": 4.933535447761194, "grad_norm": 0.4301196796501381, "learning_rate": 5.021735782463537e-06, "loss": 0.2765, "step": 21155 }, { "epoch": 4.934701492537314, "grad_norm": 0.4125398239917131, "learning_rate": 5.020979930926365e-06, "loss": 0.2778, "step": 21160 }, { "epoch": 4.935867537313433, "grad_norm": 0.3839116135692184, "learning_rate": 5.020237449048333e-06, "loss": 0.2711, "step": 21165 }, { "epoch": 4.937033582089552, "grad_norm": 0.4031583385870646, "learning_rate": 5.0195083372710345e-06, "loss": 0.2804, "step": 21170 }, { "epoch": 4.938199626865671, "grad_norm": 0.3808448155322725, "learning_rate": 5.018792596028123e-06, "loss": 0.2688, "step": 21175 }, { "epoch": 4.9393656716417915, "grad_norm": 0.42703062283967935, "learning_rate": 5.018090225745291e-06, "loss": 0.2714, "step": 21180 }, { "epoch": 4.940531716417911, "grad_norm": 0.41642469146543964, "learning_rate": 5.017401226840284e-06, "loss": 0.2886, "step": 21185 }, { "epoch": 4.94169776119403, "grad_norm": 0.40773956431401676, "learning_rate": 5.016725599722889e-06, "loss": 0.2887, "step": 21190 }, { "epoch": 4.942863805970149, "grad_norm": 0.3887940281318315, "learning_rate": 5.016063344794947e-06, "loss": 0.2744, "step": 21195 }, { "epoch": 4.9440298507462686, "grad_norm": 0.4157645935918462, "learning_rate": 5.0154144624503365e-06, "loss": 0.2725, "step": 21200 }, { "epoch": 4.945195895522388, "grad_norm": 0.4210318173390705, "learning_rate": 5.014778953074992e-06, "loss": 0.2859, "step": 21205 }, { "epoch": 4.946361940298507, "grad_norm": 0.3981207785886553, "learning_rate": 5.014156817046891e-06, "loss": 0.2792, "step": 21210 }, { "epoch": 4.947527985074627, "grad_norm": 0.41941067877749943, "learning_rate": 5.013548054736049e-06, "loss": 0.2833, "step": 21215 }, { "epoch": 4.9486940298507465, "grad_norm": 0.39804302722844526, "learning_rate": 5.012952666504542e-06, "loss": 0.2754, "step": 21220 }, { "epoch": 4.949860074626866, "grad_norm": 0.39807259496106023, "learning_rate": 5.012370652706484e-06, "loss": 0.28, "step": 21225 }, { "epoch": 4.951026119402985, "grad_norm": 0.41755613450166845, "learning_rate": 5.011802013688029e-06, "loss": 0.2866, "step": 21230 }, { "epoch": 4.952192164179104, "grad_norm": 0.42508109516027964, "learning_rate": 5.011246749787385e-06, "loss": 0.2857, "step": 21235 }, { "epoch": 4.9533582089552235, "grad_norm": 0.4049200674752226, "learning_rate": 5.010704861334803e-06, "loss": 0.2645, "step": 21240 }, { "epoch": 4.954524253731344, "grad_norm": 0.40554740307454185, "learning_rate": 5.010176348652576e-06, "loss": 0.2879, "step": 21245 }, { "epoch": 4.955690298507463, "grad_norm": 0.4171370392183362, "learning_rate": 5.0096612120550436e-06, "loss": 0.2748, "step": 21250 }, { "epoch": 4.956856343283582, "grad_norm": 0.4135318586798704, "learning_rate": 5.009159451848587e-06, "loss": 0.276, "step": 21255 }, { "epoch": 4.958022388059701, "grad_norm": 0.41017795039602367, "learning_rate": 5.008671068331634e-06, "loss": 0.2835, "step": 21260 }, { "epoch": 4.959188432835821, "grad_norm": 0.38694873358561843, "learning_rate": 5.00819606179466e-06, "loss": 0.2702, "step": 21265 }, { "epoch": 4.96035447761194, "grad_norm": 0.41101565067261825, "learning_rate": 5.007734432520179e-06, "loss": 0.2918, "step": 21270 }, { "epoch": 4.96152052238806, "grad_norm": 0.40074163029173104, "learning_rate": 5.0072861807827505e-06, "loss": 0.2789, "step": 21275 }, { "epoch": 4.962686567164179, "grad_norm": 0.4234714500967549, "learning_rate": 5.0068513068489765e-06, "loss": 0.2775, "step": 21280 }, { "epoch": 4.963852611940299, "grad_norm": 0.39278240101882295, "learning_rate": 5.0064298109775035e-06, "loss": 0.2665, "step": 21285 }, { "epoch": 4.965018656716418, "grad_norm": 0.40837934561077827, "learning_rate": 5.006021693419021e-06, "loss": 0.2788, "step": 21290 }, { "epoch": 4.966184701492537, "grad_norm": 0.4142150863607586, "learning_rate": 5.0056269544162635e-06, "loss": 0.2732, "step": 21295 }, { "epoch": 4.967350746268656, "grad_norm": 0.40573497340467674, "learning_rate": 5.0052455942040045e-06, "loss": 0.2821, "step": 21300 }, { "epoch": 4.9685167910447765, "grad_norm": 0.39800806054666515, "learning_rate": 5.004877613009064e-06, "loss": 0.2818, "step": 21305 }, { "epoch": 4.969682835820896, "grad_norm": 0.38979452945694254, "learning_rate": 5.0045230110503e-06, "loss": 0.267, "step": 21310 }, { "epoch": 4.970848880597015, "grad_norm": 0.42055164882379603, "learning_rate": 5.00418178853862e-06, "loss": 0.2715, "step": 21315 }, { "epoch": 4.972014925373134, "grad_norm": 0.42732687241285094, "learning_rate": 5.003853945676969e-06, "loss": 0.2833, "step": 21320 }, { "epoch": 4.9731809701492535, "grad_norm": 0.38840733087442564, "learning_rate": 5.0035394826603345e-06, "loss": 0.2809, "step": 21325 }, { "epoch": 4.974347014925373, "grad_norm": 0.41284386720082655, "learning_rate": 5.003238399675746e-06, "loss": 0.2728, "step": 21330 }, { "epoch": 4.975513059701493, "grad_norm": 0.413687332011822, "learning_rate": 5.002950696902278e-06, "loss": 0.2717, "step": 21335 }, { "epoch": 4.976679104477612, "grad_norm": 0.39369282860208743, "learning_rate": 5.002676374511046e-06, "loss": 0.2678, "step": 21340 }, { "epoch": 4.9778451492537314, "grad_norm": 0.410582306177516, "learning_rate": 5.0024154326652044e-06, "loss": 0.2773, "step": 21345 }, { "epoch": 4.979011194029851, "grad_norm": 0.40082441046553735, "learning_rate": 5.002167871519951e-06, "loss": 0.2735, "step": 21350 }, { "epoch": 4.98017723880597, "grad_norm": 0.40048548194936884, "learning_rate": 5.001933691222527e-06, "loss": 0.285, "step": 21355 }, { "epoch": 4.981343283582089, "grad_norm": 0.40933313203727506, "learning_rate": 5.001712891912217e-06, "loss": 0.2828, "step": 21360 }, { "epoch": 4.9825093283582085, "grad_norm": 0.37561592164505403, "learning_rate": 5.001505473720337e-06, "loss": 0.2697, "step": 21365 }, { "epoch": 4.983675373134329, "grad_norm": 0.3940218577106496, "learning_rate": 5.001311436770255e-06, "loss": 0.2884, "step": 21370 }, { "epoch": 4.984841417910448, "grad_norm": 0.40789697316039814, "learning_rate": 5.001130781177377e-06, "loss": 0.2771, "step": 21375 }, { "epoch": 4.986007462686567, "grad_norm": 0.38966392808622125, "learning_rate": 5.000963507049151e-06, "loss": 0.2777, "step": 21380 }, { "epoch": 4.987173507462686, "grad_norm": 0.42082273116940005, "learning_rate": 5.000809614485062e-06, "loss": 0.2759, "step": 21385 }, { "epoch": 4.988339552238806, "grad_norm": 0.39524031881150823, "learning_rate": 5.000669103576643e-06, "loss": 0.2711, "step": 21390 }, { "epoch": 4.989505597014926, "grad_norm": 0.4243407252055143, "learning_rate": 5.000541974407462e-06, "loss": 0.2781, "step": 21395 }, { "epoch": 4.990671641791045, "grad_norm": 0.39442825126110137, "learning_rate": 5.000428227053131e-06, "loss": 0.2769, "step": 21400 }, { "epoch": 4.991837686567164, "grad_norm": 0.39636388485481344, "learning_rate": 5.000327861581302e-06, "loss": 0.2745, "step": 21405 }, { "epoch": 4.993003731343284, "grad_norm": 0.4109817745413501, "learning_rate": 5.000240878051671e-06, "loss": 0.2763, "step": 21410 }, { "epoch": 4.994169776119403, "grad_norm": 0.419159995584321, "learning_rate": 5.0001672765159696e-06, "loss": 0.267, "step": 21415 }, { "epoch": 4.995335820895522, "grad_norm": 0.40579325116029774, "learning_rate": 5.000107057017976e-06, "loss": 0.2907, "step": 21420 }, { "epoch": 4.996501865671641, "grad_norm": 0.4013985907244209, "learning_rate": 5.0000602195935046e-06, "loss": 0.2784, "step": 21425 }, { "epoch": 4.9976679104477615, "grad_norm": 0.40870836269752825, "learning_rate": 5.000026764270413e-06, "loss": 0.2677, "step": 21430 }, { "epoch": 4.998833955223881, "grad_norm": 0.40234064324932556, "learning_rate": 5.0000066910686e-06, "loss": 0.2887, "step": 21435 }, { "epoch": 5.0, "grad_norm": 0.4134369961168365, "learning_rate": 5e-06, "loss": 0.2589, "step": 21440 }, { "epoch": 5.0, "step": 21440, "total_flos": 2439673306546176.0, "train_loss": 0.4061120442879289, "train_runtime": 35572.055, "train_samples_per_second": 4.821, "train_steps_per_second": 0.603 } ], "logging_steps": 5, "max_steps": 21440, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2439673306546176.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }