diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30058 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 21440, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011660447761194029, + "grad_norm": 2.31470614163852, + "learning_rate": 2.3320895522388058e-07, + "loss": 0.859, + "step": 5 + }, + { + "epoch": 0.0023320895522388058, + "grad_norm": 2.1806524067205517, + "learning_rate": 4.6641791044776116e-07, + "loss": 0.8409, + "step": 10 + }, + { + "epoch": 0.003498134328358209, + "grad_norm": 2.0336016383542637, + "learning_rate": 6.996268656716418e-07, + "loss": 0.8403, + "step": 15 + }, + { + "epoch": 0.0046641791044776115, + "grad_norm": 1.8107765601305243, + "learning_rate": 9.328358208955223e-07, + "loss": 0.8402, + "step": 20 + }, + { + "epoch": 0.005830223880597015, + "grad_norm": 1.5214305765409384, + "learning_rate": 1.1660447761194032e-06, + "loss": 0.8399, + "step": 25 + }, + { + "epoch": 0.006996268656716418, + "grad_norm": 1.3729288900773917, + "learning_rate": 1.3992537313432837e-06, + "loss": 0.8529, + "step": 30 + }, + { + "epoch": 0.00816231343283582, + "grad_norm": 1.110318324330149, + "learning_rate": 1.6324626865671642e-06, + "loss": 0.8044, + "step": 35 + }, + { + "epoch": 0.009328358208955223, + "grad_norm": 0.91480035139308, + "learning_rate": 1.8656716417910446e-06, + "loss": 0.7846, + "step": 40 + }, + { + "epoch": 0.010494402985074628, + "grad_norm": 0.8043185199854158, + "learning_rate": 2.0988805970149257e-06, + "loss": 0.771, + "step": 45 + }, + { + "epoch": 0.01166044776119403, + "grad_norm": 0.7383123349724773, + "learning_rate": 2.3320895522388064e-06, + "loss": 0.7736, + "step": 50 + }, + { + "epoch": 0.012826492537313433, + "grad_norm": 0.7490548326880867, + "learning_rate": 2.5652985074626867e-06, + "loss": 0.736, + "step": 55 + }, + { + "epoch": 0.013992537313432836, + "grad_norm": 0.6375751886591362, + "learning_rate": 2.7985074626865674e-06, + "loss": 0.7309, + "step": 60 + }, + { + "epoch": 0.01515858208955224, + "grad_norm": 0.6475768724712477, + "learning_rate": 3.031716417910448e-06, + "loss": 0.711, + "step": 65 + }, + { + "epoch": 0.01632462686567164, + "grad_norm": 0.6106272866800404, + "learning_rate": 3.2649253731343283e-06, + "loss": 0.7357, + "step": 70 + }, + { + "epoch": 0.017490671641791043, + "grad_norm": 0.6589391454469965, + "learning_rate": 3.498134328358209e-06, + "loss": 0.7276, + "step": 75 + }, + { + "epoch": 0.018656716417910446, + "grad_norm": 0.6409686727344286, + "learning_rate": 3.7313432835820893e-06, + "loss": 0.6865, + "step": 80 + }, + { + "epoch": 0.019822761194029852, + "grad_norm": 0.6336098527814419, + "learning_rate": 3.96455223880597e-06, + "loss": 0.6994, + "step": 85 + }, + { + "epoch": 0.020988805970149255, + "grad_norm": 0.7346724850292417, + "learning_rate": 4.1977611940298515e-06, + "loss": 0.7019, + "step": 90 + }, + { + "epoch": 0.022154850746268658, + "grad_norm": 0.6583598026893135, + "learning_rate": 4.430970149253732e-06, + "loss": 0.6827, + "step": 95 + }, + { + "epoch": 0.02332089552238806, + "grad_norm": 0.5990590447236036, + "learning_rate": 4.664179104477613e-06, + "loss": 0.6674, + "step": 100 + }, + { + "epoch": 0.024486940298507464, + "grad_norm": 0.5795709703569923, + "learning_rate": 4.897388059701493e-06, + "loss": 0.6599, + "step": 105 + }, + { + "epoch": 0.025652985074626867, + "grad_norm": 0.5892028434107501, + "learning_rate": 5.130597014925373e-06, + "loss": 0.6777, + "step": 110 + }, + { + "epoch": 0.02681902985074627, + "grad_norm": 0.6253265633727113, + "learning_rate": 5.3638059701492545e-06, + "loss": 0.6602, + "step": 115 + }, + { + "epoch": 0.027985074626865673, + "grad_norm": 0.5779782549880188, + "learning_rate": 5.597014925373135e-06, + "loss": 0.6746, + "step": 120 + }, + { + "epoch": 0.029151119402985076, + "grad_norm": 0.5651696911532211, + "learning_rate": 5.830223880597015e-06, + "loss": 0.6584, + "step": 125 + }, + { + "epoch": 0.03031716417910448, + "grad_norm": 0.5548638684876713, + "learning_rate": 6.063432835820896e-06, + "loss": 0.6538, + "step": 130 + }, + { + "epoch": 0.03148320895522388, + "grad_norm": 0.5527878439636197, + "learning_rate": 6.2966417910447755e-06, + "loss": 0.6355, + "step": 135 + }, + { + "epoch": 0.03264925373134328, + "grad_norm": 0.5588523850308228, + "learning_rate": 6.529850746268657e-06, + "loss": 0.6243, + "step": 140 + }, + { + "epoch": 0.033815298507462684, + "grad_norm": 0.5680280651347817, + "learning_rate": 6.763059701492537e-06, + "loss": 0.6406, + "step": 145 + }, + { + "epoch": 0.034981343283582086, + "grad_norm": 0.5680483228113563, + "learning_rate": 6.996268656716418e-06, + "loss": 0.6318, + "step": 150 + }, + { + "epoch": 0.03614738805970149, + "grad_norm": 0.562964090892077, + "learning_rate": 7.229477611940298e-06, + "loss": 0.6278, + "step": 155 + }, + { + "epoch": 0.03731343283582089, + "grad_norm": 0.635553701478694, + "learning_rate": 7.4626865671641785e-06, + "loss": 0.658, + "step": 160 + }, + { + "epoch": 0.038479477611940295, + "grad_norm": 0.6557036509575375, + "learning_rate": 7.69589552238806e-06, + "loss": 0.6405, + "step": 165 + }, + { + "epoch": 0.039645522388059705, + "grad_norm": 0.5867982800908484, + "learning_rate": 7.92910447761194e-06, + "loss": 0.633, + "step": 170 + }, + { + "epoch": 0.04081156716417911, + "grad_norm": 0.6141705031654435, + "learning_rate": 8.162313432835822e-06, + "loss": 0.6229, + "step": 175 + }, + { + "epoch": 0.04197761194029851, + "grad_norm": 0.6248136285598961, + "learning_rate": 8.395522388059703e-06, + "loss": 0.6502, + "step": 180 + }, + { + "epoch": 0.043143656716417914, + "grad_norm": 0.6019477617952123, + "learning_rate": 8.628731343283582e-06, + "loss": 0.6216, + "step": 185 + }, + { + "epoch": 0.044309701492537316, + "grad_norm": 0.6240291503277375, + "learning_rate": 8.861940298507463e-06, + "loss": 0.6232, + "step": 190 + }, + { + "epoch": 0.04547574626865672, + "grad_norm": 0.5871751457850535, + "learning_rate": 9.095149253731345e-06, + "loss": 0.6121, + "step": 195 + }, + { + "epoch": 0.04664179104477612, + "grad_norm": 0.5469566198329419, + "learning_rate": 9.328358208955226e-06, + "loss": 0.6208, + "step": 200 + }, + { + "epoch": 0.047807835820895525, + "grad_norm": 0.620559082451502, + "learning_rate": 9.561567164179105e-06, + "loss": 0.6371, + "step": 205 + }, + { + "epoch": 0.04897388059701493, + "grad_norm": 0.6006349876167401, + "learning_rate": 9.794776119402986e-06, + "loss": 0.6478, + "step": 210 + }, + { + "epoch": 0.05013992537313433, + "grad_norm": 0.5744573514050928, + "learning_rate": 1.0027985074626867e-05, + "loss": 0.6437, + "step": 215 + }, + { + "epoch": 0.051305970149253734, + "grad_norm": 0.6132138150827955, + "learning_rate": 1.0261194029850747e-05, + "loss": 0.6296, + "step": 220 + }, + { + "epoch": 0.05247201492537314, + "grad_norm": 0.6550090348688805, + "learning_rate": 1.0494402985074628e-05, + "loss": 0.6359, + "step": 225 + }, + { + "epoch": 0.05363805970149254, + "grad_norm": 0.6070565138983731, + "learning_rate": 1.0727611940298509e-05, + "loss": 0.636, + "step": 230 + }, + { + "epoch": 0.05480410447761194, + "grad_norm": 0.612999711828607, + "learning_rate": 1.0960820895522388e-05, + "loss": 0.6391, + "step": 235 + }, + { + "epoch": 0.055970149253731345, + "grad_norm": 0.636063155653856, + "learning_rate": 1.119402985074627e-05, + "loss": 0.6206, + "step": 240 + }, + { + "epoch": 0.05713619402985075, + "grad_norm": 0.5896975464405161, + "learning_rate": 1.142723880597015e-05, + "loss": 0.6052, + "step": 245 + }, + { + "epoch": 0.05830223880597015, + "grad_norm": 0.5590212693105078, + "learning_rate": 1.166044776119403e-05, + "loss": 0.62, + "step": 250 + }, + { + "epoch": 0.059468283582089554, + "grad_norm": 0.6286006057858667, + "learning_rate": 1.1893656716417911e-05, + "loss": 0.6121, + "step": 255 + }, + { + "epoch": 0.06063432835820896, + "grad_norm": 0.6319032629223692, + "learning_rate": 1.2126865671641792e-05, + "loss": 0.6111, + "step": 260 + }, + { + "epoch": 0.06180037313432836, + "grad_norm": 0.6227274913468611, + "learning_rate": 1.2360074626865673e-05, + "loss": 0.667, + "step": 265 + }, + { + "epoch": 0.06296641791044776, + "grad_norm": 0.6291796320514405, + "learning_rate": 1.2593283582089551e-05, + "loss": 0.645, + "step": 270 + }, + { + "epoch": 0.06413246268656717, + "grad_norm": 0.5883497046558631, + "learning_rate": 1.2826492537313434e-05, + "loss": 0.6205, + "step": 275 + }, + { + "epoch": 0.06529850746268656, + "grad_norm": 0.6117993081700775, + "learning_rate": 1.3059701492537313e-05, + "loss": 0.633, + "step": 280 + }, + { + "epoch": 0.06646455223880597, + "grad_norm": 0.6115079820522081, + "learning_rate": 1.3292910447761194e-05, + "loss": 0.6186, + "step": 285 + }, + { + "epoch": 0.06763059701492537, + "grad_norm": 0.6264704135890352, + "learning_rate": 1.3526119402985074e-05, + "loss": 0.6043, + "step": 290 + }, + { + "epoch": 0.06879664179104478, + "grad_norm": 0.6143832560745185, + "learning_rate": 1.3759328358208957e-05, + "loss": 0.6327, + "step": 295 + }, + { + "epoch": 0.06996268656716417, + "grad_norm": 0.6263813560874937, + "learning_rate": 1.3992537313432836e-05, + "loss": 0.6209, + "step": 300 + }, + { + "epoch": 0.07112873134328358, + "grad_norm": 0.5709188077991506, + "learning_rate": 1.4225746268656717e-05, + "loss": 0.6292, + "step": 305 + }, + { + "epoch": 0.07229477611940298, + "grad_norm": 0.6034458566222612, + "learning_rate": 1.4458955223880596e-05, + "loss": 0.5954, + "step": 310 + }, + { + "epoch": 0.07346082089552239, + "grad_norm": 0.6105447525677008, + "learning_rate": 1.4692164179104478e-05, + "loss": 0.6142, + "step": 315 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.6082492860147549, + "learning_rate": 1.4925373134328357e-05, + "loss": 0.6181, + "step": 320 + }, + { + "epoch": 0.0757929104477612, + "grad_norm": 0.6127880023341611, + "learning_rate": 1.515858208955224e-05, + "loss": 0.6181, + "step": 325 + }, + { + "epoch": 0.07695895522388059, + "grad_norm": 0.5851972780654783, + "learning_rate": 1.539179104477612e-05, + "loss": 0.6136, + "step": 330 + }, + { + "epoch": 0.078125, + "grad_norm": 0.6325445573033428, + "learning_rate": 1.5625e-05, + "loss": 0.5949, + "step": 335 + }, + { + "epoch": 0.07929104477611941, + "grad_norm": 0.6544545895796555, + "learning_rate": 1.585820895522388e-05, + "loss": 0.5996, + "step": 340 + }, + { + "epoch": 0.0804570895522388, + "grad_norm": 0.6136147810235407, + "learning_rate": 1.6091417910447763e-05, + "loss": 0.5878, + "step": 345 + }, + { + "epoch": 0.08162313432835822, + "grad_norm": 0.6292253029881693, + "learning_rate": 1.6324626865671644e-05, + "loss": 0.5868, + "step": 350 + }, + { + "epoch": 0.08278917910447761, + "grad_norm": 0.6639712455230693, + "learning_rate": 1.6557835820895525e-05, + "loss": 0.6326, + "step": 355 + }, + { + "epoch": 0.08395522388059702, + "grad_norm": 0.6149576522483596, + "learning_rate": 1.6791044776119406e-05, + "loss": 0.5785, + "step": 360 + }, + { + "epoch": 0.08512126865671642, + "grad_norm": 0.6157016571844574, + "learning_rate": 1.7024253731343284e-05, + "loss": 0.5942, + "step": 365 + }, + { + "epoch": 0.08628731343283583, + "grad_norm": 0.5731436313839582, + "learning_rate": 1.7257462686567165e-05, + "loss": 0.6061, + "step": 370 + }, + { + "epoch": 0.08745335820895522, + "grad_norm": 0.572969266685798, + "learning_rate": 1.7490671641791046e-05, + "loss": 0.5917, + "step": 375 + }, + { + "epoch": 0.08861940298507463, + "grad_norm": 0.5474810292542341, + "learning_rate": 1.7723880597014927e-05, + "loss": 0.6018, + "step": 380 + }, + { + "epoch": 0.08978544776119403, + "grad_norm": 0.580804803981787, + "learning_rate": 1.7957089552238808e-05, + "loss": 0.6128, + "step": 385 + }, + { + "epoch": 0.09095149253731344, + "grad_norm": 0.6481861841112424, + "learning_rate": 1.819029850746269e-05, + "loss": 0.6279, + "step": 390 + }, + { + "epoch": 0.09211753731343283, + "grad_norm": 0.5838423347872619, + "learning_rate": 1.8423507462686567e-05, + "loss": 0.6042, + "step": 395 + }, + { + "epoch": 0.09328358208955224, + "grad_norm": 0.6455198727922007, + "learning_rate": 1.865671641791045e-05, + "loss": 0.6024, + "step": 400 + }, + { + "epoch": 0.09444962686567164, + "grad_norm": 0.6523931295134143, + "learning_rate": 1.888992537313433e-05, + "loss": 0.6048, + "step": 405 + }, + { + "epoch": 0.09561567164179105, + "grad_norm": 0.6173638885707777, + "learning_rate": 1.912313432835821e-05, + "loss": 0.6359, + "step": 410 + }, + { + "epoch": 0.09678171641791045, + "grad_norm": 0.6620853772560328, + "learning_rate": 1.935634328358209e-05, + "loss": 0.5996, + "step": 415 + }, + { + "epoch": 0.09794776119402986, + "grad_norm": 0.6565267779653624, + "learning_rate": 1.9589552238805972e-05, + "loss": 0.63, + "step": 420 + }, + { + "epoch": 0.09911380597014925, + "grad_norm": 0.7395452985055094, + "learning_rate": 1.982276119402985e-05, + "loss": 0.5999, + "step": 425 + }, + { + "epoch": 0.10027985074626866, + "grad_norm": 0.7276853929918868, + "learning_rate": 2.0055970149253735e-05, + "loss": 0.6167, + "step": 430 + }, + { + "epoch": 0.10144589552238806, + "grad_norm": 0.5711327807157935, + "learning_rate": 2.0289179104477612e-05, + "loss": 0.6055, + "step": 435 + }, + { + "epoch": 0.10261194029850747, + "grad_norm": 0.6225503982377574, + "learning_rate": 2.0522388059701493e-05, + "loss": 0.5971, + "step": 440 + }, + { + "epoch": 0.10377798507462686, + "grad_norm": 0.639440056865781, + "learning_rate": 2.0755597014925375e-05, + "loss": 0.6072, + "step": 445 + }, + { + "epoch": 0.10494402985074627, + "grad_norm": 0.5900464509968311, + "learning_rate": 2.0988805970149256e-05, + "loss": 0.5899, + "step": 450 + }, + { + "epoch": 0.10611007462686567, + "grad_norm": 0.6018506954073033, + "learning_rate": 2.1222014925373133e-05, + "loss": 0.5785, + "step": 455 + }, + { + "epoch": 0.10727611940298508, + "grad_norm": 0.6321374013938041, + "learning_rate": 2.1455223880597018e-05, + "loss": 0.6034, + "step": 460 + }, + { + "epoch": 0.10844216417910447, + "grad_norm": 0.5882403627611281, + "learning_rate": 2.1688432835820896e-05, + "loss": 0.5884, + "step": 465 + }, + { + "epoch": 0.10960820895522388, + "grad_norm": 0.6225348600881592, + "learning_rate": 2.1921641791044777e-05, + "loss": 0.5711, + "step": 470 + }, + { + "epoch": 0.11077425373134328, + "grad_norm": 0.6262344050303364, + "learning_rate": 2.2154850746268658e-05, + "loss": 0.6288, + "step": 475 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.6555452742873066, + "learning_rate": 2.238805970149254e-05, + "loss": 0.6226, + "step": 480 + }, + { + "epoch": 0.11310634328358209, + "grad_norm": 0.6775582702092503, + "learning_rate": 2.262126865671642e-05, + "loss": 0.6283, + "step": 485 + }, + { + "epoch": 0.1142723880597015, + "grad_norm": 0.8254727401525609, + "learning_rate": 2.28544776119403e-05, + "loss": 0.5886, + "step": 490 + }, + { + "epoch": 0.11543843283582089, + "grad_norm": 0.6610453358422199, + "learning_rate": 2.308768656716418e-05, + "loss": 0.5993, + "step": 495 + }, + { + "epoch": 0.1166044776119403, + "grad_norm": 0.6934901928302911, + "learning_rate": 2.332089552238806e-05, + "loss": 0.5882, + "step": 500 + }, + { + "epoch": 0.1177705223880597, + "grad_norm": 0.6135522526969803, + "learning_rate": 2.355410447761194e-05, + "loss": 0.5861, + "step": 505 + }, + { + "epoch": 0.11893656716417911, + "grad_norm": 0.5960309586201494, + "learning_rate": 2.3787313432835822e-05, + "loss": 0.5918, + "step": 510 + }, + { + "epoch": 0.1201026119402985, + "grad_norm": 0.6740541197005134, + "learning_rate": 2.4020522388059703e-05, + "loss": 0.6014, + "step": 515 + }, + { + "epoch": 0.12126865671641791, + "grad_norm": 0.712642754013648, + "learning_rate": 2.4253731343283584e-05, + "loss": 0.5931, + "step": 520 + }, + { + "epoch": 0.12243470149253731, + "grad_norm": 0.6138777829198983, + "learning_rate": 2.4486940298507462e-05, + "loss": 0.5993, + "step": 525 + }, + { + "epoch": 0.12360074626865672, + "grad_norm": 0.6860446691695918, + "learning_rate": 2.4720149253731347e-05, + "loss": 0.5894, + "step": 530 + }, + { + "epoch": 0.12476679104477612, + "grad_norm": 0.5600239225937539, + "learning_rate": 2.4953358208955224e-05, + "loss": 0.5653, + "step": 535 + }, + { + "epoch": 0.1259328358208955, + "grad_norm": 0.7015962241233562, + "learning_rate": 2.5186567164179102e-05, + "loss": 0.6196, + "step": 540 + }, + { + "epoch": 0.12709888059701493, + "grad_norm": 0.6556531567053278, + "learning_rate": 2.5419776119402987e-05, + "loss": 0.5905, + "step": 545 + }, + { + "epoch": 0.12826492537313433, + "grad_norm": 0.5759020094439726, + "learning_rate": 2.5652985074626868e-05, + "loss": 0.5882, + "step": 550 + }, + { + "epoch": 0.12943097014925373, + "grad_norm": 0.6590948750158928, + "learning_rate": 2.5886194029850745e-05, + "loss": 0.6075, + "step": 555 + }, + { + "epoch": 0.13059701492537312, + "grad_norm": 0.5946418488501953, + "learning_rate": 2.6119402985074626e-05, + "loss": 0.5944, + "step": 560 + }, + { + "epoch": 0.13176305970149255, + "grad_norm": 0.552275921080956, + "learning_rate": 2.635261194029851e-05, + "loss": 0.603, + "step": 565 + }, + { + "epoch": 0.13292910447761194, + "grad_norm": 0.6102324631143958, + "learning_rate": 2.658582089552239e-05, + "loss": 0.5911, + "step": 570 + }, + { + "epoch": 0.13409514925373134, + "grad_norm": 0.6615381703401475, + "learning_rate": 2.681902985074627e-05, + "loss": 0.6299, + "step": 575 + }, + { + "epoch": 0.13526119402985073, + "grad_norm": 0.571717872663118, + "learning_rate": 2.7052238805970147e-05, + "loss": 0.572, + "step": 580 + }, + { + "epoch": 0.13642723880597016, + "grad_norm": 0.6475707634151389, + "learning_rate": 2.7285447761194032e-05, + "loss": 0.5779, + "step": 585 + }, + { + "epoch": 0.13759328358208955, + "grad_norm": 0.6368002029643398, + "learning_rate": 2.7518656716417913e-05, + "loss": 0.5809, + "step": 590 + }, + { + "epoch": 0.13875932835820895, + "grad_norm": 0.6827885325554015, + "learning_rate": 2.775186567164179e-05, + "loss": 0.583, + "step": 595 + }, + { + "epoch": 0.13992537313432835, + "grad_norm": 0.6642314706691799, + "learning_rate": 2.7985074626865672e-05, + "loss": 0.5899, + "step": 600 + }, + { + "epoch": 0.14109141791044777, + "grad_norm": 0.6762868283771502, + "learning_rate": 2.8218283582089556e-05, + "loss": 0.574, + "step": 605 + }, + { + "epoch": 0.14225746268656717, + "grad_norm": 0.6481495750991825, + "learning_rate": 2.8451492537313434e-05, + "loss": 0.5772, + "step": 610 + }, + { + "epoch": 0.14342350746268656, + "grad_norm": 0.5892324009182466, + "learning_rate": 2.8684701492537315e-05, + "loss": 0.6066, + "step": 615 + }, + { + "epoch": 0.14458955223880596, + "grad_norm": 0.6254959051349941, + "learning_rate": 2.8917910447761193e-05, + "loss": 0.5924, + "step": 620 + }, + { + "epoch": 0.14575559701492538, + "grad_norm": 0.5847897324041825, + "learning_rate": 2.9151119402985077e-05, + "loss": 0.5887, + "step": 625 + }, + { + "epoch": 0.14692164179104478, + "grad_norm": 0.6649402988506068, + "learning_rate": 2.9384328358208955e-05, + "loss": 0.616, + "step": 630 + }, + { + "epoch": 0.14808768656716417, + "grad_norm": 0.6320253638275484, + "learning_rate": 2.9617537313432836e-05, + "loss": 0.5718, + "step": 635 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.635075778375142, + "learning_rate": 2.9850746268656714e-05, + "loss": 0.6073, + "step": 640 + }, + { + "epoch": 0.150419776119403, + "grad_norm": 0.6976457881972329, + "learning_rate": 3.00839552238806e-05, + "loss": 0.5889, + "step": 645 + }, + { + "epoch": 0.1515858208955224, + "grad_norm": 0.5937396394518106, + "learning_rate": 3.031716417910448e-05, + "loss": 0.5907, + "step": 650 + }, + { + "epoch": 0.15275186567164178, + "grad_norm": 0.5801081729770634, + "learning_rate": 3.055037313432836e-05, + "loss": 0.5913, + "step": 655 + }, + { + "epoch": 0.15391791044776118, + "grad_norm": 0.6667536375513913, + "learning_rate": 3.078358208955224e-05, + "loss": 0.5969, + "step": 660 + }, + { + "epoch": 0.1550839552238806, + "grad_norm": 0.565340013321232, + "learning_rate": 3.101679104477612e-05, + "loss": 0.5858, + "step": 665 + }, + { + "epoch": 0.15625, + "grad_norm": 0.5992712228605234, + "learning_rate": 3.125e-05, + "loss": 0.5978, + "step": 670 + }, + { + "epoch": 0.1574160447761194, + "grad_norm": 0.6016621578315273, + "learning_rate": 3.148320895522388e-05, + "loss": 0.6107, + "step": 675 + }, + { + "epoch": 0.15858208955223882, + "grad_norm": 0.6624206201218021, + "learning_rate": 3.171641791044776e-05, + "loss": 0.6197, + "step": 680 + }, + { + "epoch": 0.15974813432835822, + "grad_norm": 0.5656129717345737, + "learning_rate": 3.1949626865671644e-05, + "loss": 0.6051, + "step": 685 + }, + { + "epoch": 0.1609141791044776, + "grad_norm": 0.5716458801031394, + "learning_rate": 3.2182835820895525e-05, + "loss": 0.5917, + "step": 690 + }, + { + "epoch": 0.162080223880597, + "grad_norm": 0.56275562236787, + "learning_rate": 3.2416044776119406e-05, + "loss": 0.606, + "step": 695 + }, + { + "epoch": 0.16324626865671643, + "grad_norm": 0.6196256045734249, + "learning_rate": 3.264925373134329e-05, + "loss": 0.5887, + "step": 700 + }, + { + "epoch": 0.16441231343283583, + "grad_norm": 0.6125006602113579, + "learning_rate": 3.288246268656717e-05, + "loss": 0.6054, + "step": 705 + }, + { + "epoch": 0.16557835820895522, + "grad_norm": 0.5797375121091652, + "learning_rate": 3.311567164179105e-05, + "loss": 0.5748, + "step": 710 + }, + { + "epoch": 0.16674440298507462, + "grad_norm": 0.555974564114352, + "learning_rate": 3.3348880597014924e-05, + "loss": 0.618, + "step": 715 + }, + { + "epoch": 0.16791044776119404, + "grad_norm": 0.6230701097857418, + "learning_rate": 3.358208955223881e-05, + "loss": 0.5775, + "step": 720 + }, + { + "epoch": 0.16907649253731344, + "grad_norm": 0.5883157056075351, + "learning_rate": 3.3815298507462686e-05, + "loss": 0.5935, + "step": 725 + }, + { + "epoch": 0.17024253731343283, + "grad_norm": 0.5313103469638913, + "learning_rate": 3.404850746268657e-05, + "loss": 0.5903, + "step": 730 + }, + { + "epoch": 0.17140858208955223, + "grad_norm": 0.5269472732980944, + "learning_rate": 3.428171641791045e-05, + "loss": 0.5589, + "step": 735 + }, + { + "epoch": 0.17257462686567165, + "grad_norm": 0.5883967944258575, + "learning_rate": 3.451492537313433e-05, + "loss": 0.6068, + "step": 740 + }, + { + "epoch": 0.17374067164179105, + "grad_norm": 0.6430325388079234, + "learning_rate": 3.474813432835821e-05, + "loss": 0.569, + "step": 745 + }, + { + "epoch": 0.17490671641791045, + "grad_norm": 0.6426885251690175, + "learning_rate": 3.498134328358209e-05, + "loss": 0.5806, + "step": 750 + }, + { + "epoch": 0.17607276119402984, + "grad_norm": 0.6733446219892759, + "learning_rate": 3.521455223880597e-05, + "loss": 0.5958, + "step": 755 + }, + { + "epoch": 0.17723880597014927, + "grad_norm": 0.5943382748973256, + "learning_rate": 3.5447761194029854e-05, + "loss": 0.5766, + "step": 760 + }, + { + "epoch": 0.17840485074626866, + "grad_norm": 0.6034703115342142, + "learning_rate": 3.5680970149253735e-05, + "loss": 0.5677, + "step": 765 + }, + { + "epoch": 0.17957089552238806, + "grad_norm": 0.543048749101742, + "learning_rate": 3.5914179104477616e-05, + "loss": 0.582, + "step": 770 + }, + { + "epoch": 0.18073694029850745, + "grad_norm": 0.6062053068378871, + "learning_rate": 3.614738805970149e-05, + "loss": 0.5948, + "step": 775 + }, + { + "epoch": 0.18190298507462688, + "grad_norm": 0.5318696868283348, + "learning_rate": 3.638059701492538e-05, + "loss": 0.5803, + "step": 780 + }, + { + "epoch": 0.18306902985074627, + "grad_norm": 0.614282680422898, + "learning_rate": 3.661380597014926e-05, + "loss": 0.5725, + "step": 785 + }, + { + "epoch": 0.18423507462686567, + "grad_norm": 0.5822339500378908, + "learning_rate": 3.6847014925373134e-05, + "loss": 0.566, + "step": 790 + }, + { + "epoch": 0.18540111940298507, + "grad_norm": 0.578500299025908, + "learning_rate": 3.7080223880597015e-05, + "loss": 0.6007, + "step": 795 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.6855748875441977, + "learning_rate": 3.73134328358209e-05, + "loss": 0.5875, + "step": 800 + }, + { + "epoch": 0.18773320895522388, + "grad_norm": 0.5831814969944836, + "learning_rate": 3.754664179104478e-05, + "loss": 0.5998, + "step": 805 + }, + { + "epoch": 0.18889925373134328, + "grad_norm": 0.5260055027606848, + "learning_rate": 3.777985074626866e-05, + "loss": 0.5854, + "step": 810 + }, + { + "epoch": 0.19006529850746268, + "grad_norm": 0.5964193533820314, + "learning_rate": 3.801305970149254e-05, + "loss": 0.6024, + "step": 815 + }, + { + "epoch": 0.1912313432835821, + "grad_norm": 0.5107453949700954, + "learning_rate": 3.824626865671642e-05, + "loss": 0.5935, + "step": 820 + }, + { + "epoch": 0.1923973880597015, + "grad_norm": 0.5648317168782171, + "learning_rate": 3.84794776119403e-05, + "loss": 0.5498, + "step": 825 + }, + { + "epoch": 0.1935634328358209, + "grad_norm": 0.5562713574675907, + "learning_rate": 3.871268656716418e-05, + "loss": 0.6104, + "step": 830 + }, + { + "epoch": 0.1947294776119403, + "grad_norm": 0.6192932504991097, + "learning_rate": 3.894589552238806e-05, + "loss": 0.601, + "step": 835 + }, + { + "epoch": 0.1958955223880597, + "grad_norm": 0.6121054128335968, + "learning_rate": 3.9179104477611945e-05, + "loss": 0.5878, + "step": 840 + }, + { + "epoch": 0.1970615671641791, + "grad_norm": 0.5906087442373493, + "learning_rate": 3.9412313432835826e-05, + "loss": 0.5657, + "step": 845 + }, + { + "epoch": 0.1982276119402985, + "grad_norm": 0.5816800031851801, + "learning_rate": 3.96455223880597e-05, + "loss": 0.5676, + "step": 850 + }, + { + "epoch": 0.1993936567164179, + "grad_norm": 0.5977952262779495, + "learning_rate": 3.987873134328358e-05, + "loss": 0.5999, + "step": 855 + }, + { + "epoch": 0.20055970149253732, + "grad_norm": 0.5432525463268602, + "learning_rate": 4.011194029850747e-05, + "loss": 0.6065, + "step": 860 + }, + { + "epoch": 0.20172574626865672, + "grad_norm": 0.5662596636748775, + "learning_rate": 4.0345149253731344e-05, + "loss": 0.6021, + "step": 865 + }, + { + "epoch": 0.20289179104477612, + "grad_norm": 0.5550451000970232, + "learning_rate": 4.0578358208955225e-05, + "loss": 0.5767, + "step": 870 + }, + { + "epoch": 0.2040578358208955, + "grad_norm": 0.6584490116997408, + "learning_rate": 4.0811567164179106e-05, + "loss": 0.5811, + "step": 875 + }, + { + "epoch": 0.20522388059701493, + "grad_norm": 0.6196068395971027, + "learning_rate": 4.104477611940299e-05, + "loss": 0.5678, + "step": 880 + }, + { + "epoch": 0.20638992537313433, + "grad_norm": 0.569578990966687, + "learning_rate": 4.127798507462687e-05, + "loss": 0.576, + "step": 885 + }, + { + "epoch": 0.20755597014925373, + "grad_norm": 0.540443760471811, + "learning_rate": 4.151119402985075e-05, + "loss": 0.5549, + "step": 890 + }, + { + "epoch": 0.20872201492537312, + "grad_norm": 0.5079297415494084, + "learning_rate": 4.174440298507462e-05, + "loss": 0.5682, + "step": 895 + }, + { + "epoch": 0.20988805970149255, + "grad_norm": 0.6069256063141537, + "learning_rate": 4.197761194029851e-05, + "loss": 0.602, + "step": 900 + }, + { + "epoch": 0.21105410447761194, + "grad_norm": 0.5255861953251896, + "learning_rate": 4.221082089552239e-05, + "loss": 0.5739, + "step": 905 + }, + { + "epoch": 0.21222014925373134, + "grad_norm": 0.6771457707390894, + "learning_rate": 4.244402985074627e-05, + "loss": 0.5969, + "step": 910 + }, + { + "epoch": 0.21338619402985073, + "grad_norm": 0.5924999294439693, + "learning_rate": 4.267723880597015e-05, + "loss": 0.5716, + "step": 915 + }, + { + "epoch": 0.21455223880597016, + "grad_norm": 0.5628296247978087, + "learning_rate": 4.2910447761194036e-05, + "loss": 0.6034, + "step": 920 + }, + { + "epoch": 0.21571828358208955, + "grad_norm": 0.6335505374297353, + "learning_rate": 4.314365671641791e-05, + "loss": 0.594, + "step": 925 + }, + { + "epoch": 0.21688432835820895, + "grad_norm": 0.5933156164806249, + "learning_rate": 4.337686567164179e-05, + "loss": 0.6059, + "step": 930 + }, + { + "epoch": 0.21805037313432835, + "grad_norm": 0.5577452592820481, + "learning_rate": 4.361007462686567e-05, + "loss": 0.5821, + "step": 935 + }, + { + "epoch": 0.21921641791044777, + "grad_norm": 0.5447180264806184, + "learning_rate": 4.384328358208955e-05, + "loss": 0.5892, + "step": 940 + }, + { + "epoch": 0.22038246268656717, + "grad_norm": 0.532791679164916, + "learning_rate": 4.4076492537313434e-05, + "loss": 0.5828, + "step": 945 + }, + { + "epoch": 0.22154850746268656, + "grad_norm": 0.5583820131957794, + "learning_rate": 4.4309701492537316e-05, + "loss": 0.601, + "step": 950 + }, + { + "epoch": 0.22271455223880596, + "grad_norm": 0.5003777558691288, + "learning_rate": 4.45429104477612e-05, + "loss": 0.5959, + "step": 955 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5358343725297245, + "learning_rate": 4.477611940298508e-05, + "loss": 0.5906, + "step": 960 + }, + { + "epoch": 0.22504664179104478, + "grad_norm": 0.5479409267391404, + "learning_rate": 4.500932835820896e-05, + "loss": 0.5963, + "step": 965 + }, + { + "epoch": 0.22621268656716417, + "grad_norm": 0.6183224784323952, + "learning_rate": 4.524253731343284e-05, + "loss": 0.6274, + "step": 970 + }, + { + "epoch": 0.22737873134328357, + "grad_norm": 0.5433881231736056, + "learning_rate": 4.5475746268656714e-05, + "loss": 0.527, + "step": 975 + }, + { + "epoch": 0.228544776119403, + "grad_norm": 0.5506055406053556, + "learning_rate": 4.57089552238806e-05, + "loss": 0.5657, + "step": 980 + }, + { + "epoch": 0.2297108208955224, + "grad_norm": 0.5536131078091451, + "learning_rate": 4.5942164179104477e-05, + "loss": 0.59, + "step": 985 + }, + { + "epoch": 0.23087686567164178, + "grad_norm": 0.5299963995134134, + "learning_rate": 4.617537313432836e-05, + "loss": 0.5938, + "step": 990 + }, + { + "epoch": 0.23204291044776118, + "grad_norm": 0.5530020286887515, + "learning_rate": 4.640858208955224e-05, + "loss": 0.5513, + "step": 995 + }, + { + "epoch": 0.2332089552238806, + "grad_norm": 0.5652090655865671, + "learning_rate": 4.664179104477612e-05, + "loss": 0.5808, + "step": 1000 + }, + { + "epoch": 0.234375, + "grad_norm": 0.5442231136582252, + "learning_rate": 4.6875e-05, + "loss": 0.5729, + "step": 1005 + }, + { + "epoch": 0.2355410447761194, + "grad_norm": 0.5201566882142586, + "learning_rate": 4.710820895522388e-05, + "loss": 0.5683, + "step": 1010 + }, + { + "epoch": 0.23670708955223882, + "grad_norm": 0.564490392879513, + "learning_rate": 4.734141791044776e-05, + "loss": 0.5725, + "step": 1015 + }, + { + "epoch": 0.23787313432835822, + "grad_norm": 0.5585859112380516, + "learning_rate": 4.7574626865671644e-05, + "loss": 0.5866, + "step": 1020 + }, + { + "epoch": 0.2390391791044776, + "grad_norm": 0.5332960766216192, + "learning_rate": 4.7807835820895525e-05, + "loss": 0.58, + "step": 1025 + }, + { + "epoch": 0.240205223880597, + "grad_norm": 0.5684421565070324, + "learning_rate": 4.8041044776119407e-05, + "loss": 0.5862, + "step": 1030 + }, + { + "epoch": 0.24137126865671643, + "grad_norm": 0.519204840668517, + "learning_rate": 4.827425373134329e-05, + "loss": 0.5804, + "step": 1035 + }, + { + "epoch": 0.24253731343283583, + "grad_norm": 0.5549119910611328, + "learning_rate": 4.850746268656717e-05, + "loss": 0.5909, + "step": 1040 + }, + { + "epoch": 0.24370335820895522, + "grad_norm": 0.5344473129466101, + "learning_rate": 4.874067164179105e-05, + "loss": 0.5844, + "step": 1045 + }, + { + "epoch": 0.24486940298507462, + "grad_norm": 0.46984697348471055, + "learning_rate": 4.8973880597014924e-05, + "loss": 0.5558, + "step": 1050 + }, + { + "epoch": 0.24603544776119404, + "grad_norm": 0.5049122569293845, + "learning_rate": 4.920708955223881e-05, + "loss": 0.5767, + "step": 1055 + }, + { + "epoch": 0.24720149253731344, + "grad_norm": 0.5165157903140213, + "learning_rate": 4.944029850746269e-05, + "loss": 0.5775, + "step": 1060 + }, + { + "epoch": 0.24836753731343283, + "grad_norm": 0.5432283053409598, + "learning_rate": 4.967350746268657e-05, + "loss": 0.5873, + "step": 1065 + }, + { + "epoch": 0.24953358208955223, + "grad_norm": 0.5957847227963389, + "learning_rate": 4.990671641791045e-05, + "loss": 0.5697, + "step": 1070 + }, + { + "epoch": 0.25069962686567165, + "grad_norm": 0.5251045167384748, + "learning_rate": 4.999999759121523e-05, + "loss": 0.5791, + "step": 1075 + }, + { + "epoch": 0.251865671641791, + "grad_norm": 0.5305719836101288, + "learning_rate": 4.9999982870865717e-05, + "loss": 0.5533, + "step": 1080 + }, + { + "epoch": 0.25303171641791045, + "grad_norm": 0.489282165443993, + "learning_rate": 4.9999954768389194e-05, + "loss": 0.5584, + "step": 1085 + }, + { + "epoch": 0.25419776119402987, + "grad_norm": 0.4792650955406527, + "learning_rate": 4.999991328380238e-05, + "loss": 0.5997, + "step": 1090 + }, + { + "epoch": 0.25536380597014924, + "grad_norm": 0.5067911392365152, + "learning_rate": 4.999985841712994e-05, + "loss": 0.5724, + "step": 1095 + }, + { + "epoch": 0.25652985074626866, + "grad_norm": 0.5906860460933071, + "learning_rate": 4.999979016840452e-05, + "loss": 0.5778, + "step": 1100 + }, + { + "epoch": 0.2576958955223881, + "grad_norm": 0.5296167398051762, + "learning_rate": 4.9999708537666696e-05, + "loss": 0.5459, + "step": 1105 + }, + { + "epoch": 0.25886194029850745, + "grad_norm": 0.5638313474773577, + "learning_rate": 4.999961352496503e-05, + "loss": 0.5883, + "step": 1110 + }, + { + "epoch": 0.2600279850746269, + "grad_norm": 0.5396231839865687, + "learning_rate": 4.999950513035602e-05, + "loss": 0.5948, + "step": 1115 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.5387745529714087, + "learning_rate": 4.9999383353904156e-05, + "loss": 0.5907, + "step": 1120 + }, + { + "epoch": 0.26236007462686567, + "grad_norm": 0.4823196011549847, + "learning_rate": 4.999924819568185e-05, + "loss": 0.5747, + "step": 1125 + }, + { + "epoch": 0.2635261194029851, + "grad_norm": 0.5093099848396279, + "learning_rate": 4.999909965576949e-05, + "loss": 0.5915, + "step": 1130 + }, + { + "epoch": 0.26469216417910446, + "grad_norm": 0.5302438248697375, + "learning_rate": 4.9998937734255424e-05, + "loss": 0.5987, + "step": 1135 + }, + { + "epoch": 0.2658582089552239, + "grad_norm": 0.5670135020812648, + "learning_rate": 4.9998762431235955e-05, + "loss": 0.578, + "step": 1140 + }, + { + "epoch": 0.2670242537313433, + "grad_norm": 0.5283848808200766, + "learning_rate": 4.9998573746815355e-05, + "loss": 0.5697, + "step": 1145 + }, + { + "epoch": 0.2681902985074627, + "grad_norm": 0.48975246214458645, + "learning_rate": 4.999837168110584e-05, + "loss": 0.5644, + "step": 1150 + }, + { + "epoch": 0.2693563432835821, + "grad_norm": 0.4776034322287359, + "learning_rate": 4.9998156234227586e-05, + "loss": 0.5581, + "step": 1155 + }, + { + "epoch": 0.27052238805970147, + "grad_norm": 0.5252762499974786, + "learning_rate": 4.999792740630874e-05, + "loss": 0.5861, + "step": 1160 + }, + { + "epoch": 0.2716884328358209, + "grad_norm": 0.448611905979275, + "learning_rate": 4.9997685197485396e-05, + "loss": 0.5753, + "step": 1165 + }, + { + "epoch": 0.2728544776119403, + "grad_norm": 0.49406828117713886, + "learning_rate": 4.999742960790161e-05, + "loss": 0.5687, + "step": 1170 + }, + { + "epoch": 0.2740205223880597, + "grad_norm": 0.49948080039138953, + "learning_rate": 4.9997160637709395e-05, + "loss": 0.5639, + "step": 1175 + }, + { + "epoch": 0.2751865671641791, + "grad_norm": 0.46726100492164313, + "learning_rate": 4.999687828706874e-05, + "loss": 0.5454, + "step": 1180 + }, + { + "epoch": 0.27635261194029853, + "grad_norm": 0.472396240079208, + "learning_rate": 4.999658255614756e-05, + "loss": 0.5729, + "step": 1185 + }, + { + "epoch": 0.2775186567164179, + "grad_norm": 0.5824879991684005, + "learning_rate": 4.9996273445121744e-05, + "loss": 0.5766, + "step": 1190 + }, + { + "epoch": 0.2786847014925373, + "grad_norm": 0.5120369900726413, + "learning_rate": 4.9995950954175145e-05, + "loss": 0.5564, + "step": 1195 + }, + { + "epoch": 0.2798507462686567, + "grad_norm": 0.6821504634222254, + "learning_rate": 4.999561508349957e-05, + "loss": 0.6108, + "step": 1200 + }, + { + "epoch": 0.2810167910447761, + "grad_norm": 0.45526066922122244, + "learning_rate": 4.9995265833294774e-05, + "loss": 0.5493, + "step": 1205 + }, + { + "epoch": 0.28218283582089554, + "grad_norm": 0.5120804639574477, + "learning_rate": 4.9994903203768486e-05, + "loss": 0.5515, + "step": 1210 + }, + { + "epoch": 0.2833488805970149, + "grad_norm": 0.4541809372974789, + "learning_rate": 4.999452719513638e-05, + "loss": 0.5698, + "step": 1215 + }, + { + "epoch": 0.28451492537313433, + "grad_norm": 0.49158525259783803, + "learning_rate": 4.99941378076221e-05, + "loss": 0.5799, + "step": 1220 + }, + { + "epoch": 0.28568097014925375, + "grad_norm": 0.5289894603163678, + "learning_rate": 4.9993735041457226e-05, + "loss": 0.5995, + "step": 1225 + }, + { + "epoch": 0.2868470149253731, + "grad_norm": 0.4641712754839069, + "learning_rate": 4.999331889688131e-05, + "loss": 0.5662, + "step": 1230 + }, + { + "epoch": 0.28801305970149255, + "grad_norm": 0.4868098256257308, + "learning_rate": 4.999288937414186e-05, + "loss": 0.5688, + "step": 1235 + }, + { + "epoch": 0.2891791044776119, + "grad_norm": 0.4924844059462168, + "learning_rate": 4.999244647349435e-05, + "loss": 0.5506, + "step": 1240 + }, + { + "epoch": 0.29034514925373134, + "grad_norm": 0.4983358257275362, + "learning_rate": 4.999199019520219e-05, + "loss": 0.5883, + "step": 1245 + }, + { + "epoch": 0.29151119402985076, + "grad_norm": 0.48195597056031647, + "learning_rate": 4.999152053953675e-05, + "loss": 0.5601, + "step": 1250 + }, + { + "epoch": 0.29267723880597013, + "grad_norm": 0.5242089410028173, + "learning_rate": 4.9991037506777384e-05, + "loss": 0.586, + "step": 1255 + }, + { + "epoch": 0.29384328358208955, + "grad_norm": 0.48781671854762115, + "learning_rate": 4.999054109721136e-05, + "loss": 0.588, + "step": 1260 + }, + { + "epoch": 0.295009328358209, + "grad_norm": 0.5635402788279851, + "learning_rate": 4.9990031311133944e-05, + "loss": 0.5743, + "step": 1265 + }, + { + "epoch": 0.29617537313432835, + "grad_norm": 0.4928633568357669, + "learning_rate": 4.9989508148848315e-05, + "loss": 0.6074, + "step": 1270 + }, + { + "epoch": 0.29734141791044777, + "grad_norm": 0.49767968993096073, + "learning_rate": 4.9988971610665645e-05, + "loss": 0.5875, + "step": 1275 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.46249236791400944, + "learning_rate": 4.998842169690504e-05, + "loss": 0.5737, + "step": 1280 + }, + { + "epoch": 0.29967350746268656, + "grad_norm": 0.4495065875541309, + "learning_rate": 4.9987858407893576e-05, + "loss": 0.5775, + "step": 1285 + }, + { + "epoch": 0.300839552238806, + "grad_norm": 0.48948231779530305, + "learning_rate": 4.998728174396626e-05, + "loss": 0.5536, + "step": 1290 + }, + { + "epoch": 0.30200559701492535, + "grad_norm": 0.543688758605199, + "learning_rate": 4.998669170546609e-05, + "loss": 0.574, + "step": 1295 + }, + { + "epoch": 0.3031716417910448, + "grad_norm": 0.45285718788241364, + "learning_rate": 4.998608829274398e-05, + "loss": 0.5724, + "step": 1300 + }, + { + "epoch": 0.3043376865671642, + "grad_norm": 0.5269509970734879, + "learning_rate": 4.998547150615882e-05, + "loss": 0.577, + "step": 1305 + }, + { + "epoch": 0.30550373134328357, + "grad_norm": 0.4651413513800348, + "learning_rate": 4.998484134607746e-05, + "loss": 0.5762, + "step": 1310 + }, + { + "epoch": 0.306669776119403, + "grad_norm": 0.5329583323973838, + "learning_rate": 4.998419781287469e-05, + "loss": 0.5686, + "step": 1315 + }, + { + "epoch": 0.30783582089552236, + "grad_norm": 0.4906742183579196, + "learning_rate": 4.998354090693326e-05, + "loss": 0.5846, + "step": 1320 + }, + { + "epoch": 0.3090018656716418, + "grad_norm": 0.4935895797614555, + "learning_rate": 4.9982870628643876e-05, + "loss": 0.5611, + "step": 1325 + }, + { + "epoch": 0.3101679104477612, + "grad_norm": 0.4636994251106214, + "learning_rate": 4.9982186978405175e-05, + "loss": 0.5575, + "step": 1330 + }, + { + "epoch": 0.3113339552238806, + "grad_norm": 0.4891057406321723, + "learning_rate": 4.998148995662379e-05, + "loss": 0.5684, + "step": 1335 + }, + { + "epoch": 0.3125, + "grad_norm": 0.47100059226298446, + "learning_rate": 4.9980779563714274e-05, + "loss": 0.5953, + "step": 1340 + }, + { + "epoch": 0.3136660447761194, + "grad_norm": 0.5743422188018308, + "learning_rate": 4.998005580009914e-05, + "loss": 0.5369, + "step": 1345 + }, + { + "epoch": 0.3148320895522388, + "grad_norm": 0.49522739813094924, + "learning_rate": 4.9979318666208855e-05, + "loss": 0.5655, + "step": 1350 + }, + { + "epoch": 0.3159981343283582, + "grad_norm": 0.4412980180642329, + "learning_rate": 4.997856816248184e-05, + "loss": 0.5468, + "step": 1355 + }, + { + "epoch": 0.31716417910447764, + "grad_norm": 0.4472553803069325, + "learning_rate": 4.997780428936446e-05, + "loss": 0.575, + "step": 1360 + }, + { + "epoch": 0.318330223880597, + "grad_norm": 0.49682919492884364, + "learning_rate": 4.9977027047311046e-05, + "loss": 0.5861, + "step": 1365 + }, + { + "epoch": 0.31949626865671643, + "grad_norm": 0.4695043714505331, + "learning_rate": 4.9976236436783865e-05, + "loss": 0.5753, + "step": 1370 + }, + { + "epoch": 0.3206623134328358, + "grad_norm": 0.4433496667188403, + "learning_rate": 4.9975432458253136e-05, + "loss": 0.5601, + "step": 1375 + }, + { + "epoch": 0.3218283582089552, + "grad_norm": 0.4621878380740119, + "learning_rate": 4.997461511219705e-05, + "loss": 0.5604, + "step": 1380 + }, + { + "epoch": 0.32299440298507465, + "grad_norm": 0.4406248363104525, + "learning_rate": 4.997378439910173e-05, + "loss": 0.57, + "step": 1385 + }, + { + "epoch": 0.324160447761194, + "grad_norm": 0.48970285941888225, + "learning_rate": 4.997294031946124e-05, + "loss": 0.5926, + "step": 1390 + }, + { + "epoch": 0.32532649253731344, + "grad_norm": 0.4813570655780272, + "learning_rate": 4.9972082873777626e-05, + "loss": 0.5606, + "step": 1395 + }, + { + "epoch": 0.32649253731343286, + "grad_norm": 0.4937323426247838, + "learning_rate": 4.9971212062560844e-05, + "loss": 0.5564, + "step": 1400 + }, + { + "epoch": 0.32765858208955223, + "grad_norm": 0.4738148221539011, + "learning_rate": 4.9970327886328824e-05, + "loss": 0.5634, + "step": 1405 + }, + { + "epoch": 0.32882462686567165, + "grad_norm": 0.4944510657021552, + "learning_rate": 4.9969430345607445e-05, + "loss": 0.5651, + "step": 1410 + }, + { + "epoch": 0.329990671641791, + "grad_norm": 0.4603449829292863, + "learning_rate": 4.9968519440930536e-05, + "loss": 0.5655, + "step": 1415 + }, + { + "epoch": 0.33115671641791045, + "grad_norm": 0.43710997519260814, + "learning_rate": 4.996759517283986e-05, + "loss": 0.555, + "step": 1420 + }, + { + "epoch": 0.33232276119402987, + "grad_norm": 0.5016530800822935, + "learning_rate": 4.996665754188513e-05, + "loss": 0.5965, + "step": 1425 + }, + { + "epoch": 0.33348880597014924, + "grad_norm": 0.45950447005990486, + "learning_rate": 4.996570654862402e-05, + "loss": 0.5744, + "step": 1430 + }, + { + "epoch": 0.33465485074626866, + "grad_norm": 0.479158893542782, + "learning_rate": 4.996474219362215e-05, + "loss": 0.5513, + "step": 1435 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.47063062571255804, + "learning_rate": 4.996376447745307e-05, + "loss": 0.5935, + "step": 1440 + }, + { + "epoch": 0.33698694029850745, + "grad_norm": 0.46312430894878515, + "learning_rate": 4.9962773400698295e-05, + "loss": 0.5565, + "step": 1445 + }, + { + "epoch": 0.3381529850746269, + "grad_norm": 0.48463397201420777, + "learning_rate": 4.996176896394728e-05, + "loss": 0.5787, + "step": 1450 + }, + { + "epoch": 0.33931902985074625, + "grad_norm": 0.43419167021782, + "learning_rate": 4.9960751167797414e-05, + "loss": 0.5711, + "step": 1455 + }, + { + "epoch": 0.34048507462686567, + "grad_norm": 0.4653074420498655, + "learning_rate": 4.995972001285406e-05, + "loss": 0.5868, + "step": 1460 + }, + { + "epoch": 0.3416511194029851, + "grad_norm": 0.45197311826393544, + "learning_rate": 4.99586754997305e-05, + "loss": 0.5659, + "step": 1465 + }, + { + "epoch": 0.34281716417910446, + "grad_norm": 0.4493851701904994, + "learning_rate": 4.995761762904797e-05, + "loss": 0.5677, + "step": 1470 + }, + { + "epoch": 0.3439832089552239, + "grad_norm": 0.48396020059964967, + "learning_rate": 4.9956546401435654e-05, + "loss": 0.5941, + "step": 1475 + }, + { + "epoch": 0.3451492537313433, + "grad_norm": 0.45853252669502864, + "learning_rate": 4.995546181753069e-05, + "loss": 0.5726, + "step": 1480 + }, + { + "epoch": 0.3463152985074627, + "grad_norm": 0.45960292628732224, + "learning_rate": 4.995436387797811e-05, + "loss": 0.566, + "step": 1485 + }, + { + "epoch": 0.3474813432835821, + "grad_norm": 0.45722447478413303, + "learning_rate": 4.9953252583430965e-05, + "loss": 0.5482, + "step": 1490 + }, + { + "epoch": 0.34864738805970147, + "grad_norm": 0.49733374956686327, + "learning_rate": 4.995212793455019e-05, + "loss": 0.5787, + "step": 1495 + }, + { + "epoch": 0.3498134328358209, + "grad_norm": 0.4475154420431213, + "learning_rate": 4.9950989932004684e-05, + "loss": 0.5575, + "step": 1500 + }, + { + "epoch": 0.3509794776119403, + "grad_norm": 0.5039137900484434, + "learning_rate": 4.9949838576471296e-05, + "loss": 0.5628, + "step": 1505 + }, + { + "epoch": 0.3521455223880597, + "grad_norm": 0.4607828784861782, + "learning_rate": 4.9948673868634806e-05, + "loss": 0.565, + "step": 1510 + }, + { + "epoch": 0.3533115671641791, + "grad_norm": 0.45572689063215227, + "learning_rate": 4.994749580918793e-05, + "loss": 0.5534, + "step": 1515 + }, + { + "epoch": 0.35447761194029853, + "grad_norm": 0.47030198648203614, + "learning_rate": 4.9946304398831336e-05, + "loss": 0.5679, + "step": 1520 + }, + { + "epoch": 0.3556436567164179, + "grad_norm": 0.4596430481486753, + "learning_rate": 4.9945099638273635e-05, + "loss": 0.5642, + "step": 1525 + }, + { + "epoch": 0.3568097014925373, + "grad_norm": 0.45333240594045915, + "learning_rate": 4.9943881528231365e-05, + "loss": 0.5635, + "step": 1530 + }, + { + "epoch": 0.3579757462686567, + "grad_norm": 0.5450126141011754, + "learning_rate": 4.9942650069429016e-05, + "loss": 0.5619, + "step": 1535 + }, + { + "epoch": 0.3591417910447761, + "grad_norm": 0.4638764230629832, + "learning_rate": 4.994140526259901e-05, + "loss": 0.5564, + "step": 1540 + }, + { + "epoch": 0.36030783582089554, + "grad_norm": 0.461856082768502, + "learning_rate": 4.994014710848171e-05, + "loss": 0.5579, + "step": 1545 + }, + { + "epoch": 0.3614738805970149, + "grad_norm": 0.46665112067850467, + "learning_rate": 4.993887560782541e-05, + "loss": 0.5638, + "step": 1550 + }, + { + "epoch": 0.36263992537313433, + "grad_norm": 0.6118091321397457, + "learning_rate": 4.993759076138637e-05, + "loss": 0.5976, + "step": 1555 + }, + { + "epoch": 0.36380597014925375, + "grad_norm": 0.5380381051727333, + "learning_rate": 4.993629256992876e-05, + "loss": 0.5377, + "step": 1560 + }, + { + "epoch": 0.3649720149253731, + "grad_norm": 0.46844246829034303, + "learning_rate": 4.993498103422469e-05, + "loss": 0.5647, + "step": 1565 + }, + { + "epoch": 0.36613805970149255, + "grad_norm": 0.4639832657096169, + "learning_rate": 4.99336561550542e-05, + "loss": 0.548, + "step": 1570 + }, + { + "epoch": 0.3673041044776119, + "grad_norm": 0.43083325820687435, + "learning_rate": 4.993231793320529e-05, + "loss": 0.5766, + "step": 1575 + }, + { + "epoch": 0.36847014925373134, + "grad_norm": 0.4376649250540398, + "learning_rate": 4.993096636947389e-05, + "loss": 0.5876, + "step": 1580 + }, + { + "epoch": 0.36963619402985076, + "grad_norm": 0.46991883724171807, + "learning_rate": 4.992960146466384e-05, + "loss": 0.5543, + "step": 1585 + }, + { + "epoch": 0.37080223880597013, + "grad_norm": 0.4442143989894822, + "learning_rate": 4.992822321958695e-05, + "loss": 0.5414, + "step": 1590 + }, + { + "epoch": 0.37196828358208955, + "grad_norm": 0.4352259309288988, + "learning_rate": 4.9926831635062955e-05, + "loss": 0.5681, + "step": 1595 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.4481233562706409, + "learning_rate": 4.992542671191948e-05, + "loss": 0.5777, + "step": 1600 + }, + { + "epoch": 0.37430037313432835, + "grad_norm": 0.43213203607139844, + "learning_rate": 4.992400845099215e-05, + "loss": 0.5631, + "step": 1605 + }, + { + "epoch": 0.37546641791044777, + "grad_norm": 0.46838479756333634, + "learning_rate": 4.992257685312448e-05, + "loss": 0.5615, + "step": 1610 + }, + { + "epoch": 0.37663246268656714, + "grad_norm": 0.4790157694826662, + "learning_rate": 4.992113191916794e-05, + "loss": 0.5722, + "step": 1615 + }, + { + "epoch": 0.37779850746268656, + "grad_norm": 0.5074206106763016, + "learning_rate": 4.991967364998191e-05, + "loss": 0.5569, + "step": 1620 + }, + { + "epoch": 0.378964552238806, + "grad_norm": 0.4939666577027776, + "learning_rate": 4.9918202046433714e-05, + "loss": 0.5443, + "step": 1625 + }, + { + "epoch": 0.38013059701492535, + "grad_norm": 0.4603493031471122, + "learning_rate": 4.991671710939861e-05, + "loss": 0.5605, + "step": 1630 + }, + { + "epoch": 0.3812966417910448, + "grad_norm": 0.41270849678671384, + "learning_rate": 4.991521883975978e-05, + "loss": 0.5456, + "step": 1635 + }, + { + "epoch": 0.3824626865671642, + "grad_norm": 0.4164437714939307, + "learning_rate": 4.991370723840834e-05, + "loss": 0.5416, + "step": 1640 + }, + { + "epoch": 0.38362873134328357, + "grad_norm": 0.44774976729021265, + "learning_rate": 4.991218230624332e-05, + "loss": 0.543, + "step": 1645 + }, + { + "epoch": 0.384794776119403, + "grad_norm": 0.4256827310125916, + "learning_rate": 4.9910644044171714e-05, + "loss": 0.5646, + "step": 1650 + }, + { + "epoch": 0.38596082089552236, + "grad_norm": 0.5176286353898649, + "learning_rate": 4.9909092453108394e-05, + "loss": 0.5705, + "step": 1655 + }, + { + "epoch": 0.3871268656716418, + "grad_norm": 0.4399705219539844, + "learning_rate": 4.9907527533976214e-05, + "loss": 0.56, + "step": 1660 + }, + { + "epoch": 0.3882929104477612, + "grad_norm": 0.518195076648471, + "learning_rate": 4.990594928770591e-05, + "loss": 0.5725, + "step": 1665 + }, + { + "epoch": 0.3894589552238806, + "grad_norm": 0.43061197075942564, + "learning_rate": 4.9904357715236164e-05, + "loss": 0.5644, + "step": 1670 + }, + { + "epoch": 0.390625, + "grad_norm": 0.4619368440512794, + "learning_rate": 4.9902752817513586e-05, + "loss": 0.5664, + "step": 1675 + }, + { + "epoch": 0.3917910447761194, + "grad_norm": 0.4638437137264606, + "learning_rate": 4.990113459549271e-05, + "loss": 0.5562, + "step": 1680 + }, + { + "epoch": 0.3929570895522388, + "grad_norm": 0.42811331971642497, + "learning_rate": 4.989950305013599e-05, + "loss": 0.5712, + "step": 1685 + }, + { + "epoch": 0.3941231343283582, + "grad_norm": 0.43336616831376673, + "learning_rate": 4.98978581824138e-05, + "loss": 0.5701, + "step": 1690 + }, + { + "epoch": 0.39528917910447764, + "grad_norm": 0.5021149588786751, + "learning_rate": 4.989619999330446e-05, + "loss": 0.5485, + "step": 1695 + }, + { + "epoch": 0.396455223880597, + "grad_norm": 0.5041381985514305, + "learning_rate": 4.9894528483794175e-05, + "loss": 0.5793, + "step": 1700 + }, + { + "epoch": 0.39762126865671643, + "grad_norm": 0.500144974307226, + "learning_rate": 4.989284365487712e-05, + "loss": 0.5757, + "step": 1705 + }, + { + "epoch": 0.3987873134328358, + "grad_norm": 0.4306528925895123, + "learning_rate": 4.9891145507555346e-05, + "loss": 0.5597, + "step": 1710 + }, + { + "epoch": 0.3999533582089552, + "grad_norm": 0.44274857897206693, + "learning_rate": 4.988943404283886e-05, + "loss": 0.5675, + "step": 1715 + }, + { + "epoch": 0.40111940298507465, + "grad_norm": 0.42368506541882556, + "learning_rate": 4.9887709261745566e-05, + "loss": 0.5544, + "step": 1720 + }, + { + "epoch": 0.402285447761194, + "grad_norm": 0.41733710751037534, + "learning_rate": 4.9885971165301296e-05, + "loss": 0.5637, + "step": 1725 + }, + { + "epoch": 0.40345149253731344, + "grad_norm": 0.46279885315214764, + "learning_rate": 4.988421975453982e-05, + "loss": 0.5812, + "step": 1730 + }, + { + "epoch": 0.40461753731343286, + "grad_norm": 0.474553185605738, + "learning_rate": 4.988245503050279e-05, + "loss": 0.5759, + "step": 1735 + }, + { + "epoch": 0.40578358208955223, + "grad_norm": 0.4577653422711988, + "learning_rate": 4.9880676994239805e-05, + "loss": 0.5513, + "step": 1740 + }, + { + "epoch": 0.40694962686567165, + "grad_norm": 0.4187272091284333, + "learning_rate": 4.987888564680837e-05, + "loss": 0.5652, + "step": 1745 + }, + { + "epoch": 0.408115671641791, + "grad_norm": 0.5184535633687853, + "learning_rate": 4.9877080989273925e-05, + "loss": 0.5877, + "step": 1750 + }, + { + "epoch": 0.40928171641791045, + "grad_norm": 0.44602635138498936, + "learning_rate": 4.9875263022709786e-05, + "loss": 0.5398, + "step": 1755 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.559920348015096, + "learning_rate": 4.987343174819723e-05, + "loss": 0.549, + "step": 1760 + }, + { + "epoch": 0.41161380597014924, + "grad_norm": 0.40212601810851556, + "learning_rate": 4.9871587166825405e-05, + "loss": 0.5261, + "step": 1765 + }, + { + "epoch": 0.41277985074626866, + "grad_norm": 0.46302360712943924, + "learning_rate": 4.9869729279691425e-05, + "loss": 0.5612, + "step": 1770 + }, + { + "epoch": 0.4139458955223881, + "grad_norm": 0.48495455648218605, + "learning_rate": 4.986785808790028e-05, + "loss": 0.5824, + "step": 1775 + }, + { + "epoch": 0.41511194029850745, + "grad_norm": 0.4303272638769056, + "learning_rate": 4.9865973592564876e-05, + "loss": 0.5646, + "step": 1780 + }, + { + "epoch": 0.4162779850746269, + "grad_norm": 0.4124997140863719, + "learning_rate": 4.986407579480604e-05, + "loss": 0.5603, + "step": 1785 + }, + { + "epoch": 0.41744402985074625, + "grad_norm": 0.4311398379117858, + "learning_rate": 4.9862164695752524e-05, + "loss": 0.558, + "step": 1790 + }, + { + "epoch": 0.41861007462686567, + "grad_norm": 0.4241377020007968, + "learning_rate": 4.986024029654095e-05, + "loss": 0.5536, + "step": 1795 + }, + { + "epoch": 0.4197761194029851, + "grad_norm": 0.4248495962681604, + "learning_rate": 4.98583025983159e-05, + "loss": 0.5693, + "step": 1800 + }, + { + "epoch": 0.42094216417910446, + "grad_norm": 0.4431106772288949, + "learning_rate": 4.9856351602229846e-05, + "loss": 0.5431, + "step": 1805 + }, + { + "epoch": 0.4221082089552239, + "grad_norm": 0.4791949644486913, + "learning_rate": 4.985438730944314e-05, + "loss": 0.5585, + "step": 1810 + }, + { + "epoch": 0.4232742537313433, + "grad_norm": 0.4377501102334117, + "learning_rate": 4.985240972112409e-05, + "loss": 0.5805, + "step": 1815 + }, + { + "epoch": 0.4244402985074627, + "grad_norm": 0.4507178181055429, + "learning_rate": 4.985041883844888e-05, + "loss": 0.5373, + "step": 1820 + }, + { + "epoch": 0.4256063432835821, + "grad_norm": 0.4705472621000598, + "learning_rate": 4.984841466260161e-05, + "loss": 0.5602, + "step": 1825 + }, + { + "epoch": 0.42677238805970147, + "grad_norm": 0.4140988235587778, + "learning_rate": 4.9846397194774294e-05, + "loss": 0.5389, + "step": 1830 + }, + { + "epoch": 0.4279384328358209, + "grad_norm": 0.40544501813240136, + "learning_rate": 4.9844366436166837e-05, + "loss": 0.5418, + "step": 1835 + }, + { + "epoch": 0.4291044776119403, + "grad_norm": 0.42668985298136625, + "learning_rate": 4.984232238798707e-05, + "loss": 0.5638, + "step": 1840 + }, + { + "epoch": 0.4302705223880597, + "grad_norm": 0.4618355005213451, + "learning_rate": 4.9840265051450694e-05, + "loss": 0.5533, + "step": 1845 + }, + { + "epoch": 0.4314365671641791, + "grad_norm": 0.405012555937919, + "learning_rate": 4.983819442778134e-05, + "loss": 0.5423, + "step": 1850 + }, + { + "epoch": 0.43260261194029853, + "grad_norm": 0.44687169495578805, + "learning_rate": 4.983611051821055e-05, + "loss": 0.5439, + "step": 1855 + }, + { + "epoch": 0.4337686567164179, + "grad_norm": 0.40695918232481937, + "learning_rate": 4.983401332397775e-05, + "loss": 0.5232, + "step": 1860 + }, + { + "epoch": 0.4349347014925373, + "grad_norm": 0.42811566851360106, + "learning_rate": 4.983190284633025e-05, + "loss": 0.5604, + "step": 1865 + }, + { + "epoch": 0.4361007462686567, + "grad_norm": 0.42869973179034077, + "learning_rate": 4.9829779086523295e-05, + "loss": 0.5711, + "step": 1870 + }, + { + "epoch": 0.4372667910447761, + "grad_norm": 0.5494959455877744, + "learning_rate": 4.9827642045820016e-05, + "loss": 0.5419, + "step": 1875 + }, + { + "epoch": 0.43843283582089554, + "grad_norm": 0.42568192378292363, + "learning_rate": 4.982549172549145e-05, + "loss": 0.5825, + "step": 1880 + }, + { + "epoch": 0.4395988805970149, + "grad_norm": 0.46959577972035027, + "learning_rate": 4.982332812681651e-05, + "loss": 0.5814, + "step": 1885 + }, + { + "epoch": 0.44076492537313433, + "grad_norm": 0.4708004823112155, + "learning_rate": 4.9821151251082035e-05, + "loss": 0.5735, + "step": 1890 + }, + { + "epoch": 0.44193097014925375, + "grad_norm": 0.4279741612325164, + "learning_rate": 4.981896109958274e-05, + "loss": 0.5437, + "step": 1895 + }, + { + "epoch": 0.4430970149253731, + "grad_norm": 0.4095434730326501, + "learning_rate": 4.981675767362125e-05, + "loss": 0.5593, + "step": 1900 + }, + { + "epoch": 0.44426305970149255, + "grad_norm": 0.42685723472991594, + "learning_rate": 4.981454097450806e-05, + "loss": 0.5661, + "step": 1905 + }, + { + "epoch": 0.4454291044776119, + "grad_norm": 0.44948867506310025, + "learning_rate": 4.98123110035616e-05, + "loss": 0.5457, + "step": 1910 + }, + { + "epoch": 0.44659514925373134, + "grad_norm": 0.43929816483518985, + "learning_rate": 4.981006776210816e-05, + "loss": 0.5571, + "step": 1915 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.43876619295403296, + "learning_rate": 4.980781125148194e-05, + "loss": 0.5525, + "step": 1920 + }, + { + "epoch": 0.44892723880597013, + "grad_norm": 0.45083808200498976, + "learning_rate": 4.9805541473025016e-05, + "loss": 0.5722, + "step": 1925 + }, + { + "epoch": 0.45009328358208955, + "grad_norm": 0.5087947632583268, + "learning_rate": 4.980325842808737e-05, + "loss": 0.5602, + "step": 1930 + }, + { + "epoch": 0.451259328358209, + "grad_norm": 0.4298992123911525, + "learning_rate": 4.980096211802688e-05, + "loss": 0.5527, + "step": 1935 + }, + { + "epoch": 0.45242537313432835, + "grad_norm": 0.42488150264102736, + "learning_rate": 4.979865254420929e-05, + "loss": 0.5231, + "step": 1940 + }, + { + "epoch": 0.45359141791044777, + "grad_norm": 0.4139456996916272, + "learning_rate": 4.979632970800824e-05, + "loss": 0.5634, + "step": 1945 + }, + { + "epoch": 0.45475746268656714, + "grad_norm": 0.4697915547711206, + "learning_rate": 4.9793993610805276e-05, + "loss": 0.5489, + "step": 1950 + }, + { + "epoch": 0.45592350746268656, + "grad_norm": 0.4583626661754686, + "learning_rate": 4.979164425398983e-05, + "loss": 0.5864, + "step": 1955 + }, + { + "epoch": 0.457089552238806, + "grad_norm": 0.446217179949066, + "learning_rate": 4.9789281638959184e-05, + "loss": 0.5608, + "step": 1960 + }, + { + "epoch": 0.45825559701492535, + "grad_norm": 0.45063170677328174, + "learning_rate": 4.978690576711855e-05, + "loss": 0.5463, + "step": 1965 + }, + { + "epoch": 0.4594216417910448, + "grad_norm": 0.4335952872820952, + "learning_rate": 4.978451663988099e-05, + "loss": 0.5537, + "step": 1970 + }, + { + "epoch": 0.4605876865671642, + "grad_norm": 0.4198703921554321, + "learning_rate": 4.978211425866748e-05, + "loss": 0.5202, + "step": 1975 + }, + { + "epoch": 0.46175373134328357, + "grad_norm": 0.44012417799479364, + "learning_rate": 4.977969862490685e-05, + "loss": 0.576, + "step": 1980 + }, + { + "epoch": 0.462919776119403, + "grad_norm": 0.4106293282394111, + "learning_rate": 4.9777269740035844e-05, + "loss": 0.5417, + "step": 1985 + }, + { + "epoch": 0.46408582089552236, + "grad_norm": 0.42664083312692597, + "learning_rate": 4.977482760549905e-05, + "loss": 0.5507, + "step": 1990 + }, + { + "epoch": 0.4652518656716418, + "grad_norm": 0.37376513992329213, + "learning_rate": 4.977237222274897e-05, + "loss": 0.5352, + "step": 1995 + }, + { + "epoch": 0.4664179104477612, + "grad_norm": 0.4130064098570384, + "learning_rate": 4.976990359324597e-05, + "loss": 0.5497, + "step": 2000 + }, + { + "epoch": 0.4675839552238806, + "grad_norm": 0.43241982641910154, + "learning_rate": 4.9767421718458304e-05, + "loss": 0.5532, + "step": 2005 + }, + { + "epoch": 0.46875, + "grad_norm": 0.42166614491803217, + "learning_rate": 4.9764926599862065e-05, + "loss": 0.574, + "step": 2010 + }, + { + "epoch": 0.4699160447761194, + "grad_norm": 0.469266061518071, + "learning_rate": 4.9762418238941285e-05, + "loss": 0.5414, + "step": 2015 + }, + { + "epoch": 0.4710820895522388, + "grad_norm": 0.441909513049608, + "learning_rate": 4.9759896637187826e-05, + "loss": 0.5596, + "step": 2020 + }, + { + "epoch": 0.4722481343283582, + "grad_norm": 0.4679809012304934, + "learning_rate": 4.9757361796101445e-05, + "loss": 0.5538, + "step": 2025 + }, + { + "epoch": 0.47341417910447764, + "grad_norm": 0.4136933429405254, + "learning_rate": 4.9754813717189765e-05, + "loss": 0.5648, + "step": 2030 + }, + { + "epoch": 0.474580223880597, + "grad_norm": 0.3977775912844767, + "learning_rate": 4.975225240196829e-05, + "loss": 0.5524, + "step": 2035 + }, + { + "epoch": 0.47574626865671643, + "grad_norm": 0.40316601277755537, + "learning_rate": 4.974967785196039e-05, + "loss": 0.5347, + "step": 2040 + }, + { + "epoch": 0.4769123134328358, + "grad_norm": 0.417585225911704, + "learning_rate": 4.974709006869731e-05, + "loss": 0.5568, + "step": 2045 + }, + { + "epoch": 0.4780783582089552, + "grad_norm": 0.3891660537025456, + "learning_rate": 4.974448905371816e-05, + "loss": 0.5504, + "step": 2050 + }, + { + "epoch": 0.47924440298507465, + "grad_norm": 0.44591964309369564, + "learning_rate": 4.974187480856993e-05, + "loss": 0.5529, + "step": 2055 + }, + { + "epoch": 0.480410447761194, + "grad_norm": 0.39423545578129715, + "learning_rate": 4.973924733480747e-05, + "loss": 0.5407, + "step": 2060 + }, + { + "epoch": 0.48157649253731344, + "grad_norm": 0.368291270463148, + "learning_rate": 4.973660663399349e-05, + "loss": 0.5369, + "step": 2065 + }, + { + "epoch": 0.48274253731343286, + "grad_norm": 0.4301251852936052, + "learning_rate": 4.9733952707698606e-05, + "loss": 0.5387, + "step": 2070 + }, + { + "epoch": 0.48390858208955223, + "grad_norm": 0.4207409252637043, + "learning_rate": 4.9731285557501245e-05, + "loss": 0.5431, + "step": 2075 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.4336520003007907, + "learning_rate": 4.9728605184987724e-05, + "loss": 0.5489, + "step": 2080 + }, + { + "epoch": 0.486240671641791, + "grad_norm": 0.40021246117533077, + "learning_rate": 4.972591159175225e-05, + "loss": 0.5396, + "step": 2085 + }, + { + "epoch": 0.48740671641791045, + "grad_norm": 0.40834453386389635, + "learning_rate": 4.972320477939685e-05, + "loss": 0.531, + "step": 2090 + }, + { + "epoch": 0.48857276119402987, + "grad_norm": 0.431163778817016, + "learning_rate": 4.9720484749531434e-05, + "loss": 0.5372, + "step": 2095 + }, + { + "epoch": 0.48973880597014924, + "grad_norm": 0.41321888129096157, + "learning_rate": 4.971775150377378e-05, + "loss": 0.5712, + "step": 2100 + }, + { + "epoch": 0.49090485074626866, + "grad_norm": 0.4000479374442276, + "learning_rate": 4.971500504374951e-05, + "loss": 0.5161, + "step": 2105 + }, + { + "epoch": 0.4920708955223881, + "grad_norm": 0.4125452194985789, + "learning_rate": 4.971224537109211e-05, + "loss": 0.5427, + "step": 2110 + }, + { + "epoch": 0.49323694029850745, + "grad_norm": 0.4335278042799295, + "learning_rate": 4.970947248744294e-05, + "loss": 0.5255, + "step": 2115 + }, + { + "epoch": 0.4944029850746269, + "grad_norm": 0.45345614437500326, + "learning_rate": 4.970668639445119e-05, + "loss": 0.5548, + "step": 2120 + }, + { + "epoch": 0.49556902985074625, + "grad_norm": 0.4104769008914523, + "learning_rate": 4.9703887093773935e-05, + "loss": 0.5674, + "step": 2125 + }, + { + "epoch": 0.49673507462686567, + "grad_norm": 0.42388957127023674, + "learning_rate": 4.970107458707608e-05, + "loss": 0.5321, + "step": 2130 + }, + { + "epoch": 0.4979011194029851, + "grad_norm": 0.41865598540915067, + "learning_rate": 4.969824887603042e-05, + "loss": 0.5641, + "step": 2135 + }, + { + "epoch": 0.49906716417910446, + "grad_norm": 0.40493387297681477, + "learning_rate": 4.969540996231754e-05, + "loss": 0.562, + "step": 2140 + }, + { + "epoch": 0.5002332089552238, + "grad_norm": 0.433698724664108, + "learning_rate": 4.9692557847625946e-05, + "loss": 0.5411, + "step": 2145 + }, + { + "epoch": 0.5013992537313433, + "grad_norm": 0.462819257352637, + "learning_rate": 4.968969253365196e-05, + "loss": 0.5661, + "step": 2150 + }, + { + "epoch": 0.5025652985074627, + "grad_norm": 0.414743883976037, + "learning_rate": 4.968681402209976e-05, + "loss": 0.554, + "step": 2155 + }, + { + "epoch": 0.503731343283582, + "grad_norm": 0.4180694749018957, + "learning_rate": 4.9683922314681374e-05, + "loss": 0.5502, + "step": 2160 + }, + { + "epoch": 0.5048973880597015, + "grad_norm": 0.43182947210847594, + "learning_rate": 4.968101741311668e-05, + "loss": 0.5465, + "step": 2165 + }, + { + "epoch": 0.5060634328358209, + "grad_norm": 0.4134036726718013, + "learning_rate": 4.96780993191334e-05, + "loss": 0.5533, + "step": 2170 + }, + { + "epoch": 0.5072294776119403, + "grad_norm": 0.39978988822243555, + "learning_rate": 4.96751680344671e-05, + "loss": 0.5354, + "step": 2175 + }, + { + "epoch": 0.5083955223880597, + "grad_norm": 0.4325283182624236, + "learning_rate": 4.9672223560861204e-05, + "loss": 0.5883, + "step": 2180 + }, + { + "epoch": 0.5095615671641791, + "grad_norm": 0.43897133429666435, + "learning_rate": 4.966926590006697e-05, + "loss": 0.5403, + "step": 2185 + }, + { + "epoch": 0.5107276119402985, + "grad_norm": 0.4411985754252747, + "learning_rate": 4.9666295053843495e-05, + "loss": 0.5451, + "step": 2190 + }, + { + "epoch": 0.511893656716418, + "grad_norm": 0.42414306291691434, + "learning_rate": 4.9663311023957744e-05, + "loss": 0.5588, + "step": 2195 + }, + { + "epoch": 0.5130597014925373, + "grad_norm": 0.45615950583419457, + "learning_rate": 4.966031381218447e-05, + "loss": 0.5688, + "step": 2200 + }, + { + "epoch": 0.5142257462686567, + "grad_norm": 0.4025743805608124, + "learning_rate": 4.965730342030633e-05, + "loss": 0.5581, + "step": 2205 + }, + { + "epoch": 0.5153917910447762, + "grad_norm": 0.449274095779492, + "learning_rate": 4.9654279850113775e-05, + "loss": 0.5497, + "step": 2210 + }, + { + "epoch": 0.5165578358208955, + "grad_norm": 0.4099326830474774, + "learning_rate": 4.965124310340511e-05, + "loss": 0.5507, + "step": 2215 + }, + { + "epoch": 0.5177238805970149, + "grad_norm": 0.42211103230606756, + "learning_rate": 4.964819318198648e-05, + "loss": 0.5403, + "step": 2220 + }, + { + "epoch": 0.5188899253731343, + "grad_norm": 0.3977512342414685, + "learning_rate": 4.9645130087671866e-05, + "loss": 0.5292, + "step": 2225 + }, + { + "epoch": 0.5200559701492538, + "grad_norm": 0.4104065309174799, + "learning_rate": 4.9642053822283066e-05, + "loss": 0.5459, + "step": 2230 + }, + { + "epoch": 0.5212220149253731, + "grad_norm": 0.4442365588240732, + "learning_rate": 4.963896438764973e-05, + "loss": 0.5705, + "step": 2235 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.380223914558644, + "learning_rate": 4.9635861785609333e-05, + "loss": 0.5587, + "step": 2240 + }, + { + "epoch": 0.523554104477612, + "grad_norm": 0.39221509175325275, + "learning_rate": 4.9632746018007184e-05, + "loss": 0.54, + "step": 2245 + }, + { + "epoch": 0.5247201492537313, + "grad_norm": 0.4218750790208063, + "learning_rate": 4.9629617086696434e-05, + "loss": 0.5431, + "step": 2250 + }, + { + "epoch": 0.5258861940298507, + "grad_norm": 0.42184482397484385, + "learning_rate": 4.962647499353803e-05, + "loss": 0.5647, + "step": 2255 + }, + { + "epoch": 0.5270522388059702, + "grad_norm": 0.39624179577870444, + "learning_rate": 4.962331974040079e-05, + "loss": 0.5229, + "step": 2260 + }, + { + "epoch": 0.5282182835820896, + "grad_norm": 0.3884637349231805, + "learning_rate": 4.962015132916133e-05, + "loss": 0.5403, + "step": 2265 + }, + { + "epoch": 0.5293843283582089, + "grad_norm": 0.38503446280263115, + "learning_rate": 4.961696976170409e-05, + "loss": 0.5268, + "step": 2270 + }, + { + "epoch": 0.5305503731343284, + "grad_norm": 0.4014110774875387, + "learning_rate": 4.9613775039921355e-05, + "loss": 0.5279, + "step": 2275 + }, + { + "epoch": 0.5317164179104478, + "grad_norm": 0.36379396965145155, + "learning_rate": 4.961056716571322e-05, + "loss": 0.5349, + "step": 2280 + }, + { + "epoch": 0.5328824626865671, + "grad_norm": 0.4325186706011893, + "learning_rate": 4.96073461409876e-05, + "loss": 0.5406, + "step": 2285 + }, + { + "epoch": 0.5340485074626866, + "grad_norm": 0.42859599239320695, + "learning_rate": 4.960411196766025e-05, + "loss": 0.5715, + "step": 2290 + }, + { + "epoch": 0.535214552238806, + "grad_norm": 0.43631387946986677, + "learning_rate": 4.960086464765472e-05, + "loss": 0.5356, + "step": 2295 + }, + { + "epoch": 0.5363805970149254, + "grad_norm": 0.3908360471093934, + "learning_rate": 4.95976041829024e-05, + "loss": 0.5413, + "step": 2300 + }, + { + "epoch": 0.5375466417910447, + "grad_norm": 0.412186673321893, + "learning_rate": 4.959433057534248e-05, + "loss": 0.5594, + "step": 2305 + }, + { + "epoch": 0.5387126865671642, + "grad_norm": 0.38206980527085477, + "learning_rate": 4.9591043826921984e-05, + "loss": 0.5525, + "step": 2310 + }, + { + "epoch": 0.5398787313432836, + "grad_norm": 0.42150140836879146, + "learning_rate": 4.958774393959574e-05, + "loss": 0.5312, + "step": 2315 + }, + { + "epoch": 0.5410447761194029, + "grad_norm": 0.38256885412752933, + "learning_rate": 4.95844309153264e-05, + "loss": 0.5662, + "step": 2320 + }, + { + "epoch": 0.5422108208955224, + "grad_norm": 0.4221705350921838, + "learning_rate": 4.958110475608442e-05, + "loss": 0.5328, + "step": 2325 + }, + { + "epoch": 0.5433768656716418, + "grad_norm": 0.41339404123608425, + "learning_rate": 4.9577765463848065e-05, + "loss": 0.5214, + "step": 2330 + }, + { + "epoch": 0.5445429104477612, + "grad_norm": 0.4489741191179022, + "learning_rate": 4.957441304060343e-05, + "loss": 0.5502, + "step": 2335 + }, + { + "epoch": 0.5457089552238806, + "grad_norm": 0.45846439883111634, + "learning_rate": 4.957104748834441e-05, + "loss": 0.5552, + "step": 2340 + }, + { + "epoch": 0.546875, + "grad_norm": 0.46384208003104427, + "learning_rate": 4.956766880907269e-05, + "loss": 0.5333, + "step": 2345 + }, + { + "epoch": 0.5480410447761194, + "grad_norm": 0.38904250662100903, + "learning_rate": 4.9564277004797784e-05, + "loss": 0.5402, + "step": 2350 + }, + { + "epoch": 0.5492070895522388, + "grad_norm": 0.41833862097627633, + "learning_rate": 4.956087207753702e-05, + "loss": 0.5452, + "step": 2355 + }, + { + "epoch": 0.5503731343283582, + "grad_norm": 0.40467595517570154, + "learning_rate": 4.95574540293155e-05, + "loss": 0.5321, + "step": 2360 + }, + { + "epoch": 0.5515391791044776, + "grad_norm": 0.4036471233217711, + "learning_rate": 4.955402286216617e-05, + "loss": 0.5471, + "step": 2365 + }, + { + "epoch": 0.5527052238805971, + "grad_norm": 0.4446716550676256, + "learning_rate": 4.9550578578129734e-05, + "loss": 0.564, + "step": 2370 + }, + { + "epoch": 0.5538712686567164, + "grad_norm": 0.4307597876599641, + "learning_rate": 4.954712117925473e-05, + "loss": 0.536, + "step": 2375 + }, + { + "epoch": 0.5550373134328358, + "grad_norm": 0.42377340401702124, + "learning_rate": 4.954365066759748e-05, + "loss": 0.5451, + "step": 2380 + }, + { + "epoch": 0.5562033582089553, + "grad_norm": 0.40759561883047934, + "learning_rate": 4.954016704522213e-05, + "loss": 0.5427, + "step": 2385 + }, + { + "epoch": 0.5573694029850746, + "grad_norm": 0.38288930360428963, + "learning_rate": 4.95366703142006e-05, + "loss": 0.5438, + "step": 2390 + }, + { + "epoch": 0.558535447761194, + "grad_norm": 0.47471663867410535, + "learning_rate": 4.9533160476612584e-05, + "loss": 0.5344, + "step": 2395 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.4128590228252018, + "learning_rate": 4.952963753454563e-05, + "loss": 0.5356, + "step": 2400 + }, + { + "epoch": 0.5608675373134329, + "grad_norm": 0.4126062052427728, + "learning_rate": 4.9526101490095035e-05, + "loss": 0.5332, + "step": 2405 + }, + { + "epoch": 0.5620335820895522, + "grad_norm": 0.3850389069038923, + "learning_rate": 4.95225523453639e-05, + "loss": 0.5359, + "step": 2410 + }, + { + "epoch": 0.5631996268656716, + "grad_norm": 0.4051951555671248, + "learning_rate": 4.9518990102463133e-05, + "loss": 0.5306, + "step": 2415 + }, + { + "epoch": 0.5643656716417911, + "grad_norm": 0.4093554562992661, + "learning_rate": 4.951541476351141e-05, + "loss": 0.5227, + "step": 2420 + }, + { + "epoch": 0.5655317164179104, + "grad_norm": 0.4260462395399217, + "learning_rate": 4.9511826330635205e-05, + "loss": 0.5402, + "step": 2425 + }, + { + "epoch": 0.5666977611940298, + "grad_norm": 0.3955030974367377, + "learning_rate": 4.9508224805968784e-05, + "loss": 0.5397, + "step": 2430 + }, + { + "epoch": 0.5678638059701493, + "grad_norm": 0.4258726059708142, + "learning_rate": 4.9504610191654195e-05, + "loss": 0.5324, + "step": 2435 + }, + { + "epoch": 0.5690298507462687, + "grad_norm": 0.41515662260898495, + "learning_rate": 4.950098248984127e-05, + "loss": 0.5691, + "step": 2440 + }, + { + "epoch": 0.570195895522388, + "grad_norm": 0.42425090749716615, + "learning_rate": 4.949734170268763e-05, + "loss": 0.5533, + "step": 2445 + }, + { + "epoch": 0.5713619402985075, + "grad_norm": 0.4252546180107593, + "learning_rate": 4.949368783235867e-05, + "loss": 0.5656, + "step": 2450 + }, + { + "epoch": 0.5725279850746269, + "grad_norm": 0.3825097097032984, + "learning_rate": 4.949002088102758e-05, + "loss": 0.5532, + "step": 2455 + }, + { + "epoch": 0.5736940298507462, + "grad_norm": 0.39714340156998973, + "learning_rate": 4.9486340850875316e-05, + "loss": 0.536, + "step": 2460 + }, + { + "epoch": 0.5748600746268657, + "grad_norm": 0.4280746733530717, + "learning_rate": 4.948264774409062e-05, + "loss": 0.5616, + "step": 2465 + }, + { + "epoch": 0.5760261194029851, + "grad_norm": 0.40412029049673165, + "learning_rate": 4.947894156287001e-05, + "loss": 0.5221, + "step": 2470 + }, + { + "epoch": 0.5771921641791045, + "grad_norm": 0.43128378795650346, + "learning_rate": 4.947522230941779e-05, + "loss": 0.5358, + "step": 2475 + }, + { + "epoch": 0.5783582089552238, + "grad_norm": 0.3985041609839594, + "learning_rate": 4.947148998594601e-05, + "loss": 0.5254, + "step": 2480 + }, + { + "epoch": 0.5795242537313433, + "grad_norm": 0.3772993372811755, + "learning_rate": 4.946774459467454e-05, + "loss": 0.5261, + "step": 2485 + }, + { + "epoch": 0.5806902985074627, + "grad_norm": 0.4130113004940263, + "learning_rate": 4.946398613783096e-05, + "loss": 0.5473, + "step": 2490 + }, + { + "epoch": 0.581856343283582, + "grad_norm": 0.4357760759909216, + "learning_rate": 4.946021461765069e-05, + "loss": 0.5768, + "step": 2495 + }, + { + "epoch": 0.5830223880597015, + "grad_norm": 0.3935256613674847, + "learning_rate": 4.945643003637686e-05, + "loss": 0.5115, + "step": 2500 + }, + { + "epoch": 0.5841884328358209, + "grad_norm": 0.4039689209345031, + "learning_rate": 4.945263239626039e-05, + "loss": 0.5665, + "step": 2505 + }, + { + "epoch": 0.5853544776119403, + "grad_norm": 0.39673543418986956, + "learning_rate": 4.944882169956001e-05, + "loss": 0.5298, + "step": 2510 + }, + { + "epoch": 0.5865205223880597, + "grad_norm": 0.393928069739327, + "learning_rate": 4.944499794854215e-05, + "loss": 0.5308, + "step": 2515 + }, + { + "epoch": 0.5876865671641791, + "grad_norm": 0.43568570484896163, + "learning_rate": 4.9441161145481016e-05, + "loss": 0.5399, + "step": 2520 + }, + { + "epoch": 0.5888526119402985, + "grad_norm": 0.459960614460092, + "learning_rate": 4.943731129265862e-05, + "loss": 0.5547, + "step": 2525 + }, + { + "epoch": 0.590018656716418, + "grad_norm": 0.41496044754685024, + "learning_rate": 4.9433448392364694e-05, + "loss": 0.5536, + "step": 2530 + }, + { + "epoch": 0.5911847014925373, + "grad_norm": 0.4025528875737623, + "learning_rate": 4.942957244689673e-05, + "loss": 0.5266, + "step": 2535 + }, + { + "epoch": 0.5923507462686567, + "grad_norm": 0.3808899346150937, + "learning_rate": 4.942568345856002e-05, + "loss": 0.5442, + "step": 2540 + }, + { + "epoch": 0.5935167910447762, + "grad_norm": 0.6016204533773503, + "learning_rate": 4.9421781429667555e-05, + "loss": 0.5517, + "step": 2545 + }, + { + "epoch": 0.5946828358208955, + "grad_norm": 0.3901280993232808, + "learning_rate": 4.941786636254014e-05, + "loss": 0.5069, + "step": 2550 + }, + { + "epoch": 0.5958488805970149, + "grad_norm": 0.4296414713053638, + "learning_rate": 4.9413938259506286e-05, + "loss": 0.5407, + "step": 2555 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4542224493780557, + "learning_rate": 4.940999712290229e-05, + "loss": 0.5215, + "step": 2560 + }, + { + "epoch": 0.5981809701492538, + "grad_norm": 0.3699306372038709, + "learning_rate": 4.940604295507218e-05, + "loss": 0.5334, + "step": 2565 + }, + { + "epoch": 0.5993470149253731, + "grad_norm": 0.39978710700203823, + "learning_rate": 4.940207575836775e-05, + "loss": 0.546, + "step": 2570 + }, + { + "epoch": 0.6005130597014925, + "grad_norm": 0.38181128292940997, + "learning_rate": 4.9398095535148535e-05, + "loss": 0.5446, + "step": 2575 + }, + { + "epoch": 0.601679104477612, + "grad_norm": 0.5004932568037355, + "learning_rate": 4.9394102287781816e-05, + "loss": 0.5456, + "step": 2580 + }, + { + "epoch": 0.6028451492537313, + "grad_norm": 0.3768598673122833, + "learning_rate": 4.939009601864263e-05, + "loss": 0.538, + "step": 2585 + }, + { + "epoch": 0.6040111940298507, + "grad_norm": 0.39413056994014956, + "learning_rate": 4.938607673011375e-05, + "loss": 0.5137, + "step": 2590 + }, + { + "epoch": 0.6051772388059702, + "grad_norm": 0.410121290723826, + "learning_rate": 4.938204442458569e-05, + "loss": 0.5384, + "step": 2595 + }, + { + "epoch": 0.6063432835820896, + "grad_norm": 0.371103195286021, + "learning_rate": 4.9377999104456704e-05, + "loss": 0.5225, + "step": 2600 + }, + { + "epoch": 0.6075093283582089, + "grad_norm": 0.428376461747673, + "learning_rate": 4.937394077213281e-05, + "loss": 0.5392, + "step": 2605 + }, + { + "epoch": 0.6086753731343284, + "grad_norm": 0.40068274167948215, + "learning_rate": 4.9369869430027756e-05, + "loss": 0.5147, + "step": 2610 + }, + { + "epoch": 0.6098414179104478, + "grad_norm": 0.46353704587455485, + "learning_rate": 4.9365785080562984e-05, + "loss": 0.5504, + "step": 2615 + }, + { + "epoch": 0.6110074626865671, + "grad_norm": 0.41837179049201895, + "learning_rate": 4.9361687726167746e-05, + "loss": 0.5504, + "step": 2620 + }, + { + "epoch": 0.6121735074626866, + "grad_norm": 0.46141765429478143, + "learning_rate": 4.935757736927896e-05, + "loss": 0.5411, + "step": 2625 + }, + { + "epoch": 0.613339552238806, + "grad_norm": 0.4184873410957794, + "learning_rate": 4.9353454012341346e-05, + "loss": 0.5357, + "step": 2630 + }, + { + "epoch": 0.6145055970149254, + "grad_norm": 0.420514986213068, + "learning_rate": 4.934931765780727e-05, + "loss": 0.5401, + "step": 2635 + }, + { + "epoch": 0.6156716417910447, + "grad_norm": 0.43917359395612987, + "learning_rate": 4.934516830813693e-05, + "loss": 0.5524, + "step": 2640 + }, + { + "epoch": 0.6168376865671642, + "grad_norm": 0.35672615682155906, + "learning_rate": 4.9341005965798155e-05, + "loss": 0.5401, + "step": 2645 + }, + { + "epoch": 0.6180037313432836, + "grad_norm": 0.3786713219886741, + "learning_rate": 4.9336830633266565e-05, + "loss": 0.5363, + "step": 2650 + }, + { + "epoch": 0.6191697761194029, + "grad_norm": 0.4064460236883073, + "learning_rate": 4.9332642313025495e-05, + "loss": 0.5464, + "step": 2655 + }, + { + "epoch": 0.6203358208955224, + "grad_norm": 0.39567223901783555, + "learning_rate": 4.932844100756599e-05, + "loss": 0.5263, + "step": 2660 + }, + { + "epoch": 0.6215018656716418, + "grad_norm": 0.40123811323141495, + "learning_rate": 4.9324226719386826e-05, + "loss": 0.5356, + "step": 2665 + }, + { + "epoch": 0.6226679104477612, + "grad_norm": 0.3915925580507895, + "learning_rate": 4.931999945099449e-05, + "loss": 0.5473, + "step": 2670 + }, + { + "epoch": 0.6238339552238806, + "grad_norm": 0.38500253649294836, + "learning_rate": 4.931575920490322e-05, + "loss": 0.5256, + "step": 2675 + }, + { + "epoch": 0.625, + "grad_norm": 0.3604303898684078, + "learning_rate": 4.931150598363494e-05, + "loss": 0.5288, + "step": 2680 + }, + { + "epoch": 0.6261660447761194, + "grad_norm": 0.41682983974343424, + "learning_rate": 4.93072397897193e-05, + "loss": 0.5577, + "step": 2685 + }, + { + "epoch": 0.6273320895522388, + "grad_norm": 0.383675309143092, + "learning_rate": 4.9302960625693666e-05, + "loss": 0.5102, + "step": 2690 + }, + { + "epoch": 0.6284981343283582, + "grad_norm": 0.3994211112597252, + "learning_rate": 4.929866849410313e-05, + "loss": 0.5522, + "step": 2695 + }, + { + "epoch": 0.6296641791044776, + "grad_norm": 0.39450428451057956, + "learning_rate": 4.929436339750049e-05, + "loss": 0.5529, + "step": 2700 + }, + { + "epoch": 0.6308302238805971, + "grad_norm": 0.4217539177584992, + "learning_rate": 4.9290045338446245e-05, + "loss": 0.5395, + "step": 2705 + }, + { + "epoch": 0.6319962686567164, + "grad_norm": 0.39723731617923325, + "learning_rate": 4.9285714319508607e-05, + "loss": 0.5216, + "step": 2710 + }, + { + "epoch": 0.6331623134328358, + "grad_norm": 0.38234661399826736, + "learning_rate": 4.9281370343263514e-05, + "loss": 0.5255, + "step": 2715 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.4738795364697418, + "learning_rate": 4.927701341229457e-05, + "loss": 0.5538, + "step": 2720 + }, + { + "epoch": 0.6354944029850746, + "grad_norm": 0.4140546339821553, + "learning_rate": 4.927264352919315e-05, + "loss": 0.5401, + "step": 2725 + }, + { + "epoch": 0.636660447761194, + "grad_norm": 0.37663373107692194, + "learning_rate": 4.9268260696558264e-05, + "loss": 0.536, + "step": 2730 + }, + { + "epoch": 0.6378264925373134, + "grad_norm": 0.3963608702532208, + "learning_rate": 4.926386491699665e-05, + "loss": 0.5087, + "step": 2735 + }, + { + "epoch": 0.6389925373134329, + "grad_norm": 0.38669349050721274, + "learning_rate": 4.925945619312277e-05, + "loss": 0.5526, + "step": 2740 + }, + { + "epoch": 0.6401585820895522, + "grad_norm": 0.3836229304354064, + "learning_rate": 4.925503452755875e-05, + "loss": 0.5296, + "step": 2745 + }, + { + "epoch": 0.6413246268656716, + "grad_norm": 0.3667584538602013, + "learning_rate": 4.925059992293443e-05, + "loss": 0.5473, + "step": 2750 + }, + { + "epoch": 0.6424906716417911, + "grad_norm": 0.3896534523747938, + "learning_rate": 4.924615238188734e-05, + "loss": 0.5065, + "step": 2755 + }, + { + "epoch": 0.6436567164179104, + "grad_norm": 0.391843351397095, + "learning_rate": 4.924169190706271e-05, + "loss": 0.5279, + "step": 2760 + }, + { + "epoch": 0.6448227611940298, + "grad_norm": 0.40227777754572197, + "learning_rate": 4.9237218501113466e-05, + "loss": 0.5119, + "step": 2765 + }, + { + "epoch": 0.6459888059701493, + "grad_norm": 0.4109208776707434, + "learning_rate": 4.92327321667002e-05, + "loss": 0.5287, + "step": 2770 + }, + { + "epoch": 0.6471548507462687, + "grad_norm": 0.40814412746294954, + "learning_rate": 4.922823290649122e-05, + "loss": 0.5428, + "step": 2775 + }, + { + "epoch": 0.648320895522388, + "grad_norm": 0.38176058409566654, + "learning_rate": 4.922372072316253e-05, + "loss": 0.5451, + "step": 2780 + }, + { + "epoch": 0.6494869402985075, + "grad_norm": 0.40985499403940806, + "learning_rate": 4.921919561939779e-05, + "loss": 0.5766, + "step": 2785 + }, + { + "epoch": 0.6506529850746269, + "grad_norm": 0.3987752571923002, + "learning_rate": 4.9214657597888354e-05, + "loss": 0.5466, + "step": 2790 + }, + { + "epoch": 0.6518190298507462, + "grad_norm": 0.42378717634202273, + "learning_rate": 4.921010666133326e-05, + "loss": 0.5351, + "step": 2795 + }, + { + "epoch": 0.6529850746268657, + "grad_norm": 0.43538203338320064, + "learning_rate": 4.920554281243925e-05, + "loss": 0.5397, + "step": 2800 + }, + { + "epoch": 0.6541511194029851, + "grad_norm": 0.4422239695813655, + "learning_rate": 4.920096605392071e-05, + "loss": 0.5279, + "step": 2805 + }, + { + "epoch": 0.6553171641791045, + "grad_norm": 0.3863682728766982, + "learning_rate": 4.919637638849972e-05, + "loss": 0.5223, + "step": 2810 + }, + { + "epoch": 0.6564832089552238, + "grad_norm": 0.3928925654796181, + "learning_rate": 4.9191773818906044e-05, + "loss": 0.5528, + "step": 2815 + }, + { + "epoch": 0.6576492537313433, + "grad_norm": 0.38761234569742775, + "learning_rate": 4.918715834787711e-05, + "loss": 0.5246, + "step": 2820 + }, + { + "epoch": 0.6588152985074627, + "grad_norm": 0.3866078885204687, + "learning_rate": 4.918252997815802e-05, + "loss": 0.539, + "step": 2825 + }, + { + "epoch": 0.659981343283582, + "grad_norm": 0.42287402253037515, + "learning_rate": 4.917788871250157e-05, + "loss": 0.5194, + "step": 2830 + }, + { + "epoch": 0.6611473880597015, + "grad_norm": 0.38153243447938595, + "learning_rate": 4.9173234553668194e-05, + "loss": 0.4957, + "step": 2835 + }, + { + "epoch": 0.6623134328358209, + "grad_norm": 0.3988851192480335, + "learning_rate": 4.9168567504425994e-05, + "loss": 0.5428, + "step": 2840 + }, + { + "epoch": 0.6634794776119403, + "grad_norm": 0.38608291875633494, + "learning_rate": 4.916388756755077e-05, + "loss": 0.5207, + "step": 2845 + }, + { + "epoch": 0.6646455223880597, + "grad_norm": 0.3857104904637888, + "learning_rate": 4.915919474582596e-05, + "loss": 0.5365, + "step": 2850 + }, + { + "epoch": 0.6658115671641791, + "grad_norm": 0.37253461634433516, + "learning_rate": 4.915448904204268e-05, + "loss": 0.5273, + "step": 2855 + }, + { + "epoch": 0.6669776119402985, + "grad_norm": 0.3947450807549253, + "learning_rate": 4.914977045899969e-05, + "loss": 0.5238, + "step": 2860 + }, + { + "epoch": 0.668143656716418, + "grad_norm": 0.4063505083191156, + "learning_rate": 4.914503899950344e-05, + "loss": 0.5359, + "step": 2865 + }, + { + "epoch": 0.6693097014925373, + "grad_norm": 0.3857521939474964, + "learning_rate": 4.914029466636801e-05, + "loss": 0.5255, + "step": 2870 + }, + { + "epoch": 0.6704757462686567, + "grad_norm": 0.3912979781077615, + "learning_rate": 4.9135537462415146e-05, + "loss": 0.513, + "step": 2875 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.3745982543502313, + "learning_rate": 4.913076739047425e-05, + "loss": 0.5424, + "step": 2880 + }, + { + "epoch": 0.6728078358208955, + "grad_norm": 0.3968073349699562, + "learning_rate": 4.91259844533824e-05, + "loss": 0.5394, + "step": 2885 + }, + { + "epoch": 0.6739738805970149, + "grad_norm": 0.39206443475901537, + "learning_rate": 4.9121188653984266e-05, + "loss": 0.5319, + "step": 2890 + }, + { + "epoch": 0.6751399253731343, + "grad_norm": 0.39111573373033887, + "learning_rate": 4.911637999513224e-05, + "loss": 0.537, + "step": 2895 + }, + { + "epoch": 0.6763059701492538, + "grad_norm": 0.3889569274195957, + "learning_rate": 4.9111558479686296e-05, + "loss": 0.5307, + "step": 2900 + }, + { + "epoch": 0.6774720149253731, + "grad_norm": 2.951639883433305, + "learning_rate": 4.910672411051412e-05, + "loss": 0.5514, + "step": 2905 + }, + { + "epoch": 0.6786380597014925, + "grad_norm": 0.6846608488178547, + "learning_rate": 4.910187689049099e-05, + "loss": 0.5314, + "step": 2910 + }, + { + "epoch": 0.679804104477612, + "grad_norm": 0.4093428473728174, + "learning_rate": 4.909701682249985e-05, + "loss": 0.5348, + "step": 2915 + }, + { + "epoch": 0.6809701492537313, + "grad_norm": 0.38578253112754274, + "learning_rate": 4.909214390943127e-05, + "loss": 0.5223, + "step": 2920 + }, + { + "epoch": 0.6821361940298507, + "grad_norm": 0.3898653529650086, + "learning_rate": 4.908725815418349e-05, + "loss": 0.5358, + "step": 2925 + }, + { + "epoch": 0.6833022388059702, + "grad_norm": 0.3911821894109652, + "learning_rate": 4.908235955966236e-05, + "loss": 0.5313, + "step": 2930 + }, + { + "epoch": 0.6844682835820896, + "grad_norm": 0.39835380288671635, + "learning_rate": 4.907744812878138e-05, + "loss": 0.5287, + "step": 2935 + }, + { + "epoch": 0.6856343283582089, + "grad_norm": 0.4294557267238251, + "learning_rate": 4.907252386446169e-05, + "loss": 0.5306, + "step": 2940 + }, + { + "epoch": 0.6868003731343284, + "grad_norm": 0.38270702565340614, + "learning_rate": 4.906758676963204e-05, + "loss": 0.5236, + "step": 2945 + }, + { + "epoch": 0.6879664179104478, + "grad_norm": 0.36823714228457355, + "learning_rate": 4.906263684722883e-05, + "loss": 0.5326, + "step": 2950 + }, + { + "epoch": 0.6891324626865671, + "grad_norm": 0.39194862605492836, + "learning_rate": 4.905767410019607e-05, + "loss": 0.5315, + "step": 2955 + }, + { + "epoch": 0.6902985074626866, + "grad_norm": 0.43321836299604577, + "learning_rate": 4.905269853148543e-05, + "loss": 0.5349, + "step": 2960 + }, + { + "epoch": 0.691464552238806, + "grad_norm": 0.37809749169598833, + "learning_rate": 4.904771014405618e-05, + "loss": 0.5163, + "step": 2965 + }, + { + "epoch": 0.6926305970149254, + "grad_norm": 0.38605371180030074, + "learning_rate": 4.9042708940875225e-05, + "loss": 0.5278, + "step": 2970 + }, + { + "epoch": 0.6937966417910447, + "grad_norm": 0.38021129180239116, + "learning_rate": 4.903769492491709e-05, + "loss": 0.5306, + "step": 2975 + }, + { + "epoch": 0.6949626865671642, + "grad_norm": 0.42687512055029797, + "learning_rate": 4.903266809916392e-05, + "loss": 0.5307, + "step": 2980 + }, + { + "epoch": 0.6961287313432836, + "grad_norm": 0.42495050285927866, + "learning_rate": 4.902762846660546e-05, + "loss": 0.5327, + "step": 2985 + }, + { + "epoch": 0.6972947761194029, + "grad_norm": 0.4115848460391631, + "learning_rate": 4.902257603023912e-05, + "loss": 0.5659, + "step": 2990 + }, + { + "epoch": 0.6984608208955224, + "grad_norm": 0.4092517042755935, + "learning_rate": 4.901751079306987e-05, + "loss": 0.5362, + "step": 2995 + }, + { + "epoch": 0.6996268656716418, + "grad_norm": 0.4108255952331313, + "learning_rate": 4.901243275811034e-05, + "loss": 0.5357, + "step": 3000 + }, + { + "epoch": 0.7007929104477612, + "grad_norm": 0.4044470832031311, + "learning_rate": 4.900734192838073e-05, + "loss": 0.541, + "step": 3005 + }, + { + "epoch": 0.7019589552238806, + "grad_norm": 0.3735587539523039, + "learning_rate": 4.9002238306908884e-05, + "loss": 0.5273, + "step": 3010 + }, + { + "epoch": 0.703125, + "grad_norm": 0.3767725560311478, + "learning_rate": 4.899712189673022e-05, + "loss": 0.5547, + "step": 3015 + }, + { + "epoch": 0.7042910447761194, + "grad_norm": 0.3892345398916338, + "learning_rate": 4.899199270088782e-05, + "loss": 0.5485, + "step": 3020 + }, + { + "epoch": 0.7054570895522388, + "grad_norm": 0.3860586851895598, + "learning_rate": 4.898685072243231e-05, + "loss": 0.537, + "step": 3025 + }, + { + "epoch": 0.7066231343283582, + "grad_norm": 0.38368334621088224, + "learning_rate": 4.8981695964421934e-05, + "loss": 0.5197, + "step": 3030 + }, + { + "epoch": 0.7077891791044776, + "grad_norm": 0.3722183706729155, + "learning_rate": 4.897652842992256e-05, + "loss": 0.5286, + "step": 3035 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.40111216056211474, + "learning_rate": 4.897134812200763e-05, + "loss": 0.5276, + "step": 3040 + }, + { + "epoch": 0.7101212686567164, + "grad_norm": 0.3677350592197864, + "learning_rate": 4.896615504375819e-05, + "loss": 0.5176, + "step": 3045 + }, + { + "epoch": 0.7112873134328358, + "grad_norm": 0.43186027733762905, + "learning_rate": 4.8960949198262896e-05, + "loss": 0.5504, + "step": 3050 + }, + { + "epoch": 0.7124533582089553, + "grad_norm": 0.3754092397371525, + "learning_rate": 4.895573058861798e-05, + "loss": 0.5298, + "step": 3055 + }, + { + "epoch": 0.7136194029850746, + "grad_norm": 0.37437284767163637, + "learning_rate": 4.895049921792727e-05, + "loss": 0.5494, + "step": 3060 + }, + { + "epoch": 0.714785447761194, + "grad_norm": 0.36959904830475193, + "learning_rate": 4.894525508930218e-05, + "loss": 0.5259, + "step": 3065 + }, + { + "epoch": 0.7159514925373134, + "grad_norm": 0.43939464674563955, + "learning_rate": 4.893999820586172e-05, + "loss": 0.5716, + "step": 3070 + }, + { + "epoch": 0.7171175373134329, + "grad_norm": 0.40922209897051337, + "learning_rate": 4.893472857073249e-05, + "loss": 0.5361, + "step": 3075 + }, + { + "epoch": 0.7182835820895522, + "grad_norm": 0.40570250040927736, + "learning_rate": 4.892944618704865e-05, + "loss": 0.526, + "step": 3080 + }, + { + "epoch": 0.7194496268656716, + "grad_norm": 0.3758106893416268, + "learning_rate": 4.892415105795197e-05, + "loss": 0.5365, + "step": 3085 + }, + { + "epoch": 0.7206156716417911, + "grad_norm": 0.3762267615069055, + "learning_rate": 4.89188431865918e-05, + "loss": 0.5489, + "step": 3090 + }, + { + "epoch": 0.7217817164179104, + "grad_norm": 0.47300186208433503, + "learning_rate": 4.891352257612505e-05, + "loss": 0.5592, + "step": 3095 + }, + { + "epoch": 0.7229477611940298, + "grad_norm": 0.42577961562317984, + "learning_rate": 4.89081892297162e-05, + "loss": 0.5392, + "step": 3100 + }, + { + "epoch": 0.7241138059701493, + "grad_norm": 0.3636682370248496, + "learning_rate": 4.8902843150537345e-05, + "loss": 0.5126, + "step": 3105 + }, + { + "epoch": 0.7252798507462687, + "grad_norm": 0.37366394466126196, + "learning_rate": 4.8897484341768104e-05, + "loss": 0.5455, + "step": 3110 + }, + { + "epoch": 0.726445895522388, + "grad_norm": 0.374747080930876, + "learning_rate": 4.88921128065957e-05, + "loss": 0.5413, + "step": 3115 + }, + { + "epoch": 0.7276119402985075, + "grad_norm": 0.41837222928371043, + "learning_rate": 4.8886728548214933e-05, + "loss": 0.5101, + "step": 3120 + }, + { + "epoch": 0.7287779850746269, + "grad_norm": 0.38118401717070516, + "learning_rate": 4.8881331569828134e-05, + "loss": 0.5252, + "step": 3125 + }, + { + "epoch": 0.7299440298507462, + "grad_norm": 0.4354876996923206, + "learning_rate": 4.887592187464522e-05, + "loss": 0.5367, + "step": 3130 + }, + { + "epoch": 0.7311100746268657, + "grad_norm": 0.3823240177123982, + "learning_rate": 4.8870499465883676e-05, + "loss": 0.5354, + "step": 3135 + }, + { + "epoch": 0.7322761194029851, + "grad_norm": 0.38921016614206255, + "learning_rate": 4.886506434676854e-05, + "loss": 0.5246, + "step": 3140 + }, + { + "epoch": 0.7334421641791045, + "grad_norm": 0.37598254128147834, + "learning_rate": 4.885961652053242e-05, + "loss": 0.5228, + "step": 3145 + }, + { + "epoch": 0.7346082089552238, + "grad_norm": 0.37488353157168236, + "learning_rate": 4.885415599041545e-05, + "loss": 0.5421, + "step": 3150 + }, + { + "epoch": 0.7357742537313433, + "grad_norm": 0.38206799211213144, + "learning_rate": 4.884868275966538e-05, + "loss": 0.5567, + "step": 3155 + }, + { + "epoch": 0.7369402985074627, + "grad_norm": 0.3697930725962389, + "learning_rate": 4.884319683153746e-05, + "loss": 0.5313, + "step": 3160 + }, + { + "epoch": 0.738106343283582, + "grad_norm": 0.35928636492820076, + "learning_rate": 4.88376982092945e-05, + "loss": 0.519, + "step": 3165 + }, + { + "epoch": 0.7392723880597015, + "grad_norm": 0.39872417925703796, + "learning_rate": 4.883218689620688e-05, + "loss": 0.5382, + "step": 3170 + }, + { + "epoch": 0.7404384328358209, + "grad_norm": 0.3603561466972036, + "learning_rate": 4.882666289555251e-05, + "loss": 0.5057, + "step": 3175 + }, + { + "epoch": 0.7416044776119403, + "grad_norm": 0.3817389594188072, + "learning_rate": 4.882112621061687e-05, + "loss": 0.5379, + "step": 3180 + }, + { + "epoch": 0.7427705223880597, + "grad_norm": 0.3856273538293339, + "learning_rate": 4.881557684469295e-05, + "loss": 0.516, + "step": 3185 + }, + { + "epoch": 0.7439365671641791, + "grad_norm": 0.3921503674609452, + "learning_rate": 4.881001480108131e-05, + "loss": 0.5499, + "step": 3190 + }, + { + "epoch": 0.7451026119402985, + "grad_norm": 0.3624242863944163, + "learning_rate": 4.880444008309004e-05, + "loss": 0.5241, + "step": 3195 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.36529864721183014, + "learning_rate": 4.8798852694034775e-05, + "loss": 0.5375, + "step": 3200 + }, + { + "epoch": 0.7474347014925373, + "grad_norm": 0.3683021692125095, + "learning_rate": 4.8793252637238656e-05, + "loss": 0.5361, + "step": 3205 + }, + { + "epoch": 0.7486007462686567, + "grad_norm": 0.41150876515923124, + "learning_rate": 4.878763991603241e-05, + "loss": 0.5565, + "step": 3210 + }, + { + "epoch": 0.7497667910447762, + "grad_norm": 0.40867839984227117, + "learning_rate": 4.878201453375425e-05, + "loss": 0.5523, + "step": 3215 + }, + { + "epoch": 0.7509328358208955, + "grad_norm": 0.4095340603407869, + "learning_rate": 4.877637649374994e-05, + "loss": 0.5351, + "step": 3220 + }, + { + "epoch": 0.7520988805970149, + "grad_norm": 0.36660890111556915, + "learning_rate": 4.877072579937278e-05, + "loss": 0.5238, + "step": 3225 + }, + { + "epoch": 0.7532649253731343, + "grad_norm": 0.3693950147252224, + "learning_rate": 4.876506245398358e-05, + "loss": 0.5265, + "step": 3230 + }, + { + "epoch": 0.7544309701492538, + "grad_norm": 0.39183392565814196, + "learning_rate": 4.8759386460950676e-05, + "loss": 0.5343, + "step": 3235 + }, + { + "epoch": 0.7555970149253731, + "grad_norm": 0.542803265870136, + "learning_rate": 4.875369782364994e-05, + "loss": 0.5145, + "step": 3240 + }, + { + "epoch": 0.7567630597014925, + "grad_norm": 0.3796918877465813, + "learning_rate": 4.8747996545464746e-05, + "loss": 0.5387, + "step": 3245 + }, + { + "epoch": 0.757929104477612, + "grad_norm": 0.36993099878408103, + "learning_rate": 4.8742282629786005e-05, + "loss": 0.5257, + "step": 3250 + }, + { + "epoch": 0.7590951492537313, + "grad_norm": 0.35937870943600997, + "learning_rate": 4.8736556080012125e-05, + "loss": 0.5138, + "step": 3255 + }, + { + "epoch": 0.7602611940298507, + "grad_norm": 0.3685247204565086, + "learning_rate": 4.8730816899549046e-05, + "loss": 0.5396, + "step": 3260 + }, + { + "epoch": 0.7614272388059702, + "grad_norm": 0.34953024881079675, + "learning_rate": 4.872506509181021e-05, + "loss": 0.5058, + "step": 3265 + }, + { + "epoch": 0.7625932835820896, + "grad_norm": 0.38937210431785063, + "learning_rate": 4.871930066021658e-05, + "loss": 0.5121, + "step": 3270 + }, + { + "epoch": 0.7637593283582089, + "grad_norm": 0.3713733929526791, + "learning_rate": 4.8713523608196595e-05, + "loss": 0.5077, + "step": 3275 + }, + { + "epoch": 0.7649253731343284, + "grad_norm": 0.34539804422553044, + "learning_rate": 4.8707733939186254e-05, + "loss": 0.5244, + "step": 3280 + }, + { + "epoch": 0.7660914179104478, + "grad_norm": 0.3953032545102582, + "learning_rate": 4.8701931656629e-05, + "loss": 0.5126, + "step": 3285 + }, + { + "epoch": 0.7672574626865671, + "grad_norm": 0.4274727255915733, + "learning_rate": 4.869611676397584e-05, + "loss": 0.5425, + "step": 3290 + }, + { + "epoch": 0.7684235074626866, + "grad_norm": 0.3992773367160212, + "learning_rate": 4.8690289264685226e-05, + "loss": 0.5522, + "step": 3295 + }, + { + "epoch": 0.769589552238806, + "grad_norm": 0.37929867188570593, + "learning_rate": 4.868444916222313e-05, + "loss": 0.527, + "step": 3300 + }, + { + "epoch": 0.7707555970149254, + "grad_norm": 0.38375418179681725, + "learning_rate": 4.8678596460063046e-05, + "loss": 0.523, + "step": 3305 + }, + { + "epoch": 0.7719216417910447, + "grad_norm": 0.35887407029642154, + "learning_rate": 4.867273116168591e-05, + "loss": 0.538, + "step": 3310 + }, + { + "epoch": 0.7730876865671642, + "grad_norm": 0.33680304878390593, + "learning_rate": 4.866685327058018e-05, + "loss": 0.5438, + "step": 3315 + }, + { + "epoch": 0.7742537313432836, + "grad_norm": 0.40367292142093464, + "learning_rate": 4.8660962790241824e-05, + "loss": 0.5474, + "step": 3320 + }, + { + "epoch": 0.7754197761194029, + "grad_norm": 0.3535253816287866, + "learning_rate": 4.865505972417424e-05, + "loss": 0.5398, + "step": 3325 + }, + { + "epoch": 0.7765858208955224, + "grad_norm": 0.367223155545177, + "learning_rate": 4.864914407588837e-05, + "loss": 0.5104, + "step": 3330 + }, + { + "epoch": 0.7777518656716418, + "grad_norm": 0.35517393232062466, + "learning_rate": 4.864321584890261e-05, + "loss": 0.5204, + "step": 3335 + }, + { + "epoch": 0.7789179104477612, + "grad_norm": 0.35960337444973683, + "learning_rate": 4.863727504674282e-05, + "loss": 0.5402, + "step": 3340 + }, + { + "epoch": 0.7800839552238806, + "grad_norm": 0.35899268759291825, + "learning_rate": 4.86313216729424e-05, + "loss": 0.5414, + "step": 3345 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3854500530158688, + "learning_rate": 4.8625355731042174e-05, + "loss": 0.5311, + "step": 3350 + }, + { + "epoch": 0.7824160447761194, + "grad_norm": 0.37821710706001427, + "learning_rate": 4.8619377224590435e-05, + "loss": 0.5235, + "step": 3355 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.3665037290417063, + "learning_rate": 4.861338615714299e-05, + "loss": 0.5135, + "step": 3360 + }, + { + "epoch": 0.7847481343283582, + "grad_norm": 0.3713560738949342, + "learning_rate": 4.8607382532263085e-05, + "loss": 0.5047, + "step": 3365 + }, + { + "epoch": 0.7859141791044776, + "grad_norm": 0.3635466877964919, + "learning_rate": 4.860136635352145e-05, + "loss": 0.5379, + "step": 3370 + }, + { + "epoch": 0.7870802238805971, + "grad_norm": 0.40042517177476933, + "learning_rate": 4.8595337624496284e-05, + "loss": 0.5421, + "step": 3375 + }, + { + "epoch": 0.7882462686567164, + "grad_norm": 0.38015847460292385, + "learning_rate": 4.8589296348773244e-05, + "loss": 0.5252, + "step": 3380 + }, + { + "epoch": 0.7894123134328358, + "grad_norm": 0.3554037084450133, + "learning_rate": 4.858324252994543e-05, + "loss": 0.5466, + "step": 3385 + }, + { + "epoch": 0.7905783582089553, + "grad_norm": 0.36613079355211486, + "learning_rate": 4.857717617161345e-05, + "loss": 0.5002, + "step": 3390 + }, + { + "epoch": 0.7917444029850746, + "grad_norm": 0.37651594332970095, + "learning_rate": 4.857109727738532e-05, + "loss": 0.5387, + "step": 3395 + }, + { + "epoch": 0.792910447761194, + "grad_norm": 0.4389797234162256, + "learning_rate": 4.856500585087654e-05, + "loss": 0.531, + "step": 3400 + }, + { + "epoch": 0.7940764925373134, + "grad_norm": 0.3738498054416888, + "learning_rate": 4.855890189571005e-05, + "loss": 0.505, + "step": 3405 + }, + { + "epoch": 0.7952425373134329, + "grad_norm": 0.3674465646905238, + "learning_rate": 4.855278541551626e-05, + "loss": 0.5459, + "step": 3410 + }, + { + "epoch": 0.7964085820895522, + "grad_norm": 0.39058597736030176, + "learning_rate": 4.8546656413933014e-05, + "loss": 0.5207, + "step": 3415 + }, + { + "epoch": 0.7975746268656716, + "grad_norm": 0.38345389631005905, + "learning_rate": 4.85405148946056e-05, + "loss": 0.4985, + "step": 3420 + }, + { + "epoch": 0.7987406716417911, + "grad_norm": 0.3612601910233199, + "learning_rate": 4.853436086118677e-05, + "loss": 0.5405, + "step": 3425 + }, + { + "epoch": 0.7999067164179104, + "grad_norm": 0.43426171790928614, + "learning_rate": 4.8528194317336703e-05, + "loss": 0.5468, + "step": 3430 + }, + { + "epoch": 0.8010727611940298, + "grad_norm": 0.364692653323174, + "learning_rate": 4.852201526672302e-05, + "loss": 0.5414, + "step": 3435 + }, + { + "epoch": 0.8022388059701493, + "grad_norm": 0.35787657334869305, + "learning_rate": 4.851582371302078e-05, + "loss": 0.5147, + "step": 3440 + }, + { + "epoch": 0.8034048507462687, + "grad_norm": 0.4025786719654805, + "learning_rate": 4.8509619659912486e-05, + "loss": 0.5584, + "step": 3445 + }, + { + "epoch": 0.804570895522388, + "grad_norm": 0.3653941639823354, + "learning_rate": 4.8503403111088075e-05, + "loss": 0.5346, + "step": 3450 + }, + { + "epoch": 0.8057369402985075, + "grad_norm": 0.3807238713664878, + "learning_rate": 4.849717407024491e-05, + "loss": 0.535, + "step": 3455 + }, + { + "epoch": 0.8069029850746269, + "grad_norm": 0.38322383255794096, + "learning_rate": 4.849093254108778e-05, + "loss": 0.5185, + "step": 3460 + }, + { + "epoch": 0.8080690298507462, + "grad_norm": 0.3574305340915936, + "learning_rate": 4.8484678527328906e-05, + "loss": 0.505, + "step": 3465 + }, + { + "epoch": 0.8092350746268657, + "grad_norm": 0.36188591159275424, + "learning_rate": 4.8478412032687956e-05, + "loss": 0.5416, + "step": 3470 + }, + { + "epoch": 0.8104011194029851, + "grad_norm": 0.36872477781138113, + "learning_rate": 4.847213306089197e-05, + "loss": 0.5254, + "step": 3475 + }, + { + "epoch": 0.8115671641791045, + "grad_norm": 0.37999000283471834, + "learning_rate": 4.8465841615675464e-05, + "loss": 0.5047, + "step": 3480 + }, + { + "epoch": 0.8127332089552238, + "grad_norm": 0.35855482155978396, + "learning_rate": 4.845953770078032e-05, + "loss": 0.5329, + "step": 3485 + }, + { + "epoch": 0.8138992537313433, + "grad_norm": 0.37240742569138474, + "learning_rate": 4.84532213199559e-05, + "loss": 0.5269, + "step": 3490 + }, + { + "epoch": 0.8150652985074627, + "grad_norm": 0.3537955013147227, + "learning_rate": 4.844689247695893e-05, + "loss": 0.5139, + "step": 3495 + }, + { + "epoch": 0.816231343283582, + "grad_norm": 0.5994885720859646, + "learning_rate": 4.844055117555355e-05, + "loss": 0.5179, + "step": 3500 + }, + { + "epoch": 0.8173973880597015, + "grad_norm": 0.3571813302219918, + "learning_rate": 4.8434197419511346e-05, + "loss": 0.5156, + "step": 3505 + }, + { + "epoch": 0.8185634328358209, + "grad_norm": 0.3558376327730202, + "learning_rate": 4.8427831212611276e-05, + "loss": 0.5085, + "step": 3510 + }, + { + "epoch": 0.8197294776119403, + "grad_norm": 0.3828186825763461, + "learning_rate": 4.8421452558639715e-05, + "loss": 0.5229, + "step": 3515 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.35916409943555344, + "learning_rate": 4.8415061461390444e-05, + "loss": 0.537, + "step": 3520 + }, + { + "epoch": 0.8220615671641791, + "grad_norm": 0.3779972441000561, + "learning_rate": 4.840865792466464e-05, + "loss": 0.5249, + "step": 3525 + }, + { + "epoch": 0.8232276119402985, + "grad_norm": 0.38367392212571394, + "learning_rate": 4.840224195227088e-05, + "loss": 0.5385, + "step": 3530 + }, + { + "epoch": 0.824393656716418, + "grad_norm": 0.4144581958197853, + "learning_rate": 4.839581354802516e-05, + "loss": 0.5313, + "step": 3535 + }, + { + "epoch": 0.8255597014925373, + "grad_norm": 0.39554430330589657, + "learning_rate": 4.8389372715750814e-05, + "loss": 0.5347, + "step": 3540 + }, + { + "epoch": 0.8267257462686567, + "grad_norm": 0.4231326348522534, + "learning_rate": 4.838291945927862e-05, + "loss": 0.5338, + "step": 3545 + }, + { + "epoch": 0.8278917910447762, + "grad_norm": 0.33844234195933864, + "learning_rate": 4.8376453782446724e-05, + "loss": 0.5347, + "step": 3550 + }, + { + "epoch": 0.8290578358208955, + "grad_norm": 0.40521372891231033, + "learning_rate": 4.836997568910067e-05, + "loss": 0.5362, + "step": 3555 + }, + { + "epoch": 0.8302238805970149, + "grad_norm": 0.41223456724651164, + "learning_rate": 4.836348518309337e-05, + "loss": 0.5272, + "step": 3560 + }, + { + "epoch": 0.8313899253731343, + "grad_norm": 0.38998789028488645, + "learning_rate": 4.835698226828513e-05, + "loss": 0.5334, + "step": 3565 + }, + { + "epoch": 0.8325559701492538, + "grad_norm": 0.36292234494655784, + "learning_rate": 4.835046694854364e-05, + "loss": 0.5287, + "step": 3570 + }, + { + "epoch": 0.8337220149253731, + "grad_norm": 0.37198758549059624, + "learning_rate": 4.834393922774397e-05, + "loss": 0.525, + "step": 3575 + }, + { + "epoch": 0.8348880597014925, + "grad_norm": 0.3588469223074909, + "learning_rate": 4.833739910976853e-05, + "loss": 0.5204, + "step": 3580 + }, + { + "epoch": 0.836054104477612, + "grad_norm": 0.3613777440664703, + "learning_rate": 4.833084659850715e-05, + "loss": 0.5194, + "step": 3585 + }, + { + "epoch": 0.8372201492537313, + "grad_norm": 0.3669099169462927, + "learning_rate": 4.8324281697857024e-05, + "loss": 0.5301, + "step": 3590 + }, + { + "epoch": 0.8383861940298507, + "grad_norm": 0.37356480434573064, + "learning_rate": 4.8317704411722676e-05, + "loss": 0.5372, + "step": 3595 + }, + { + "epoch": 0.8395522388059702, + "grad_norm": 0.3830462031746556, + "learning_rate": 4.831111474401604e-05, + "loss": 0.5333, + "step": 3600 + }, + { + "epoch": 0.8407182835820896, + "grad_norm": 0.34145441939187476, + "learning_rate": 4.830451269865639e-05, + "loss": 0.5087, + "step": 3605 + }, + { + "epoch": 0.8418843283582089, + "grad_norm": 0.3549704149825061, + "learning_rate": 4.8297898279570385e-05, + "loss": 0.5031, + "step": 3610 + }, + { + "epoch": 0.8430503731343284, + "grad_norm": 0.3908631136507107, + "learning_rate": 4.829127149069201e-05, + "loss": 0.5187, + "step": 3615 + }, + { + "epoch": 0.8442164179104478, + "grad_norm": 0.36814413083033554, + "learning_rate": 4.828463233596264e-05, + "loss": 0.5418, + "step": 3620 + }, + { + "epoch": 0.8453824626865671, + "grad_norm": 0.39205408593529883, + "learning_rate": 4.827798081933097e-05, + "loss": 0.5283, + "step": 3625 + }, + { + "epoch": 0.8465485074626866, + "grad_norm": 0.3728855870818213, + "learning_rate": 4.827131694475309e-05, + "loss": 0.5195, + "step": 3630 + }, + { + "epoch": 0.847714552238806, + "grad_norm": 0.4423186199754924, + "learning_rate": 4.826464071619239e-05, + "loss": 0.5288, + "step": 3635 + }, + { + "epoch": 0.8488805970149254, + "grad_norm": 0.34581435883322015, + "learning_rate": 4.825795213761967e-05, + "loss": 0.5259, + "step": 3640 + }, + { + "epoch": 0.8500466417910447, + "grad_norm": 0.36748786969462993, + "learning_rate": 4.825125121301301e-05, + "loss": 0.5219, + "step": 3645 + }, + { + "epoch": 0.8512126865671642, + "grad_norm": 0.3455389751097042, + "learning_rate": 4.824453794635788e-05, + "loss": 0.5374, + "step": 3650 + }, + { + "epoch": 0.8523787313432836, + "grad_norm": 0.3697450381090462, + "learning_rate": 4.823781234164706e-05, + "loss": 0.5176, + "step": 3655 + }, + { + "epoch": 0.8535447761194029, + "grad_norm": 0.37672444877375977, + "learning_rate": 4.8231074402880686e-05, + "loss": 0.5047, + "step": 3660 + }, + { + "epoch": 0.8547108208955224, + "grad_norm": 0.35794911351156683, + "learning_rate": 4.822432413406624e-05, + "loss": 0.5145, + "step": 3665 + }, + { + "epoch": 0.8558768656716418, + "grad_norm": 0.34990335094508795, + "learning_rate": 4.82175615392185e-05, + "loss": 0.5349, + "step": 3670 + }, + { + "epoch": 0.8570429104477612, + "grad_norm": 0.38359837586324985, + "learning_rate": 4.821078662235962e-05, + "loss": 0.5568, + "step": 3675 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3352674153689726, + "learning_rate": 4.8203999387519036e-05, + "loss": 0.5576, + "step": 3680 + }, + { + "epoch": 0.859375, + "grad_norm": 0.36520638496981045, + "learning_rate": 4.8197199838733567e-05, + "loss": 0.5563, + "step": 3685 + }, + { + "epoch": 0.8605410447761194, + "grad_norm": 0.3574284015097453, + "learning_rate": 4.81903879800473e-05, + "loss": 0.5404, + "step": 3690 + }, + { + "epoch": 0.8617070895522388, + "grad_norm": 0.359570530567946, + "learning_rate": 4.818356381551167e-05, + "loss": 0.5166, + "step": 3695 + }, + { + "epoch": 0.8628731343283582, + "grad_norm": 0.3710298218316339, + "learning_rate": 4.817672734918543e-05, + "loss": 0.5443, + "step": 3700 + }, + { + "epoch": 0.8640391791044776, + "grad_norm": 0.3374403237529766, + "learning_rate": 4.816987858513465e-05, + "loss": 0.5205, + "step": 3705 + }, + { + "epoch": 0.8652052238805971, + "grad_norm": 0.348318746440703, + "learning_rate": 4.816301752743271e-05, + "loss": 0.5352, + "step": 3710 + }, + { + "epoch": 0.8663712686567164, + "grad_norm": 0.38484207390002323, + "learning_rate": 4.8156144180160315e-05, + "loss": 0.5366, + "step": 3715 + }, + { + "epoch": 0.8675373134328358, + "grad_norm": 0.37445796526863606, + "learning_rate": 4.8149258547405466e-05, + "loss": 0.5285, + "step": 3720 + }, + { + "epoch": 0.8687033582089553, + "grad_norm": 0.3712118304785722, + "learning_rate": 4.814236063326345e-05, + "loss": 0.5278, + "step": 3725 + }, + { + "epoch": 0.8698694029850746, + "grad_norm": 0.4048809909580681, + "learning_rate": 4.8135450441836905e-05, + "loss": 0.5245, + "step": 3730 + }, + { + "epoch": 0.871035447761194, + "grad_norm": 0.35286186346457543, + "learning_rate": 4.812852797723574e-05, + "loss": 0.5091, + "step": 3735 + }, + { + "epoch": 0.8722014925373134, + "grad_norm": 0.3885939730822929, + "learning_rate": 4.8121593243577176e-05, + "loss": 0.5213, + "step": 3740 + }, + { + "epoch": 0.8733675373134329, + "grad_norm": 0.3509136590403021, + "learning_rate": 4.8114646244985734e-05, + "loss": 0.4985, + "step": 3745 + }, + { + "epoch": 0.8745335820895522, + "grad_norm": 0.379327379093406, + "learning_rate": 4.8107686985593194e-05, + "loss": 0.5369, + "step": 3750 + }, + { + "epoch": 0.8756996268656716, + "grad_norm": 0.36629198233004384, + "learning_rate": 4.810071546953868e-05, + "loss": 0.5157, + "step": 3755 + }, + { + "epoch": 0.8768656716417911, + "grad_norm": 0.361590813988436, + "learning_rate": 4.809373170096859e-05, + "loss": 0.4995, + "step": 3760 + }, + { + "epoch": 0.8780317164179104, + "grad_norm": 0.38058987259985083, + "learning_rate": 4.808673568403657e-05, + "loss": 0.5403, + "step": 3765 + }, + { + "epoch": 0.8791977611940298, + "grad_norm": 0.3687321217294468, + "learning_rate": 4.8079727422903615e-05, + "loss": 0.5056, + "step": 3770 + }, + { + "epoch": 0.8803638059701493, + "grad_norm": 0.36448187208791616, + "learning_rate": 4.807270692173795e-05, + "loss": 0.5467, + "step": 3775 + }, + { + "epoch": 0.8815298507462687, + "grad_norm": 0.3518354824504398, + "learning_rate": 4.806567418471511e-05, + "loss": 0.4974, + "step": 3780 + }, + { + "epoch": 0.882695895522388, + "grad_norm": 0.3517464627873222, + "learning_rate": 4.8058629216017884e-05, + "loss": 0.5245, + "step": 3785 + }, + { + "epoch": 0.8838619402985075, + "grad_norm": 0.3501120459290764, + "learning_rate": 4.805157201983637e-05, + "loss": 0.5203, + "step": 3790 + }, + { + "epoch": 0.8850279850746269, + "grad_norm": 0.3752725179333214, + "learning_rate": 4.804450260036791e-05, + "loss": 0.5052, + "step": 3795 + }, + { + "epoch": 0.8861940298507462, + "grad_norm": 0.33128678885131735, + "learning_rate": 4.803742096181711e-05, + "loss": 0.5042, + "step": 3800 + }, + { + "epoch": 0.8873600746268657, + "grad_norm": 0.3893343866092791, + "learning_rate": 4.803032710839587e-05, + "loss": 0.5054, + "step": 3805 + }, + { + "epoch": 0.8885261194029851, + "grad_norm": 0.38272290845756407, + "learning_rate": 4.802322104432334e-05, + "loss": 0.5198, + "step": 3810 + }, + { + "epoch": 0.8896921641791045, + "grad_norm": 0.3518699066231957, + "learning_rate": 4.801610277382593e-05, + "loss": 0.5327, + "step": 3815 + }, + { + "epoch": 0.8908582089552238, + "grad_norm": 0.35723026748106745, + "learning_rate": 4.800897230113732e-05, + "loss": 0.5294, + "step": 3820 + }, + { + "epoch": 0.8920242537313433, + "grad_norm": 0.3648234750324044, + "learning_rate": 4.8001829630498445e-05, + "loss": 0.5333, + "step": 3825 + }, + { + "epoch": 0.8931902985074627, + "grad_norm": 0.35539484373934965, + "learning_rate": 4.799467476615748e-05, + "loss": 0.5176, + "step": 3830 + }, + { + "epoch": 0.894356343283582, + "grad_norm": 0.3506025617608877, + "learning_rate": 4.798750771236988e-05, + "loss": 0.5406, + "step": 3835 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.3604809232860036, + "learning_rate": 4.7980328473398314e-05, + "loss": 0.5093, + "step": 3840 + }, + { + "epoch": 0.8966884328358209, + "grad_norm": 0.37745557495788395, + "learning_rate": 4.797313705351273e-05, + "loss": 0.533, + "step": 3845 + }, + { + "epoch": 0.8978544776119403, + "grad_norm": 0.37778332560546296, + "learning_rate": 4.7965933456990306e-05, + "loss": 0.5249, + "step": 3850 + }, + { + "epoch": 0.8990205223880597, + "grad_norm": 0.3597428246838284, + "learning_rate": 4.795871768811547e-05, + "loss": 0.5498, + "step": 3855 + }, + { + "epoch": 0.9001865671641791, + "grad_norm": 0.3503018368391822, + "learning_rate": 4.795148975117988e-05, + "loss": 0.5155, + "step": 3860 + }, + { + "epoch": 0.9013526119402985, + "grad_norm": 0.36434390383698934, + "learning_rate": 4.794424965048243e-05, + "loss": 0.5427, + "step": 3865 + }, + { + "epoch": 0.902518656716418, + "grad_norm": 0.3364509220521192, + "learning_rate": 4.7936997390329266e-05, + "loss": 0.5033, + "step": 3870 + }, + { + "epoch": 0.9036847014925373, + "grad_norm": 0.3743439294123426, + "learning_rate": 4.7929732975033744e-05, + "loss": 0.5225, + "step": 3875 + }, + { + "epoch": 0.9048507462686567, + "grad_norm": 0.377358056832661, + "learning_rate": 4.7922456408916465e-05, + "loss": 0.5522, + "step": 3880 + }, + { + "epoch": 0.9060167910447762, + "grad_norm": 0.356898913926242, + "learning_rate": 4.791516769630526e-05, + "loss": 0.5231, + "step": 3885 + }, + { + "epoch": 0.9071828358208955, + "grad_norm": 0.3567691938451185, + "learning_rate": 4.790786684153516e-05, + "loss": 0.5088, + "step": 3890 + }, + { + "epoch": 0.9083488805970149, + "grad_norm": 0.34726261524405555, + "learning_rate": 4.790055384894844e-05, + "loss": 0.5243, + "step": 3895 + }, + { + "epoch": 0.9095149253731343, + "grad_norm": 0.3341187282693523, + "learning_rate": 4.7893228722894584e-05, + "loss": 0.5042, + "step": 3900 + }, + { + "epoch": 0.9106809701492538, + "grad_norm": 0.3734105353776965, + "learning_rate": 4.78858914677303e-05, + "loss": 0.5276, + "step": 3905 + }, + { + "epoch": 0.9118470149253731, + "grad_norm": 0.3646933409594863, + "learning_rate": 4.787854208781951e-05, + "loss": 0.5176, + "step": 3910 + }, + { + "epoch": 0.9130130597014925, + "grad_norm": 0.35598729767991766, + "learning_rate": 4.787118058753334e-05, + "loss": 0.5281, + "step": 3915 + }, + { + "epoch": 0.914179104477612, + "grad_norm": 0.3482901756658386, + "learning_rate": 4.786380697125012e-05, + "loss": 0.5029, + "step": 3920 + }, + { + "epoch": 0.9153451492537313, + "grad_norm": 0.34946469107236366, + "learning_rate": 4.7856421243355414e-05, + "loss": 0.5089, + "step": 3925 + }, + { + "epoch": 0.9165111940298507, + "grad_norm": 0.3374850022842466, + "learning_rate": 4.784902340824195e-05, + "loss": 0.5424, + "step": 3930 + }, + { + "epoch": 0.9176772388059702, + "grad_norm": 0.3451628345775439, + "learning_rate": 4.784161347030968e-05, + "loss": 0.5067, + "step": 3935 + }, + { + "epoch": 0.9188432835820896, + "grad_norm": 0.3396991426296273, + "learning_rate": 4.7834191433965756e-05, + "loss": 0.5368, + "step": 3940 + }, + { + "epoch": 0.9200093283582089, + "grad_norm": 0.35650498827828714, + "learning_rate": 4.782675730362452e-05, + "loss": 0.5236, + "step": 3945 + }, + { + "epoch": 0.9211753731343284, + "grad_norm": 0.3579016732583666, + "learning_rate": 4.781931108370751e-05, + "loss": 0.5091, + "step": 3950 + }, + { + "epoch": 0.9223414179104478, + "grad_norm": 0.38378319243031106, + "learning_rate": 4.781185277864344e-05, + "loss": 0.5163, + "step": 3955 + }, + { + "epoch": 0.9235074626865671, + "grad_norm": 0.34607506200104415, + "learning_rate": 4.780438239286824e-05, + "loss": 0.4975, + "step": 3960 + }, + { + "epoch": 0.9246735074626866, + "grad_norm": 0.3586809835931925, + "learning_rate": 4.7796899930825004e-05, + "loss": 0.5122, + "step": 3965 + }, + { + "epoch": 0.925839552238806, + "grad_norm": 0.38780022598471536, + "learning_rate": 4.7789405396964004e-05, + "loss": 0.525, + "step": 3970 + }, + { + "epoch": 0.9270055970149254, + "grad_norm": 0.3432810643625677, + "learning_rate": 4.7781898795742716e-05, + "loss": 0.5337, + "step": 3975 + }, + { + "epoch": 0.9281716417910447, + "grad_norm": 0.3648287666200129, + "learning_rate": 4.777438013162576e-05, + "loss": 0.5084, + "step": 3980 + }, + { + "epoch": 0.9293376865671642, + "grad_norm": 0.3570869085594697, + "learning_rate": 4.7766849409084976e-05, + "loss": 0.5153, + "step": 3985 + }, + { + "epoch": 0.9305037313432836, + "grad_norm": 0.3407670830006965, + "learning_rate": 4.775930663259932e-05, + "loss": 0.5007, + "step": 3990 + }, + { + "epoch": 0.9316697761194029, + "grad_norm": 0.3878252654284208, + "learning_rate": 4.7751751806654966e-05, + "loss": 0.5511, + "step": 3995 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.3734064688870557, + "learning_rate": 4.774418493574523e-05, + "loss": 0.5177, + "step": 4000 + }, + { + "epoch": 0.9340018656716418, + "grad_norm": 0.38658474263973996, + "learning_rate": 4.773660602437059e-05, + "loss": 0.5173, + "step": 4005 + }, + { + "epoch": 0.9351679104477612, + "grad_norm": 0.37559941555662035, + "learning_rate": 4.77290150770387e-05, + "loss": 0.5184, + "step": 4010 + }, + { + "epoch": 0.9363339552238806, + "grad_norm": 0.3649865471875475, + "learning_rate": 4.772141209826435e-05, + "loss": 0.52, + "step": 4015 + }, + { + "epoch": 0.9375, + "grad_norm": 0.3666204894036579, + "learning_rate": 4.771379709256953e-05, + "loss": 0.5016, + "step": 4020 + }, + { + "epoch": 0.9386660447761194, + "grad_norm": 0.3855570618196418, + "learning_rate": 4.770617006448332e-05, + "loss": 0.5139, + "step": 4025 + }, + { + "epoch": 0.9398320895522388, + "grad_norm": 0.3688556011606313, + "learning_rate": 4.769853101854201e-05, + "loss": 0.5314, + "step": 4030 + }, + { + "epoch": 0.9409981343283582, + "grad_norm": 0.35422971055747376, + "learning_rate": 4.7690879959288994e-05, + "loss": 0.502, + "step": 4035 + }, + { + "epoch": 0.9421641791044776, + "grad_norm": 0.41163266033343787, + "learning_rate": 4.768321689127483e-05, + "loss": 0.5191, + "step": 4040 + }, + { + "epoch": 0.9433302238805971, + "grad_norm": 0.35927951351420284, + "learning_rate": 4.767554181905723e-05, + "loss": 0.5037, + "step": 4045 + }, + { + "epoch": 0.9444962686567164, + "grad_norm": 0.4080315217338119, + "learning_rate": 4.766785474720102e-05, + "loss": 0.5289, + "step": 4050 + }, + { + "epoch": 0.9456623134328358, + "grad_norm": 0.3643245613714343, + "learning_rate": 4.766015568027818e-05, + "loss": 0.5248, + "step": 4055 + }, + { + "epoch": 0.9468283582089553, + "grad_norm": 0.36984525668316576, + "learning_rate": 4.765244462286782e-05, + "loss": 0.5178, + "step": 4060 + }, + { + "epoch": 0.9479944029850746, + "grad_norm": 0.357614208945119, + "learning_rate": 4.7644721579556184e-05, + "loss": 0.4851, + "step": 4065 + }, + { + "epoch": 0.949160447761194, + "grad_norm": 0.3625643886650782, + "learning_rate": 4.763698655493664e-05, + "loss": 0.5176, + "step": 4070 + }, + { + "epoch": 0.9503264925373134, + "grad_norm": 0.4340708025546624, + "learning_rate": 4.762923955360968e-05, + "loss": 0.5231, + "step": 4075 + }, + { + "epoch": 0.9514925373134329, + "grad_norm": 0.3867495253416601, + "learning_rate": 4.7621480580182925e-05, + "loss": 0.5242, + "step": 4080 + }, + { + "epoch": 0.9526585820895522, + "grad_norm": 0.33670691155146837, + "learning_rate": 4.761370963927112e-05, + "loss": 0.5014, + "step": 4085 + }, + { + "epoch": 0.9538246268656716, + "grad_norm": 0.3910906714932446, + "learning_rate": 4.760592673549611e-05, + "loss": 0.5507, + "step": 4090 + }, + { + "epoch": 0.9549906716417911, + "grad_norm": 0.3427975875880022, + "learning_rate": 4.759813187348688e-05, + "loss": 0.5198, + "step": 4095 + }, + { + "epoch": 0.9561567164179104, + "grad_norm": 0.3448331826575159, + "learning_rate": 4.759032505787952e-05, + "loss": 0.5037, + "step": 4100 + }, + { + "epoch": 0.9573227611940298, + "grad_norm": 0.4176221837743691, + "learning_rate": 4.758250629331721e-05, + "loss": 0.5309, + "step": 4105 + }, + { + "epoch": 0.9584888059701493, + "grad_norm": 0.3474680474425099, + "learning_rate": 4.7574675584450256e-05, + "loss": 0.5299, + "step": 4110 + }, + { + "epoch": 0.9596548507462687, + "grad_norm": 0.3380216012321242, + "learning_rate": 4.756683293593607e-05, + "loss": 0.4861, + "step": 4115 + }, + { + "epoch": 0.960820895522388, + "grad_norm": 0.3689763159872039, + "learning_rate": 4.755897835243916e-05, + "loss": 0.534, + "step": 4120 + }, + { + "epoch": 0.9619869402985075, + "grad_norm": 0.36353135415568916, + "learning_rate": 4.755111183863111e-05, + "loss": 0.5172, + "step": 4125 + }, + { + "epoch": 0.9631529850746269, + "grad_norm": 0.4102867037516881, + "learning_rate": 4.754323339919064e-05, + "loss": 0.5279, + "step": 4130 + }, + { + "epoch": 0.9643190298507462, + "grad_norm": 0.3629563614507666, + "learning_rate": 4.753534303880353e-05, + "loss": 0.5263, + "step": 4135 + }, + { + "epoch": 0.9654850746268657, + "grad_norm": 0.38282803357753165, + "learning_rate": 4.752744076216268e-05, + "loss": 0.5327, + "step": 4140 + }, + { + "epoch": 0.9666511194029851, + "grad_norm": 0.3759862448767285, + "learning_rate": 4.751952657396807e-05, + "loss": 0.4998, + "step": 4145 + }, + { + "epoch": 0.9678171641791045, + "grad_norm": 0.4225602759399125, + "learning_rate": 4.751160047892672e-05, + "loss": 0.5368, + "step": 4150 + }, + { + "epoch": 0.9689832089552238, + "grad_norm": 0.3667189317271687, + "learning_rate": 4.75036624817528e-05, + "loss": 0.5123, + "step": 4155 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.3759800202584298, + "learning_rate": 4.74957125871675e-05, + "loss": 0.5284, + "step": 4160 + }, + { + "epoch": 0.9713152985074627, + "grad_norm": 0.3516404484578329, + "learning_rate": 4.748775079989913e-05, + "loss": 0.513, + "step": 4165 + }, + { + "epoch": 0.972481343283582, + "grad_norm": 0.3667735402807629, + "learning_rate": 4.747977712468305e-05, + "loss": 0.5346, + "step": 4170 + }, + { + "epoch": 0.9736473880597015, + "grad_norm": 0.34657495681092837, + "learning_rate": 4.747179156626171e-05, + "loss": 0.5259, + "step": 4175 + }, + { + "epoch": 0.9748134328358209, + "grad_norm": 0.3686808027950596, + "learning_rate": 4.746379412938459e-05, + "loss": 0.5289, + "step": 4180 + }, + { + "epoch": 0.9759794776119403, + "grad_norm": 0.3710221094823407, + "learning_rate": 4.745578481880827e-05, + "loss": 0.5325, + "step": 4185 + }, + { + "epoch": 0.9771455223880597, + "grad_norm": 0.3718992110394827, + "learning_rate": 4.7447763639296384e-05, + "loss": 0.5192, + "step": 4190 + }, + { + "epoch": 0.9783115671641791, + "grad_norm": 0.38811688254320986, + "learning_rate": 4.743973059561962e-05, + "loss": 0.5166, + "step": 4195 + }, + { + "epoch": 0.9794776119402985, + "grad_norm": 0.37797901051638616, + "learning_rate": 4.743168569255572e-05, + "loss": 0.5152, + "step": 4200 + }, + { + "epoch": 0.980643656716418, + "grad_norm": 0.354444010887117, + "learning_rate": 4.742362893488949e-05, + "loss": 0.5089, + "step": 4205 + }, + { + "epoch": 0.9818097014925373, + "grad_norm": 0.38839405719200126, + "learning_rate": 4.741556032741278e-05, + "loss": 0.5149, + "step": 4210 + }, + { + "epoch": 0.9829757462686567, + "grad_norm": 0.3527632951015326, + "learning_rate": 4.7407479874924474e-05, + "loss": 0.4977, + "step": 4215 + }, + { + "epoch": 0.9841417910447762, + "grad_norm": 0.36559685649539897, + "learning_rate": 4.739938758223055e-05, + "loss": 0.532, + "step": 4220 + }, + { + "epoch": 0.9853078358208955, + "grad_norm": 0.40877482441563245, + "learning_rate": 4.739128345414395e-05, + "loss": 0.5419, + "step": 4225 + }, + { + "epoch": 0.9864738805970149, + "grad_norm": 0.39497249782350524, + "learning_rate": 4.738316749548473e-05, + "loss": 0.5458, + "step": 4230 + }, + { + "epoch": 0.9876399253731343, + "grad_norm": 0.3544546491958244, + "learning_rate": 4.737503971107994e-05, + "loss": 0.5106, + "step": 4235 + }, + { + "epoch": 0.9888059701492538, + "grad_norm": 0.3569979280697408, + "learning_rate": 4.736690010576368e-05, + "loss": 0.5265, + "step": 4240 + }, + { + "epoch": 0.9899720149253731, + "grad_norm": 0.36246491758499877, + "learning_rate": 4.735874868437705e-05, + "loss": 0.527, + "step": 4245 + }, + { + "epoch": 0.9911380597014925, + "grad_norm": 0.3605631002640691, + "learning_rate": 4.735058545176824e-05, + "loss": 0.5045, + "step": 4250 + }, + { + "epoch": 0.992304104477612, + "grad_norm": 0.3501311525847066, + "learning_rate": 4.73424104127924e-05, + "loss": 0.5352, + "step": 4255 + }, + { + "epoch": 0.9934701492537313, + "grad_norm": 0.3531197991557556, + "learning_rate": 4.733422357231176e-05, + "loss": 0.5201, + "step": 4260 + }, + { + "epoch": 0.9946361940298507, + "grad_norm": 0.3673995578841256, + "learning_rate": 4.7326024935195504e-05, + "loss": 0.524, + "step": 4265 + }, + { + "epoch": 0.9958022388059702, + "grad_norm": 0.3650499869368064, + "learning_rate": 4.731781450631988e-05, + "loss": 0.5138, + "step": 4270 + }, + { + "epoch": 0.9969682835820896, + "grad_norm": 0.3576361014814151, + "learning_rate": 4.7309592290568144e-05, + "loss": 0.5222, + "step": 4275 + }, + { + "epoch": 0.9981343283582089, + "grad_norm": 0.45808390293359386, + "learning_rate": 4.730135829283055e-05, + "loss": 0.5235, + "step": 4280 + }, + { + "epoch": 0.9993003731343284, + "grad_norm": 0.34157809235910325, + "learning_rate": 4.7293112518004357e-05, + "loss": 0.5038, + "step": 4285 + }, + { + "epoch": 1.0004664179104477, + "grad_norm": 0.3703872875286972, + "learning_rate": 4.728485497099385e-05, + "loss": 0.5048, + "step": 4290 + }, + { + "epoch": 1.0016324626865671, + "grad_norm": 0.39107106437997685, + "learning_rate": 4.7276585656710295e-05, + "loss": 0.463, + "step": 4295 + }, + { + "epoch": 1.0027985074626866, + "grad_norm": 0.3740341373445779, + "learning_rate": 4.726830458007194e-05, + "loss": 0.4489, + "step": 4300 + }, + { + "epoch": 1.0039645522388059, + "grad_norm": 0.3615235344255957, + "learning_rate": 4.72600117460041e-05, + "loss": 0.4654, + "step": 4305 + }, + { + "epoch": 1.0051305970149254, + "grad_norm": 0.37819710152198327, + "learning_rate": 4.725170715943898e-05, + "loss": 0.4807, + "step": 4310 + }, + { + "epoch": 1.0062966417910448, + "grad_norm": 0.3650150836539973, + "learning_rate": 4.724339082531588e-05, + "loss": 0.4477, + "step": 4315 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.3564277718182905, + "learning_rate": 4.723506274858101e-05, + "loss": 0.4681, + "step": 4320 + }, + { + "epoch": 1.0086287313432836, + "grad_norm": 0.38019322833322156, + "learning_rate": 4.722672293418759e-05, + "loss": 0.4848, + "step": 4325 + }, + { + "epoch": 1.009794776119403, + "grad_norm": 0.3831442682341317, + "learning_rate": 4.721837138709582e-05, + "loss": 0.505, + "step": 4330 + }, + { + "epoch": 1.0109608208955223, + "grad_norm": 0.3757318543553241, + "learning_rate": 4.7210008112272895e-05, + "loss": 0.4911, + "step": 4335 + }, + { + "epoch": 1.0121268656716418, + "grad_norm": 0.4222622673290297, + "learning_rate": 4.720163311469296e-05, + "loss": 0.4661, + "step": 4340 + }, + { + "epoch": 1.0132929104477613, + "grad_norm": 0.3419450515547992, + "learning_rate": 4.7193246399337146e-05, + "loss": 0.4922, + "step": 4345 + }, + { + "epoch": 1.0144589552238805, + "grad_norm": 0.41543713936485166, + "learning_rate": 4.718484797119355e-05, + "loss": 0.4954, + "step": 4350 + }, + { + "epoch": 1.015625, + "grad_norm": 0.33108739537354487, + "learning_rate": 4.717643783525722e-05, + "loss": 0.4568, + "step": 4355 + }, + { + "epoch": 1.0167910447761195, + "grad_norm": 0.372470897375687, + "learning_rate": 4.7168015996530204e-05, + "loss": 0.4705, + "step": 4360 + }, + { + "epoch": 1.0179570895522387, + "grad_norm": 0.34207144199133666, + "learning_rate": 4.715958246002148e-05, + "loss": 0.4614, + "step": 4365 + }, + { + "epoch": 1.0191231343283582, + "grad_norm": 0.3749619013088552, + "learning_rate": 4.715113723074699e-05, + "loss": 0.4599, + "step": 4370 + }, + { + "epoch": 1.0202891791044777, + "grad_norm": 0.37088493349069046, + "learning_rate": 4.714268031372964e-05, + "loss": 0.4732, + "step": 4375 + }, + { + "epoch": 1.021455223880597, + "grad_norm": 0.3395518739628772, + "learning_rate": 4.7134211713999264e-05, + "loss": 0.4466, + "step": 4380 + }, + { + "epoch": 1.0226212686567164, + "grad_norm": 0.38790472672739523, + "learning_rate": 4.712573143659268e-05, + "loss": 0.4697, + "step": 4385 + }, + { + "epoch": 1.023787313432836, + "grad_norm": 0.34352109900158556, + "learning_rate": 4.711723948655362e-05, + "loss": 0.4672, + "step": 4390 + }, + { + "epoch": 1.0249533582089552, + "grad_norm": 0.36000936651907434, + "learning_rate": 4.710873586893276e-05, + "loss": 0.4472, + "step": 4395 + }, + { + "epoch": 1.0261194029850746, + "grad_norm": 0.3269748917403646, + "learning_rate": 4.7100220588787755e-05, + "loss": 0.469, + "step": 4400 + }, + { + "epoch": 1.0272854477611941, + "grad_norm": 0.3437252814245387, + "learning_rate": 4.7091693651183144e-05, + "loss": 0.483, + "step": 4405 + }, + { + "epoch": 1.0284514925373134, + "grad_norm": 0.36295468075037257, + "learning_rate": 4.7083155061190426e-05, + "loss": 0.4938, + "step": 4410 + }, + { + "epoch": 1.0296175373134329, + "grad_norm": 0.34170727382592825, + "learning_rate": 4.707460482388804e-05, + "loss": 0.4682, + "step": 4415 + }, + { + "epoch": 1.0307835820895523, + "grad_norm": 0.40347198297014825, + "learning_rate": 4.706604294436132e-05, + "loss": 0.4588, + "step": 4420 + }, + { + "epoch": 1.0319496268656716, + "grad_norm": 0.3798756515973087, + "learning_rate": 4.705746942770255e-05, + "loss": 0.4701, + "step": 4425 + }, + { + "epoch": 1.033115671641791, + "grad_norm": 0.37895617706651286, + "learning_rate": 4.704888427901094e-05, + "loss": 0.444, + "step": 4430 + }, + { + "epoch": 1.0342817164179103, + "grad_norm": 0.3426791493033279, + "learning_rate": 4.70402875033926e-05, + "loss": 0.4815, + "step": 4435 + }, + { + "epoch": 1.0354477611940298, + "grad_norm": 0.37433436951614435, + "learning_rate": 4.703167910596055e-05, + "loss": 0.4916, + "step": 4440 + }, + { + "epoch": 1.0366138059701493, + "grad_norm": 0.3648609106638476, + "learning_rate": 4.702305909183475e-05, + "loss": 0.4646, + "step": 4445 + }, + { + "epoch": 1.0377798507462686, + "grad_norm": 0.3292380253195559, + "learning_rate": 4.701442746614206e-05, + "loss": 0.4936, + "step": 4450 + }, + { + "epoch": 1.038945895522388, + "grad_norm": 0.34478649180279286, + "learning_rate": 4.700578423401622e-05, + "loss": 0.4571, + "step": 4455 + }, + { + "epoch": 1.0401119402985075, + "grad_norm": 0.3646134103678016, + "learning_rate": 4.699712940059791e-05, + "loss": 0.4653, + "step": 4460 + }, + { + "epoch": 1.0412779850746268, + "grad_norm": 0.35114381717321036, + "learning_rate": 4.6988462971034676e-05, + "loss": 0.4844, + "step": 4465 + }, + { + "epoch": 1.0424440298507462, + "grad_norm": 0.3298159067089294, + "learning_rate": 4.697978495048099e-05, + "loss": 0.4696, + "step": 4470 + }, + { + "epoch": 1.0436100746268657, + "grad_norm": 0.4339146415373856, + "learning_rate": 4.697109534409821e-05, + "loss": 0.4758, + "step": 4475 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.37461865408584955, + "learning_rate": 4.696239415705458e-05, + "loss": 0.479, + "step": 4480 + }, + { + "epoch": 1.0459421641791045, + "grad_norm": 0.3365744173673012, + "learning_rate": 4.695368139452521e-05, + "loss": 0.4683, + "step": 4485 + }, + { + "epoch": 1.047108208955224, + "grad_norm": 0.3397519146528062, + "learning_rate": 4.694495706169214e-05, + "loss": 0.477, + "step": 4490 + }, + { + "epoch": 1.0482742537313432, + "grad_norm": 0.39498235820057703, + "learning_rate": 4.693622116374427e-05, + "loss": 0.496, + "step": 4495 + }, + { + "epoch": 1.0494402985074627, + "grad_norm": 0.3441985067767204, + "learning_rate": 4.692747370587737e-05, + "loss": 0.4739, + "step": 4500 + }, + { + "epoch": 1.0506063432835822, + "grad_norm": 0.3610968244663037, + "learning_rate": 4.691871469329408e-05, + "loss": 0.472, + "step": 4505 + }, + { + "epoch": 1.0517723880597014, + "grad_norm": 0.3646189009707427, + "learning_rate": 4.690994413120394e-05, + "loss": 0.4605, + "step": 4510 + }, + { + "epoch": 1.052938432835821, + "grad_norm": 0.3555160621444571, + "learning_rate": 4.690116202482335e-05, + "loss": 0.4592, + "step": 4515 + }, + { + "epoch": 1.0541044776119404, + "grad_norm": 0.37864888282318304, + "learning_rate": 4.689236837937556e-05, + "loss": 0.4555, + "step": 4520 + }, + { + "epoch": 1.0552705223880596, + "grad_norm": 0.351622874821909, + "learning_rate": 4.688356320009069e-05, + "loss": 0.495, + "step": 4525 + }, + { + "epoch": 1.056436567164179, + "grad_norm": 0.37630623033926713, + "learning_rate": 4.687474649220573e-05, + "loss": 0.478, + "step": 4530 + }, + { + "epoch": 1.0576026119402986, + "grad_norm": 0.36080528635217013, + "learning_rate": 4.6865918260964506e-05, + "loss": 0.4664, + "step": 4535 + }, + { + "epoch": 1.0587686567164178, + "grad_norm": 0.390910593040182, + "learning_rate": 4.685707851161773e-05, + "loss": 0.4655, + "step": 4540 + }, + { + "epoch": 1.0599347014925373, + "grad_norm": 0.34939052192195547, + "learning_rate": 4.6848227249422936e-05, + "loss": 0.4827, + "step": 4545 + }, + { + "epoch": 1.0611007462686568, + "grad_norm": 0.3776705402142592, + "learning_rate": 4.683936447964452e-05, + "loss": 0.4784, + "step": 4550 + }, + { + "epoch": 1.062266791044776, + "grad_norm": 0.41247129015358364, + "learning_rate": 4.683049020755372e-05, + "loss": 0.4824, + "step": 4555 + }, + { + "epoch": 1.0634328358208955, + "grad_norm": 0.37161703283515807, + "learning_rate": 4.6821604438428594e-05, + "loss": 0.4812, + "step": 4560 + }, + { + "epoch": 1.064598880597015, + "grad_norm": 0.3855721993358096, + "learning_rate": 4.681270717755409e-05, + "loss": 0.4871, + "step": 4565 + }, + { + "epoch": 1.0657649253731343, + "grad_norm": 0.3745003412323164, + "learning_rate": 4.680379843022192e-05, + "loss": 0.4653, + "step": 4570 + }, + { + "epoch": 1.0669309701492538, + "grad_norm": 0.3442522560370515, + "learning_rate": 4.679487820173069e-05, + "loss": 0.4529, + "step": 4575 + }, + { + "epoch": 1.0680970149253732, + "grad_norm": 0.36435916677044505, + "learning_rate": 4.678594649738581e-05, + "loss": 0.4471, + "step": 4580 + }, + { + "epoch": 1.0692630597014925, + "grad_norm": 0.36181593368564924, + "learning_rate": 4.67770033224995e-05, + "loss": 0.4651, + "step": 4585 + }, + { + "epoch": 1.070429104477612, + "grad_norm": 0.3595589483080218, + "learning_rate": 4.676804868239083e-05, + "loss": 0.4366, + "step": 4590 + }, + { + "epoch": 1.0715951492537314, + "grad_norm": 0.3620547821336687, + "learning_rate": 4.675908258238567e-05, + "loss": 0.4646, + "step": 4595 + }, + { + "epoch": 1.0727611940298507, + "grad_norm": 0.3665862033739012, + "learning_rate": 4.6750105027816716e-05, + "loss": 0.4741, + "step": 4600 + }, + { + "epoch": 1.0739272388059702, + "grad_norm": 0.3522918193078635, + "learning_rate": 4.6741116024023476e-05, + "loss": 0.4656, + "step": 4605 + }, + { + "epoch": 1.0750932835820897, + "grad_norm": 0.34441917709894415, + "learning_rate": 4.673211557635225e-05, + "loss": 0.4654, + "step": 4610 + }, + { + "epoch": 1.076259328358209, + "grad_norm": 0.36340291750873155, + "learning_rate": 4.672310369015619e-05, + "loss": 0.4849, + "step": 4615 + }, + { + "epoch": 1.0774253731343284, + "grad_norm": 0.3469345999215564, + "learning_rate": 4.671408037079519e-05, + "loss": 0.4666, + "step": 4620 + }, + { + "epoch": 1.0785914179104477, + "grad_norm": 0.3538303630694054, + "learning_rate": 4.670504562363598e-05, + "loss": 0.4561, + "step": 4625 + }, + { + "epoch": 1.0797574626865671, + "grad_norm": 0.36634890355747535, + "learning_rate": 4.669599945405208e-05, + "loss": 0.4921, + "step": 4630 + }, + { + "epoch": 1.0809235074626866, + "grad_norm": 0.3430965755301908, + "learning_rate": 4.668694186742383e-05, + "loss": 0.4618, + "step": 4635 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.36964625204254353, + "learning_rate": 4.6677872869138304e-05, + "loss": 0.4685, + "step": 4640 + }, + { + "epoch": 1.0832555970149254, + "grad_norm": 0.40764938594719646, + "learning_rate": 4.666879246458941e-05, + "loss": 0.4684, + "step": 4645 + }, + { + "epoch": 1.0844216417910448, + "grad_norm": 0.36940611020903746, + "learning_rate": 4.6659700659177814e-05, + "loss": 0.4672, + "step": 4650 + }, + { + "epoch": 1.085587686567164, + "grad_norm": 0.3534666311242132, + "learning_rate": 4.665059745831098e-05, + "loss": 0.4786, + "step": 4655 + }, + { + "epoch": 1.0867537313432836, + "grad_norm": 0.3708615966075552, + "learning_rate": 4.6641482867403156e-05, + "loss": 0.4615, + "step": 4660 + }, + { + "epoch": 1.087919776119403, + "grad_norm": 0.35557897859045173, + "learning_rate": 4.6632356891875336e-05, + "loss": 0.4715, + "step": 4665 + }, + { + "epoch": 1.0890858208955223, + "grad_norm": 0.3607744066748857, + "learning_rate": 4.662321953715529e-05, + "loss": 0.4591, + "step": 4670 + }, + { + "epoch": 1.0902518656716418, + "grad_norm": 0.36007222893395574, + "learning_rate": 4.661407080867759e-05, + "loss": 0.4574, + "step": 4675 + }, + { + "epoch": 1.0914179104477613, + "grad_norm": 0.38089461165127153, + "learning_rate": 4.660491071188353e-05, + "loss": 0.4846, + "step": 4680 + }, + { + "epoch": 1.0925839552238805, + "grad_norm": 0.3905329009773269, + "learning_rate": 4.6595739252221196e-05, + "loss": 0.5093, + "step": 4685 + }, + { + "epoch": 1.09375, + "grad_norm": 0.3602585268091229, + "learning_rate": 4.658655643514541e-05, + "loss": 0.4792, + "step": 4690 + }, + { + "epoch": 1.0949160447761195, + "grad_norm": 0.4522315050176882, + "learning_rate": 4.657736226611778e-05, + "loss": 0.4864, + "step": 4695 + }, + { + "epoch": 1.0960820895522387, + "grad_norm": 0.3405801635037323, + "learning_rate": 4.656815675060662e-05, + "loss": 0.4689, + "step": 4700 + }, + { + "epoch": 1.0972481343283582, + "grad_norm": 0.4041410578879345, + "learning_rate": 4.655893989408702e-05, + "loss": 0.4792, + "step": 4705 + }, + { + "epoch": 1.0984141791044777, + "grad_norm": 0.3387401672489702, + "learning_rate": 4.654971170204083e-05, + "loss": 0.4605, + "step": 4710 + }, + { + "epoch": 1.099580223880597, + "grad_norm": 0.3553977899178839, + "learning_rate": 4.6540472179956625e-05, + "loss": 0.471, + "step": 4715 + }, + { + "epoch": 1.1007462686567164, + "grad_norm": 0.35250369064647263, + "learning_rate": 4.6531221333329694e-05, + "loss": 0.4807, + "step": 4720 + }, + { + "epoch": 1.101912313432836, + "grad_norm": 0.34666820032492063, + "learning_rate": 4.652195916766211e-05, + "loss": 0.4577, + "step": 4725 + }, + { + "epoch": 1.1030783582089552, + "grad_norm": 0.3687788621939309, + "learning_rate": 4.6512685688462645e-05, + "loss": 0.4929, + "step": 4730 + }, + { + "epoch": 1.1042444029850746, + "grad_norm": 0.3604406893897734, + "learning_rate": 4.65034009012468e-05, + "loss": 0.4694, + "step": 4735 + }, + { + "epoch": 1.1054104477611941, + "grad_norm": 0.39492988501975834, + "learning_rate": 4.649410481153683e-05, + "loss": 0.4847, + "step": 4740 + }, + { + "epoch": 1.1065764925373134, + "grad_norm": 0.3556350936151747, + "learning_rate": 4.6484797424861675e-05, + "loss": 0.4709, + "step": 4745 + }, + { + "epoch": 1.1077425373134329, + "grad_norm": 0.36685815822526896, + "learning_rate": 4.6475478746757025e-05, + "loss": 0.4596, + "step": 4750 + }, + { + "epoch": 1.1089085820895523, + "grad_norm": 0.38179365682843375, + "learning_rate": 4.646614878276526e-05, + "loss": 0.4903, + "step": 4755 + }, + { + "epoch": 1.1100746268656716, + "grad_norm": 0.3477278156168366, + "learning_rate": 4.64568075384355e-05, + "loss": 0.467, + "step": 4760 + }, + { + "epoch": 1.111240671641791, + "grad_norm": 0.3494902566057672, + "learning_rate": 4.644745501932355e-05, + "loss": 0.4527, + "step": 4765 + }, + { + "epoch": 1.1124067164179103, + "grad_norm": 0.35263314338083995, + "learning_rate": 4.643809123099192e-05, + "loss": 0.4932, + "step": 4770 + }, + { + "epoch": 1.1135727611940298, + "grad_norm": 0.3874795877039355, + "learning_rate": 4.6428716179009844e-05, + "loss": 0.4708, + "step": 4775 + }, + { + "epoch": 1.1147388059701493, + "grad_norm": 0.36333961831243017, + "learning_rate": 4.641932986895325e-05, + "loss": 0.4585, + "step": 4780 + }, + { + "epoch": 1.1159048507462686, + "grad_norm": 0.351190710620662, + "learning_rate": 4.6409932306404735e-05, + "loss": 0.4804, + "step": 4785 + }, + { + "epoch": 1.117070895522388, + "grad_norm": 0.31329819772120054, + "learning_rate": 4.640052349695363e-05, + "loss": 0.4637, + "step": 4790 + }, + { + "epoch": 1.1182369402985075, + "grad_norm": 0.3792351860722495, + "learning_rate": 4.6391103446195915e-05, + "loss": 0.4584, + "step": 4795 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3506865703413139, + "learning_rate": 4.6381672159734287e-05, + "loss": 0.4562, + "step": 4800 + }, + { + "epoch": 1.1205690298507462, + "grad_norm": 0.4326740549088606, + "learning_rate": 4.637222964317811e-05, + "loss": 0.4867, + "step": 4805 + }, + { + "epoch": 1.1217350746268657, + "grad_norm": 0.35337883468208786, + "learning_rate": 4.636277590214344e-05, + "loss": 0.4631, + "step": 4810 + }, + { + "epoch": 1.122901119402985, + "grad_norm": 0.3437797335400648, + "learning_rate": 4.6353310942252986e-05, + "loss": 0.4711, + "step": 4815 + }, + { + "epoch": 1.1240671641791045, + "grad_norm": 0.3349220385183838, + "learning_rate": 4.634383476913615e-05, + "loss": 0.4794, + "step": 4820 + }, + { + "epoch": 1.125233208955224, + "grad_norm": 0.34586059659937457, + "learning_rate": 4.6334347388429e-05, + "loss": 0.4527, + "step": 4825 + }, + { + "epoch": 1.1263992537313432, + "grad_norm": 0.33260407177537676, + "learning_rate": 4.632484880577425e-05, + "loss": 0.462, + "step": 4830 + }, + { + "epoch": 1.1275652985074627, + "grad_norm": 0.34136322338964215, + "learning_rate": 4.6315339026821305e-05, + "loss": 0.4707, + "step": 4835 + }, + { + "epoch": 1.1287313432835822, + "grad_norm": 0.3489751043295923, + "learning_rate": 4.6305818057226226e-05, + "loss": 0.4689, + "step": 4840 + }, + { + "epoch": 1.1298973880597014, + "grad_norm": 0.3693000467333574, + "learning_rate": 4.62962859026517e-05, + "loss": 0.4762, + "step": 4845 + }, + { + "epoch": 1.131063432835821, + "grad_norm": 0.4298711061127338, + "learning_rate": 4.62867425687671e-05, + "loss": 0.4969, + "step": 4850 + }, + { + "epoch": 1.1322294776119404, + "grad_norm": 0.3502124625404359, + "learning_rate": 4.6277188061248436e-05, + "loss": 0.4893, + "step": 4855 + }, + { + "epoch": 1.1333955223880596, + "grad_norm": 0.3682601185314399, + "learning_rate": 4.626762238577836e-05, + "loss": 0.4726, + "step": 4860 + }, + { + "epoch": 1.134561567164179, + "grad_norm": 0.35690149579209757, + "learning_rate": 4.6258045548046166e-05, + "loss": 0.4707, + "step": 4865 + }, + { + "epoch": 1.1357276119402986, + "grad_norm": 0.35016570452171414, + "learning_rate": 4.624845755374779e-05, + "loss": 0.4834, + "step": 4870 + }, + { + "epoch": 1.1368936567164178, + "grad_norm": 0.3563033283287152, + "learning_rate": 4.6238858408585804e-05, + "loss": 0.4571, + "step": 4875 + }, + { + "epoch": 1.1380597014925373, + "grad_norm": 0.3445818160791845, + "learning_rate": 4.622924811826942e-05, + "loss": 0.4722, + "step": 4880 + }, + { + "epoch": 1.1392257462686568, + "grad_norm": 0.34145338015025534, + "learning_rate": 4.6219626688514456e-05, + "loss": 0.4741, + "step": 4885 + }, + { + "epoch": 1.140391791044776, + "grad_norm": 0.36587625257751644, + "learning_rate": 4.620999412504338e-05, + "loss": 0.4618, + "step": 4890 + }, + { + "epoch": 1.1415578358208955, + "grad_norm": 0.34599353777788416, + "learning_rate": 4.620035043358526e-05, + "loss": 0.4747, + "step": 4895 + }, + { + "epoch": 1.142723880597015, + "grad_norm": 0.359034246944853, + "learning_rate": 4.619069561987581e-05, + "loss": 0.4765, + "step": 4900 + }, + { + "epoch": 1.1438899253731343, + "grad_norm": 0.3316031562213355, + "learning_rate": 4.618102968965733e-05, + "loss": 0.4629, + "step": 4905 + }, + { + "epoch": 1.1450559701492538, + "grad_norm": 0.3456782856940626, + "learning_rate": 4.6171352648678755e-05, + "loss": 0.4544, + "step": 4910 + }, + { + "epoch": 1.1462220149253732, + "grad_norm": 0.3597782283482135, + "learning_rate": 4.6161664502695606e-05, + "loss": 0.476, + "step": 4915 + }, + { + "epoch": 1.1473880597014925, + "grad_norm": 0.33986609170732546, + "learning_rate": 4.615196525747003e-05, + "loss": 0.4526, + "step": 4920 + }, + { + "epoch": 1.148554104477612, + "grad_norm": 0.34494264289228443, + "learning_rate": 4.6142254918770764e-05, + "loss": 0.4851, + "step": 4925 + }, + { + "epoch": 1.1497201492537314, + "grad_norm": 0.35939390691080125, + "learning_rate": 4.613253349237314e-05, + "loss": 0.4722, + "step": 4930 + }, + { + "epoch": 1.1508861940298507, + "grad_norm": 0.34825796229089734, + "learning_rate": 4.612280098405909e-05, + "loss": 0.4799, + "step": 4935 + }, + { + "epoch": 1.1520522388059702, + "grad_norm": 0.34466249371372343, + "learning_rate": 4.611305739961715e-05, + "loss": 0.4564, + "step": 4940 + }, + { + "epoch": 1.1532182835820897, + "grad_norm": 0.37636422115279045, + "learning_rate": 4.610330274484242e-05, + "loss": 0.4878, + "step": 4945 + }, + { + "epoch": 1.154384328358209, + "grad_norm": 0.3444610445631163, + "learning_rate": 4.609353702553659e-05, + "loss": 0.4591, + "step": 4950 + }, + { + "epoch": 1.1555503731343284, + "grad_norm": 0.44224030162597666, + "learning_rate": 4.6083760247507945e-05, + "loss": 0.4888, + "step": 4955 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.36083422304592466, + "learning_rate": 4.607397241657133e-05, + "loss": 0.4743, + "step": 4960 + }, + { + "epoch": 1.1578824626865671, + "grad_norm": 0.41777229964352414, + "learning_rate": 4.606417353854818e-05, + "loss": 0.4663, + "step": 4965 + }, + { + "epoch": 1.1590485074626866, + "grad_norm": 0.327211979110645, + "learning_rate": 4.605436361926648e-05, + "loss": 0.4544, + "step": 4970 + }, + { + "epoch": 1.1602145522388059, + "grad_norm": 0.3709306815528986, + "learning_rate": 4.6044542664560804e-05, + "loss": 0.4793, + "step": 4975 + }, + { + "epoch": 1.1613805970149254, + "grad_norm": 0.359192689249793, + "learning_rate": 4.6034710680272274e-05, + "loss": 0.4307, + "step": 4980 + }, + { + "epoch": 1.1625466417910448, + "grad_norm": 0.4667362119738335, + "learning_rate": 4.602486767224858e-05, + "loss": 0.4815, + "step": 4985 + }, + { + "epoch": 1.163712686567164, + "grad_norm": 0.39369758782073544, + "learning_rate": 4.601501364634397e-05, + "loss": 0.468, + "step": 4990 + }, + { + "epoch": 1.1648787313432836, + "grad_norm": 0.35195180185798663, + "learning_rate": 4.600514860841923e-05, + "loss": 0.4974, + "step": 4995 + }, + { + "epoch": 1.166044776119403, + "grad_norm": 0.38108608598276555, + "learning_rate": 4.599527256434171e-05, + "loss": 0.4731, + "step": 5000 + }, + { + "epoch": 1.1672108208955223, + "grad_norm": 0.3389828422725711, + "learning_rate": 4.598538551998531e-05, + "loss": 0.4672, + "step": 5005 + }, + { + "epoch": 1.1683768656716418, + "grad_norm": 0.3456064473874332, + "learning_rate": 4.597548748123046e-05, + "loss": 0.4763, + "step": 5010 + }, + { + "epoch": 1.1695429104477613, + "grad_norm": 0.3897515261074159, + "learning_rate": 4.596557845396412e-05, + "loss": 0.477, + "step": 5015 + }, + { + "epoch": 1.1707089552238805, + "grad_norm": 0.35285702888187354, + "learning_rate": 4.595565844407982e-05, + "loss": 0.47, + "step": 5020 + }, + { + "epoch": 1.171875, + "grad_norm": 0.3436729048905015, + "learning_rate": 4.59457274574776e-05, + "loss": 0.476, + "step": 5025 + }, + { + "epoch": 1.1730410447761195, + "grad_norm": 0.3395289915550358, + "learning_rate": 4.5935785500064014e-05, + "loss": 0.4514, + "step": 5030 + }, + { + "epoch": 1.1742070895522387, + "grad_norm": 0.3459533878168976, + "learning_rate": 4.5925832577752175e-05, + "loss": 0.4803, + "step": 5035 + }, + { + "epoch": 1.1753731343283582, + "grad_norm": 0.3817694396126775, + "learning_rate": 4.5915868696461685e-05, + "loss": 0.4901, + "step": 5040 + }, + { + "epoch": 1.1765391791044777, + "grad_norm": 0.3719161382099099, + "learning_rate": 4.590589386211869e-05, + "loss": 0.4723, + "step": 5045 + }, + { + "epoch": 1.177705223880597, + "grad_norm": 0.3589909527778616, + "learning_rate": 4.589590808065583e-05, + "loss": 0.4634, + "step": 5050 + }, + { + "epoch": 1.1788712686567164, + "grad_norm": 0.3446017057438538, + "learning_rate": 4.588591135801227e-05, + "loss": 0.4736, + "step": 5055 + }, + { + "epoch": 1.180037313432836, + "grad_norm": 0.36066282460265336, + "learning_rate": 4.587590370013367e-05, + "loss": 0.4813, + "step": 5060 + }, + { + "epoch": 1.1812033582089552, + "grad_norm": 0.3348515022796862, + "learning_rate": 4.5865885112972216e-05, + "loss": 0.4668, + "step": 5065 + }, + { + "epoch": 1.1823694029850746, + "grad_norm": 0.3806257961184635, + "learning_rate": 4.585585560248657e-05, + "loss": 0.464, + "step": 5070 + }, + { + "epoch": 1.1835354477611941, + "grad_norm": 0.4055021260870813, + "learning_rate": 4.58458151746419e-05, + "loss": 0.4771, + "step": 5075 + }, + { + "epoch": 1.1847014925373134, + "grad_norm": 0.36024792486991203, + "learning_rate": 4.5835763835409864e-05, + "loss": 0.4847, + "step": 5080 + }, + { + "epoch": 1.1858675373134329, + "grad_norm": 0.36432444987862567, + "learning_rate": 4.5825701590768625e-05, + "loss": 0.4674, + "step": 5085 + }, + { + "epoch": 1.1870335820895521, + "grad_norm": 0.35539556899013736, + "learning_rate": 4.58156284467028e-05, + "loss": 0.4644, + "step": 5090 + }, + { + "epoch": 1.1881996268656716, + "grad_norm": 0.36012386102501925, + "learning_rate": 4.5805544409203535e-05, + "loss": 0.4761, + "step": 5095 + }, + { + "epoch": 1.189365671641791, + "grad_norm": 0.37398174590872096, + "learning_rate": 4.579544948426841e-05, + "loss": 0.4799, + "step": 5100 + }, + { + "epoch": 1.1905317164179103, + "grad_norm": 0.3994282420738512, + "learning_rate": 4.57853436779015e-05, + "loss": 0.4879, + "step": 5105 + }, + { + "epoch": 1.1916977611940298, + "grad_norm": 0.39624462528288745, + "learning_rate": 4.577522699611336e-05, + "loss": 0.4687, + "step": 5110 + }, + { + "epoch": 1.1928638059701493, + "grad_norm": 0.36822580927497833, + "learning_rate": 4.576509944492101e-05, + "loss": 0.4743, + "step": 5115 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3418725043685805, + "learning_rate": 4.57549610303479e-05, + "loss": 0.4653, + "step": 5120 + }, + { + "epoch": 1.195195895522388, + "grad_norm": 0.371728706042081, + "learning_rate": 4.5744811758424e-05, + "loss": 0.4931, + "step": 5125 + }, + { + "epoch": 1.1963619402985075, + "grad_norm": 0.3639777699347714, + "learning_rate": 4.573465163518569e-05, + "loss": 0.4556, + "step": 5130 + }, + { + "epoch": 1.1975279850746268, + "grad_norm": 0.3655513971592392, + "learning_rate": 4.572448066667584e-05, + "loss": 0.4759, + "step": 5135 + }, + { + "epoch": 1.1986940298507462, + "grad_norm": 0.3804150438485859, + "learning_rate": 4.571429885894373e-05, + "loss": 0.4786, + "step": 5140 + }, + { + "epoch": 1.1998600746268657, + "grad_norm": 0.37662124845828593, + "learning_rate": 4.5704106218045124e-05, + "loss": 0.49, + "step": 5145 + }, + { + "epoch": 1.201026119402985, + "grad_norm": 0.3538352986060505, + "learning_rate": 4.569390275004221e-05, + "loss": 0.4786, + "step": 5150 + }, + { + "epoch": 1.2021921641791045, + "grad_norm": 0.3401736071338025, + "learning_rate": 4.568368846100363e-05, + "loss": 0.4807, + "step": 5155 + }, + { + "epoch": 1.203358208955224, + "grad_norm": 0.3871943834832515, + "learning_rate": 4.567346335700442e-05, + "loss": 0.4951, + "step": 5160 + }, + { + "epoch": 1.2045242537313432, + "grad_norm": 0.34238863528262575, + "learning_rate": 4.5663227444126114e-05, + "loss": 0.4796, + "step": 5165 + }, + { + "epoch": 1.2056902985074627, + "grad_norm": 0.3120375829312124, + "learning_rate": 4.565298072845662e-05, + "loss": 0.4435, + "step": 5170 + }, + { + "epoch": 1.2068563432835822, + "grad_norm": 0.34929115168215075, + "learning_rate": 4.564272321609031e-05, + "loss": 0.4593, + "step": 5175 + }, + { + "epoch": 1.2080223880597014, + "grad_norm": 0.3642388243430463, + "learning_rate": 4.563245491312793e-05, + "loss": 0.4965, + "step": 5180 + }, + { + "epoch": 1.209188432835821, + "grad_norm": 0.34560827023747864, + "learning_rate": 4.5622175825676695e-05, + "loss": 0.4455, + "step": 5185 + }, + { + "epoch": 1.2103544776119404, + "grad_norm": 0.3544268002221307, + "learning_rate": 4.5611885959850216e-05, + "loss": 0.4656, + "step": 5190 + }, + { + "epoch": 1.2115205223880596, + "grad_norm": 0.35984517231494456, + "learning_rate": 4.560158532176849e-05, + "loss": 0.4618, + "step": 5195 + }, + { + "epoch": 1.212686567164179, + "grad_norm": 0.33659386973381583, + "learning_rate": 4.559127391755796e-05, + "loss": 0.452, + "step": 5200 + }, + { + "epoch": 1.2138526119402986, + "grad_norm": 0.3386594459424572, + "learning_rate": 4.558095175335145e-05, + "loss": 0.4861, + "step": 5205 + }, + { + "epoch": 1.2150186567164178, + "grad_norm": 0.38530834371843253, + "learning_rate": 4.557061883528818e-05, + "loss": 0.4703, + "step": 5210 + }, + { + "epoch": 1.2161847014925373, + "grad_norm": 0.34018779300657914, + "learning_rate": 4.5560275169513786e-05, + "loss": 0.4752, + "step": 5215 + }, + { + "epoch": 1.2173507462686568, + "grad_norm": 0.3665096540503898, + "learning_rate": 4.554992076218026e-05, + "loss": 0.4591, + "step": 5220 + }, + { + "epoch": 1.218516791044776, + "grad_norm": 0.3369144313377284, + "learning_rate": 4.553955561944603e-05, + "loss": 0.4564, + "step": 5225 + }, + { + "epoch": 1.2196828358208955, + "grad_norm": 0.3468759578660706, + "learning_rate": 4.552917974747588e-05, + "loss": 0.4855, + "step": 5230 + }, + { + "epoch": 1.220848880597015, + "grad_norm": 0.35481173701535745, + "learning_rate": 4.551879315244098e-05, + "loss": 0.4617, + "step": 5235 + }, + { + "epoch": 1.2220149253731343, + "grad_norm": 0.3450986925909998, + "learning_rate": 4.5508395840518884e-05, + "loss": 0.4499, + "step": 5240 + }, + { + "epoch": 1.2231809701492538, + "grad_norm": 0.3576714143656946, + "learning_rate": 4.549798781789349e-05, + "loss": 0.4615, + "step": 5245 + }, + { + "epoch": 1.2243470149253732, + "grad_norm": 0.34560502842767293, + "learning_rate": 4.548756909075511e-05, + "loss": 0.4898, + "step": 5250 + }, + { + "epoch": 1.2255130597014925, + "grad_norm": 0.34514981942949313, + "learning_rate": 4.5477139665300414e-05, + "loss": 0.458, + "step": 5255 + }, + { + "epoch": 1.226679104477612, + "grad_norm": 0.37756186234396855, + "learning_rate": 4.5466699547732405e-05, + "loss": 0.467, + "step": 5260 + }, + { + "epoch": 1.2278451492537314, + "grad_norm": 0.34040811641059854, + "learning_rate": 4.545624874426047e-05, + "loss": 0.4585, + "step": 5265 + }, + { + "epoch": 1.2290111940298507, + "grad_norm": 0.3559940480316581, + "learning_rate": 4.544578726110035e-05, + "loss": 0.4618, + "step": 5270 + }, + { + "epoch": 1.2301772388059702, + "grad_norm": 0.351376294469949, + "learning_rate": 4.5435315104474124e-05, + "loss": 0.4867, + "step": 5275 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.3670231506185556, + "learning_rate": 4.5424832280610245e-05, + "loss": 0.4791, + "step": 5280 + }, + { + "epoch": 1.232509328358209, + "grad_norm": 0.34028405863163114, + "learning_rate": 4.541433879574348e-05, + "loss": 0.4617, + "step": 5285 + }, + { + "epoch": 1.2336753731343284, + "grad_norm": 0.3393481641804084, + "learning_rate": 4.540383465611496e-05, + "loss": 0.457, + "step": 5290 + }, + { + "epoch": 1.2348414179104479, + "grad_norm": 0.3786249770896504, + "learning_rate": 4.539331986797215e-05, + "loss": 0.4803, + "step": 5295 + }, + { + "epoch": 1.2360074626865671, + "grad_norm": 0.374430924707444, + "learning_rate": 4.5382794437568824e-05, + "loss": 0.4729, + "step": 5300 + }, + { + "epoch": 1.2371735074626866, + "grad_norm": 0.3422094187059447, + "learning_rate": 4.537225837116512e-05, + "loss": 0.47, + "step": 5305 + }, + { + "epoch": 1.2383395522388059, + "grad_norm": 0.3530091405504405, + "learning_rate": 4.5361711675027484e-05, + "loss": 0.4682, + "step": 5310 + }, + { + "epoch": 1.2395055970149254, + "grad_norm": 0.3654578778498564, + "learning_rate": 4.535115435542868e-05, + "loss": 0.4688, + "step": 5315 + }, + { + "epoch": 1.2406716417910448, + "grad_norm": 0.3422839665065767, + "learning_rate": 4.53405864186478e-05, + "loss": 0.492, + "step": 5320 + }, + { + "epoch": 1.241837686567164, + "grad_norm": 0.34419522904570493, + "learning_rate": 4.5330007870970255e-05, + "loss": 0.4661, + "step": 5325 + }, + { + "epoch": 1.2430037313432836, + "grad_norm": 0.3820446962350419, + "learning_rate": 4.531941871868775e-05, + "loss": 0.4764, + "step": 5330 + }, + { + "epoch": 1.244169776119403, + "grad_norm": 0.34931863469762414, + "learning_rate": 4.530881896809831e-05, + "loss": 0.4662, + "step": 5335 + }, + { + "epoch": 1.2453358208955223, + "grad_norm": 0.40255326996579954, + "learning_rate": 4.5298208625506253e-05, + "loss": 0.4724, + "step": 5340 + }, + { + "epoch": 1.2465018656716418, + "grad_norm": 0.3471022066775054, + "learning_rate": 4.5287587697222215e-05, + "loss": 0.4677, + "step": 5345 + }, + { + "epoch": 1.2476679104477613, + "grad_norm": 0.3338908746986557, + "learning_rate": 4.527695618956312e-05, + "loss": 0.4906, + "step": 5350 + }, + { + "epoch": 1.2488339552238805, + "grad_norm": 0.3560939004514755, + "learning_rate": 4.5266314108852166e-05, + "loss": 0.4806, + "step": 5355 + }, + { + "epoch": 1.25, + "grad_norm": 0.34642499571322316, + "learning_rate": 4.5255661461418854e-05, + "loss": 0.4921, + "step": 5360 + }, + { + "epoch": 1.2511660447761195, + "grad_norm": 0.35942052354277376, + "learning_rate": 4.5244998253598994e-05, + "loss": 0.4673, + "step": 5365 + }, + { + "epoch": 1.2523320895522387, + "grad_norm": 0.33618468091116055, + "learning_rate": 4.5234324491734624e-05, + "loss": 0.4699, + "step": 5370 + }, + { + "epoch": 1.2534981343283582, + "grad_norm": 0.3621804767634846, + "learning_rate": 4.5223640182174115e-05, + "loss": 0.4937, + "step": 5375 + }, + { + "epoch": 1.2546641791044777, + "grad_norm": 0.39067172926492755, + "learning_rate": 4.521294533127206e-05, + "loss": 0.4747, + "step": 5380 + }, + { + "epoch": 1.255830223880597, + "grad_norm": 0.3519068497378385, + "learning_rate": 4.520223994538937e-05, + "loss": 0.4577, + "step": 5385 + }, + { + "epoch": 1.2569962686567164, + "grad_norm": 0.3346943899355124, + "learning_rate": 4.519152403089317e-05, + "loss": 0.4673, + "step": 5390 + }, + { + "epoch": 1.2581623134328357, + "grad_norm": 0.33450040481508014, + "learning_rate": 4.51807975941569e-05, + "loss": 0.4827, + "step": 5395 + }, + { + "epoch": 1.2593283582089552, + "grad_norm": 0.3395969296470776, + "learning_rate": 4.517006064156023e-05, + "loss": 0.456, + "step": 5400 + }, + { + "epoch": 1.2604944029850746, + "grad_norm": 0.3606714318871862, + "learning_rate": 4.515931317948907e-05, + "loss": 0.4648, + "step": 5405 + }, + { + "epoch": 1.261660447761194, + "grad_norm": 0.3598142629013287, + "learning_rate": 4.5148555214335616e-05, + "loss": 0.4586, + "step": 5410 + }, + { + "epoch": 1.2628264925373134, + "grad_norm": 0.3417923393849799, + "learning_rate": 4.5137786752498285e-05, + "loss": 0.4562, + "step": 5415 + }, + { + "epoch": 1.2639925373134329, + "grad_norm": 0.332696637003428, + "learning_rate": 4.512700780038174e-05, + "loss": 0.4686, + "step": 5420 + }, + { + "epoch": 1.2651585820895521, + "grad_norm": 0.3495175993937834, + "learning_rate": 4.5116218364396904e-05, + "loss": 0.453, + "step": 5425 + }, + { + "epoch": 1.2663246268656716, + "grad_norm": 0.35815858890808655, + "learning_rate": 4.510541845096091e-05, + "loss": 0.4444, + "step": 5430 + }, + { + "epoch": 1.267490671641791, + "grad_norm": 0.32114652584389525, + "learning_rate": 4.509460806649714e-05, + "loss": 0.4622, + "step": 5435 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.34840881577020344, + "learning_rate": 4.5083787217435175e-05, + "loss": 0.4686, + "step": 5440 + }, + { + "epoch": 1.2698227611940298, + "grad_norm": 0.3529734030459022, + "learning_rate": 4.507295591021087e-05, + "loss": 0.4811, + "step": 5445 + }, + { + "epoch": 1.2709888059701493, + "grad_norm": 0.3406895748034019, + "learning_rate": 4.506211415126624e-05, + "loss": 0.4707, + "step": 5450 + }, + { + "epoch": 1.2721548507462686, + "grad_norm": 0.34948257097100904, + "learning_rate": 4.505126194704958e-05, + "loss": 0.4657, + "step": 5455 + }, + { + "epoch": 1.273320895522388, + "grad_norm": 0.3481689704013532, + "learning_rate": 4.504039930401535e-05, + "loss": 0.4982, + "step": 5460 + }, + { + "epoch": 1.2744869402985075, + "grad_norm": 0.36571058022165265, + "learning_rate": 4.5029526228624226e-05, + "loss": 0.4894, + "step": 5465 + }, + { + "epoch": 1.2756529850746268, + "grad_norm": 0.3504718839400544, + "learning_rate": 4.501864272734311e-05, + "loss": 0.4909, + "step": 5470 + }, + { + "epoch": 1.2768190298507462, + "grad_norm": 0.3712428064742856, + "learning_rate": 4.500774880664508e-05, + "loss": 0.4614, + "step": 5475 + }, + { + "epoch": 1.2779850746268657, + "grad_norm": 0.3367222023286355, + "learning_rate": 4.4996844473009425e-05, + "loss": 0.4843, + "step": 5480 + }, + { + "epoch": 1.279151119402985, + "grad_norm": 0.3467565130703101, + "learning_rate": 4.498592973292162e-05, + "loss": 0.4748, + "step": 5485 + }, + { + "epoch": 1.2803171641791045, + "grad_norm": 0.352271810512614, + "learning_rate": 4.497500459287335e-05, + "loss": 0.4758, + "step": 5490 + }, + { + "epoch": 1.281483208955224, + "grad_norm": 0.3649509574716188, + "learning_rate": 4.496406905936246e-05, + "loss": 0.4618, + "step": 5495 + }, + { + "epoch": 1.2826492537313432, + "grad_norm": 0.34154397167538003, + "learning_rate": 4.4953123138892984e-05, + "loss": 0.4399, + "step": 5500 + }, + { + "epoch": 1.2838152985074627, + "grad_norm": 0.35770749183416967, + "learning_rate": 4.4942166837975134e-05, + "loss": 0.4734, + "step": 5505 + }, + { + "epoch": 1.2849813432835822, + "grad_norm": 0.33193004769625795, + "learning_rate": 4.4931200163125306e-05, + "loss": 0.4637, + "step": 5510 + }, + { + "epoch": 1.2861473880597014, + "grad_norm": 0.32636524823546637, + "learning_rate": 4.492022312086605e-05, + "loss": 0.4628, + "step": 5515 + }, + { + "epoch": 1.287313432835821, + "grad_norm": 0.37972064478182993, + "learning_rate": 4.4909235717726086e-05, + "loss": 0.4809, + "step": 5520 + }, + { + "epoch": 1.2884794776119404, + "grad_norm": 0.3705003936352805, + "learning_rate": 4.4898237960240315e-05, + "loss": 0.4862, + "step": 5525 + }, + { + "epoch": 1.2896455223880596, + "grad_norm": 0.35557349238462793, + "learning_rate": 4.488722985494978e-05, + "loss": 0.4713, + "step": 5530 + }, + { + "epoch": 1.290811567164179, + "grad_norm": 0.3520554264053171, + "learning_rate": 4.487621140840165e-05, + "loss": 0.4665, + "step": 5535 + }, + { + "epoch": 1.2919776119402986, + "grad_norm": 0.3493763195955227, + "learning_rate": 4.486518262714931e-05, + "loss": 0.4775, + "step": 5540 + }, + { + "epoch": 1.2931436567164178, + "grad_norm": 0.3434809439568833, + "learning_rate": 4.485414351775224e-05, + "loss": 0.4677, + "step": 5545 + }, + { + "epoch": 1.2943097014925373, + "grad_norm": 0.3448956584886712, + "learning_rate": 4.484309408677609e-05, + "loss": 0.4733, + "step": 5550 + }, + { + "epoch": 1.2954757462686568, + "grad_norm": 0.3511373110368711, + "learning_rate": 4.483203434079263e-05, + "loss": 0.4817, + "step": 5555 + }, + { + "epoch": 1.296641791044776, + "grad_norm": 0.34682648049040404, + "learning_rate": 4.4820964286379764e-05, + "loss": 0.4845, + "step": 5560 + }, + { + "epoch": 1.2978078358208955, + "grad_norm": 0.33948525207732083, + "learning_rate": 4.480988393012155e-05, + "loss": 0.4629, + "step": 5565 + }, + { + "epoch": 1.298973880597015, + "grad_norm": 0.357963666780957, + "learning_rate": 4.479879327860816e-05, + "loss": 0.4887, + "step": 5570 + }, + { + "epoch": 1.3001399253731343, + "grad_norm": 0.3540736603121829, + "learning_rate": 4.478769233843587e-05, + "loss": 0.4706, + "step": 5575 + }, + { + "epoch": 1.3013059701492538, + "grad_norm": 0.34082396033556095, + "learning_rate": 4.477658111620711e-05, + "loss": 0.4645, + "step": 5580 + }, + { + "epoch": 1.3024720149253732, + "grad_norm": 0.35335694823393465, + "learning_rate": 4.4765459618530405e-05, + "loss": 0.4775, + "step": 5585 + }, + { + "epoch": 1.3036380597014925, + "grad_norm": 0.35226879726747845, + "learning_rate": 4.47543278520204e-05, + "loss": 0.4592, + "step": 5590 + }, + { + "epoch": 1.304804104477612, + "grad_norm": 0.38845177796693403, + "learning_rate": 4.474318582329783e-05, + "loss": 0.4393, + "step": 5595 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.3656431063157445, + "learning_rate": 4.4732033538989556e-05, + "loss": 0.4551, + "step": 5600 + }, + { + "epoch": 1.3071361940298507, + "grad_norm": 0.3575967154476404, + "learning_rate": 4.4720871005728526e-05, + "loss": 0.4803, + "step": 5605 + }, + { + "epoch": 1.3083022388059702, + "grad_norm": 0.41037970036578786, + "learning_rate": 4.47096982301538e-05, + "loss": 0.4615, + "step": 5610 + }, + { + "epoch": 1.3094682835820897, + "grad_norm": 0.35118234109058316, + "learning_rate": 4.469851521891049e-05, + "loss": 0.4716, + "step": 5615 + }, + { + "epoch": 1.310634328358209, + "grad_norm": 0.35030180181912374, + "learning_rate": 4.468732197864984e-05, + "loss": 0.4825, + "step": 5620 + }, + { + "epoch": 1.3118003731343284, + "grad_norm": 0.34959973159151725, + "learning_rate": 4.467611851602916e-05, + "loss": 0.4823, + "step": 5625 + }, + { + "epoch": 1.3129664179104479, + "grad_norm": 0.34289899511328137, + "learning_rate": 4.4664904837711835e-05, + "loss": 0.4701, + "step": 5630 + }, + { + "epoch": 1.3141324626865671, + "grad_norm": 0.3369597712695564, + "learning_rate": 4.465368095036733e-05, + "loss": 0.4467, + "step": 5635 + }, + { + "epoch": 1.3152985074626866, + "grad_norm": 0.35788483394262643, + "learning_rate": 4.4642446860671185e-05, + "loss": 0.4806, + "step": 5640 + }, + { + "epoch": 1.316464552238806, + "grad_norm": 0.37717250387952733, + "learning_rate": 4.463120257530501e-05, + "loss": 0.4709, + "step": 5645 + }, + { + "epoch": 1.3176305970149254, + "grad_norm": 0.35296128412664984, + "learning_rate": 4.461994810095647e-05, + "loss": 0.456, + "step": 5650 + }, + { + "epoch": 1.3187966417910448, + "grad_norm": 0.33699362841592273, + "learning_rate": 4.46086834443193e-05, + "loss": 0.4627, + "step": 5655 + }, + { + "epoch": 1.3199626865671643, + "grad_norm": 0.336295217139395, + "learning_rate": 4.4597408612093265e-05, + "loss": 0.4826, + "step": 5660 + }, + { + "epoch": 1.3211287313432836, + "grad_norm": 0.3387351126786447, + "learning_rate": 4.458612361098423e-05, + "loss": 0.448, + "step": 5665 + }, + { + "epoch": 1.322294776119403, + "grad_norm": 0.3371270481559477, + "learning_rate": 4.457482844770408e-05, + "loss": 0.4835, + "step": 5670 + }, + { + "epoch": 1.3234608208955223, + "grad_norm": 0.38399747806048057, + "learning_rate": 4.456352312897072e-05, + "loss": 0.4961, + "step": 5675 + }, + { + "epoch": 1.3246268656716418, + "grad_norm": 0.34975638032156975, + "learning_rate": 4.455220766150814e-05, + "loss": 0.4802, + "step": 5680 + }, + { + "epoch": 1.3257929104477613, + "grad_norm": 0.36240364241292927, + "learning_rate": 4.454088205204634e-05, + "loss": 0.4773, + "step": 5685 + }, + { + "epoch": 1.3269589552238805, + "grad_norm": 0.3367159592821131, + "learning_rate": 4.452954630732136e-05, + "loss": 0.4494, + "step": 5690 + }, + { + "epoch": 1.328125, + "grad_norm": 0.3900161527854174, + "learning_rate": 4.451820043407527e-05, + "loss": 0.4731, + "step": 5695 + }, + { + "epoch": 1.3292910447761195, + "grad_norm": 0.3391857878049143, + "learning_rate": 4.450684443905615e-05, + "loss": 0.4843, + "step": 5700 + }, + { + "epoch": 1.3304570895522387, + "grad_norm": 0.36324288923377723, + "learning_rate": 4.4495478329018125e-05, + "loss": 0.4807, + "step": 5705 + }, + { + "epoch": 1.3316231343283582, + "grad_norm": 0.34646667743589166, + "learning_rate": 4.44841021107213e-05, + "loss": 0.4771, + "step": 5710 + }, + { + "epoch": 1.3327891791044777, + "grad_norm": 0.35641048169105694, + "learning_rate": 4.447271579093185e-05, + "loss": 0.4623, + "step": 5715 + }, + { + "epoch": 1.333955223880597, + "grad_norm": 0.3563101542477204, + "learning_rate": 4.4461319376421875e-05, + "loss": 0.4591, + "step": 5720 + }, + { + "epoch": 1.3351212686567164, + "grad_norm": 0.4141322825166876, + "learning_rate": 4.444991287396955e-05, + "loss": 0.461, + "step": 5725 + }, + { + "epoch": 1.3362873134328357, + "grad_norm": 0.3248529230379204, + "learning_rate": 4.443849629035903e-05, + "loss": 0.4478, + "step": 5730 + }, + { + "epoch": 1.3374533582089552, + "grad_norm": 0.3886089588340654, + "learning_rate": 4.4427069632380455e-05, + "loss": 0.4628, + "step": 5735 + }, + { + "epoch": 1.3386194029850746, + "grad_norm": 0.33379826652693817, + "learning_rate": 4.441563290682996e-05, + "loss": 0.469, + "step": 5740 + }, + { + "epoch": 1.339785447761194, + "grad_norm": 0.3429521150843274, + "learning_rate": 4.4404186120509674e-05, + "loss": 0.474, + "step": 5745 + }, + { + "epoch": 1.3409514925373134, + "grad_norm": 0.35876226703538694, + "learning_rate": 4.43927292802277e-05, + "loss": 0.4703, + "step": 5750 + }, + { + "epoch": 1.3421175373134329, + "grad_norm": 0.3569189498874493, + "learning_rate": 4.438126239279814e-05, + "loss": 0.4701, + "step": 5755 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.3189911195518358, + "learning_rate": 4.436978546504105e-05, + "loss": 0.4584, + "step": 5760 + }, + { + "epoch": 1.3444496268656716, + "grad_norm": 0.36161508269905673, + "learning_rate": 4.435829850378247e-05, + "loss": 0.4892, + "step": 5765 + }, + { + "epoch": 1.345615671641791, + "grad_norm": 0.3367798013900717, + "learning_rate": 4.43468015158544e-05, + "loss": 0.4642, + "step": 5770 + }, + { + "epoch": 1.3467817164179103, + "grad_norm": 0.39248959182020965, + "learning_rate": 4.433529450809481e-05, + "loss": 0.4714, + "step": 5775 + }, + { + "epoch": 1.3479477611940298, + "grad_norm": 0.3952304086704552, + "learning_rate": 4.432377748734763e-05, + "loss": 0.4659, + "step": 5780 + }, + { + "epoch": 1.3491138059701493, + "grad_norm": 0.36098942675552664, + "learning_rate": 4.431225046046274e-05, + "loss": 0.4585, + "step": 5785 + }, + { + "epoch": 1.3502798507462686, + "grad_norm": 0.34396963031792716, + "learning_rate": 4.430071343429597e-05, + "loss": 0.4575, + "step": 5790 + }, + { + "epoch": 1.351445895522388, + "grad_norm": 0.3516631835202446, + "learning_rate": 4.4289166415709096e-05, + "loss": 0.467, + "step": 5795 + }, + { + "epoch": 1.3526119402985075, + "grad_norm": 0.34830728074438166, + "learning_rate": 4.427760941156986e-05, + "loss": 0.443, + "step": 5800 + }, + { + "epoch": 1.3537779850746268, + "grad_norm": 0.3401554835447212, + "learning_rate": 4.426604242875191e-05, + "loss": 0.4736, + "step": 5805 + }, + { + "epoch": 1.3549440298507462, + "grad_norm": 0.31692934632881437, + "learning_rate": 4.4254465474134856e-05, + "loss": 0.4664, + "step": 5810 + }, + { + "epoch": 1.3561100746268657, + "grad_norm": 0.3344355543563697, + "learning_rate": 4.42428785546042e-05, + "loss": 0.4418, + "step": 5815 + }, + { + "epoch": 1.357276119402985, + "grad_norm": 0.3429853648105938, + "learning_rate": 4.423128167705144e-05, + "loss": 0.4934, + "step": 5820 + }, + { + "epoch": 1.3584421641791045, + "grad_norm": 0.3372551467137175, + "learning_rate": 4.4219674848373924e-05, + "loss": 0.4557, + "step": 5825 + }, + { + "epoch": 1.359608208955224, + "grad_norm": 0.33816161202855277, + "learning_rate": 4.4208058075474945e-05, + "loss": 0.4538, + "step": 5830 + }, + { + "epoch": 1.3607742537313432, + "grad_norm": 0.35285268917873747, + "learning_rate": 4.419643136526373e-05, + "loss": 0.4811, + "step": 5835 + }, + { + "epoch": 1.3619402985074627, + "grad_norm": 0.3488461077093203, + "learning_rate": 4.418479472465539e-05, + "loss": 0.4766, + "step": 5840 + }, + { + "epoch": 1.3631063432835822, + "grad_norm": 0.35482334452327685, + "learning_rate": 4.417314816057096e-05, + "loss": 0.4745, + "step": 5845 + }, + { + "epoch": 1.3642723880597014, + "grad_norm": 0.34765449286342404, + "learning_rate": 4.416149167993737e-05, + "loss": 0.4756, + "step": 5850 + }, + { + "epoch": 1.365438432835821, + "grad_norm": 0.33080434875289166, + "learning_rate": 4.4149825289687454e-05, + "loss": 0.4808, + "step": 5855 + }, + { + "epoch": 1.3666044776119404, + "grad_norm": 0.34179222147845023, + "learning_rate": 4.413814899675991e-05, + "loss": 0.4751, + "step": 5860 + }, + { + "epoch": 1.3677705223880596, + "grad_norm": 0.3439575534468635, + "learning_rate": 4.4126462808099364e-05, + "loss": 0.4775, + "step": 5865 + }, + { + "epoch": 1.368936567164179, + "grad_norm": 0.3184975726093386, + "learning_rate": 4.411476673065631e-05, + "loss": 0.4662, + "step": 5870 + }, + { + "epoch": 1.3701026119402986, + "grad_norm": 0.333398032842768, + "learning_rate": 4.410306077138713e-05, + "loss": 0.452, + "step": 5875 + }, + { + "epoch": 1.3712686567164178, + "grad_norm": 0.32159660884405294, + "learning_rate": 4.409134493725409e-05, + "loss": 0.4489, + "step": 5880 + }, + { + "epoch": 1.3724347014925373, + "grad_norm": 0.35401452948354284, + "learning_rate": 4.407961923522529e-05, + "loss": 0.5064, + "step": 5885 + }, + { + "epoch": 1.3736007462686568, + "grad_norm": 0.3436323801869561, + "learning_rate": 4.406788367227475e-05, + "loss": 0.4644, + "step": 5890 + }, + { + "epoch": 1.374766791044776, + "grad_norm": 0.34815397841072016, + "learning_rate": 4.4056138255382335e-05, + "loss": 0.4898, + "step": 5895 + }, + { + "epoch": 1.3759328358208955, + "grad_norm": 0.40504575116335834, + "learning_rate": 4.404438299153376e-05, + "loss": 0.5113, + "step": 5900 + }, + { + "epoch": 1.377098880597015, + "grad_norm": 0.3687852567765307, + "learning_rate": 4.4032617887720604e-05, + "loss": 0.4967, + "step": 5905 + }, + { + "epoch": 1.3782649253731343, + "grad_norm": 0.3738569955095955, + "learning_rate": 4.4020842950940294e-05, + "loss": 0.4832, + "step": 5910 + }, + { + "epoch": 1.3794309701492538, + "grad_norm": 0.365148671617145, + "learning_rate": 4.400905818819613e-05, + "loss": 0.4428, + "step": 5915 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.33811311087636564, + "learning_rate": 4.3997263606497225e-05, + "loss": 0.4714, + "step": 5920 + }, + { + "epoch": 1.3817630597014925, + "grad_norm": 0.33101762048774236, + "learning_rate": 4.3985459212858535e-05, + "loss": 0.4644, + "step": 5925 + }, + { + "epoch": 1.382929104477612, + "grad_norm": 0.4045628708096018, + "learning_rate": 4.397364501430088e-05, + "loss": 0.4933, + "step": 5930 + }, + { + "epoch": 1.3840951492537314, + "grad_norm": 0.34091667587368785, + "learning_rate": 4.396182101785089e-05, + "loss": 0.4815, + "step": 5935 + }, + { + "epoch": 1.3852611940298507, + "grad_norm": 0.330650194374689, + "learning_rate": 4.3949987230541e-05, + "loss": 0.4662, + "step": 5940 + }, + { + "epoch": 1.3864272388059702, + "grad_norm": 0.3410642768782291, + "learning_rate": 4.3938143659409515e-05, + "loss": 0.4733, + "step": 5945 + }, + { + "epoch": 1.3875932835820897, + "grad_norm": 0.3253332678449179, + "learning_rate": 4.392629031150054e-05, + "loss": 0.4757, + "step": 5950 + }, + { + "epoch": 1.388759328358209, + "grad_norm": 0.32630858099074683, + "learning_rate": 4.391442719386398e-05, + "loss": 0.4551, + "step": 5955 + }, + { + "epoch": 1.3899253731343284, + "grad_norm": 0.3713212930833286, + "learning_rate": 4.390255431355557e-05, + "loss": 0.452, + "step": 5960 + }, + { + "epoch": 1.3910914179104479, + "grad_norm": 0.3625155387958956, + "learning_rate": 4.389067167763683e-05, + "loss": 0.4541, + "step": 5965 + }, + { + "epoch": 1.3922574626865671, + "grad_norm": 0.3543990369217148, + "learning_rate": 4.387877929317512e-05, + "loss": 0.4975, + "step": 5970 + }, + { + "epoch": 1.3934235074626866, + "grad_norm": 0.3374150857174104, + "learning_rate": 4.3866877167243554e-05, + "loss": 0.4589, + "step": 5975 + }, + { + "epoch": 1.394589552238806, + "grad_norm": 0.3724150202757855, + "learning_rate": 4.3854965306921064e-05, + "loss": 0.4895, + "step": 5980 + }, + { + "epoch": 1.3957555970149254, + "grad_norm": 0.334775208986404, + "learning_rate": 4.384304371929238e-05, + "loss": 0.4702, + "step": 5985 + }, + { + "epoch": 1.3969216417910448, + "grad_norm": 0.34216010538815633, + "learning_rate": 4.383111241144798e-05, + "loss": 0.4443, + "step": 5990 + }, + { + "epoch": 1.3980876865671643, + "grad_norm": 0.3412124651166191, + "learning_rate": 4.3819171390484184e-05, + "loss": 0.4623, + "step": 5995 + }, + { + "epoch": 1.3992537313432836, + "grad_norm": 0.3425367727967644, + "learning_rate": 4.380722066350303e-05, + "loss": 0.474, + "step": 6000 + }, + { + "epoch": 1.400419776119403, + "grad_norm": 0.33250347319099394, + "learning_rate": 4.3795260237612353e-05, + "loss": 0.4695, + "step": 6005 + }, + { + "epoch": 1.4015858208955223, + "grad_norm": 0.35582044943792523, + "learning_rate": 4.378329011992575e-05, + "loss": 0.4767, + "step": 6010 + }, + { + "epoch": 1.4027518656716418, + "grad_norm": 0.329136701757757, + "learning_rate": 4.37713103175626e-05, + "loss": 0.4575, + "step": 6015 + }, + { + "epoch": 1.4039179104477613, + "grad_norm": 0.3375566200650066, + "learning_rate": 4.375932083764803e-05, + "loss": 0.4553, + "step": 6020 + }, + { + "epoch": 1.4050839552238805, + "grad_norm": 0.36949881723621175, + "learning_rate": 4.3747321687312916e-05, + "loss": 0.4606, + "step": 6025 + }, + { + "epoch": 1.40625, + "grad_norm": 0.3393860235345915, + "learning_rate": 4.37353128736939e-05, + "loss": 0.4854, + "step": 6030 + }, + { + "epoch": 1.4074160447761195, + "grad_norm": 0.39783506079987735, + "learning_rate": 4.3723294403933355e-05, + "loss": 0.4686, + "step": 6035 + }, + { + "epoch": 1.4085820895522387, + "grad_norm": 0.44253275431438494, + "learning_rate": 4.3711266285179415e-05, + "loss": 0.4696, + "step": 6040 + }, + { + "epoch": 1.4097481343283582, + "grad_norm": 0.34565348987597855, + "learning_rate": 4.369922852458594e-05, + "loss": 0.4792, + "step": 6045 + }, + { + "epoch": 1.4109141791044777, + "grad_norm": 0.3266358830905705, + "learning_rate": 4.3687181129312534e-05, + "loss": 0.4748, + "step": 6050 + }, + { + "epoch": 1.412080223880597, + "grad_norm": 0.3346189911122666, + "learning_rate": 4.3675124106524514e-05, + "loss": 0.4863, + "step": 6055 + }, + { + "epoch": 1.4132462686567164, + "grad_norm": 0.36015183728416794, + "learning_rate": 4.366305746339293e-05, + "loss": 0.4657, + "step": 6060 + }, + { + "epoch": 1.4144123134328357, + "grad_norm": 0.3445327226090057, + "learning_rate": 4.365098120709458e-05, + "loss": 0.4402, + "step": 6065 + }, + { + "epoch": 1.4155783582089552, + "grad_norm": 0.3578417001077714, + "learning_rate": 4.363889534481195e-05, + "loss": 0.4738, + "step": 6070 + }, + { + "epoch": 1.4167444029850746, + "grad_norm": 0.35258504782831435, + "learning_rate": 4.3626799883733236e-05, + "loss": 0.4587, + "step": 6075 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.34996079677911557, + "learning_rate": 4.361469483105236e-05, + "loss": 0.4649, + "step": 6080 + }, + { + "epoch": 1.4190764925373134, + "grad_norm": 0.33313727721456643, + "learning_rate": 4.360258019396895e-05, + "loss": 0.4514, + "step": 6085 + }, + { + "epoch": 1.4202425373134329, + "grad_norm": 0.3737444393468166, + "learning_rate": 4.3590455979688335e-05, + "loss": 0.4844, + "step": 6090 + }, + { + "epoch": 1.4214085820895521, + "grad_norm": 0.36666239009502494, + "learning_rate": 4.357832219542151e-05, + "loss": 0.5074, + "step": 6095 + }, + { + "epoch": 1.4225746268656716, + "grad_norm": 0.34348607992487973, + "learning_rate": 4.3566178848385194e-05, + "loss": 0.4833, + "step": 6100 + }, + { + "epoch": 1.423740671641791, + "grad_norm": 0.32522904396512936, + "learning_rate": 4.35540259458018e-05, + "loss": 0.4553, + "step": 6105 + }, + { + "epoch": 1.4249067164179103, + "grad_norm": 0.3285642105741488, + "learning_rate": 4.3541863494899385e-05, + "loss": 0.4633, + "step": 6110 + }, + { + "epoch": 1.4260727611940298, + "grad_norm": 0.33496319615756404, + "learning_rate": 4.352969150291172e-05, + "loss": 0.4567, + "step": 6115 + }, + { + "epoch": 1.4272388059701493, + "grad_norm": 0.3280628349806029, + "learning_rate": 4.351750997707824e-05, + "loss": 0.4452, + "step": 6120 + }, + { + "epoch": 1.4284048507462686, + "grad_norm": 0.342932705677778, + "learning_rate": 4.3505318924644036e-05, + "loss": 0.451, + "step": 6125 + }, + { + "epoch": 1.429570895522388, + "grad_norm": 0.314679608343574, + "learning_rate": 4.34931183528599e-05, + "loss": 0.4588, + "step": 6130 + }, + { + "epoch": 1.4307369402985075, + "grad_norm": 0.3552548606888951, + "learning_rate": 4.348090826898225e-05, + "loss": 0.4664, + "step": 6135 + }, + { + "epoch": 1.4319029850746268, + "grad_norm": 0.3504993104642995, + "learning_rate": 4.346868868027318e-05, + "loss": 0.461, + "step": 6140 + }, + { + "epoch": 1.4330690298507462, + "grad_norm": 0.3402055721432462, + "learning_rate": 4.345645959400043e-05, + "loss": 0.4601, + "step": 6145 + }, + { + "epoch": 1.4342350746268657, + "grad_norm": 0.3299759511142303, + "learning_rate": 4.344422101743739e-05, + "loss": 0.4428, + "step": 6150 + }, + { + "epoch": 1.435401119402985, + "grad_norm": 0.33594338149547776, + "learning_rate": 4.3431972957863106e-05, + "loss": 0.4626, + "step": 6155 + }, + { + "epoch": 1.4365671641791045, + "grad_norm": 0.3746544537156815, + "learning_rate": 4.341971542256225e-05, + "loss": 0.475, + "step": 6160 + }, + { + "epoch": 1.437733208955224, + "grad_norm": 0.34845527069662197, + "learning_rate": 4.340744841882512e-05, + "loss": 0.4677, + "step": 6165 + }, + { + "epoch": 1.4388992537313432, + "grad_norm": 0.34970961524757327, + "learning_rate": 4.339517195394768e-05, + "loss": 0.4588, + "step": 6170 + }, + { + "epoch": 1.4400652985074627, + "grad_norm": 0.3615162800299732, + "learning_rate": 4.3382886035231484e-05, + "loss": 0.4809, + "step": 6175 + }, + { + "epoch": 1.4412313432835822, + "grad_norm": 0.31849620012464863, + "learning_rate": 4.3370590669983736e-05, + "loss": 0.4592, + "step": 6180 + }, + { + "epoch": 1.4423973880597014, + "grad_norm": 0.3357398139997132, + "learning_rate": 4.335828586551725e-05, + "loss": 0.4716, + "step": 6185 + }, + { + "epoch": 1.443563432835821, + "grad_norm": 0.3112798682013351, + "learning_rate": 4.334597162915045e-05, + "loss": 0.459, + "step": 6190 + }, + { + "epoch": 1.4447294776119404, + "grad_norm": 0.3546794675494792, + "learning_rate": 4.333364796820735e-05, + "loss": 0.4611, + "step": 6195 + }, + { + "epoch": 1.4458955223880596, + "grad_norm": 0.328212485716124, + "learning_rate": 4.332131489001762e-05, + "loss": 0.4829, + "step": 6200 + }, + { + "epoch": 1.447061567164179, + "grad_norm": 0.3254891995018081, + "learning_rate": 4.3308972401916495e-05, + "loss": 0.4409, + "step": 6205 + }, + { + "epoch": 1.4482276119402986, + "grad_norm": 0.3265612892352721, + "learning_rate": 4.3296620511244804e-05, + "loss": 0.4724, + "step": 6210 + }, + { + "epoch": 1.4493936567164178, + "grad_norm": 0.3273810014969138, + "learning_rate": 4.3284259225348985e-05, + "loss": 0.4663, + "step": 6215 + }, + { + "epoch": 1.4505597014925373, + "grad_norm": 0.3449418735589012, + "learning_rate": 4.327188855158106e-05, + "loss": 0.4692, + "step": 6220 + }, + { + "epoch": 1.4517257462686568, + "grad_norm": 0.3540063144251308, + "learning_rate": 4.325950849729862e-05, + "loss": 0.4767, + "step": 6225 + }, + { + "epoch": 1.452891791044776, + "grad_norm": 1.0643761025103295, + "learning_rate": 4.3247119069864856e-05, + "loss": 0.4571, + "step": 6230 + }, + { + "epoch": 1.4540578358208955, + "grad_norm": 0.33419173990646234, + "learning_rate": 4.323472027664852e-05, + "loss": 0.447, + "step": 6235 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.34788092893745387, + "learning_rate": 4.322231212502394e-05, + "loss": 0.4597, + "step": 6240 + }, + { + "epoch": 1.4563899253731343, + "grad_norm": 0.3134974285379023, + "learning_rate": 4.320989462237101e-05, + "loss": 0.4545, + "step": 6245 + }, + { + "epoch": 1.4575559701492538, + "grad_norm": 0.35842579625615295, + "learning_rate": 4.3197467776075185e-05, + "loss": 0.4833, + "step": 6250 + }, + { + "epoch": 1.4587220149253732, + "grad_norm": 0.33672571732190615, + "learning_rate": 4.318503159352748e-05, + "loss": 0.47, + "step": 6255 + }, + { + "epoch": 1.4598880597014925, + "grad_norm": 0.3333232640983891, + "learning_rate": 4.317258608212444e-05, + "loss": 0.4645, + "step": 6260 + }, + { + "epoch": 1.461054104477612, + "grad_norm": 0.3677124920971659, + "learning_rate": 4.31601312492682e-05, + "loss": 0.4428, + "step": 6265 + }, + { + "epoch": 1.4622201492537314, + "grad_norm": 0.3579258774501656, + "learning_rate": 4.3147667102366415e-05, + "loss": 0.4858, + "step": 6270 + }, + { + "epoch": 1.4633861940298507, + "grad_norm": 0.3429301134861556, + "learning_rate": 4.313519364883227e-05, + "loss": 0.4787, + "step": 6275 + }, + { + "epoch": 1.4645522388059702, + "grad_norm": 0.3479253377443876, + "learning_rate": 4.3122710896084504e-05, + "loss": 0.467, + "step": 6280 + }, + { + "epoch": 1.4657182835820897, + "grad_norm": 0.3369620809903681, + "learning_rate": 4.3110218851547384e-05, + "loss": 0.4636, + "step": 6285 + }, + { + "epoch": 1.466884328358209, + "grad_norm": 0.33730464524968007, + "learning_rate": 4.309771752265069e-05, + "loss": 0.4573, + "step": 6290 + }, + { + "epoch": 1.4680503731343284, + "grad_norm": 0.3218846587482649, + "learning_rate": 4.308520691682974e-05, + "loss": 0.4473, + "step": 6295 + }, + { + "epoch": 1.4692164179104479, + "grad_norm": 0.32838855496131214, + "learning_rate": 4.307268704152535e-05, + "loss": 0.4604, + "step": 6300 + }, + { + "epoch": 1.4703824626865671, + "grad_norm": 0.34867492982284204, + "learning_rate": 4.3060157904183873e-05, + "loss": 0.4791, + "step": 6305 + }, + { + "epoch": 1.4715485074626866, + "grad_norm": 0.3330642008297452, + "learning_rate": 4.3047619512257164e-05, + "loss": 0.4431, + "step": 6310 + }, + { + "epoch": 1.472714552238806, + "grad_norm": 0.33600408093566253, + "learning_rate": 4.3035071873202563e-05, + "loss": 0.4525, + "step": 6315 + }, + { + "epoch": 1.4738805970149254, + "grad_norm": 0.3406265788599132, + "learning_rate": 4.302251499448294e-05, + "loss": 0.4767, + "step": 6320 + }, + { + "epoch": 1.4750466417910448, + "grad_norm": 0.3429515470944637, + "learning_rate": 4.3009948883566645e-05, + "loss": 0.4894, + "step": 6325 + }, + { + "epoch": 1.4762126865671643, + "grad_norm": 0.32648908815987504, + "learning_rate": 4.29973735479275e-05, + "loss": 0.4767, + "step": 6330 + }, + { + "epoch": 1.4773787313432836, + "grad_norm": 0.33673546137273785, + "learning_rate": 4.298478899504485e-05, + "loss": 0.4604, + "step": 6335 + }, + { + "epoch": 1.478544776119403, + "grad_norm": 0.33823780875850035, + "learning_rate": 4.297219523240349e-05, + "loss": 0.4531, + "step": 6340 + }, + { + "epoch": 1.4797108208955223, + "grad_norm": 0.35119200081415775, + "learning_rate": 4.2959592267493715e-05, + "loss": 0.4636, + "step": 6345 + }, + { + "epoch": 1.4808768656716418, + "grad_norm": 0.4322198520912854, + "learning_rate": 4.2946980107811295e-05, + "loss": 0.4787, + "step": 6350 + }, + { + "epoch": 1.4820429104477613, + "grad_norm": 0.36083460260316264, + "learning_rate": 4.2934358760857454e-05, + "loss": 0.4794, + "step": 6355 + }, + { + "epoch": 1.4832089552238805, + "grad_norm": 0.31173947674943203, + "learning_rate": 4.292172823413887e-05, + "loss": 0.4538, + "step": 6360 + }, + { + "epoch": 1.484375, + "grad_norm": 0.33214255574354906, + "learning_rate": 4.2909088535167714e-05, + "loss": 0.46, + "step": 6365 + }, + { + "epoch": 1.4855410447761195, + "grad_norm": 0.32569959491022565, + "learning_rate": 4.289643967146158e-05, + "loss": 0.4777, + "step": 6370 + }, + { + "epoch": 1.4867070895522387, + "grad_norm": 0.3134761071205022, + "learning_rate": 4.288378165054354e-05, + "loss": 0.4557, + "step": 6375 + }, + { + "epoch": 1.4878731343283582, + "grad_norm": 0.3418201443289579, + "learning_rate": 4.28711144799421e-05, + "loss": 0.4669, + "step": 6380 + }, + { + "epoch": 1.4890391791044777, + "grad_norm": 0.3612542299768223, + "learning_rate": 4.2858438167191185e-05, + "loss": 0.4641, + "step": 6385 + }, + { + "epoch": 1.490205223880597, + "grad_norm": 0.34867879366259485, + "learning_rate": 4.2845752719830206e-05, + "loss": 0.456, + "step": 6390 + }, + { + "epoch": 1.4913712686567164, + "grad_norm": 0.3731883625822092, + "learning_rate": 4.283305814540397e-05, + "loss": 0.4713, + "step": 6395 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.34935271530956713, + "learning_rate": 4.282035445146272e-05, + "loss": 0.4826, + "step": 6400 + }, + { + "epoch": 1.4937033582089552, + "grad_norm": 0.32927278997045184, + "learning_rate": 4.2807641645562134e-05, + "loss": 0.472, + "step": 6405 + }, + { + "epoch": 1.4948694029850746, + "grad_norm": 0.3400672819928905, + "learning_rate": 4.2794919735263295e-05, + "loss": 0.4582, + "step": 6410 + }, + { + "epoch": 1.496035447761194, + "grad_norm": 0.3259509490685773, + "learning_rate": 4.278218872813271e-05, + "loss": 0.4793, + "step": 6415 + }, + { + "epoch": 1.4972014925373134, + "grad_norm": 0.36147664082553604, + "learning_rate": 4.276944863174229e-05, + "loss": 0.4531, + "step": 6420 + }, + { + "epoch": 1.4983675373134329, + "grad_norm": 0.37015743529852446, + "learning_rate": 4.275669945366936e-05, + "loss": 0.4712, + "step": 6425 + }, + { + "epoch": 1.4995335820895521, + "grad_norm": 0.35148150922706417, + "learning_rate": 4.2743941201496644e-05, + "loss": 0.4593, + "step": 6430 + }, + { + "epoch": 1.5006996268656716, + "grad_norm": 0.33182234228008994, + "learning_rate": 4.2731173882812264e-05, + "loss": 0.467, + "step": 6435 + }, + { + "epoch": 1.501865671641791, + "grad_norm": 0.3407677529605748, + "learning_rate": 4.271839750520972e-05, + "loss": 0.4674, + "step": 6440 + }, + { + "epoch": 1.5030317164179103, + "grad_norm": 0.3345446593153718, + "learning_rate": 4.2705612076287907e-05, + "loss": 0.4507, + "step": 6445 + }, + { + "epoch": 1.5041977611940298, + "grad_norm": 0.3332154562763926, + "learning_rate": 4.2692817603651134e-05, + "loss": 0.4431, + "step": 6450 + }, + { + "epoch": 1.5053638059701493, + "grad_norm": 0.36323268860291735, + "learning_rate": 4.2680014094909035e-05, + "loss": 0.4566, + "step": 6455 + }, + { + "epoch": 1.5065298507462686, + "grad_norm": 0.3259349704090724, + "learning_rate": 4.2667201557676673e-05, + "loss": 0.4629, + "step": 6460 + }, + { + "epoch": 1.507695895522388, + "grad_norm": 0.33444066240013476, + "learning_rate": 4.2654379999574425e-05, + "loss": 0.4557, + "step": 6465 + }, + { + "epoch": 1.5088619402985075, + "grad_norm": 0.344096009710541, + "learning_rate": 4.2641549428228087e-05, + "loss": 0.4636, + "step": 6470 + }, + { + "epoch": 1.5100279850746268, + "grad_norm": 0.33982012028637054, + "learning_rate": 4.2628709851268775e-05, + "loss": 0.4588, + "step": 6475 + }, + { + "epoch": 1.5111940298507462, + "grad_norm": 0.32270394544416997, + "learning_rate": 4.261586127633297e-05, + "loss": 0.4554, + "step": 6480 + }, + { + "epoch": 1.5123600746268657, + "grad_norm": 0.3252570889517615, + "learning_rate": 4.2603003711062536e-05, + "loss": 0.4561, + "step": 6485 + }, + { + "epoch": 1.513526119402985, + "grad_norm": 0.3394768153452522, + "learning_rate": 4.259013716310465e-05, + "loss": 0.4636, + "step": 6490 + }, + { + "epoch": 1.5146921641791045, + "grad_norm": 0.3365118701807337, + "learning_rate": 4.2577261640111834e-05, + "loss": 0.4519, + "step": 6495 + }, + { + "epoch": 1.515858208955224, + "grad_norm": 0.32262503177912955, + "learning_rate": 4.256437714974196e-05, + "loss": 0.48, + "step": 6500 + }, + { + "epoch": 1.5170242537313432, + "grad_norm": 0.316392457569673, + "learning_rate": 4.255148369965822e-05, + "loss": 0.469, + "step": 6505 + }, + { + "epoch": 1.5181902985074627, + "grad_norm": 0.3490934274261248, + "learning_rate": 4.253858129752916e-05, + "loss": 0.465, + "step": 6510 + }, + { + "epoch": 1.5193563432835822, + "grad_norm": 0.33463059646920107, + "learning_rate": 4.252566995102864e-05, + "loss": 0.4741, + "step": 6515 + }, + { + "epoch": 1.5205223880597014, + "grad_norm": 0.3256308103198369, + "learning_rate": 4.251274966783579e-05, + "loss": 0.4583, + "step": 6520 + }, + { + "epoch": 1.521688432835821, + "grad_norm": 0.32298219383214943, + "learning_rate": 4.2499820455635154e-05, + "loss": 0.4481, + "step": 6525 + }, + { + "epoch": 1.5228544776119404, + "grad_norm": 0.3676236502372447, + "learning_rate": 4.24868823221165e-05, + "loss": 0.4703, + "step": 6530 + }, + { + "epoch": 1.5240205223880596, + "grad_norm": 0.3312495080130234, + "learning_rate": 4.2473935274974944e-05, + "loss": 0.4657, + "step": 6535 + }, + { + "epoch": 1.525186567164179, + "grad_norm": 0.30885099791103654, + "learning_rate": 4.246097932191088e-05, + "loss": 0.4579, + "step": 6540 + }, + { + "epoch": 1.5263526119402986, + "grad_norm": 0.3341932819145299, + "learning_rate": 4.2448014470630034e-05, + "loss": 0.4609, + "step": 6545 + }, + { + "epoch": 1.5275186567164178, + "grad_norm": 0.34122470078700223, + "learning_rate": 4.2435040728843376e-05, + "loss": 0.4676, + "step": 6550 + }, + { + "epoch": 1.5286847014925373, + "grad_norm": 0.32811744269493354, + "learning_rate": 4.2422058104267215e-05, + "loss": 0.4645, + "step": 6555 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3547434028641149, + "learning_rate": 4.2409066604623096e-05, + "loss": 0.4717, + "step": 6560 + }, + { + "epoch": 1.531016791044776, + "grad_norm": 0.31996972002621243, + "learning_rate": 4.239606623763789e-05, + "loss": 0.4623, + "step": 6565 + }, + { + "epoch": 1.5321828358208955, + "grad_norm": 0.3355552932095701, + "learning_rate": 4.23830570110437e-05, + "loss": 0.4621, + "step": 6570 + }, + { + "epoch": 1.533348880597015, + "grad_norm": 0.32800163172000013, + "learning_rate": 4.237003893257791e-05, + "loss": 0.4818, + "step": 6575 + }, + { + "epoch": 1.5345149253731343, + "grad_norm": 0.3307796850338419, + "learning_rate": 4.2357012009983185e-05, + "loss": 0.486, + "step": 6580 + }, + { + "epoch": 1.5356809701492538, + "grad_norm": 0.338310889951412, + "learning_rate": 4.234397625100745e-05, + "loss": 0.4521, + "step": 6585 + }, + { + "epoch": 1.5368470149253732, + "grad_norm": 0.3647838461591258, + "learning_rate": 4.2330931663403844e-05, + "loss": 0.4706, + "step": 6590 + }, + { + "epoch": 1.5380130597014925, + "grad_norm": 0.3300642849353027, + "learning_rate": 4.231787825493081e-05, + "loss": 0.4492, + "step": 6595 + }, + { + "epoch": 1.539179104477612, + "grad_norm": 0.33557264157968786, + "learning_rate": 4.230481603335201e-05, + "loss": 0.4687, + "step": 6600 + }, + { + "epoch": 1.5403451492537314, + "grad_norm": 0.3114008445874425, + "learning_rate": 4.229174500643634e-05, + "loss": 0.4578, + "step": 6605 + }, + { + "epoch": 1.5415111940298507, + "grad_norm": 0.32152049048790204, + "learning_rate": 4.227866518195797e-05, + "loss": 0.4658, + "step": 6610 + }, + { + "epoch": 1.5426772388059702, + "grad_norm": 0.3527059542529975, + "learning_rate": 4.226557656769626e-05, + "loss": 0.4844, + "step": 6615 + }, + { + "epoch": 1.5438432835820897, + "grad_norm": 0.330570383716205, + "learning_rate": 4.225247917143582e-05, + "loss": 0.4832, + "step": 6620 + }, + { + "epoch": 1.545009328358209, + "grad_norm": 0.34996640622759717, + "learning_rate": 4.223937300096648e-05, + "loss": 0.4753, + "step": 6625 + }, + { + "epoch": 1.5461753731343284, + "grad_norm": 0.3425186961041473, + "learning_rate": 4.22262580640833e-05, + "loss": 0.4784, + "step": 6630 + }, + { + "epoch": 1.5473414179104479, + "grad_norm": 0.3197472159330529, + "learning_rate": 4.221313436858651e-05, + "loss": 0.462, + "step": 6635 + }, + { + "epoch": 1.5485074626865671, + "grad_norm": 0.36173561576993507, + "learning_rate": 4.220000192228161e-05, + "loss": 0.4864, + "step": 6640 + }, + { + "epoch": 1.5496735074626866, + "grad_norm": 0.3457277757550715, + "learning_rate": 4.218686073297926e-05, + "loss": 0.4924, + "step": 6645 + }, + { + "epoch": 1.550839552238806, + "grad_norm": 0.31799753718370405, + "learning_rate": 4.217371080849535e-05, + "loss": 0.4364, + "step": 6650 + }, + { + "epoch": 1.5520055970149254, + "grad_norm": 0.3196897952854417, + "learning_rate": 4.216055215665093e-05, + "loss": 0.4506, + "step": 6655 + }, + { + "epoch": 1.5531716417910446, + "grad_norm": 0.3257421590495531, + "learning_rate": 4.2147384785272284e-05, + "loss": 0.4553, + "step": 6660 + }, + { + "epoch": 1.5543376865671643, + "grad_norm": 0.3432550994802492, + "learning_rate": 4.213420870219084e-05, + "loss": 0.4817, + "step": 6665 + }, + { + "epoch": 1.5555037313432836, + "grad_norm": 0.34725168242420085, + "learning_rate": 4.212102391524324e-05, + "loss": 0.4648, + "step": 6670 + }, + { + "epoch": 1.5566697761194028, + "grad_norm": 0.33122172319023835, + "learning_rate": 4.210783043227129e-05, + "loss": 0.457, + "step": 6675 + }, + { + "epoch": 1.5578358208955225, + "grad_norm": 0.32042190315550045, + "learning_rate": 4.209462826112195e-05, + "loss": 0.4775, + "step": 6680 + }, + { + "epoch": 1.5590018656716418, + "grad_norm": 0.34115796189591796, + "learning_rate": 4.2081417409647386e-05, + "loss": 0.4725, + "step": 6685 + }, + { + "epoch": 1.560167910447761, + "grad_norm": 0.3349752626639639, + "learning_rate": 4.2068197885704904e-05, + "loss": 0.4722, + "step": 6690 + }, + { + "epoch": 1.5613339552238807, + "grad_norm": 0.3450973154659737, + "learning_rate": 4.205496969715696e-05, + "loss": 0.482, + "step": 6695 + }, + { + "epoch": 1.5625, + "grad_norm": 0.3346036864573799, + "learning_rate": 4.204173285187117e-05, + "loss": 0.4685, + "step": 6700 + }, + { + "epoch": 1.5636660447761193, + "grad_norm": 0.34800557133729426, + "learning_rate": 4.202848735772031e-05, + "loss": 0.4801, + "step": 6705 + }, + { + "epoch": 1.564832089552239, + "grad_norm": 0.3367084430898118, + "learning_rate": 4.201523322258231e-05, + "loss": 0.4274, + "step": 6710 + }, + { + "epoch": 1.5659981343283582, + "grad_norm": 0.35012078531123364, + "learning_rate": 4.2001970454340185e-05, + "loss": 0.4779, + "step": 6715 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.3464241011269615, + "learning_rate": 4.1988699060882144e-05, + "loss": 0.4474, + "step": 6720 + }, + { + "epoch": 1.5683302238805972, + "grad_norm": 0.32315881126500995, + "learning_rate": 4.197541905010149e-05, + "loss": 0.4557, + "step": 6725 + }, + { + "epoch": 1.5694962686567164, + "grad_norm": 0.33350654547882, + "learning_rate": 4.196213042989668e-05, + "loss": 0.4813, + "step": 6730 + }, + { + "epoch": 1.5706623134328357, + "grad_norm": 0.31996360129943924, + "learning_rate": 4.194883320817127e-05, + "loss": 0.4796, + "step": 6735 + }, + { + "epoch": 1.5718283582089554, + "grad_norm": 0.30918268023470596, + "learning_rate": 4.193552739283393e-05, + "loss": 0.47, + "step": 6740 + }, + { + "epoch": 1.5729944029850746, + "grad_norm": 0.34011552174858767, + "learning_rate": 4.192221299179845e-05, + "loss": 0.4677, + "step": 6745 + }, + { + "epoch": 1.574160447761194, + "grad_norm": 0.3129514549766107, + "learning_rate": 4.190889001298373e-05, + "loss": 0.4564, + "step": 6750 + }, + { + "epoch": 1.5753264925373134, + "grad_norm": 0.3319813884486381, + "learning_rate": 4.189555846431377e-05, + "loss": 0.4546, + "step": 6755 + }, + { + "epoch": 1.5764925373134329, + "grad_norm": 0.34061377560704664, + "learning_rate": 4.188221835371766e-05, + "loss": 0.4728, + "step": 6760 + }, + { + "epoch": 1.5776585820895521, + "grad_norm": 0.33008510935751006, + "learning_rate": 4.1868869689129584e-05, + "loss": 0.4557, + "step": 6765 + }, + { + "epoch": 1.5788246268656716, + "grad_norm": 0.34404215231714474, + "learning_rate": 4.1855512478488816e-05, + "loss": 0.4569, + "step": 6770 + }, + { + "epoch": 1.579990671641791, + "grad_norm": 0.3328531164247705, + "learning_rate": 4.184214672973971e-05, + "loss": 0.4749, + "step": 6775 + }, + { + "epoch": 1.5811567164179103, + "grad_norm": 0.3242877263909023, + "learning_rate": 4.182877245083172e-05, + "loss": 0.469, + "step": 6780 + }, + { + "epoch": 1.5823227611940298, + "grad_norm": 0.3250921673756281, + "learning_rate": 4.181538964971933e-05, + "loss": 0.442, + "step": 6785 + }, + { + "epoch": 1.5834888059701493, + "grad_norm": 0.33070321772700895, + "learning_rate": 4.180199833436213e-05, + "loss": 0.4635, + "step": 6790 + }, + { + "epoch": 1.5846548507462686, + "grad_norm": 0.34575544940838887, + "learning_rate": 4.178859851272475e-05, + "loss": 0.491, + "step": 6795 + }, + { + "epoch": 1.585820895522388, + "grad_norm": 0.35928774509106254, + "learning_rate": 4.1775190192776905e-05, + "loss": 0.4829, + "step": 6800 + }, + { + "epoch": 1.5869869402985075, + "grad_norm": 0.32912110954208285, + "learning_rate": 4.176177338249334e-05, + "loss": 0.4569, + "step": 6805 + }, + { + "epoch": 1.5881529850746268, + "grad_norm": 0.320790652546973, + "learning_rate": 4.1748348089853864e-05, + "loss": 0.4738, + "step": 6810 + }, + { + "epoch": 1.5893190298507462, + "grad_norm": 0.33958660148187925, + "learning_rate": 4.173491432284332e-05, + "loss": 0.4888, + "step": 6815 + }, + { + "epoch": 1.5904850746268657, + "grad_norm": 0.33037229113813193, + "learning_rate": 4.172147208945159e-05, + "loss": 0.4727, + "step": 6820 + }, + { + "epoch": 1.591651119402985, + "grad_norm": 0.33796195513843025, + "learning_rate": 4.170802139767362e-05, + "loss": 0.4578, + "step": 6825 + }, + { + "epoch": 1.5928171641791045, + "grad_norm": 0.3357550301495569, + "learning_rate": 4.1694562255509354e-05, + "loss": 0.4609, + "step": 6830 + }, + { + "epoch": 1.593983208955224, + "grad_norm": 0.32146300171119124, + "learning_rate": 4.168109467096378e-05, + "loss": 0.4589, + "step": 6835 + }, + { + "epoch": 1.5951492537313432, + "grad_norm": 0.33435126899728823, + "learning_rate": 4.1667618652046894e-05, + "loss": 0.4557, + "step": 6840 + }, + { + "epoch": 1.5963152985074627, + "grad_norm": 0.3373892914826476, + "learning_rate": 4.165413420677372e-05, + "loss": 0.4592, + "step": 6845 + }, + { + "epoch": 1.5974813432835822, + "grad_norm": 0.33192379105590525, + "learning_rate": 4.164064134316428e-05, + "loss": 0.5028, + "step": 6850 + }, + { + "epoch": 1.5986473880597014, + "grad_norm": 0.3614568709330173, + "learning_rate": 4.162714006924362e-05, + "loss": 0.4897, + "step": 6855 + }, + { + "epoch": 1.599813432835821, + "grad_norm": 0.3339458864487026, + "learning_rate": 4.161363039304177e-05, + "loss": 0.4662, + "step": 6860 + }, + { + "epoch": 1.6009794776119404, + "grad_norm": 0.3420259560333856, + "learning_rate": 4.160011232259378e-05, + "loss": 0.4597, + "step": 6865 + }, + { + "epoch": 1.6021455223880596, + "grad_norm": 0.3563996763973991, + "learning_rate": 4.158658586593969e-05, + "loss": 0.4618, + "step": 6870 + }, + { + "epoch": 1.603311567164179, + "grad_norm": 0.3086604099147653, + "learning_rate": 4.1573051031124486e-05, + "loss": 0.4401, + "step": 6875 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.3699639063420792, + "learning_rate": 4.155950782619819e-05, + "loss": 0.4697, + "step": 6880 + }, + { + "epoch": 1.6056436567164178, + "grad_norm": 0.37077466980798357, + "learning_rate": 4.1545956259215776e-05, + "loss": 0.4575, + "step": 6885 + }, + { + "epoch": 1.6068097014925373, + "grad_norm": 0.37397215048587934, + "learning_rate": 4.153239633823721e-05, + "loss": 0.4818, + "step": 6890 + }, + { + "epoch": 1.6079757462686568, + "grad_norm": 0.3819042229679817, + "learning_rate": 4.151882807132739e-05, + "loss": 0.4834, + "step": 6895 + }, + { + "epoch": 1.609141791044776, + "grad_norm": 0.3420138491704207, + "learning_rate": 4.1505251466556206e-05, + "loss": 0.4735, + "step": 6900 + }, + { + "epoch": 1.6103078358208955, + "grad_norm": 0.34624964872120995, + "learning_rate": 4.149166653199852e-05, + "loss": 0.4724, + "step": 6905 + }, + { + "epoch": 1.611473880597015, + "grad_norm": 0.32999263610838986, + "learning_rate": 4.1478073275734105e-05, + "loss": 0.4617, + "step": 6910 + }, + { + "epoch": 1.6126399253731343, + "grad_norm": 0.3219821202086392, + "learning_rate": 4.146447170584772e-05, + "loss": 0.4409, + "step": 6915 + }, + { + "epoch": 1.6138059701492538, + "grad_norm": 0.31823919385054067, + "learning_rate": 4.145086183042907e-05, + "loss": 0.4772, + "step": 6920 + }, + { + "epoch": 1.6149720149253732, + "grad_norm": 0.3117042372128923, + "learning_rate": 4.143724365757275e-05, + "loss": 0.4473, + "step": 6925 + }, + { + "epoch": 1.6161380597014925, + "grad_norm": 0.3250330576185892, + "learning_rate": 4.142361719537838e-05, + "loss": 0.4579, + "step": 6930 + }, + { + "epoch": 1.617304104477612, + "grad_norm": 0.35348966460135667, + "learning_rate": 4.140998245195042e-05, + "loss": 0.4452, + "step": 6935 + }, + { + "epoch": 1.6184701492537314, + "grad_norm": 0.35601914592858, + "learning_rate": 4.13963394353983e-05, + "loss": 0.4819, + "step": 6940 + }, + { + "epoch": 1.6196361940298507, + "grad_norm": 0.34207907216373123, + "learning_rate": 4.138268815383636e-05, + "loss": 0.4659, + "step": 6945 + }, + { + "epoch": 1.6208022388059702, + "grad_norm": 0.35803264576282556, + "learning_rate": 4.136902861538387e-05, + "loss": 0.4813, + "step": 6950 + }, + { + "epoch": 1.6219682835820897, + "grad_norm": 0.3540499176018185, + "learning_rate": 4.135536082816499e-05, + "loss": 0.4772, + "step": 6955 + }, + { + "epoch": 1.623134328358209, + "grad_norm": 0.33817971497647364, + "learning_rate": 4.13416848003088e-05, + "loss": 0.4526, + "step": 6960 + }, + { + "epoch": 1.6243003731343284, + "grad_norm": 0.34797848678426574, + "learning_rate": 4.132800053994927e-05, + "loss": 0.4803, + "step": 6965 + }, + { + "epoch": 1.6254664179104479, + "grad_norm": 0.33500331225773355, + "learning_rate": 4.1314308055225295e-05, + "loss": 0.4799, + "step": 6970 + }, + { + "epoch": 1.6266324626865671, + "grad_norm": 0.34589069785663457, + "learning_rate": 4.1300607354280605e-05, + "loss": 0.4606, + "step": 6975 + }, + { + "epoch": 1.6277985074626866, + "grad_norm": 0.3083760808773547, + "learning_rate": 4.128689844526388e-05, + "loss": 0.441, + "step": 6980 + }, + { + "epoch": 1.628964552238806, + "grad_norm": 0.32415702677889685, + "learning_rate": 4.1273181336328646e-05, + "loss": 0.4725, + "step": 6985 + }, + { + "epoch": 1.6301305970149254, + "grad_norm": 0.34014109508651214, + "learning_rate": 4.125945603563331e-05, + "loss": 0.478, + "step": 6990 + }, + { + "epoch": 1.6312966417910446, + "grad_norm": 0.32538033857434384, + "learning_rate": 4.124572255134115e-05, + "loss": 0.4525, + "step": 6995 + }, + { + "epoch": 1.6324626865671643, + "grad_norm": 0.3208811617337528, + "learning_rate": 4.123198089162033e-05, + "loss": 0.4505, + "step": 7000 + }, + { + "epoch": 1.6336287313432836, + "grad_norm": 0.3325850879806714, + "learning_rate": 4.121823106464384e-05, + "loss": 0.4718, + "step": 7005 + }, + { + "epoch": 1.6347947761194028, + "grad_norm": 0.3614875622567189, + "learning_rate": 4.1204473078589575e-05, + "loss": 0.4744, + "step": 7010 + }, + { + "epoch": 1.6359608208955225, + "grad_norm": 0.3191733658518597, + "learning_rate": 4.119070694164024e-05, + "loss": 0.4725, + "step": 7015 + }, + { + "epoch": 1.6371268656716418, + "grad_norm": 0.33466736823296434, + "learning_rate": 4.117693266198342e-05, + "loss": 0.4542, + "step": 7020 + }, + { + "epoch": 1.638292910447761, + "grad_norm": 0.3336159641539354, + "learning_rate": 4.116315024781152e-05, + "loss": 0.4699, + "step": 7025 + }, + { + "epoch": 1.6394589552238807, + "grad_norm": 0.36793982561955546, + "learning_rate": 4.114935970732178e-05, + "loss": 0.4823, + "step": 7030 + }, + { + "epoch": 1.640625, + "grad_norm": 0.33120379255270743, + "learning_rate": 4.113556104871631e-05, + "loss": 0.4923, + "step": 7035 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.30729104833002724, + "learning_rate": 4.112175428020199e-05, + "loss": 0.4435, + "step": 7040 + }, + { + "epoch": 1.642957089552239, + "grad_norm": 0.34616218198056103, + "learning_rate": 4.110793940999059e-05, + "loss": 0.4516, + "step": 7045 + }, + { + "epoch": 1.6441231343283582, + "grad_norm": 0.3354519199641666, + "learning_rate": 4.1094116446298645e-05, + "loss": 0.4671, + "step": 7050 + }, + { + "epoch": 1.6452891791044775, + "grad_norm": 0.32236283925320086, + "learning_rate": 4.108028539734753e-05, + "loss": 0.4837, + "step": 7055 + }, + { + "epoch": 1.6464552238805972, + "grad_norm": 0.3116041047755905, + "learning_rate": 4.1066446271363426e-05, + "loss": 0.4591, + "step": 7060 + }, + { + "epoch": 1.6476212686567164, + "grad_norm": 0.3597117887276012, + "learning_rate": 4.1052599076577306e-05, + "loss": 0.482, + "step": 7065 + }, + { + "epoch": 1.6487873134328357, + "grad_norm": 0.3371049984339211, + "learning_rate": 4.103874382122496e-05, + "loss": 0.452, + "step": 7070 + }, + { + "epoch": 1.6499533582089554, + "grad_norm": 0.3342395269568629, + "learning_rate": 4.1024880513546955e-05, + "loss": 0.4554, + "step": 7075 + }, + { + "epoch": 1.6511194029850746, + "grad_norm": 0.3334503038716055, + "learning_rate": 4.1011009161788655e-05, + "loss": 0.4659, + "step": 7080 + }, + { + "epoch": 1.652285447761194, + "grad_norm": 0.32691988784391596, + "learning_rate": 4.099712977420021e-05, + "loss": 0.448, + "step": 7085 + }, + { + "epoch": 1.6534514925373134, + "grad_norm": 0.3189213068437785, + "learning_rate": 4.098324235903655e-05, + "loss": 0.442, + "step": 7090 + }, + { + "epoch": 1.6546175373134329, + "grad_norm": 0.34562830396228567, + "learning_rate": 4.0969346924557374e-05, + "loss": 0.4874, + "step": 7095 + }, + { + "epoch": 1.6557835820895521, + "grad_norm": 0.3433525287693214, + "learning_rate": 4.095544347902715e-05, + "loss": 0.4757, + "step": 7100 + }, + { + "epoch": 1.6569496268656716, + "grad_norm": 0.36953751999856715, + "learning_rate": 4.094153203071512e-05, + "loss": 0.51, + "step": 7105 + }, + { + "epoch": 1.658115671641791, + "grad_norm": 0.3274988234772176, + "learning_rate": 4.092761258789529e-05, + "loss": 0.4571, + "step": 7110 + }, + { + "epoch": 1.6592817164179103, + "grad_norm": 0.34823208824612917, + "learning_rate": 4.091368515884638e-05, + "loss": 0.4831, + "step": 7115 + }, + { + "epoch": 1.6604477611940298, + "grad_norm": 0.32536250915613824, + "learning_rate": 4.089974975185192e-05, + "loss": 0.4614, + "step": 7120 + }, + { + "epoch": 1.6616138059701493, + "grad_norm": 0.324795678268426, + "learning_rate": 4.088580637520015e-05, + "loss": 0.4551, + "step": 7125 + }, + { + "epoch": 1.6627798507462686, + "grad_norm": 0.3162172035843185, + "learning_rate": 4.087185503718404e-05, + "loss": 0.4443, + "step": 7130 + }, + { + "epoch": 1.663945895522388, + "grad_norm": 0.33952859510564576, + "learning_rate": 4.0857895746101335e-05, + "loss": 0.4717, + "step": 7135 + }, + { + "epoch": 1.6651119402985075, + "grad_norm": 0.3075133719483971, + "learning_rate": 4.084392851025447e-05, + "loss": 0.4339, + "step": 7140 + }, + { + "epoch": 1.6662779850746268, + "grad_norm": 0.33515643023992886, + "learning_rate": 4.082995333795063e-05, + "loss": 0.4632, + "step": 7145 + }, + { + "epoch": 1.6674440298507462, + "grad_norm": 0.3288379676558023, + "learning_rate": 4.081597023750169e-05, + "loss": 0.4654, + "step": 7150 + }, + { + "epoch": 1.6686100746268657, + "grad_norm": 0.30998910910125077, + "learning_rate": 4.0801979217224285e-05, + "loss": 0.454, + "step": 7155 + }, + { + "epoch": 1.669776119402985, + "grad_norm": 0.34437967384422724, + "learning_rate": 4.078798028543974e-05, + "loss": 0.4729, + "step": 7160 + }, + { + "epoch": 1.6709421641791045, + "grad_norm": 0.3829981647145245, + "learning_rate": 4.0773973450474055e-05, + "loss": 0.4737, + "step": 7165 + }, + { + "epoch": 1.672108208955224, + "grad_norm": 0.3530178173945503, + "learning_rate": 4.0759958720658e-05, + "loss": 0.4533, + "step": 7170 + }, + { + "epoch": 1.6732742537313432, + "grad_norm": 0.3179227326670143, + "learning_rate": 4.074593610432695e-05, + "loss": 0.4508, + "step": 7175 + }, + { + "epoch": 1.6744402985074627, + "grad_norm": 0.3146630519651914, + "learning_rate": 4.073190560982106e-05, + "loss": 0.4597, + "step": 7180 + }, + { + "epoch": 1.6756063432835822, + "grad_norm": 0.34642209234797366, + "learning_rate": 4.071786724548511e-05, + "loss": 0.4649, + "step": 7185 + }, + { + "epoch": 1.6767723880597014, + "grad_norm": 0.3706405830369706, + "learning_rate": 4.07038210196686e-05, + "loss": 0.4606, + "step": 7190 + }, + { + "epoch": 1.677938432835821, + "grad_norm": 0.3056476971166796, + "learning_rate": 4.068976694072565e-05, + "loss": 0.4578, + "step": 7195 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.3079763421623756, + "learning_rate": 4.067570501701513e-05, + "loss": 0.4642, + "step": 7200 + }, + { + "epoch": 1.6802705223880596, + "grad_norm": 0.3356571842787205, + "learning_rate": 4.0661635256900505e-05, + "loss": 0.4731, + "step": 7205 + }, + { + "epoch": 1.681436567164179, + "grad_norm": 0.31406788758525345, + "learning_rate": 4.064755766874993e-05, + "loss": 0.4426, + "step": 7210 + }, + { + "epoch": 1.6826026119402986, + "grad_norm": 0.3192951288730359, + "learning_rate": 4.0633472260936224e-05, + "loss": 0.4609, + "step": 7215 + }, + { + "epoch": 1.6837686567164178, + "grad_norm": 0.30323038618671533, + "learning_rate": 4.061937904183685e-05, + "loss": 0.4495, + "step": 7220 + }, + { + "epoch": 1.6849347014925373, + "grad_norm": 0.3603630192365742, + "learning_rate": 4.060527801983391e-05, + "loss": 0.4815, + "step": 7225 + }, + { + "epoch": 1.6861007462686568, + "grad_norm": 0.3552458403395982, + "learning_rate": 4.0591169203314145e-05, + "loss": 0.478, + "step": 7230 + }, + { + "epoch": 1.687266791044776, + "grad_norm": 0.3437733174212601, + "learning_rate": 4.057705260066894e-05, + "loss": 0.4813, + "step": 7235 + }, + { + "epoch": 1.6884328358208955, + "grad_norm": 0.37064293513083574, + "learning_rate": 4.056292822029432e-05, + "loss": 0.4797, + "step": 7240 + }, + { + "epoch": 1.689598880597015, + "grad_norm": 0.31622660412831927, + "learning_rate": 4.05487960705909e-05, + "loss": 0.4514, + "step": 7245 + }, + { + "epoch": 1.6907649253731343, + "grad_norm": 0.3237729621504635, + "learning_rate": 4.053465615996397e-05, + "loss": 0.4735, + "step": 7250 + }, + { + "epoch": 1.6919309701492538, + "grad_norm": 0.3487012541914169, + "learning_rate": 4.0520508496823395e-05, + "loss": 0.4722, + "step": 7255 + }, + { + "epoch": 1.6930970149253732, + "grad_norm": 0.33532018875590897, + "learning_rate": 4.050635308958366e-05, + "loss": 0.4768, + "step": 7260 + }, + { + "epoch": 1.6942630597014925, + "grad_norm": 0.3303916866276644, + "learning_rate": 4.0492189946663864e-05, + "loss": 0.4687, + "step": 7265 + }, + { + "epoch": 1.695429104477612, + "grad_norm": 0.32806496684644687, + "learning_rate": 4.047801907648769e-05, + "loss": 0.4472, + "step": 7270 + }, + { + "epoch": 1.6965951492537314, + "grad_norm": 0.339298332376045, + "learning_rate": 4.046384048748344e-05, + "loss": 0.4591, + "step": 7275 + }, + { + "epoch": 1.6977611940298507, + "grad_norm": 0.32851237253811666, + "learning_rate": 4.0449654188083985e-05, + "loss": 0.4603, + "step": 7280 + }, + { + "epoch": 1.6989272388059702, + "grad_norm": 0.34429588338857486, + "learning_rate": 4.04354601867268e-05, + "loss": 0.4545, + "step": 7285 + }, + { + "epoch": 1.7000932835820897, + "grad_norm": 0.3522229862796038, + "learning_rate": 4.042125849185394e-05, + "loss": 0.4896, + "step": 7290 + }, + { + "epoch": 1.701259328358209, + "grad_norm": 0.3304803396604614, + "learning_rate": 4.040704911191201e-05, + "loss": 0.4704, + "step": 7295 + }, + { + "epoch": 1.7024253731343284, + "grad_norm": 0.31746663711386225, + "learning_rate": 4.0392832055352205e-05, + "loss": 0.4668, + "step": 7300 + }, + { + "epoch": 1.7035914179104479, + "grad_norm": 0.312913030389991, + "learning_rate": 4.0378607330630304e-05, + "loss": 0.4743, + "step": 7305 + }, + { + "epoch": 1.7047574626865671, + "grad_norm": 0.3225342566376156, + "learning_rate": 4.036437494620661e-05, + "loss": 0.4494, + "step": 7310 + }, + { + "epoch": 1.7059235074626866, + "grad_norm": 0.3047737962305274, + "learning_rate": 4.0350134910546e-05, + "loss": 0.474, + "step": 7315 + }, + { + "epoch": 1.707089552238806, + "grad_norm": 0.3272477406317157, + "learning_rate": 4.033588723211793e-05, + "loss": 0.4638, + "step": 7320 + }, + { + "epoch": 1.7082555970149254, + "grad_norm": 0.31344877189254877, + "learning_rate": 4.032163191939633e-05, + "loss": 0.4781, + "step": 7325 + }, + { + "epoch": 1.7094216417910446, + "grad_norm": 0.3355640428877015, + "learning_rate": 4.030736898085974e-05, + "loss": 0.4558, + "step": 7330 + }, + { + "epoch": 1.7105876865671643, + "grad_norm": 0.3229925108692079, + "learning_rate": 4.02930984249912e-05, + "loss": 0.4648, + "step": 7335 + }, + { + "epoch": 1.7117537313432836, + "grad_norm": 0.3339328917463594, + "learning_rate": 4.02788202602783e-05, + "loss": 0.4693, + "step": 7340 + }, + { + "epoch": 1.7129197761194028, + "grad_norm": 0.32061837468394516, + "learning_rate": 4.026453449521313e-05, + "loss": 0.4707, + "step": 7345 + }, + { + "epoch": 1.7140858208955225, + "grad_norm": 0.3334566989161919, + "learning_rate": 4.025024113829233e-05, + "loss": 0.4691, + "step": 7350 + }, + { + "epoch": 1.7152518656716418, + "grad_norm": 0.3117933495011161, + "learning_rate": 4.023594019801702e-05, + "loss": 0.462, + "step": 7355 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.31444912546892323, + "learning_rate": 4.022163168289287e-05, + "loss": 0.4534, + "step": 7360 + }, + { + "epoch": 1.7175839552238807, + "grad_norm": 0.33449115278274755, + "learning_rate": 4.020731560143002e-05, + "loss": 0.4544, + "step": 7365 + }, + { + "epoch": 1.71875, + "grad_norm": 0.32373835873579954, + "learning_rate": 4.019299196214315e-05, + "loss": 0.4833, + "step": 7370 + }, + { + "epoch": 1.7199160447761193, + "grad_norm": 0.3376817168174418, + "learning_rate": 4.017866077355139e-05, + "loss": 0.4646, + "step": 7375 + }, + { + "epoch": 1.721082089552239, + "grad_norm": 0.323632069545875, + "learning_rate": 4.016432204417839e-05, + "loss": 0.4735, + "step": 7380 + }, + { + "epoch": 1.7222481343283582, + "grad_norm": 0.3273976532971608, + "learning_rate": 4.014997578255227e-05, + "loss": 0.4599, + "step": 7385 + }, + { + "epoch": 1.7234141791044775, + "grad_norm": 0.31173763021502077, + "learning_rate": 4.0135621997205654e-05, + "loss": 0.4332, + "step": 7390 + }, + { + "epoch": 1.7245802238805972, + "grad_norm": 0.3418947089076045, + "learning_rate": 4.01212606966756e-05, + "loss": 0.4583, + "step": 7395 + }, + { + "epoch": 1.7257462686567164, + "grad_norm": 0.33546667435232946, + "learning_rate": 4.010689188950367e-05, + "loss": 0.4933, + "step": 7400 + }, + { + "epoch": 1.7269123134328357, + "grad_norm": 0.3431240770804782, + "learning_rate": 4.009251558423588e-05, + "loss": 0.4753, + "step": 7405 + }, + { + "epoch": 1.7280783582089554, + "grad_norm": 0.3333671988156517, + "learning_rate": 4.00781317894227e-05, + "loss": 0.4683, + "step": 7410 + }, + { + "epoch": 1.7292444029850746, + "grad_norm": 0.34563156857776767, + "learning_rate": 4.006374051361907e-05, + "loss": 0.4646, + "step": 7415 + }, + { + "epoch": 1.730410447761194, + "grad_norm": 0.3190618544335192, + "learning_rate": 4.004934176538436e-05, + "loss": 0.4568, + "step": 7420 + }, + { + "epoch": 1.7315764925373134, + "grad_norm": 0.32943852720887506, + "learning_rate": 4.0034935553282396e-05, + "loss": 0.4719, + "step": 7425 + }, + { + "epoch": 1.7327425373134329, + "grad_norm": 0.3352232407139199, + "learning_rate": 4.002052188588144e-05, + "loss": 0.4705, + "step": 7430 + }, + { + "epoch": 1.7339085820895521, + "grad_norm": 0.36728042330785804, + "learning_rate": 4.000610077175419e-05, + "loss": 0.4774, + "step": 7435 + }, + { + "epoch": 1.7350746268656716, + "grad_norm": 0.3469652200713558, + "learning_rate": 3.999167221947777e-05, + "loss": 0.4574, + "step": 7440 + }, + { + "epoch": 1.736240671641791, + "grad_norm": 0.3639547566058384, + "learning_rate": 3.997723623763372e-05, + "loss": 0.47, + "step": 7445 + }, + { + "epoch": 1.7374067164179103, + "grad_norm": 0.3348528872265908, + "learning_rate": 3.9962792834808034e-05, + "loss": 0.4826, + "step": 7450 + }, + { + "epoch": 1.7385727611940298, + "grad_norm": 0.33223482507466934, + "learning_rate": 3.9948342019591066e-05, + "loss": 0.4547, + "step": 7455 + }, + { + "epoch": 1.7397388059701493, + "grad_norm": 0.3577356441829403, + "learning_rate": 3.993388380057763e-05, + "loss": 0.4756, + "step": 7460 + }, + { + "epoch": 1.7409048507462686, + "grad_norm": 0.3314024852499628, + "learning_rate": 3.9919418186366905e-05, + "loss": 0.4574, + "step": 7465 + }, + { + "epoch": 1.742070895522388, + "grad_norm": 0.3065927663958129, + "learning_rate": 3.9904945185562484e-05, + "loss": 0.4627, + "step": 7470 + }, + { + "epoch": 1.7432369402985075, + "grad_norm": 0.29873041722286, + "learning_rate": 3.989046480677236e-05, + "loss": 0.4747, + "step": 7475 + }, + { + "epoch": 1.7444029850746268, + "grad_norm": 0.3320058004590895, + "learning_rate": 3.987597705860891e-05, + "loss": 0.461, + "step": 7480 + }, + { + "epoch": 1.7455690298507462, + "grad_norm": 0.3272721295880745, + "learning_rate": 3.986148194968888e-05, + "loss": 0.4703, + "step": 7485 + }, + { + "epoch": 1.7467350746268657, + "grad_norm": 0.3363991185235613, + "learning_rate": 3.9846979488633415e-05, + "loss": 0.466, + "step": 7490 + }, + { + "epoch": 1.747901119402985, + "grad_norm": 0.3401648724589398, + "learning_rate": 3.9832469684068007e-05, + "loss": 0.4508, + "step": 7495 + }, + { + "epoch": 1.7490671641791045, + "grad_norm": 0.32673520813583895, + "learning_rate": 3.9817952544622554e-05, + "loss": 0.4343, + "step": 7500 + }, + { + "epoch": 1.750233208955224, + "grad_norm": 0.33213225396645735, + "learning_rate": 3.9803428078931276e-05, + "loss": 0.5444, + "step": 7505 + }, + { + "epoch": 1.7513992537313432, + "grad_norm": 0.3102967200111759, + "learning_rate": 3.978889629563277e-05, + "loss": 0.4492, + "step": 7510 + }, + { + "epoch": 1.7525652985074627, + "grad_norm": 0.3292571258229434, + "learning_rate": 3.977435720337e-05, + "loss": 0.4734, + "step": 7515 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.3192233513183503, + "learning_rate": 3.9759810810790236e-05, + "loss": 0.4772, + "step": 7520 + }, + { + "epoch": 1.7548973880597014, + "grad_norm": 0.36079486925702725, + "learning_rate": 3.9745257126545146e-05, + "loss": 0.4743, + "step": 7525 + }, + { + "epoch": 1.756063432835821, + "grad_norm": 0.3504091032321048, + "learning_rate": 3.9730696159290656e-05, + "loss": 0.4524, + "step": 7530 + }, + { + "epoch": 1.7572294776119404, + "grad_norm": 0.32653255000606496, + "learning_rate": 3.971612791768712e-05, + "loss": 0.4497, + "step": 7535 + }, + { + "epoch": 1.7583955223880596, + "grad_norm": 0.3222864892540727, + "learning_rate": 3.970155241039914e-05, + "loss": 0.4691, + "step": 7540 + }, + { + "epoch": 1.759561567164179, + "grad_norm": 0.32822260638527023, + "learning_rate": 3.968696964609568e-05, + "loss": 0.4605, + "step": 7545 + }, + { + "epoch": 1.7607276119402986, + "grad_norm": 0.33368821658399866, + "learning_rate": 3.967237963345001e-05, + "loss": 0.473, + "step": 7550 + }, + { + "epoch": 1.7618936567164178, + "grad_norm": 0.3441315905330009, + "learning_rate": 3.9657782381139696e-05, + "loss": 0.4588, + "step": 7555 + }, + { + "epoch": 1.7630597014925373, + "grad_norm": 0.3386771796818688, + "learning_rate": 3.964317789784664e-05, + "loss": 0.4783, + "step": 7560 + }, + { + "epoch": 1.7642257462686568, + "grad_norm": 0.32364843769565194, + "learning_rate": 3.962856619225703e-05, + "loss": 0.4757, + "step": 7565 + }, + { + "epoch": 1.765391791044776, + "grad_norm": 0.3142847596998878, + "learning_rate": 3.961394727306133e-05, + "loss": 0.4478, + "step": 7570 + }, + { + "epoch": 1.7665578358208955, + "grad_norm": 0.3338652923785012, + "learning_rate": 3.9599321148954325e-05, + "loss": 0.4484, + "step": 7575 + }, + { + "epoch": 1.767723880597015, + "grad_norm": 0.31338477996541986, + "learning_rate": 3.958468782863508e-05, + "loss": 0.4564, + "step": 7580 + }, + { + "epoch": 1.7688899253731343, + "grad_norm": 0.31103805767415105, + "learning_rate": 3.9570047320806916e-05, + "loss": 0.4626, + "step": 7585 + }, + { + "epoch": 1.7700559701492538, + "grad_norm": 0.33954791983778504, + "learning_rate": 3.955539963417746e-05, + "loss": 0.4544, + "step": 7590 + }, + { + "epoch": 1.7712220149253732, + "grad_norm": 0.3377370902996918, + "learning_rate": 3.954074477745859e-05, + "loss": 0.4538, + "step": 7595 + }, + { + "epoch": 1.7723880597014925, + "grad_norm": 0.33682355542096076, + "learning_rate": 3.952608275936644e-05, + "loss": 0.4816, + "step": 7600 + }, + { + "epoch": 1.773554104477612, + "grad_norm": 0.3061278658569396, + "learning_rate": 3.9511413588621435e-05, + "loss": 0.4405, + "step": 7605 + }, + { + "epoch": 1.7747201492537314, + "grad_norm": 0.31574220409579423, + "learning_rate": 3.949673727394823e-05, + "loss": 0.4548, + "step": 7610 + }, + { + "epoch": 1.7758861940298507, + "grad_norm": 0.3777625297037683, + "learning_rate": 3.9482053824075716e-05, + "loss": 0.4639, + "step": 7615 + }, + { + "epoch": 1.7770522388059702, + "grad_norm": 0.3029999766007014, + "learning_rate": 3.946736324773707e-05, + "loss": 0.4643, + "step": 7620 + }, + { + "epoch": 1.7782182835820897, + "grad_norm": 0.3284031193787799, + "learning_rate": 3.945266555366968e-05, + "loss": 0.4519, + "step": 7625 + }, + { + "epoch": 1.779384328358209, + "grad_norm": 0.33364532903833377, + "learning_rate": 3.943796075061517e-05, + "loss": 0.4647, + "step": 7630 + }, + { + "epoch": 1.7805503731343284, + "grad_norm": 0.3077589950827104, + "learning_rate": 3.942324884731938e-05, + "loss": 0.4577, + "step": 7635 + }, + { + "epoch": 1.7817164179104479, + "grad_norm": 0.3317835039136894, + "learning_rate": 3.940852985253239e-05, + "loss": 0.4252, + "step": 7640 + }, + { + "epoch": 1.7828824626865671, + "grad_norm": 0.32259763132175034, + "learning_rate": 3.9393803775008506e-05, + "loss": 0.4722, + "step": 7645 + }, + { + "epoch": 1.7840485074626866, + "grad_norm": 0.3417293168109983, + "learning_rate": 3.937907062350622e-05, + "loss": 0.4616, + "step": 7650 + }, + { + "epoch": 1.785214552238806, + "grad_norm": 0.32573572772958925, + "learning_rate": 3.9364330406788265e-05, + "loss": 0.4643, + "step": 7655 + }, + { + "epoch": 1.7863805970149254, + "grad_norm": 0.34137362535900706, + "learning_rate": 3.9349583133621535e-05, + "loss": 0.4605, + "step": 7660 + }, + { + "epoch": 1.7875466417910446, + "grad_norm": 0.3111180946600651, + "learning_rate": 3.933482881277715e-05, + "loss": 0.4657, + "step": 7665 + }, + { + "epoch": 1.7887126865671643, + "grad_norm": 0.3291822142229409, + "learning_rate": 3.9320067453030415e-05, + "loss": 0.4516, + "step": 7670 + }, + { + "epoch": 1.7898787313432836, + "grad_norm": 0.34076675027870634, + "learning_rate": 3.930529906316083e-05, + "loss": 0.4669, + "step": 7675 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.33220385887575243, + "learning_rate": 3.9290523651952046e-05, + "loss": 0.4585, + "step": 7680 + }, + { + "epoch": 1.7922108208955225, + "grad_norm": 0.3597238957873434, + "learning_rate": 3.927574122819193e-05, + "loss": 0.451, + "step": 7685 + }, + { + "epoch": 1.7933768656716418, + "grad_norm": 0.30887795215161673, + "learning_rate": 3.926095180067249e-05, + "loss": 0.453, + "step": 7690 + }, + { + "epoch": 1.794542910447761, + "grad_norm": 0.3212052662395762, + "learning_rate": 3.924615537818992e-05, + "loss": 0.4713, + "step": 7695 + }, + { + "epoch": 1.7957089552238807, + "grad_norm": 0.31984487566276537, + "learning_rate": 3.923135196954456e-05, + "loss": 0.4872, + "step": 7700 + }, + { + "epoch": 1.796875, + "grad_norm": 0.32155777756878273, + "learning_rate": 3.92165415835409e-05, + "loss": 0.4576, + "step": 7705 + }, + { + "epoch": 1.7980410447761193, + "grad_norm": 0.31698010709092744, + "learning_rate": 3.92017242289876e-05, + "loss": 0.4618, + "step": 7710 + }, + { + "epoch": 1.799207089552239, + "grad_norm": 0.3350559563938395, + "learning_rate": 3.918689991469746e-05, + "loss": 0.451, + "step": 7715 + }, + { + "epoch": 1.8003731343283582, + "grad_norm": 0.32581123150131286, + "learning_rate": 3.9172068649487405e-05, + "loss": 0.4473, + "step": 7720 + }, + { + "epoch": 1.8015391791044775, + "grad_norm": 0.3499490459149918, + "learning_rate": 3.91572304421785e-05, + "loss": 0.464, + "step": 7725 + }, + { + "epoch": 1.8027052238805972, + "grad_norm": 0.32822280432312334, + "learning_rate": 3.914238530159595e-05, + "loss": 0.475, + "step": 7730 + }, + { + "epoch": 1.8038712686567164, + "grad_norm": 0.31935556956987293, + "learning_rate": 3.9127533236569077e-05, + "loss": 0.4712, + "step": 7735 + }, + { + "epoch": 1.8050373134328357, + "grad_norm": 0.34873770528962084, + "learning_rate": 3.9112674255931294e-05, + "loss": 0.4578, + "step": 7740 + }, + { + "epoch": 1.8062033582089554, + "grad_norm": 0.3455218187514769, + "learning_rate": 3.909780836852019e-05, + "loss": 0.4544, + "step": 7745 + }, + { + "epoch": 1.8073694029850746, + "grad_norm": 0.3292054740020448, + "learning_rate": 3.908293558317741e-05, + "loss": 0.4512, + "step": 7750 + }, + { + "epoch": 1.808535447761194, + "grad_norm": 0.3314842049495878, + "learning_rate": 3.9068055908748706e-05, + "loss": 0.4751, + "step": 7755 + }, + { + "epoch": 1.8097014925373134, + "grad_norm": 0.30938651566350633, + "learning_rate": 3.9053169354083946e-05, + "loss": 0.4331, + "step": 7760 + }, + { + "epoch": 1.8108675373134329, + "grad_norm": 0.32214239674628525, + "learning_rate": 3.903827592803708e-05, + "loss": 0.4728, + "step": 7765 + }, + { + "epoch": 1.8120335820895521, + "grad_norm": 0.3043180490770527, + "learning_rate": 3.9023375639466156e-05, + "loss": 0.4248, + "step": 7770 + }, + { + "epoch": 1.8131996268656716, + "grad_norm": 0.31143937820829876, + "learning_rate": 3.900846849723328e-05, + "loss": 0.4555, + "step": 7775 + }, + { + "epoch": 1.814365671641791, + "grad_norm": 0.30503553696644364, + "learning_rate": 3.8993554510204664e-05, + "loss": 0.4604, + "step": 7780 + }, + { + "epoch": 1.8155317164179103, + "grad_norm": 0.3334508226536671, + "learning_rate": 3.897863368725056e-05, + "loss": 0.4864, + "step": 7785 + }, + { + "epoch": 1.8166977611940298, + "grad_norm": 0.34169562186157326, + "learning_rate": 3.896370603724531e-05, + "loss": 0.4521, + "step": 7790 + }, + { + "epoch": 1.8178638059701493, + "grad_norm": 0.335094114071336, + "learning_rate": 3.8948771569067305e-05, + "loss": 0.4478, + "step": 7795 + }, + { + "epoch": 1.8190298507462686, + "grad_norm": 0.31585481265436866, + "learning_rate": 3.893383029159899e-05, + "loss": 0.4549, + "step": 7800 + }, + { + "epoch": 1.820195895522388, + "grad_norm": 0.32872682537120884, + "learning_rate": 3.891888221372688e-05, + "loss": 0.4638, + "step": 7805 + }, + { + "epoch": 1.8213619402985075, + "grad_norm": 0.33282921132719095, + "learning_rate": 3.89039273443415e-05, + "loss": 0.4446, + "step": 7810 + }, + { + "epoch": 1.8225279850746268, + "grad_norm": 0.31069348498645605, + "learning_rate": 3.888896569233744e-05, + "loss": 0.4558, + "step": 7815 + }, + { + "epoch": 1.8236940298507462, + "grad_norm": 0.3217055889621255, + "learning_rate": 3.887399726661332e-05, + "loss": 0.4628, + "step": 7820 + }, + { + "epoch": 1.8248600746268657, + "grad_norm": 0.33258933344430636, + "learning_rate": 3.885902207607178e-05, + "loss": 0.4593, + "step": 7825 + }, + { + "epoch": 1.826026119402985, + "grad_norm": 0.335713408666473, + "learning_rate": 3.88440401296195e-05, + "loss": 0.4758, + "step": 7830 + }, + { + "epoch": 1.8271921641791045, + "grad_norm": 0.346449949823291, + "learning_rate": 3.8829051436167144e-05, + "loss": 0.4683, + "step": 7835 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.33516753990549714, + "learning_rate": 3.881405600462943e-05, + "loss": 0.4564, + "step": 7840 + }, + { + "epoch": 1.8295242537313432, + "grad_norm": 0.35138549675984454, + "learning_rate": 3.879905384392508e-05, + "loss": 0.4679, + "step": 7845 + }, + { + "epoch": 1.8306902985074627, + "grad_norm": 0.3509271341125894, + "learning_rate": 3.8784044962976776e-05, + "loss": 0.4537, + "step": 7850 + }, + { + "epoch": 1.8318563432835822, + "grad_norm": 0.3295852159389941, + "learning_rate": 3.8769029370711234e-05, + "loss": 0.4704, + "step": 7855 + }, + { + "epoch": 1.8330223880597014, + "grad_norm": 0.3219920585928317, + "learning_rate": 3.8754007076059155e-05, + "loss": 0.4721, + "step": 7860 + }, + { + "epoch": 1.834188432835821, + "grad_norm": 0.32036554569756065, + "learning_rate": 3.873897808795522e-05, + "loss": 0.463, + "step": 7865 + }, + { + "epoch": 1.8353544776119404, + "grad_norm": 0.3334846957626944, + "learning_rate": 3.8723942415338105e-05, + "loss": 0.4729, + "step": 7870 + }, + { + "epoch": 1.8365205223880596, + "grad_norm": 0.3254709598700866, + "learning_rate": 3.870890006715044e-05, + "loss": 0.4528, + "step": 7875 + }, + { + "epoch": 1.837686567164179, + "grad_norm": 0.35499842914666335, + "learning_rate": 3.869385105233884e-05, + "loss": 0.4649, + "step": 7880 + }, + { + "epoch": 1.8388526119402986, + "grad_norm": 0.34009809613132064, + "learning_rate": 3.867879537985388e-05, + "loss": 0.4697, + "step": 7885 + }, + { + "epoch": 1.8400186567164178, + "grad_norm": 0.3505457636640597, + "learning_rate": 3.8663733058650104e-05, + "loss": 0.4872, + "step": 7890 + }, + { + "epoch": 1.8411847014925373, + "grad_norm": 0.31285820850905105, + "learning_rate": 3.8648664097686e-05, + "loss": 0.439, + "step": 7895 + }, + { + "epoch": 1.8423507462686568, + "grad_norm": 0.330351773304493, + "learning_rate": 3.8633588505924e-05, + "loss": 0.473, + "step": 7900 + }, + { + "epoch": 1.843516791044776, + "grad_norm": 0.3278611547398983, + "learning_rate": 3.861850629233051e-05, + "loss": 0.4613, + "step": 7905 + }, + { + "epoch": 1.8446828358208955, + "grad_norm": 0.32658142113658106, + "learning_rate": 3.8603417465875816e-05, + "loss": 0.4573, + "step": 7910 + }, + { + "epoch": 1.845848880597015, + "grad_norm": 0.3485881700699174, + "learning_rate": 3.858832203553421e-05, + "loss": 0.4835, + "step": 7915 + }, + { + "epoch": 1.8470149253731343, + "grad_norm": 0.3083825708830578, + "learning_rate": 3.857322001028385e-05, + "loss": 0.4433, + "step": 7920 + }, + { + "epoch": 1.8481809701492538, + "grad_norm": 0.3503312124519673, + "learning_rate": 3.855811139910686e-05, + "loss": 0.4799, + "step": 7925 + }, + { + "epoch": 1.8493470149253732, + "grad_norm": 0.3234996296650612, + "learning_rate": 3.854299621098925e-05, + "loss": 0.4585, + "step": 7930 + }, + { + "epoch": 1.8505130597014925, + "grad_norm": 0.3052564357859432, + "learning_rate": 3.8527874454920955e-05, + "loss": 0.451, + "step": 7935 + }, + { + "epoch": 1.851679104477612, + "grad_norm": 0.3742671960794288, + "learning_rate": 3.851274613989582e-05, + "loss": 0.4654, + "step": 7940 + }, + { + "epoch": 1.8528451492537314, + "grad_norm": 0.3335244961793869, + "learning_rate": 3.849761127491158e-05, + "loss": 0.4474, + "step": 7945 + }, + { + "epoch": 1.8540111940298507, + "grad_norm": 0.34729167900291275, + "learning_rate": 3.848246986896989e-05, + "loss": 0.4757, + "step": 7950 + }, + { + "epoch": 1.8551772388059702, + "grad_norm": 0.32621784125350106, + "learning_rate": 3.8467321931076255e-05, + "loss": 0.4375, + "step": 7955 + }, + { + "epoch": 1.8563432835820897, + "grad_norm": 0.34002031048416315, + "learning_rate": 3.84521674702401e-05, + "loss": 0.4875, + "step": 7960 + }, + { + "epoch": 1.857509328358209, + "grad_norm": 0.33018372241070876, + "learning_rate": 3.8437006495474716e-05, + "loss": 0.451, + "step": 7965 + }, + { + "epoch": 1.8586753731343284, + "grad_norm": 0.3248254976028379, + "learning_rate": 3.8421839015797265e-05, + "loss": 0.467, + "step": 7970 + }, + { + "epoch": 1.8598414179104479, + "grad_norm": 0.31695297868444927, + "learning_rate": 3.840666504022879e-05, + "loss": 0.471, + "step": 7975 + }, + { + "epoch": 1.8610074626865671, + "grad_norm": 0.32814182269306824, + "learning_rate": 3.839148457779418e-05, + "loss": 0.4777, + "step": 7980 + }, + { + "epoch": 1.8621735074626866, + "grad_norm": 0.3188618300682253, + "learning_rate": 3.837629763752219e-05, + "loss": 0.4514, + "step": 7985 + }, + { + "epoch": 1.863339552238806, + "grad_norm": 0.3418727121371228, + "learning_rate": 3.8361104228445455e-05, + "loss": 0.4954, + "step": 7990 + }, + { + "epoch": 1.8645055970149254, + "grad_norm": 0.3348980968427248, + "learning_rate": 3.834590435960041e-05, + "loss": 0.4749, + "step": 7995 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.3469916419162627, + "learning_rate": 3.8330698040027345e-05, + "loss": 0.4855, + "step": 8000 + }, + { + "epoch": 1.8668376865671643, + "grad_norm": 0.3245130980042784, + "learning_rate": 3.8315485278770423e-05, + "loss": 0.4805, + "step": 8005 + }, + { + "epoch": 1.8680037313432836, + "grad_norm": 0.32669433732276504, + "learning_rate": 3.83002660848776e-05, + "loss": 0.4685, + "step": 8010 + }, + { + "epoch": 1.8691697761194028, + "grad_norm": 0.3371551045394974, + "learning_rate": 3.828504046740065e-05, + "loss": 0.4686, + "step": 8015 + }, + { + "epoch": 1.8703358208955225, + "grad_norm": 0.3351214970857238, + "learning_rate": 3.826980843539521e-05, + "loss": 0.4803, + "step": 8020 + }, + { + "epoch": 1.8715018656716418, + "grad_norm": 0.30946469782414815, + "learning_rate": 3.82545699979207e-05, + "loss": 0.439, + "step": 8025 + }, + { + "epoch": 1.872667910447761, + "grad_norm": 0.33302124613759193, + "learning_rate": 3.823932516404036e-05, + "loss": 0.4554, + "step": 8030 + }, + { + "epoch": 1.8738339552238807, + "grad_norm": 0.31750655079024775, + "learning_rate": 3.822407394282123e-05, + "loss": 0.4473, + "step": 8035 + }, + { + "epoch": 1.875, + "grad_norm": 0.35069118730521315, + "learning_rate": 3.8208816343334156e-05, + "loss": 0.4746, + "step": 8040 + }, + { + "epoch": 1.8761660447761193, + "grad_norm": 0.31257634947661017, + "learning_rate": 3.819355237465377e-05, + "loss": 0.4536, + "step": 8045 + }, + { + "epoch": 1.877332089552239, + "grad_norm": 0.3203627093879648, + "learning_rate": 3.81782820458585e-05, + "loss": 0.4595, + "step": 8050 + }, + { + "epoch": 1.8784981343283582, + "grad_norm": 0.30534528984636694, + "learning_rate": 3.816300536603054e-05, + "loss": 0.465, + "step": 8055 + }, + { + "epoch": 1.8796641791044775, + "grad_norm": 0.3235119118193064, + "learning_rate": 3.814772234425588e-05, + "loss": 0.4632, + "step": 8060 + }, + { + "epoch": 1.8808302238805972, + "grad_norm": 0.33739919698858545, + "learning_rate": 3.813243298962428e-05, + "loss": 0.4538, + "step": 8065 + }, + { + "epoch": 1.8819962686567164, + "grad_norm": 0.3325386291992744, + "learning_rate": 3.8117137311229255e-05, + "loss": 0.4593, + "step": 8070 + }, + { + "epoch": 1.8831623134328357, + "grad_norm": 0.3011728405166508, + "learning_rate": 3.81018353181681e-05, + "loss": 0.447, + "step": 8075 + }, + { + "epoch": 1.8843283582089554, + "grad_norm": 0.3212051882269613, + "learning_rate": 3.808652701954183e-05, + "loss": 0.4721, + "step": 8080 + }, + { + "epoch": 1.8854944029850746, + "grad_norm": 0.31184448824834854, + "learning_rate": 3.807121242445526e-05, + "loss": 0.4572, + "step": 8085 + }, + { + "epoch": 1.886660447761194, + "grad_norm": 0.3264791983669669, + "learning_rate": 3.805589154201691e-05, + "loss": 0.4543, + "step": 8090 + }, + { + "epoch": 1.8878264925373134, + "grad_norm": 0.3114795037694442, + "learning_rate": 3.804056438133905e-05, + "loss": 0.4514, + "step": 8095 + }, + { + "epoch": 1.8889925373134329, + "grad_norm": 0.3189561537443261, + "learning_rate": 3.80252309515377e-05, + "loss": 0.453, + "step": 8100 + }, + { + "epoch": 1.8901585820895521, + "grad_norm": 0.31776350942786574, + "learning_rate": 3.800989126173259e-05, + "loss": 0.4556, + "step": 8105 + }, + { + "epoch": 1.8913246268656716, + "grad_norm": 0.32079439958398626, + "learning_rate": 3.799454532104718e-05, + "loss": 0.4542, + "step": 8110 + }, + { + "epoch": 1.892490671641791, + "grad_norm": 0.33131265989782804, + "learning_rate": 3.7979193138608646e-05, + "loss": 0.4609, + "step": 8115 + }, + { + "epoch": 1.8936567164179103, + "grad_norm": 0.3327910149121253, + "learning_rate": 3.7963834723547866e-05, + "loss": 0.4765, + "step": 8120 + }, + { + "epoch": 1.8948227611940298, + "grad_norm": 0.3576971690086045, + "learning_rate": 3.794847008499946e-05, + "loss": 0.4605, + "step": 8125 + }, + { + "epoch": 1.8959888059701493, + "grad_norm": 0.31756641923535656, + "learning_rate": 3.793309923210171e-05, + "loss": 0.458, + "step": 8130 + }, + { + "epoch": 1.8971548507462686, + "grad_norm": 0.32523492172924273, + "learning_rate": 3.791772217399661e-05, + "loss": 0.4823, + "step": 8135 + }, + { + "epoch": 1.898320895522388, + "grad_norm": 0.32383831258621426, + "learning_rate": 3.7902338919829854e-05, + "loss": 0.4596, + "step": 8140 + }, + { + "epoch": 1.8994869402985075, + "grad_norm": 0.3391911198250996, + "learning_rate": 3.788694947875079e-05, + "loss": 0.4632, + "step": 8145 + }, + { + "epoch": 1.9006529850746268, + "grad_norm": 0.30137706635382955, + "learning_rate": 3.78715538599125e-05, + "loss": 0.4454, + "step": 8150 + }, + { + "epoch": 1.9018190298507462, + "grad_norm": 0.3459202673542257, + "learning_rate": 3.7856152072471686e-05, + "loss": 0.458, + "step": 8155 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.33169973302248834, + "learning_rate": 3.784074412558875e-05, + "loss": 0.4385, + "step": 8160 + }, + { + "epoch": 1.904151119402985, + "grad_norm": 0.34644032807933745, + "learning_rate": 3.782533002842773e-05, + "loss": 0.4877, + "step": 8165 + }, + { + "epoch": 1.9053171641791045, + "grad_norm": 0.3368976029041088, + "learning_rate": 3.7809909790156355e-05, + "loss": 0.4753, + "step": 8170 + }, + { + "epoch": 1.906483208955224, + "grad_norm": 0.3256484730335472, + "learning_rate": 3.7794483419946e-05, + "loss": 0.4467, + "step": 8175 + }, + { + "epoch": 1.9076492537313432, + "grad_norm": 0.3215817140612585, + "learning_rate": 3.777905092697166e-05, + "loss": 0.4572, + "step": 8180 + }, + { + "epoch": 1.9088152985074627, + "grad_norm": 0.3383505169421641, + "learning_rate": 3.7763612320412e-05, + "loss": 0.4629, + "step": 8185 + }, + { + "epoch": 1.9099813432835822, + "grad_norm": 0.32069100213684426, + "learning_rate": 3.77481676094493e-05, + "loss": 0.4607, + "step": 8190 + }, + { + "epoch": 1.9111473880597014, + "grad_norm": 0.32835821049312575, + "learning_rate": 3.77327168032695e-05, + "loss": 0.4694, + "step": 8195 + }, + { + "epoch": 1.912313432835821, + "grad_norm": 0.3467709020619544, + "learning_rate": 3.771725991106214e-05, + "loss": 0.4691, + "step": 8200 + }, + { + "epoch": 1.9134794776119404, + "grad_norm": 0.3306661408042111, + "learning_rate": 3.770179694202038e-05, + "loss": 0.4705, + "step": 8205 + }, + { + "epoch": 1.9146455223880596, + "grad_norm": 0.314401468880452, + "learning_rate": 3.7686327905341014e-05, + "loss": 0.4563, + "step": 8210 + }, + { + "epoch": 1.915811567164179, + "grad_norm": 0.31715617006599117, + "learning_rate": 3.767085281022441e-05, + "loss": 0.465, + "step": 8215 + }, + { + "epoch": 1.9169776119402986, + "grad_norm": 0.3366229097507516, + "learning_rate": 3.765537166587458e-05, + "loss": 0.4617, + "step": 8220 + }, + { + "epoch": 1.9181436567164178, + "grad_norm": 0.31505277674369897, + "learning_rate": 3.763988448149912e-05, + "loss": 0.4656, + "step": 8225 + }, + { + "epoch": 1.9193097014925373, + "grad_norm": 0.3136303836644364, + "learning_rate": 3.762439126630919e-05, + "loss": 0.4658, + "step": 8230 + }, + { + "epoch": 1.9204757462686568, + "grad_norm": 0.30999053723638365, + "learning_rate": 3.7608892029519576e-05, + "loss": 0.4512, + "step": 8235 + }, + { + "epoch": 1.921641791044776, + "grad_norm": 0.328613573645389, + "learning_rate": 3.7593386780348625e-05, + "loss": 0.469, + "step": 8240 + }, + { + "epoch": 1.9228078358208955, + "grad_norm": 0.36358357041271927, + "learning_rate": 3.757787552801827e-05, + "loss": 0.464, + "step": 8245 + }, + { + "epoch": 1.923973880597015, + "grad_norm": 0.3465509791167903, + "learning_rate": 3.756235828175401e-05, + "loss": 0.4683, + "step": 8250 + }, + { + "epoch": 1.9251399253731343, + "grad_norm": 0.3786901022074795, + "learning_rate": 3.75468350507849e-05, + "loss": 0.4672, + "step": 8255 + }, + { + "epoch": 1.9263059701492538, + "grad_norm": 0.3385674775790514, + "learning_rate": 3.753130584434357e-05, + "loss": 0.4714, + "step": 8260 + }, + { + "epoch": 1.9274720149253732, + "grad_norm": 0.3252345512535215, + "learning_rate": 3.7515770671666175e-05, + "loss": 0.4545, + "step": 8265 + }, + { + "epoch": 1.9286380597014925, + "grad_norm": 0.33098644442489994, + "learning_rate": 3.750022954199248e-05, + "loss": 0.4873, + "step": 8270 + }, + { + "epoch": 1.929804104477612, + "grad_norm": 0.33705559723146444, + "learning_rate": 3.748468246456572e-05, + "loss": 0.4582, + "step": 8275 + }, + { + "epoch": 1.9309701492537314, + "grad_norm": 0.3312102446660308, + "learning_rate": 3.7469129448632704e-05, + "loss": 0.4584, + "step": 8280 + }, + { + "epoch": 1.9321361940298507, + "grad_norm": 0.2985887156790144, + "learning_rate": 3.7453570503443785e-05, + "loss": 0.4569, + "step": 8285 + }, + { + "epoch": 1.9333022388059702, + "grad_norm": 0.31981170350527494, + "learning_rate": 3.743800563825283e-05, + "loss": 0.4751, + "step": 8290 + }, + { + "epoch": 1.9344682835820897, + "grad_norm": 0.31161099831445976, + "learning_rate": 3.742243486231719e-05, + "loss": 0.4641, + "step": 8295 + }, + { + "epoch": 1.935634328358209, + "grad_norm": 0.3093342472847783, + "learning_rate": 3.74068581848978e-05, + "loss": 0.4694, + "step": 8300 + }, + { + "epoch": 1.9368003731343284, + "grad_norm": 0.34209842644228594, + "learning_rate": 3.7391275615259065e-05, + "loss": 0.4671, + "step": 8305 + }, + { + "epoch": 1.9379664179104479, + "grad_norm": 0.31631473113092634, + "learning_rate": 3.737568716266888e-05, + "loss": 0.4518, + "step": 8310 + }, + { + "epoch": 1.9391324626865671, + "grad_norm": 0.31559245916026246, + "learning_rate": 3.7360092836398686e-05, + "loss": 0.458, + "step": 8315 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.3018699419745661, + "learning_rate": 3.734449264572336e-05, + "loss": 0.4533, + "step": 8320 + }, + { + "epoch": 1.941464552238806, + "grad_norm": 0.3242303966557626, + "learning_rate": 3.7328886599921327e-05, + "loss": 0.4561, + "step": 8325 + }, + { + "epoch": 1.9426305970149254, + "grad_norm": 0.32202208206224575, + "learning_rate": 3.7313274708274445e-05, + "loss": 0.4694, + "step": 8330 + }, + { + "epoch": 1.9437966417910446, + "grad_norm": 0.30990782532343536, + "learning_rate": 3.729765698006808e-05, + "loss": 0.4521, + "step": 8335 + }, + { + "epoch": 1.9449626865671643, + "grad_norm": 0.3048640975793819, + "learning_rate": 3.7282033424591043e-05, + "loss": 0.4341, + "step": 8340 + }, + { + "epoch": 1.9461287313432836, + "grad_norm": 0.3368657401620674, + "learning_rate": 3.726640405113564e-05, + "loss": 0.4547, + "step": 8345 + }, + { + "epoch": 1.9472947761194028, + "grad_norm": 0.3099933955455299, + "learning_rate": 3.725076886899763e-05, + "loss": 0.4381, + "step": 8350 + }, + { + "epoch": 1.9484608208955225, + "grad_norm": 0.3401686714270818, + "learning_rate": 3.723512788747619e-05, + "loss": 0.478, + "step": 8355 + }, + { + "epoch": 1.9496268656716418, + "grad_norm": 0.33096268165623416, + "learning_rate": 3.721948111587399e-05, + "loss": 0.5073, + "step": 8360 + }, + { + "epoch": 1.950792910447761, + "grad_norm": 0.3164863205631471, + "learning_rate": 3.720382856349715e-05, + "loss": 0.4411, + "step": 8365 + }, + { + "epoch": 1.9519589552238807, + "grad_norm": 0.31624211657132456, + "learning_rate": 3.718817023965519e-05, + "loss": 0.4354, + "step": 8370 + }, + { + "epoch": 1.953125, + "grad_norm": 0.33248567097779913, + "learning_rate": 3.717250615366108e-05, + "loss": 0.4831, + "step": 8375 + }, + { + "epoch": 1.9542910447761193, + "grad_norm": 0.3046710717227166, + "learning_rate": 3.715683631483121e-05, + "loss": 0.4536, + "step": 8380 + }, + { + "epoch": 1.955457089552239, + "grad_norm": 0.3083780305785333, + "learning_rate": 3.714116073248542e-05, + "loss": 0.4339, + "step": 8385 + }, + { + "epoch": 1.9566231343283582, + "grad_norm": 0.33455165027824996, + "learning_rate": 3.712547941594693e-05, + "loss": 0.4683, + "step": 8390 + }, + { + "epoch": 1.9577891791044775, + "grad_norm": 0.3426536588315946, + "learning_rate": 3.71097923745424e-05, + "loss": 0.4651, + "step": 8395 + }, + { + "epoch": 1.9589552238805972, + "grad_norm": 0.3129487990268488, + "learning_rate": 3.709409961760186e-05, + "loss": 0.4696, + "step": 8400 + }, + { + "epoch": 1.9601212686567164, + "grad_norm": 0.33330989701845404, + "learning_rate": 3.707840115445877e-05, + "loss": 0.4613, + "step": 8405 + }, + { + "epoch": 1.9612873134328357, + "grad_norm": 0.31936463039977786, + "learning_rate": 3.706269699444998e-05, + "loss": 0.4529, + "step": 8410 + }, + { + "epoch": 1.9624533582089554, + "grad_norm": 0.32365100398634133, + "learning_rate": 3.704698714691572e-05, + "loss": 0.4574, + "step": 8415 + }, + { + "epoch": 1.9636194029850746, + "grad_norm": 0.3401748878620013, + "learning_rate": 3.703127162119959e-05, + "loss": 0.4662, + "step": 8420 + }, + { + "epoch": 1.964785447761194, + "grad_norm": 0.296258052389048, + "learning_rate": 3.701555042664861e-05, + "loss": 0.4387, + "step": 8425 + }, + { + "epoch": 1.9659514925373134, + "grad_norm": 0.3487274914463436, + "learning_rate": 3.699982357261312e-05, + "loss": 0.4607, + "step": 8430 + }, + { + "epoch": 1.9671175373134329, + "grad_norm": 0.32753681353725883, + "learning_rate": 3.6984091068446855e-05, + "loss": 0.4655, + "step": 8435 + }, + { + "epoch": 1.9682835820895521, + "grad_norm": 0.3257070361701552, + "learning_rate": 3.69683529235069e-05, + "loss": 0.4757, + "step": 8440 + }, + { + "epoch": 1.9694496268656716, + "grad_norm": 0.350676554662175, + "learning_rate": 3.695260914715372e-05, + "loss": 0.4673, + "step": 8445 + }, + { + "epoch": 1.970615671641791, + "grad_norm": 0.33132191973635894, + "learning_rate": 3.693685974875109e-05, + "loss": 0.4778, + "step": 8450 + }, + { + "epoch": 1.9717817164179103, + "grad_norm": 0.3166516294323465, + "learning_rate": 3.692110473766616e-05, + "loss": 0.4779, + "step": 8455 + }, + { + "epoch": 1.9729477611940298, + "grad_norm": 0.3262098693791917, + "learning_rate": 3.69053441232694e-05, + "loss": 0.4403, + "step": 8460 + }, + { + "epoch": 1.9741138059701493, + "grad_norm": 0.35852094125486883, + "learning_rate": 3.688957791493462e-05, + "loss": 0.4638, + "step": 8465 + }, + { + "epoch": 1.9752798507462686, + "grad_norm": 0.33695176178664865, + "learning_rate": 3.6873806122038964e-05, + "loss": 0.4524, + "step": 8470 + }, + { + "epoch": 1.976445895522388, + "grad_norm": 0.33967072057618775, + "learning_rate": 3.685802875396287e-05, + "loss": 0.4879, + "step": 8475 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.3230470335042884, + "learning_rate": 3.684224582009014e-05, + "loss": 0.4743, + "step": 8480 + }, + { + "epoch": 1.9787779850746268, + "grad_norm": 0.32611382927003435, + "learning_rate": 3.682645732980783e-05, + "loss": 0.4547, + "step": 8485 + }, + { + "epoch": 1.9799440298507462, + "grad_norm": 0.32027423075272554, + "learning_rate": 3.6810663292506344e-05, + "loss": 0.4576, + "step": 8490 + }, + { + "epoch": 1.9811100746268657, + "grad_norm": 0.2998864123772078, + "learning_rate": 3.6794863717579365e-05, + "loss": 0.456, + "step": 8495 + }, + { + "epoch": 1.982276119402985, + "grad_norm": 0.3296524390065207, + "learning_rate": 3.677905861442387e-05, + "loss": 0.4539, + "step": 8500 + }, + { + "epoch": 1.9834421641791045, + "grad_norm": 0.320024303912779, + "learning_rate": 3.676324799244014e-05, + "loss": 0.4651, + "step": 8505 + }, + { + "epoch": 1.984608208955224, + "grad_norm": 0.3233521363547366, + "learning_rate": 3.6747431861031716e-05, + "loss": 0.4612, + "step": 8510 + }, + { + "epoch": 1.9857742537313432, + "grad_norm": 0.3382237291427512, + "learning_rate": 3.673161022960544e-05, + "loss": 0.4683, + "step": 8515 + }, + { + "epoch": 1.9869402985074627, + "grad_norm": 0.32855571390165056, + "learning_rate": 3.67157831075714e-05, + "loss": 0.4528, + "step": 8520 + }, + { + "epoch": 1.9881063432835822, + "grad_norm": 0.3331779819433144, + "learning_rate": 3.6699950504342954e-05, + "loss": 0.4485, + "step": 8525 + }, + { + "epoch": 1.9892723880597014, + "grad_norm": 0.3056847338421327, + "learning_rate": 3.6684112429336745e-05, + "loss": 0.4758, + "step": 8530 + }, + { + "epoch": 1.990438432835821, + "grad_norm": 0.32855409179827705, + "learning_rate": 3.666826889197265e-05, + "loss": 0.445, + "step": 8535 + }, + { + "epoch": 1.9916044776119404, + "grad_norm": 0.3757389752274244, + "learning_rate": 3.665241990167378e-05, + "loss": 0.4429, + "step": 8540 + }, + { + "epoch": 1.9927705223880596, + "grad_norm": 0.3212644203007692, + "learning_rate": 3.663656546786653e-05, + "loss": 0.4668, + "step": 8545 + }, + { + "epoch": 1.993936567164179, + "grad_norm": 0.3055540187125843, + "learning_rate": 3.6620705599980494e-05, + "loss": 0.4392, + "step": 8550 + }, + { + "epoch": 1.9951026119402986, + "grad_norm": 0.3111057920694736, + "learning_rate": 3.660484030744852e-05, + "loss": 0.4177, + "step": 8555 + }, + { + "epoch": 1.9962686567164178, + "grad_norm": 0.3085390728261077, + "learning_rate": 3.6588969599706665e-05, + "loss": 0.4631, + "step": 8560 + }, + { + "epoch": 1.9974347014925373, + "grad_norm": 0.3169719272451861, + "learning_rate": 3.6573093486194226e-05, + "loss": 0.46, + "step": 8565 + }, + { + "epoch": 1.9986007462686568, + "grad_norm": 0.3489380750224144, + "learning_rate": 3.655721197635371e-05, + "loss": 0.4561, + "step": 8570 + }, + { + "epoch": 1.999766791044776, + "grad_norm": 0.3167210678106308, + "learning_rate": 3.654132507963083e-05, + "loss": 0.4562, + "step": 8575 + }, + { + "epoch": 2.0009328358208953, + "grad_norm": 0.3145100839078455, + "learning_rate": 3.652543280547449e-05, + "loss": 0.402, + "step": 8580 + }, + { + "epoch": 2.002098880597015, + "grad_norm": 0.3739251067181522, + "learning_rate": 3.650953516333682e-05, + "loss": 0.398, + "step": 8585 + }, + { + "epoch": 2.0032649253731343, + "grad_norm": 0.33030851305774417, + "learning_rate": 3.6493632162673125e-05, + "loss": 0.388, + "step": 8590 + }, + { + "epoch": 2.0044309701492535, + "grad_norm": 0.32436345472003664, + "learning_rate": 3.647772381294189e-05, + "loss": 0.4005, + "step": 8595 + }, + { + "epoch": 2.0055970149253732, + "grad_norm": 0.3344649766185474, + "learning_rate": 3.6461810123604805e-05, + "loss": 0.3856, + "step": 8600 + }, + { + "epoch": 2.0067630597014925, + "grad_norm": 0.33728424503869586, + "learning_rate": 3.6445891104126714e-05, + "loss": 0.4365, + "step": 8605 + }, + { + "epoch": 2.0079291044776117, + "grad_norm": 0.3404316093009153, + "learning_rate": 3.6429966763975636e-05, + "loss": 0.3964, + "step": 8610 + }, + { + "epoch": 2.0090951492537314, + "grad_norm": 0.3378580339271899, + "learning_rate": 3.641403711262277e-05, + "loss": 0.3996, + "step": 8615 + }, + { + "epoch": 2.0102611940298507, + "grad_norm": 0.34431386908644757, + "learning_rate": 3.639810215954245e-05, + "loss": 0.4013, + "step": 8620 + }, + { + "epoch": 2.01142723880597, + "grad_norm": 0.3435701876332849, + "learning_rate": 3.638216191421218e-05, + "loss": 0.3887, + "step": 8625 + }, + { + "epoch": 2.0125932835820897, + "grad_norm": 0.3378758988158323, + "learning_rate": 3.6366216386112605e-05, + "loss": 0.398, + "step": 8630 + }, + { + "epoch": 2.013759328358209, + "grad_norm": 0.32610020212555607, + "learning_rate": 3.635026558472752e-05, + "loss": 0.4021, + "step": 8635 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.3307637385854097, + "learning_rate": 3.633430951954383e-05, + "loss": 0.385, + "step": 8640 + }, + { + "epoch": 2.016091417910448, + "grad_norm": 0.32575664349785305, + "learning_rate": 3.631834820005163e-05, + "loss": 0.3881, + "step": 8645 + }, + { + "epoch": 2.017257462686567, + "grad_norm": 0.31862392750078533, + "learning_rate": 3.6302381635744056e-05, + "loss": 0.3739, + "step": 8650 + }, + { + "epoch": 2.0184235074626864, + "grad_norm": 0.3541599450516158, + "learning_rate": 3.628640983611744e-05, + "loss": 0.3957, + "step": 8655 + }, + { + "epoch": 2.019589552238806, + "grad_norm": 0.4047590068267803, + "learning_rate": 3.6270432810671176e-05, + "loss": 0.4079, + "step": 8660 + }, + { + "epoch": 2.0207555970149254, + "grad_norm": 0.32493835343321636, + "learning_rate": 3.62544505689078e-05, + "loss": 0.3984, + "step": 8665 + }, + { + "epoch": 2.0219216417910446, + "grad_norm": 0.34672647439953735, + "learning_rate": 3.623846312033294e-05, + "loss": 0.3915, + "step": 8670 + }, + { + "epoch": 2.0230876865671643, + "grad_norm": 0.3404090139625984, + "learning_rate": 3.622247047445529e-05, + "loss": 0.4031, + "step": 8675 + }, + { + "epoch": 2.0242537313432836, + "grad_norm": 0.36206886274537775, + "learning_rate": 3.6206472640786696e-05, + "loss": 0.4331, + "step": 8680 + }, + { + "epoch": 2.025419776119403, + "grad_norm": 0.3136468361598117, + "learning_rate": 3.619046962884204e-05, + "loss": 0.3877, + "step": 8685 + }, + { + "epoch": 2.0265858208955225, + "grad_norm": 0.3299506757137787, + "learning_rate": 3.617446144813929e-05, + "loss": 0.3894, + "step": 8690 + }, + { + "epoch": 2.027751865671642, + "grad_norm": 0.359656032726551, + "learning_rate": 3.6158448108199515e-05, + "loss": 0.3964, + "step": 8695 + }, + { + "epoch": 2.028917910447761, + "grad_norm": 0.3372126161755083, + "learning_rate": 3.614242961854683e-05, + "loss": 0.3876, + "step": 8700 + }, + { + "epoch": 2.0300839552238807, + "grad_norm": 0.37800743601174797, + "learning_rate": 3.6126405988708424e-05, + "loss": 0.3934, + "step": 8705 + }, + { + "epoch": 2.03125, + "grad_norm": 0.3296460466656033, + "learning_rate": 3.611037722821452e-05, + "loss": 0.3919, + "step": 8710 + }, + { + "epoch": 2.0324160447761193, + "grad_norm": 0.3577029577400965, + "learning_rate": 3.609434334659842e-05, + "loss": 0.387, + "step": 8715 + }, + { + "epoch": 2.033582089552239, + "grad_norm": 0.33699339824825153, + "learning_rate": 3.607830435339648e-05, + "loss": 0.4078, + "step": 8720 + }, + { + "epoch": 2.034748134328358, + "grad_norm": 0.37817467842658975, + "learning_rate": 3.606226025814805e-05, + "loss": 0.4156, + "step": 8725 + }, + { + "epoch": 2.0359141791044775, + "grad_norm": 0.39827449754645217, + "learning_rate": 3.604621107039555e-05, + "loss": 0.3958, + "step": 8730 + }, + { + "epoch": 2.037080223880597, + "grad_norm": 0.33064611675295624, + "learning_rate": 3.6030156799684435e-05, + "loss": 0.4066, + "step": 8735 + }, + { + "epoch": 2.0382462686567164, + "grad_norm": 0.34358252515801474, + "learning_rate": 3.601409745556315e-05, + "loss": 0.4023, + "step": 8740 + }, + { + "epoch": 2.0394123134328357, + "grad_norm": 0.3461697819524935, + "learning_rate": 3.5998033047583194e-05, + "loss": 0.3912, + "step": 8745 + }, + { + "epoch": 2.0405783582089554, + "grad_norm": 0.334638657383683, + "learning_rate": 3.598196358529906e-05, + "loss": 0.3813, + "step": 8750 + }, + { + "epoch": 2.0417444029850746, + "grad_norm": 0.3421329198507935, + "learning_rate": 3.596588907826824e-05, + "loss": 0.3862, + "step": 8755 + }, + { + "epoch": 2.042910447761194, + "grad_norm": 0.34729731676881953, + "learning_rate": 3.5949809536051235e-05, + "loss": 0.4017, + "step": 8760 + }, + { + "epoch": 2.0440764925373136, + "grad_norm": 0.32402886143278264, + "learning_rate": 3.593372496821154e-05, + "loss": 0.3999, + "step": 8765 + }, + { + "epoch": 2.045242537313433, + "grad_norm": 0.3485694446078643, + "learning_rate": 3.591763538431563e-05, + "loss": 0.3875, + "step": 8770 + }, + { + "epoch": 2.046408582089552, + "grad_norm": 0.3341445435700491, + "learning_rate": 3.5901540793933e-05, + "loss": 0.3863, + "step": 8775 + }, + { + "epoch": 2.047574626865672, + "grad_norm": 0.34310854212308417, + "learning_rate": 3.5885441206636065e-05, + "loss": 0.3893, + "step": 8780 + }, + { + "epoch": 2.048740671641791, + "grad_norm": 0.3595784233067295, + "learning_rate": 3.586933663200026e-05, + "loss": 0.4177, + "step": 8785 + }, + { + "epoch": 2.0499067164179103, + "grad_norm": 0.3301877722080081, + "learning_rate": 3.585322707960397e-05, + "loss": 0.3946, + "step": 8790 + }, + { + "epoch": 2.05107276119403, + "grad_norm": 0.32064347002971455, + "learning_rate": 3.583711255902853e-05, + "loss": 0.3778, + "step": 8795 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.36183342848231537, + "learning_rate": 3.5820993079858235e-05, + "loss": 0.4026, + "step": 8800 + }, + { + "epoch": 2.0534048507462686, + "grad_norm": 0.40243763729351273, + "learning_rate": 3.580486865168034e-05, + "loss": 0.4075, + "step": 8805 + }, + { + "epoch": 2.0545708955223883, + "grad_norm": 0.3356995650904215, + "learning_rate": 3.5788739284085044e-05, + "loss": 0.4015, + "step": 8810 + }, + { + "epoch": 2.0557369402985075, + "grad_norm": 0.35874221046646215, + "learning_rate": 3.577260498666546e-05, + "loss": 0.4047, + "step": 8815 + }, + { + "epoch": 2.0569029850746268, + "grad_norm": 0.32224871797620713, + "learning_rate": 3.575646576901767e-05, + "loss": 0.393, + "step": 8820 + }, + { + "epoch": 2.0580690298507465, + "grad_norm": 0.3144167366671361, + "learning_rate": 3.5740321640740646e-05, + "loss": 0.3954, + "step": 8825 + }, + { + "epoch": 2.0592350746268657, + "grad_norm": 0.36127999507923353, + "learning_rate": 3.57241726114363e-05, + "loss": 0.4041, + "step": 8830 + }, + { + "epoch": 2.060401119402985, + "grad_norm": 0.3582324707065836, + "learning_rate": 3.570801869070945e-05, + "loss": 0.4069, + "step": 8835 + }, + { + "epoch": 2.0615671641791047, + "grad_norm": 0.3152933437571003, + "learning_rate": 3.5691859888167846e-05, + "loss": 0.3726, + "step": 8840 + }, + { + "epoch": 2.062733208955224, + "grad_norm": 0.34062557594789267, + "learning_rate": 3.5675696213422105e-05, + "loss": 0.3802, + "step": 8845 + }, + { + "epoch": 2.063899253731343, + "grad_norm": 0.34342544413829695, + "learning_rate": 3.5659527676085774e-05, + "loss": 0.3866, + "step": 8850 + }, + { + "epoch": 2.065065298507463, + "grad_norm": 0.33213617451648014, + "learning_rate": 3.564335428577526e-05, + "loss": 0.4064, + "step": 8855 + }, + { + "epoch": 2.066231343283582, + "grad_norm": 0.31122497710003183, + "learning_rate": 3.56271760521099e-05, + "loss": 0.38, + "step": 8860 + }, + { + "epoch": 2.0673973880597014, + "grad_norm": 0.3247035766512988, + "learning_rate": 3.561099298471187e-05, + "loss": 0.393, + "step": 8865 + }, + { + "epoch": 2.0685634328358207, + "grad_norm": 0.35980760216477076, + "learning_rate": 3.559480509320625e-05, + "loss": 0.4043, + "step": 8870 + }, + { + "epoch": 2.0697294776119404, + "grad_norm": 0.345505798287271, + "learning_rate": 3.557861238722097e-05, + "loss": 0.3766, + "step": 8875 + }, + { + "epoch": 2.0708955223880596, + "grad_norm": 0.33397575478315844, + "learning_rate": 3.556241487638682e-05, + "loss": 0.4005, + "step": 8880 + }, + { + "epoch": 2.0720615671641793, + "grad_norm": 0.34144564488313844, + "learning_rate": 3.554621257033749e-05, + "loss": 0.4157, + "step": 8885 + }, + { + "epoch": 2.0732276119402986, + "grad_norm": 0.3425041520164869, + "learning_rate": 3.5530005478709446e-05, + "loss": 0.4074, + "step": 8890 + }, + { + "epoch": 2.074393656716418, + "grad_norm": 0.3324000766783016, + "learning_rate": 3.551379361114209e-05, + "loss": 0.3976, + "step": 8895 + }, + { + "epoch": 2.075559701492537, + "grad_norm": 0.3336250221139361, + "learning_rate": 3.549757697727759e-05, + "loss": 0.3874, + "step": 8900 + }, + { + "epoch": 2.076725746268657, + "grad_norm": 0.3262259512746147, + "learning_rate": 3.548135558676098e-05, + "loss": 0.3876, + "step": 8905 + }, + { + "epoch": 2.077891791044776, + "grad_norm": 0.353796191706053, + "learning_rate": 3.546512944924014e-05, + "loss": 0.4093, + "step": 8910 + }, + { + "epoch": 2.0790578358208953, + "grad_norm": 0.33076643233497205, + "learning_rate": 3.544889857436573e-05, + "loss": 0.3884, + "step": 8915 + }, + { + "epoch": 2.080223880597015, + "grad_norm": 0.3392843554673036, + "learning_rate": 3.5432662971791264e-05, + "loss": 0.3999, + "step": 8920 + }, + { + "epoch": 2.0813899253731343, + "grad_norm": 0.351654409620618, + "learning_rate": 3.541642265117306e-05, + "loss": 0.4009, + "step": 8925 + }, + { + "epoch": 2.0825559701492535, + "grad_norm": 0.323086059014892, + "learning_rate": 3.540017762217023e-05, + "loss": 0.3788, + "step": 8930 + }, + { + "epoch": 2.0837220149253732, + "grad_norm": 0.34201271685018747, + "learning_rate": 3.5383927894444694e-05, + "loss": 0.3927, + "step": 8935 + }, + { + "epoch": 2.0848880597014925, + "grad_norm": 0.3164595992648572, + "learning_rate": 3.5367673477661174e-05, + "loss": 0.3979, + "step": 8940 + }, + { + "epoch": 2.0860541044776117, + "grad_norm": 0.353067105386722, + "learning_rate": 3.535141438148717e-05, + "loss": 0.4043, + "step": 8945 + }, + { + "epoch": 2.0872201492537314, + "grad_norm": 0.32638760846277376, + "learning_rate": 3.533515061559297e-05, + "loss": 0.3935, + "step": 8950 + }, + { + "epoch": 2.0883861940298507, + "grad_norm": 0.33745419508434843, + "learning_rate": 3.5318882189651635e-05, + "loss": 0.3981, + "step": 8955 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.3363813168728781, + "learning_rate": 3.5302609113339e-05, + "loss": 0.3857, + "step": 8960 + }, + { + "epoch": 2.0907182835820897, + "grad_norm": 0.3505872812787426, + "learning_rate": 3.5286331396333675e-05, + "loss": 0.3998, + "step": 8965 + }, + { + "epoch": 2.091884328358209, + "grad_norm": 0.3560302700326745, + "learning_rate": 3.5270049048317016e-05, + "loss": 0.4038, + "step": 8970 + }, + { + "epoch": 2.093050373134328, + "grad_norm": 0.3296137319524784, + "learning_rate": 3.525376207897314e-05, + "loss": 0.3855, + "step": 8975 + }, + { + "epoch": 2.094216417910448, + "grad_norm": 0.3257709438873327, + "learning_rate": 3.5237470497988905e-05, + "loss": 0.3851, + "step": 8980 + }, + { + "epoch": 2.095382462686567, + "grad_norm": 0.30911319531279924, + "learning_rate": 3.5221174315053935e-05, + "loss": 0.3838, + "step": 8985 + }, + { + "epoch": 2.0965485074626864, + "grad_norm": 0.32163696411856363, + "learning_rate": 3.520487353986056e-05, + "loss": 0.3876, + "step": 8990 + }, + { + "epoch": 2.097714552238806, + "grad_norm": 0.33223213332866475, + "learning_rate": 3.518856818210387e-05, + "loss": 0.3889, + "step": 8995 + }, + { + "epoch": 2.0988805970149254, + "grad_norm": 0.3368633401865806, + "learning_rate": 3.517225825148164e-05, + "loss": 0.4188, + "step": 9000 + }, + { + "epoch": 2.1000466417910446, + "grad_norm": 0.36498126923311375, + "learning_rate": 3.515594375769442e-05, + "loss": 0.4094, + "step": 9005 + }, + { + "epoch": 2.1012126865671643, + "grad_norm": 0.3357021014397493, + "learning_rate": 3.513962471044543e-05, + "loss": 0.4131, + "step": 9010 + }, + { + "epoch": 2.1023787313432836, + "grad_norm": 0.3413655532634832, + "learning_rate": 3.512330111944062e-05, + "loss": 0.4029, + "step": 9015 + }, + { + "epoch": 2.103544776119403, + "grad_norm": 0.35141018831844995, + "learning_rate": 3.510697299438864e-05, + "loss": 0.3926, + "step": 9020 + }, + { + "epoch": 2.1047108208955225, + "grad_norm": 0.3694729663325888, + "learning_rate": 3.509064034500082e-05, + "loss": 0.4102, + "step": 9025 + }, + { + "epoch": 2.105876865671642, + "grad_norm": 0.33303225255011376, + "learning_rate": 3.50743031809912e-05, + "loss": 0.4045, + "step": 9030 + }, + { + "epoch": 2.107042910447761, + "grad_norm": 0.3563933543982995, + "learning_rate": 3.505796151207651e-05, + "loss": 0.4373, + "step": 9035 + }, + { + "epoch": 2.1082089552238807, + "grad_norm": 0.33690360102280603, + "learning_rate": 3.504161534797612e-05, + "loss": 0.3864, + "step": 9040 + }, + { + "epoch": 2.109375, + "grad_norm": 0.3138880897274856, + "learning_rate": 3.5025264698412126e-05, + "loss": 0.4054, + "step": 9045 + }, + { + "epoch": 2.1105410447761193, + "grad_norm": 0.359526925655807, + "learning_rate": 3.500890957310926e-05, + "loss": 0.4184, + "step": 9050 + }, + { + "epoch": 2.111707089552239, + "grad_norm": 0.3357611988379197, + "learning_rate": 3.4992549981794915e-05, + "loss": 0.3932, + "step": 9055 + }, + { + "epoch": 2.112873134328358, + "grad_norm": 0.3585510708956351, + "learning_rate": 3.497618593419916e-05, + "loss": 0.397, + "step": 9060 + }, + { + "epoch": 2.1140391791044775, + "grad_norm": 0.35707551160540557, + "learning_rate": 3.495981744005471e-05, + "loss": 0.3925, + "step": 9065 + }, + { + "epoch": 2.115205223880597, + "grad_norm": 0.3460622434339384, + "learning_rate": 3.494344450909689e-05, + "loss": 0.4276, + "step": 9070 + }, + { + "epoch": 2.1163712686567164, + "grad_norm": 0.3354501658692491, + "learning_rate": 3.492706715106372e-05, + "loss": 0.3884, + "step": 9075 + }, + { + "epoch": 2.1175373134328357, + "grad_norm": 0.33888650995437664, + "learning_rate": 3.491068537569581e-05, + "loss": 0.404, + "step": 9080 + }, + { + "epoch": 2.1187033582089554, + "grad_norm": 0.3515085575344124, + "learning_rate": 3.489429919273642e-05, + "loss": 0.3982, + "step": 9085 + }, + { + "epoch": 2.1198694029850746, + "grad_norm": 0.3394430153326523, + "learning_rate": 3.4877908611931406e-05, + "loss": 0.4146, + "step": 9090 + }, + { + "epoch": 2.121035447761194, + "grad_norm": 0.32224241041683155, + "learning_rate": 3.486151364302928e-05, + "loss": 0.3914, + "step": 9095 + }, + { + "epoch": 2.1222014925373136, + "grad_norm": 0.3755355862884108, + "learning_rate": 3.484511429578113e-05, + "loss": 0.3956, + "step": 9100 + }, + { + "epoch": 2.123367537313433, + "grad_norm": 0.3395061826382033, + "learning_rate": 3.482871057994065e-05, + "loss": 0.3895, + "step": 9105 + }, + { + "epoch": 2.124533582089552, + "grad_norm": 0.3187421326522484, + "learning_rate": 3.481230250526416e-05, + "loss": 0.395, + "step": 9110 + }, + { + "epoch": 2.125699626865672, + "grad_norm": 0.32836308004681614, + "learning_rate": 3.479589008151054e-05, + "loss": 0.3867, + "step": 9115 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.36740210947329216, + "learning_rate": 3.477947331844127e-05, + "loss": 0.4013, + "step": 9120 + }, + { + "epoch": 2.1280317164179103, + "grad_norm": 0.34743179920168893, + "learning_rate": 3.476305222582042e-05, + "loss": 0.3923, + "step": 9125 + }, + { + "epoch": 2.12919776119403, + "grad_norm": 0.35436895456317047, + "learning_rate": 3.4746626813414624e-05, + "loss": 0.406, + "step": 9130 + }, + { + "epoch": 2.1303638059701493, + "grad_norm": 0.33055851137695413, + "learning_rate": 3.4730197090993084e-05, + "loss": 0.3835, + "step": 9135 + }, + { + "epoch": 2.1315298507462686, + "grad_norm": 0.32502673775303126, + "learning_rate": 3.471376306832756e-05, + "loss": 0.3786, + "step": 9140 + }, + { + "epoch": 2.1326958955223883, + "grad_norm": 0.33239252261803837, + "learning_rate": 3.4697324755192387e-05, + "loss": 0.3877, + "step": 9145 + }, + { + "epoch": 2.1338619402985075, + "grad_norm": 0.3717027978325473, + "learning_rate": 3.468088216136445e-05, + "loss": 0.413, + "step": 9150 + }, + { + "epoch": 2.1350279850746268, + "grad_norm": 0.3536399748722503, + "learning_rate": 3.466443529662317e-05, + "loss": 0.4027, + "step": 9155 + }, + { + "epoch": 2.1361940298507465, + "grad_norm": 0.33164638266796576, + "learning_rate": 3.4647984170750506e-05, + "loss": 0.3925, + "step": 9160 + }, + { + "epoch": 2.1373600746268657, + "grad_norm": 0.351036727625949, + "learning_rate": 3.463152879353097e-05, + "loss": 0.4011, + "step": 9165 + }, + { + "epoch": 2.138526119402985, + "grad_norm": 0.3344343878280186, + "learning_rate": 3.4615069174751566e-05, + "loss": 0.3936, + "step": 9170 + }, + { + "epoch": 2.1396921641791047, + "grad_norm": 0.3284585539540017, + "learning_rate": 3.459860532420186e-05, + "loss": 0.3984, + "step": 9175 + }, + { + "epoch": 2.140858208955224, + "grad_norm": 0.32979577914435176, + "learning_rate": 3.4582137251673916e-05, + "loss": 0.3819, + "step": 9180 + }, + { + "epoch": 2.142024253731343, + "grad_norm": 0.346927762213058, + "learning_rate": 3.456566496696232e-05, + "loss": 0.406, + "step": 9185 + }, + { + "epoch": 2.143190298507463, + "grad_norm": 0.3234394629275177, + "learning_rate": 3.454918847986414e-05, + "loss": 0.3878, + "step": 9190 + }, + { + "epoch": 2.144356343283582, + "grad_norm": 0.3235265627151897, + "learning_rate": 3.453270780017897e-05, + "loss": 0.4144, + "step": 9195 + }, + { + "epoch": 2.1455223880597014, + "grad_norm": 0.3508045257734673, + "learning_rate": 3.451622293770889e-05, + "loss": 0.4115, + "step": 9200 + }, + { + "epoch": 2.1466884328358207, + "grad_norm": 0.3212937139834875, + "learning_rate": 3.4499733902258446e-05, + "loss": 0.3905, + "step": 9205 + }, + { + "epoch": 2.1478544776119404, + "grad_norm": 0.34576430727001417, + "learning_rate": 3.448324070363469e-05, + "loss": 0.3993, + "step": 9210 + }, + { + "epoch": 2.1490205223880596, + "grad_norm": 0.3617695153550067, + "learning_rate": 3.446674335164716e-05, + "loss": 0.4012, + "step": 9215 + }, + { + "epoch": 2.1501865671641793, + "grad_norm": 0.33404261776562705, + "learning_rate": 3.445024185610783e-05, + "loss": 0.3844, + "step": 9220 + }, + { + "epoch": 2.1513526119402986, + "grad_norm": 0.32591207490785173, + "learning_rate": 3.443373622683116e-05, + "loss": 0.399, + "step": 9225 + }, + { + "epoch": 2.152518656716418, + "grad_norm": 0.344817492879971, + "learning_rate": 3.441722647363408e-05, + "loss": 0.4041, + "step": 9230 + }, + { + "epoch": 2.153684701492537, + "grad_norm": 0.36815130827464715, + "learning_rate": 3.440071260633594e-05, + "loss": 0.4036, + "step": 9235 + }, + { + "epoch": 2.154850746268657, + "grad_norm": 0.3220001451072475, + "learning_rate": 3.438419463475857e-05, + "loss": 0.3942, + "step": 9240 + }, + { + "epoch": 2.156016791044776, + "grad_norm": 0.32337108417941435, + "learning_rate": 3.436767256872621e-05, + "loss": 0.4066, + "step": 9245 + }, + { + "epoch": 2.1571828358208953, + "grad_norm": 0.33426588397013024, + "learning_rate": 3.435114641806557e-05, + "loss": 0.3942, + "step": 9250 + }, + { + "epoch": 2.158348880597015, + "grad_norm": 0.3371874695263784, + "learning_rate": 3.433461619260575e-05, + "loss": 0.4081, + "step": 9255 + }, + { + "epoch": 2.1595149253731343, + "grad_norm": 0.34791760129153243, + "learning_rate": 3.43180819021783e-05, + "loss": 0.4131, + "step": 9260 + }, + { + "epoch": 2.1606809701492535, + "grad_norm": 0.342867541625907, + "learning_rate": 3.4301543556617206e-05, + "loss": 0.4001, + "step": 9265 + }, + { + "epoch": 2.1618470149253732, + "grad_norm": 0.3652891668487797, + "learning_rate": 3.428500116575881e-05, + "loss": 0.4084, + "step": 9270 + }, + { + "epoch": 2.1630130597014925, + "grad_norm": 0.33482197584024415, + "learning_rate": 3.42684547394419e-05, + "loss": 0.3982, + "step": 9275 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.3103298482949906, + "learning_rate": 3.425190428750767e-05, + "loss": 0.3843, + "step": 9280 + }, + { + "epoch": 2.1653451492537314, + "grad_norm": 0.32023976082837036, + "learning_rate": 3.423534981979968e-05, + "loss": 0.3807, + "step": 9285 + }, + { + "epoch": 2.1665111940298507, + "grad_norm": 0.3375338948737711, + "learning_rate": 3.4218791346163894e-05, + "loss": 0.406, + "step": 9290 + }, + { + "epoch": 2.16767723880597, + "grad_norm": 0.3391712756288744, + "learning_rate": 3.420222887644866e-05, + "loss": 0.4014, + "step": 9295 + }, + { + "epoch": 2.1688432835820897, + "grad_norm": 0.3264185092818876, + "learning_rate": 3.41856624205047e-05, + "loss": 0.4011, + "step": 9300 + }, + { + "epoch": 2.170009328358209, + "grad_norm": 0.3625198753398349, + "learning_rate": 3.4169091988185106e-05, + "loss": 0.4163, + "step": 9305 + }, + { + "epoch": 2.171175373134328, + "grad_norm": 0.3466832342198577, + "learning_rate": 3.415251758934534e-05, + "loss": 0.4123, + "step": 9310 + }, + { + "epoch": 2.172341417910448, + "grad_norm": 0.3274812401812581, + "learning_rate": 3.413593923384321e-05, + "loss": 0.3967, + "step": 9315 + }, + { + "epoch": 2.173507462686567, + "grad_norm": 0.3290803952497196, + "learning_rate": 3.4119356931538894e-05, + "loss": 0.3828, + "step": 9320 + }, + { + "epoch": 2.1746735074626864, + "grad_norm": 0.3727908774190808, + "learning_rate": 3.410277069229491e-05, + "loss": 0.4023, + "step": 9325 + }, + { + "epoch": 2.175839552238806, + "grad_norm": 0.3168087450759355, + "learning_rate": 3.408618052597611e-05, + "loss": 0.3939, + "step": 9330 + }, + { + "epoch": 2.1770055970149254, + "grad_norm": 0.3472772751440761, + "learning_rate": 3.4069586442449684e-05, + "loss": 0.41, + "step": 9335 + }, + { + "epoch": 2.1781716417910446, + "grad_norm": 0.34129902684683067, + "learning_rate": 3.405298845158518e-05, + "loss": 0.3956, + "step": 9340 + }, + { + "epoch": 2.1793376865671643, + "grad_norm": 0.34556466209948566, + "learning_rate": 3.403638656325442e-05, + "loss": 0.3915, + "step": 9345 + }, + { + "epoch": 2.1805037313432836, + "grad_norm": 0.3350484982276217, + "learning_rate": 3.4019780787331586e-05, + "loss": 0.4078, + "step": 9350 + }, + { + "epoch": 2.181669776119403, + "grad_norm": 0.35006707466476816, + "learning_rate": 3.4003171133693154e-05, + "loss": 0.4107, + "step": 9355 + }, + { + "epoch": 2.1828358208955225, + "grad_norm": 0.34231880530649994, + "learning_rate": 3.3986557612217904e-05, + "loss": 0.3896, + "step": 9360 + }, + { + "epoch": 2.184001865671642, + "grad_norm": 0.34337827685609007, + "learning_rate": 3.396994023278693e-05, + "loss": 0.4086, + "step": 9365 + }, + { + "epoch": 2.185167910447761, + "grad_norm": 0.3434139017017427, + "learning_rate": 3.3953319005283606e-05, + "loss": 0.3945, + "step": 9370 + }, + { + "epoch": 2.1863339552238807, + "grad_norm": 0.3263391562178993, + "learning_rate": 3.393669393959361e-05, + "loss": 0.407, + "step": 9375 + }, + { + "epoch": 2.1875, + "grad_norm": 0.35678583131187686, + "learning_rate": 3.392006504560487e-05, + "loss": 0.4079, + "step": 9380 + }, + { + "epoch": 2.1886660447761193, + "grad_norm": 0.35338585088865887, + "learning_rate": 3.390343233320764e-05, + "loss": 0.3887, + "step": 9385 + }, + { + "epoch": 2.189832089552239, + "grad_norm": 0.34589020653771424, + "learning_rate": 3.388679581229441e-05, + "loss": 0.404, + "step": 9390 + }, + { + "epoch": 2.190998134328358, + "grad_norm": 0.34053259727139085, + "learning_rate": 3.3870155492759936e-05, + "loss": 0.3934, + "step": 9395 + }, + { + "epoch": 2.1921641791044775, + "grad_norm": 0.3437211990982055, + "learning_rate": 3.3853511384501256e-05, + "loss": 0.3875, + "step": 9400 + }, + { + "epoch": 2.193330223880597, + "grad_norm": 0.3264115450415562, + "learning_rate": 3.3836863497417645e-05, + "loss": 0.3912, + "step": 9405 + }, + { + "epoch": 2.1944962686567164, + "grad_norm": 0.33324491583888916, + "learning_rate": 3.382021184141062e-05, + "loss": 0.3827, + "step": 9410 + }, + { + "epoch": 2.1956623134328357, + "grad_norm": 0.3512413108243904, + "learning_rate": 3.3803556426383954e-05, + "loss": 0.3853, + "step": 9415 + }, + { + "epoch": 2.1968283582089554, + "grad_norm": 0.3429981627700293, + "learning_rate": 3.378689726224364e-05, + "loss": 0.4035, + "step": 9420 + }, + { + "epoch": 2.1979944029850746, + "grad_norm": 0.33736698789697045, + "learning_rate": 3.3770234358897926e-05, + "loss": 0.3964, + "step": 9425 + }, + { + "epoch": 2.199160447761194, + "grad_norm": 0.3567326263315446, + "learning_rate": 3.3753567726257255e-05, + "loss": 0.4037, + "step": 9430 + }, + { + "epoch": 2.2003264925373136, + "grad_norm": 0.3468438640866795, + "learning_rate": 3.373689737423431e-05, + "loss": 0.3989, + "step": 9435 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.33211326307873107, + "learning_rate": 3.372022331274397e-05, + "loss": 0.401, + "step": 9440 + }, + { + "epoch": 2.202658582089552, + "grad_norm": 0.3263460463032437, + "learning_rate": 3.3703545551703326e-05, + "loss": 0.3953, + "step": 9445 + }, + { + "epoch": 2.203824626865672, + "grad_norm": 0.3094804604925941, + "learning_rate": 3.368686410103167e-05, + "loss": 0.3878, + "step": 9450 + }, + { + "epoch": 2.204990671641791, + "grad_norm": 0.3312175611581137, + "learning_rate": 3.367017897065051e-05, + "loss": 0.3713, + "step": 9455 + }, + { + "epoch": 2.2061567164179103, + "grad_norm": 0.3311897369406946, + "learning_rate": 3.3653490170483485e-05, + "loss": 0.3948, + "step": 9460 + }, + { + "epoch": 2.20732276119403, + "grad_norm": 0.3395134715793302, + "learning_rate": 3.363679771045648e-05, + "loss": 0.3856, + "step": 9465 + }, + { + "epoch": 2.2084888059701493, + "grad_norm": 0.31358378364835615, + "learning_rate": 3.3620101600497526e-05, + "loss": 0.3981, + "step": 9470 + }, + { + "epoch": 2.2096548507462686, + "grad_norm": 0.32807405832671016, + "learning_rate": 3.360340185053683e-05, + "loss": 0.3768, + "step": 9475 + }, + { + "epoch": 2.2108208955223883, + "grad_norm": 0.3477184959298971, + "learning_rate": 3.358669847050676e-05, + "loss": 0.4165, + "step": 9480 + }, + { + "epoch": 2.2119869402985075, + "grad_norm": 0.32566328783386406, + "learning_rate": 3.356999147034184e-05, + "loss": 0.3902, + "step": 9485 + }, + { + "epoch": 2.2131529850746268, + "grad_norm": 0.3331101687492885, + "learning_rate": 3.355328085997876e-05, + "loss": 0.3905, + "step": 9490 + }, + { + "epoch": 2.2143190298507465, + "grad_norm": 0.3425856257792634, + "learning_rate": 3.3536566649356356e-05, + "loss": 0.3923, + "step": 9495 + }, + { + "epoch": 2.2154850746268657, + "grad_norm": 0.33919056245823614, + "learning_rate": 3.351984884841558e-05, + "loss": 0.3984, + "step": 9500 + }, + { + "epoch": 2.216651119402985, + "grad_norm": 0.3544595341566174, + "learning_rate": 3.350312746709956e-05, + "loss": 0.4016, + "step": 9505 + }, + { + "epoch": 2.2178171641791047, + "grad_norm": 0.3292091593354357, + "learning_rate": 3.348640251535352e-05, + "loss": 0.4102, + "step": 9510 + }, + { + "epoch": 2.218983208955224, + "grad_norm": 0.3485922388731389, + "learning_rate": 3.346967400312482e-05, + "loss": 0.4208, + "step": 9515 + }, + { + "epoch": 2.220149253731343, + "grad_norm": 0.32920454376985353, + "learning_rate": 3.3452941940362946e-05, + "loss": 0.4029, + "step": 9520 + }, + { + "epoch": 2.221315298507463, + "grad_norm": 0.33808076152472294, + "learning_rate": 3.343620633701948e-05, + "loss": 0.3867, + "step": 9525 + }, + { + "epoch": 2.222481343283582, + "grad_norm": 0.3576606952663629, + "learning_rate": 3.341946720304812e-05, + "loss": 0.404, + "step": 9530 + }, + { + "epoch": 2.2236473880597014, + "grad_norm": 0.31820374339825297, + "learning_rate": 3.340272454840466e-05, + "loss": 0.4155, + "step": 9535 + }, + { + "epoch": 2.2248134328358207, + "grad_norm": 0.30741877370719306, + "learning_rate": 3.3385978383046996e-05, + "loss": 0.3751, + "step": 9540 + }, + { + "epoch": 2.2259794776119404, + "grad_norm": 0.35617692539846185, + "learning_rate": 3.336922871693509e-05, + "loss": 0.4088, + "step": 9545 + }, + { + "epoch": 2.2271455223880596, + "grad_norm": 0.35323849990030076, + "learning_rate": 3.335247556003101e-05, + "loss": 0.4076, + "step": 9550 + }, + { + "epoch": 2.2283115671641793, + "grad_norm": 0.35950126223781953, + "learning_rate": 3.33357189222989e-05, + "loss": 0.3997, + "step": 9555 + }, + { + "epoch": 2.2294776119402986, + "grad_norm": 0.3304510599891847, + "learning_rate": 3.331895881370495e-05, + "loss": 0.3898, + "step": 9560 + }, + { + "epoch": 2.230643656716418, + "grad_norm": 0.34137056131587445, + "learning_rate": 3.3302195244217435e-05, + "loss": 0.4042, + "step": 9565 + }, + { + "epoch": 2.231809701492537, + "grad_norm": 0.34319746372534804, + "learning_rate": 3.32854282238067e-05, + "loss": 0.4064, + "step": 9570 + }, + { + "epoch": 2.232975746268657, + "grad_norm": 0.3383452593319318, + "learning_rate": 3.326865776244509e-05, + "loss": 0.4051, + "step": 9575 + }, + { + "epoch": 2.234141791044776, + "grad_norm": 0.3131809638477496, + "learning_rate": 3.3251883870107066e-05, + "loss": 0.386, + "step": 9580 + }, + { + "epoch": 2.2353078358208953, + "grad_norm": 0.33853554060608537, + "learning_rate": 3.323510655676906e-05, + "loss": 0.3966, + "step": 9585 + }, + { + "epoch": 2.236473880597015, + "grad_norm": 0.36488474678062477, + "learning_rate": 3.3218325832409616e-05, + "loss": 0.4282, + "step": 9590 + }, + { + "epoch": 2.2376399253731343, + "grad_norm": 0.3474887152647101, + "learning_rate": 3.320154170700925e-05, + "loss": 0.3923, + "step": 9595 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.340132436894961, + "learning_rate": 3.3184754190550506e-05, + "loss": 0.3976, + "step": 9600 + }, + { + "epoch": 2.2399720149253732, + "grad_norm": 0.35033706963579114, + "learning_rate": 3.316796329301796e-05, + "loss": 0.4187, + "step": 9605 + }, + { + "epoch": 2.2411380597014925, + "grad_norm": 0.31748039738442313, + "learning_rate": 3.31511690243982e-05, + "loss": 0.4008, + "step": 9610 + }, + { + "epoch": 2.2423041044776117, + "grad_norm": 0.32418023288216874, + "learning_rate": 3.3134371394679806e-05, + "loss": 0.3962, + "step": 9615 + }, + { + "epoch": 2.2434701492537314, + "grad_norm": 0.3481031422132577, + "learning_rate": 3.3117570413853373e-05, + "loss": 0.4048, + "step": 9620 + }, + { + "epoch": 2.2446361940298507, + "grad_norm": 0.3533859620834045, + "learning_rate": 3.3100766091911464e-05, + "loss": 0.4183, + "step": 9625 + }, + { + "epoch": 2.24580223880597, + "grad_norm": 0.322320201186157, + "learning_rate": 3.308395843884866e-05, + "loss": 0.3839, + "step": 9630 + }, + { + "epoch": 2.2469682835820897, + "grad_norm": 0.3439397217590057, + "learning_rate": 3.30671474646615e-05, + "loss": 0.3992, + "step": 9635 + }, + { + "epoch": 2.248134328358209, + "grad_norm": 0.34683716299409695, + "learning_rate": 3.305033317934852e-05, + "loss": 0.4004, + "step": 9640 + }, + { + "epoch": 2.249300373134328, + "grad_norm": 0.3572421224821308, + "learning_rate": 3.30335155929102e-05, + "loss": 0.4011, + "step": 9645 + }, + { + "epoch": 2.250466417910448, + "grad_norm": 0.32862119253173727, + "learning_rate": 3.301669471534899e-05, + "loss": 0.3959, + "step": 9650 + }, + { + "epoch": 2.251632462686567, + "grad_norm": 0.33086323543403445, + "learning_rate": 3.299987055666932e-05, + "loss": 0.4069, + "step": 9655 + }, + { + "epoch": 2.2527985074626864, + "grad_norm": 0.3373749666544192, + "learning_rate": 3.298304312687754e-05, + "loss": 0.4003, + "step": 9660 + }, + { + "epoch": 2.253964552238806, + "grad_norm": 0.3321302407645913, + "learning_rate": 3.2966212435981975e-05, + "loss": 0.4021, + "step": 9665 + }, + { + "epoch": 2.2551305970149254, + "grad_norm": 0.3616873156900573, + "learning_rate": 3.2949378493992854e-05, + "loss": 0.4113, + "step": 9670 + }, + { + "epoch": 2.2562966417910446, + "grad_norm": 0.3510310352513335, + "learning_rate": 3.293254131092238e-05, + "loss": 0.3978, + "step": 9675 + }, + { + "epoch": 2.2574626865671643, + "grad_norm": 0.34763794392877756, + "learning_rate": 3.2915700896784655e-05, + "loss": 0.4025, + "step": 9680 + }, + { + "epoch": 2.2586287313432836, + "grad_norm": 0.3547637725037518, + "learning_rate": 3.28988572615957e-05, + "loss": 0.409, + "step": 9685 + }, + { + "epoch": 2.259794776119403, + "grad_norm": 0.3326050672795035, + "learning_rate": 3.288201041537348e-05, + "loss": 0.4035, + "step": 9690 + }, + { + "epoch": 2.2609608208955225, + "grad_norm": 0.35370414029326885, + "learning_rate": 3.286516036813785e-05, + "loss": 0.4073, + "step": 9695 + }, + { + "epoch": 2.262126865671642, + "grad_norm": 0.33248837327294756, + "learning_rate": 3.284830712991057e-05, + "loss": 0.4089, + "step": 9700 + }, + { + "epoch": 2.263292910447761, + "grad_norm": 0.32410294453878474, + "learning_rate": 3.28314507107153e-05, + "loss": 0.3995, + "step": 9705 + }, + { + "epoch": 2.2644589552238807, + "grad_norm": 0.33391170367556255, + "learning_rate": 3.281459112057759e-05, + "loss": 0.3857, + "step": 9710 + }, + { + "epoch": 2.265625, + "grad_norm": 0.35798626988928706, + "learning_rate": 3.2797728369524875e-05, + "loss": 0.3893, + "step": 9715 + }, + { + "epoch": 2.2667910447761193, + "grad_norm": 0.3526529318202589, + "learning_rate": 3.2780862467586486e-05, + "loss": 0.3987, + "step": 9720 + }, + { + "epoch": 2.267957089552239, + "grad_norm": 0.3782225159728585, + "learning_rate": 3.2763993424793604e-05, + "loss": 0.4166, + "step": 9725 + }, + { + "epoch": 2.269123134328358, + "grad_norm": 0.35139966768672615, + "learning_rate": 3.2747121251179294e-05, + "loss": 0.392, + "step": 9730 + }, + { + "epoch": 2.2702891791044775, + "grad_norm": 0.3584291853868186, + "learning_rate": 3.273024595677846e-05, + "loss": 0.4138, + "step": 9735 + }, + { + "epoch": 2.271455223880597, + "grad_norm": 0.34507563866710766, + "learning_rate": 3.271336755162792e-05, + "loss": 0.3864, + "step": 9740 + }, + { + "epoch": 2.2726212686567164, + "grad_norm": 0.3710731561050386, + "learning_rate": 3.269648604576625e-05, + "loss": 0.4159, + "step": 9745 + }, + { + "epoch": 2.2737873134328357, + "grad_norm": 0.354054117064921, + "learning_rate": 3.267960144923397e-05, + "loss": 0.4014, + "step": 9750 + }, + { + "epoch": 2.2749533582089554, + "grad_norm": 0.3495485682998862, + "learning_rate": 3.266271377207335e-05, + "loss": 0.4154, + "step": 9755 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.35625687553791074, + "learning_rate": 3.264582302432856e-05, + "loss": 0.3961, + "step": 9760 + }, + { + "epoch": 2.277285447761194, + "grad_norm": 0.3515612044904509, + "learning_rate": 3.262892921604556e-05, + "loss": 0.388, + "step": 9765 + }, + { + "epoch": 2.2784514925373136, + "grad_norm": 0.33953461012900515, + "learning_rate": 3.261203235727214e-05, + "loss": 0.3935, + "step": 9770 + }, + { + "epoch": 2.279617537313433, + "grad_norm": 0.32714035351633375, + "learning_rate": 3.259513245805791e-05, + "loss": 0.3937, + "step": 9775 + }, + { + "epoch": 2.280783582089552, + "grad_norm": 0.36485978305336336, + "learning_rate": 3.2578229528454266e-05, + "loss": 0.4222, + "step": 9780 + }, + { + "epoch": 2.281949626865672, + "grad_norm": 0.3377764941517442, + "learning_rate": 3.256132357851445e-05, + "loss": 0.4027, + "step": 9785 + }, + { + "epoch": 2.283115671641791, + "grad_norm": 0.34529439774227333, + "learning_rate": 3.254441461829344e-05, + "loss": 0.426, + "step": 9790 + }, + { + "epoch": 2.2842817164179103, + "grad_norm": 0.32594908872759104, + "learning_rate": 3.252750265784806e-05, + "loss": 0.4017, + "step": 9795 + }, + { + "epoch": 2.28544776119403, + "grad_norm": 0.3334206547051278, + "learning_rate": 3.251058770723688e-05, + "loss": 0.3859, + "step": 9800 + }, + { + "epoch": 2.2866138059701493, + "grad_norm": 0.3277007088336736, + "learning_rate": 3.249366977652028e-05, + "loss": 0.3842, + "step": 9805 + }, + { + "epoch": 2.2877798507462686, + "grad_norm": 0.34033250680525806, + "learning_rate": 3.247674887576038e-05, + "loss": 0.3945, + "step": 9810 + }, + { + "epoch": 2.2889458955223883, + "grad_norm": 0.34437650469522774, + "learning_rate": 3.24598250150211e-05, + "loss": 0.3943, + "step": 9815 + }, + { + "epoch": 2.2901119402985075, + "grad_norm": 0.3316019832531891, + "learning_rate": 3.2442898204368086e-05, + "loss": 0.4009, + "step": 9820 + }, + { + "epoch": 2.2912779850746268, + "grad_norm": 0.3400697007911317, + "learning_rate": 3.242596845386878e-05, + "loss": 0.4052, + "step": 9825 + }, + { + "epoch": 2.2924440298507465, + "grad_norm": 0.3372049827411618, + "learning_rate": 3.240903577359232e-05, + "loss": 0.4138, + "step": 9830 + }, + { + "epoch": 2.2936100746268657, + "grad_norm": 0.3359693174749889, + "learning_rate": 3.239210017360963e-05, + "loss": 0.3929, + "step": 9835 + }, + { + "epoch": 2.294776119402985, + "grad_norm": 0.3419201946144107, + "learning_rate": 3.237516166399336e-05, + "loss": 0.4041, + "step": 9840 + }, + { + "epoch": 2.2959421641791042, + "grad_norm": 0.344251547989701, + "learning_rate": 3.2358220254817874e-05, + "loss": 0.393, + "step": 9845 + }, + { + "epoch": 2.297108208955224, + "grad_norm": 0.3452504852266753, + "learning_rate": 3.234127595615927e-05, + "loss": 0.3845, + "step": 9850 + }, + { + "epoch": 2.298274253731343, + "grad_norm": 0.346391228822272, + "learning_rate": 3.232432877809538e-05, + "loss": 0.4132, + "step": 9855 + }, + { + "epoch": 2.299440298507463, + "grad_norm": 0.34399752821195706, + "learning_rate": 3.230737873070574e-05, + "loss": 0.4211, + "step": 9860 + }, + { + "epoch": 2.300606343283582, + "grad_norm": 0.31554100422299874, + "learning_rate": 3.229042582407157e-05, + "loss": 0.4232, + "step": 9865 + }, + { + "epoch": 2.3017723880597014, + "grad_norm": 0.3470044349125867, + "learning_rate": 3.2273470068275816e-05, + "loss": 0.3889, + "step": 9870 + }, + { + "epoch": 2.3029384328358207, + "grad_norm": 0.3576250171073377, + "learning_rate": 3.225651147340312e-05, + "loss": 0.4068, + "step": 9875 + }, + { + "epoch": 2.3041044776119404, + "grad_norm": 0.33846918522103525, + "learning_rate": 3.223955004953979e-05, + "loss": 0.4041, + "step": 9880 + }, + { + "epoch": 2.3052705223880596, + "grad_norm": 0.33351911581021004, + "learning_rate": 3.222258580677385e-05, + "loss": 0.4118, + "step": 9885 + }, + { + "epoch": 2.3064365671641793, + "grad_norm": 0.33271982300791403, + "learning_rate": 3.220561875519495e-05, + "loss": 0.3925, + "step": 9890 + }, + { + "epoch": 2.3076026119402986, + "grad_norm": 0.3381145215479696, + "learning_rate": 3.218864890489446e-05, + "loss": 0.387, + "step": 9895 + }, + { + "epoch": 2.308768656716418, + "grad_norm": 0.32310480672472286, + "learning_rate": 3.2171676265965415e-05, + "loss": 0.3948, + "step": 9900 + }, + { + "epoch": 2.309934701492537, + "grad_norm": 0.3189292949217755, + "learning_rate": 3.2154700848502454e-05, + "loss": 0.402, + "step": 9905 + }, + { + "epoch": 2.311100746268657, + "grad_norm": 0.31740468766170504, + "learning_rate": 3.2137722662601934e-05, + "loss": 0.3799, + "step": 9910 + }, + { + "epoch": 2.312266791044776, + "grad_norm": 0.35015890204685557, + "learning_rate": 3.212074171836181e-05, + "loss": 0.4188, + "step": 9915 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.32065110431934984, + "learning_rate": 3.21037580258817e-05, + "loss": 0.3726, + "step": 9920 + }, + { + "epoch": 2.314598880597015, + "grad_norm": 0.3322442176622855, + "learning_rate": 3.208677159526287e-05, + "loss": 0.3967, + "step": 9925 + }, + { + "epoch": 2.3157649253731343, + "grad_norm": 0.32783982981262094, + "learning_rate": 3.206978243660817e-05, + "loss": 0.3906, + "step": 9930 + }, + { + "epoch": 2.3169309701492535, + "grad_norm": 0.32141862450794195, + "learning_rate": 3.205279056002212e-05, + "loss": 0.3816, + "step": 9935 + }, + { + "epoch": 2.3180970149253732, + "grad_norm": 0.3331699519161977, + "learning_rate": 3.203579597561082e-05, + "loss": 0.3759, + "step": 9940 + }, + { + "epoch": 2.3192630597014925, + "grad_norm": 0.37151609855873413, + "learning_rate": 3.2018798693482015e-05, + "loss": 0.3915, + "step": 9945 + }, + { + "epoch": 2.3204291044776117, + "grad_norm": 0.3185923320139489, + "learning_rate": 3.200179872374503e-05, + "loss": 0.3916, + "step": 9950 + }, + { + "epoch": 2.3215951492537314, + "grad_norm": 0.33891971937758, + "learning_rate": 3.198479607651079e-05, + "loss": 0.4028, + "step": 9955 + }, + { + "epoch": 2.3227611940298507, + "grad_norm": 0.34374813820010514, + "learning_rate": 3.1967790761891826e-05, + "loss": 0.396, + "step": 9960 + }, + { + "epoch": 2.32392723880597, + "grad_norm": 0.3279161974190552, + "learning_rate": 3.1950782790002236e-05, + "loss": 0.3876, + "step": 9965 + }, + { + "epoch": 2.3250932835820897, + "grad_norm": 0.32730251279290634, + "learning_rate": 3.1933772170957716e-05, + "loss": 0.4025, + "step": 9970 + }, + { + "epoch": 2.326259328358209, + "grad_norm": 0.3267150501264816, + "learning_rate": 3.191675891487554e-05, + "loss": 0.3877, + "step": 9975 + }, + { + "epoch": 2.327425373134328, + "grad_norm": 0.34114222456197135, + "learning_rate": 3.189974303187452e-05, + "loss": 0.3899, + "step": 9980 + }, + { + "epoch": 2.328591417910448, + "grad_norm": 0.32092975001225715, + "learning_rate": 3.188272453207507e-05, + "loss": 0.3781, + "step": 9985 + }, + { + "epoch": 2.329757462686567, + "grad_norm": 0.3920402189328833, + "learning_rate": 3.186570342559912e-05, + "loss": 0.3985, + "step": 9990 + }, + { + "epoch": 2.3309235074626864, + "grad_norm": 0.33039051930391516, + "learning_rate": 3.184867972257019e-05, + "loss": 0.3819, + "step": 9995 + }, + { + "epoch": 2.332089552238806, + "grad_norm": 0.31857875199394975, + "learning_rate": 3.1831653433113317e-05, + "loss": 0.4033, + "step": 10000 + }, + { + "epoch": 2.3332555970149254, + "grad_norm": 0.3476619668416482, + "learning_rate": 3.1814624567355087e-05, + "loss": 0.4156, + "step": 10005 + }, + { + "epoch": 2.3344216417910446, + "grad_norm": 0.34466914526162007, + "learning_rate": 3.179759313542362e-05, + "loss": 0.4056, + "step": 10010 + }, + { + "epoch": 2.3355876865671643, + "grad_norm": 0.319779769106911, + "learning_rate": 3.1780559147448554e-05, + "loss": 0.3944, + "step": 10015 + }, + { + "epoch": 2.3367537313432836, + "grad_norm": 0.334340407521021, + "learning_rate": 3.176352261356105e-05, + "loss": 0.3991, + "step": 10020 + }, + { + "epoch": 2.337919776119403, + "grad_norm": 0.3230954959705112, + "learning_rate": 3.17464835438938e-05, + "loss": 0.3871, + "step": 10025 + }, + { + "epoch": 2.3390858208955225, + "grad_norm": 0.3230271726785597, + "learning_rate": 3.172944194858096e-05, + "loss": 0.375, + "step": 10030 + }, + { + "epoch": 2.340251865671642, + "grad_norm": 0.3605855995169774, + "learning_rate": 3.171239783775825e-05, + "loss": 0.4095, + "step": 10035 + }, + { + "epoch": 2.341417910447761, + "grad_norm": 0.3426358748076512, + "learning_rate": 3.169535122156283e-05, + "loss": 0.4063, + "step": 10040 + }, + { + "epoch": 2.3425839552238807, + "grad_norm": 0.33561767090770944, + "learning_rate": 3.167830211013338e-05, + "loss": 0.4034, + "step": 10045 + }, + { + "epoch": 2.34375, + "grad_norm": 0.3375095406977493, + "learning_rate": 3.166125051361007e-05, + "loss": 0.3778, + "step": 10050 + }, + { + "epoch": 2.3449160447761193, + "grad_norm": 0.34540683433433367, + "learning_rate": 3.164419644213451e-05, + "loss": 0.3931, + "step": 10055 + }, + { + "epoch": 2.346082089552239, + "grad_norm": 0.3511959265083776, + "learning_rate": 3.162713990584983e-05, + "loss": 0.405, + "step": 10060 + }, + { + "epoch": 2.347248134328358, + "grad_norm": 0.331903263476702, + "learning_rate": 3.1610080914900604e-05, + "loss": 0.4143, + "step": 10065 + }, + { + "epoch": 2.3484141791044775, + "grad_norm": 0.32982825480967737, + "learning_rate": 3.159301947943285e-05, + "loss": 0.3965, + "step": 10070 + }, + { + "epoch": 2.349580223880597, + "grad_norm": 0.32711616689698986, + "learning_rate": 3.157595560959407e-05, + "loss": 0.3998, + "step": 10075 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.35263199055232375, + "learning_rate": 3.155888931553319e-05, + "loss": 0.4079, + "step": 10080 + }, + { + "epoch": 2.3519123134328357, + "grad_norm": 0.3569333029191892, + "learning_rate": 3.154182060740058e-05, + "loss": 0.3995, + "step": 10085 + }, + { + "epoch": 2.3530783582089554, + "grad_norm": 0.33012474548979087, + "learning_rate": 3.152474949534808e-05, + "loss": 0.3969, + "step": 10090 + }, + { + "epoch": 2.3542444029850746, + "grad_norm": 0.342346282324755, + "learning_rate": 3.1507675989528915e-05, + "loss": 0.3905, + "step": 10095 + }, + { + "epoch": 2.355410447761194, + "grad_norm": 0.31500124987977707, + "learning_rate": 3.1490600100097746e-05, + "loss": 0.384, + "step": 10100 + }, + { + "epoch": 2.3565764925373136, + "grad_norm": 0.33836453332412225, + "learning_rate": 3.147352183721067e-05, + "loss": 0.3856, + "step": 10105 + }, + { + "epoch": 2.357742537313433, + "grad_norm": 0.35235644603826194, + "learning_rate": 3.145644121102517e-05, + "loss": 0.409, + "step": 10110 + }, + { + "epoch": 2.358908582089552, + "grad_norm": 0.3402250310578, + "learning_rate": 3.1439358231700165e-05, + "loss": 0.3959, + "step": 10115 + }, + { + "epoch": 2.360074626865672, + "grad_norm": 0.31665747129753874, + "learning_rate": 3.142227290939595e-05, + "loss": 0.406, + "step": 10120 + }, + { + "epoch": 2.361240671641791, + "grad_norm": 0.3227769955380264, + "learning_rate": 3.14051852542742e-05, + "loss": 0.3836, + "step": 10125 + }, + { + "epoch": 2.3624067164179103, + "grad_norm": 0.33119680460644846, + "learning_rate": 3.1388095276498013e-05, + "loss": 0.4183, + "step": 10130 + }, + { + "epoch": 2.36357276119403, + "grad_norm": 0.3314793712422934, + "learning_rate": 3.1371002986231855e-05, + "loss": 0.384, + "step": 10135 + }, + { + "epoch": 2.3647388059701493, + "grad_norm": 0.32789918586111844, + "learning_rate": 3.1353908393641574e-05, + "loss": 0.4062, + "step": 10140 + }, + { + "epoch": 2.3659048507462686, + "grad_norm": 0.3257571322072482, + "learning_rate": 3.133681150889434e-05, + "loss": 0.3933, + "step": 10145 + }, + { + "epoch": 2.3670708955223883, + "grad_norm": 0.33231083324624255, + "learning_rate": 3.131971234215877e-05, + "loss": 0.4031, + "step": 10150 + }, + { + "epoch": 2.3682369402985075, + "grad_norm": 0.32134342411653466, + "learning_rate": 3.1302610903604775e-05, + "loss": 0.3977, + "step": 10155 + }, + { + "epoch": 2.3694029850746268, + "grad_norm": 0.3459994188256327, + "learning_rate": 3.128550720340362e-05, + "loss": 0.4047, + "step": 10160 + }, + { + "epoch": 2.3705690298507465, + "grad_norm": 0.34509381119124644, + "learning_rate": 3.126840125172795e-05, + "loss": 0.3998, + "step": 10165 + }, + { + "epoch": 2.3717350746268657, + "grad_norm": 0.3119082455680392, + "learning_rate": 3.125129305875172e-05, + "loss": 0.4072, + "step": 10170 + }, + { + "epoch": 2.372901119402985, + "grad_norm": 0.34586817402009273, + "learning_rate": 3.1234182634650234e-05, + "loss": 0.4088, + "step": 10175 + }, + { + "epoch": 2.3740671641791042, + "grad_norm": 0.3345551711929769, + "learning_rate": 3.1217069989600097e-05, + "loss": 0.4122, + "step": 10180 + }, + { + "epoch": 2.375233208955224, + "grad_norm": 0.3203957634261716, + "learning_rate": 3.119995513377928e-05, + "loss": 0.3928, + "step": 10185 + }, + { + "epoch": 2.376399253731343, + "grad_norm": 0.3596092995549471, + "learning_rate": 3.118283807736703e-05, + "loss": 0.4136, + "step": 10190 + }, + { + "epoch": 2.377565298507463, + "grad_norm": 0.3331826192089746, + "learning_rate": 3.1165718830543914e-05, + "loss": 0.4113, + "step": 10195 + }, + { + "epoch": 2.378731343283582, + "grad_norm": 0.35171452204060677, + "learning_rate": 3.1148597403491816e-05, + "loss": 0.4183, + "step": 10200 + }, + { + "epoch": 2.3798973880597014, + "grad_norm": 0.3284178824184799, + "learning_rate": 3.1131473806393876e-05, + "loss": 0.4003, + "step": 10205 + }, + { + "epoch": 2.3810634328358207, + "grad_norm": 0.32613482390213105, + "learning_rate": 3.1114348049434583e-05, + "loss": 0.3988, + "step": 10210 + }, + { + "epoch": 2.3822294776119404, + "grad_norm": 0.3308846819271684, + "learning_rate": 3.109722014279967e-05, + "loss": 0.3892, + "step": 10215 + }, + { + "epoch": 2.3833955223880596, + "grad_norm": 0.3285079817418403, + "learning_rate": 3.108009009667615e-05, + "loss": 0.379, + "step": 10220 + }, + { + "epoch": 2.3845615671641793, + "grad_norm": 0.34480978638571164, + "learning_rate": 3.106295792125233e-05, + "loss": 0.4196, + "step": 10225 + }, + { + "epoch": 2.3857276119402986, + "grad_norm": 1.8144949097090854, + "learning_rate": 3.104582362671778e-05, + "loss": 0.417, + "step": 10230 + }, + { + "epoch": 2.386893656716418, + "grad_norm": 0.3526169863481771, + "learning_rate": 3.102868722326328e-05, + "loss": 0.3867, + "step": 10235 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.32196962856243905, + "learning_rate": 3.1011548721080955e-05, + "loss": 0.3958, + "step": 10240 + }, + { + "epoch": 2.389225746268657, + "grad_norm": 0.32620353229048504, + "learning_rate": 3.099440813036411e-05, + "loss": 0.4074, + "step": 10245 + }, + { + "epoch": 2.390391791044776, + "grad_norm": 0.3250314883921857, + "learning_rate": 3.097726546130729e-05, + "loss": 0.3851, + "step": 10250 + }, + { + "epoch": 2.3915578358208958, + "grad_norm": 0.3202608673120686, + "learning_rate": 3.096012072410633e-05, + "loss": 0.4024, + "step": 10255 + }, + { + "epoch": 2.392723880597015, + "grad_norm": 0.32410091788452705, + "learning_rate": 3.094297392895825e-05, + "loss": 0.3995, + "step": 10260 + }, + { + "epoch": 2.3938899253731343, + "grad_norm": 0.3577726965191297, + "learning_rate": 3.0925825086061295e-05, + "loss": 0.4185, + "step": 10265 + }, + { + "epoch": 2.3950559701492535, + "grad_norm": 0.3584149177157239, + "learning_rate": 3.090867420561495e-05, + "loss": 0.4111, + "step": 10270 + }, + { + "epoch": 2.3962220149253732, + "grad_norm": 0.36685961686960566, + "learning_rate": 3.0891521297819906e-05, + "loss": 0.4171, + "step": 10275 + }, + { + "epoch": 2.3973880597014925, + "grad_norm": 0.3391626954307687, + "learning_rate": 3.0874366372878036e-05, + "loss": 0.4219, + "step": 10280 + }, + { + "epoch": 2.3985541044776117, + "grad_norm": 0.3364139323696701, + "learning_rate": 3.085720944099246e-05, + "loss": 0.3884, + "step": 10285 + }, + { + "epoch": 2.3997201492537314, + "grad_norm": 0.3459153137425684, + "learning_rate": 3.0840050512367444e-05, + "loss": 0.3768, + "step": 10290 + }, + { + "epoch": 2.4008861940298507, + "grad_norm": 0.3554510164309164, + "learning_rate": 3.082288959720845e-05, + "loss": 0.3954, + "step": 10295 + }, + { + "epoch": 2.40205223880597, + "grad_norm": 0.5307041269625768, + "learning_rate": 3.0805726705722156e-05, + "loss": 0.405, + "step": 10300 + }, + { + "epoch": 2.4032182835820897, + "grad_norm": 0.31334252629145287, + "learning_rate": 3.078856184811638e-05, + "loss": 0.3797, + "step": 10305 + }, + { + "epoch": 2.404384328358209, + "grad_norm": 0.31746455307462684, + "learning_rate": 3.077139503460012e-05, + "loss": 0.4075, + "step": 10310 + }, + { + "epoch": 2.405550373134328, + "grad_norm": 0.35179501384941514, + "learning_rate": 3.0754226275383546e-05, + "loss": 0.4108, + "step": 10315 + }, + { + "epoch": 2.406716417910448, + "grad_norm": 0.32612692754775957, + "learning_rate": 3.073705558067797e-05, + "loss": 0.4048, + "step": 10320 + }, + { + "epoch": 2.407882462686567, + "grad_norm": 0.36856468182483, + "learning_rate": 3.071988296069586e-05, + "loss": 0.4091, + "step": 10325 + }, + { + "epoch": 2.4090485074626864, + "grad_norm": 0.33215179272632817, + "learning_rate": 3.070270842565084e-05, + "loss": 0.4033, + "step": 10330 + }, + { + "epoch": 2.410214552238806, + "grad_norm": 0.3529031418620512, + "learning_rate": 3.068553198575767e-05, + "loss": 0.4014, + "step": 10335 + }, + { + "epoch": 2.4113805970149254, + "grad_norm": 0.352076797924759, + "learning_rate": 3.0668353651232226e-05, + "loss": 0.399, + "step": 10340 + }, + { + "epoch": 2.4125466417910446, + "grad_norm": 0.32273091432723616, + "learning_rate": 3.065117343229153e-05, + "loss": 0.385, + "step": 10345 + }, + { + "epoch": 2.4137126865671643, + "grad_norm": 0.3336054787241504, + "learning_rate": 3.063399133915371e-05, + "loss": 0.3986, + "step": 10350 + }, + { + "epoch": 2.4148787313432836, + "grad_norm": 0.33073092455074937, + "learning_rate": 3.0616807382038016e-05, + "loss": 0.379, + "step": 10355 + }, + { + "epoch": 2.416044776119403, + "grad_norm": 0.3251397381722083, + "learning_rate": 3.059962157116481e-05, + "loss": 0.3833, + "step": 10360 + }, + { + "epoch": 2.4172108208955225, + "grad_norm": 0.3413498059059611, + "learning_rate": 3.058243391675557e-05, + "loss": 0.4056, + "step": 10365 + }, + { + "epoch": 2.418376865671642, + "grad_norm": 0.35337941213086, + "learning_rate": 3.056524442903282e-05, + "loss": 0.406, + "step": 10370 + }, + { + "epoch": 2.419542910447761, + "grad_norm": 0.34151696255758757, + "learning_rate": 3.054805311822023e-05, + "loss": 0.4164, + "step": 10375 + }, + { + "epoch": 2.4207089552238807, + "grad_norm": 0.3319208887764344, + "learning_rate": 3.053085999454254e-05, + "loss": 0.3897, + "step": 10380 + }, + { + "epoch": 2.421875, + "grad_norm": 0.3520706047391738, + "learning_rate": 3.051366506822554e-05, + "loss": 0.4153, + "step": 10385 + }, + { + "epoch": 2.4230410447761193, + "grad_norm": 0.31789560104516634, + "learning_rate": 3.0496468349496115e-05, + "loss": 0.3986, + "step": 10390 + }, + { + "epoch": 2.424207089552239, + "grad_norm": 0.3597622154424394, + "learning_rate": 3.047926984858223e-05, + "loss": 0.4146, + "step": 10395 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.3332932526410318, + "learning_rate": 3.046206957571288e-05, + "loss": 0.3975, + "step": 10400 + }, + { + "epoch": 2.4265391791044775, + "grad_norm": 0.33580940086004796, + "learning_rate": 3.0444867541118145e-05, + "loss": 0.4031, + "step": 10405 + }, + { + "epoch": 2.427705223880597, + "grad_norm": 0.32027555259339785, + "learning_rate": 3.0427663755029108e-05, + "loss": 0.3869, + "step": 10410 + }, + { + "epoch": 2.4288712686567164, + "grad_norm": 0.322247385919075, + "learning_rate": 3.0410458227677934e-05, + "loss": 0.4043, + "step": 10415 + }, + { + "epoch": 2.4300373134328357, + "grad_norm": 0.3644410852880657, + "learning_rate": 3.0393250969297826e-05, + "loss": 0.399, + "step": 10420 + }, + { + "epoch": 2.4312033582089554, + "grad_norm": 0.33834561753799774, + "learning_rate": 3.0376041990122983e-05, + "loss": 0.4092, + "step": 10425 + }, + { + "epoch": 2.4323694029850746, + "grad_norm": 0.32506434525029393, + "learning_rate": 3.0358831300388657e-05, + "loss": 0.4118, + "step": 10430 + }, + { + "epoch": 2.433535447761194, + "grad_norm": 0.34663468361506883, + "learning_rate": 3.0341618910331093e-05, + "loss": 0.3959, + "step": 10435 + }, + { + "epoch": 2.4347014925373136, + "grad_norm": 0.3398568910309269, + "learning_rate": 3.0324404830187564e-05, + "loss": 0.4018, + "step": 10440 + }, + { + "epoch": 2.435867537313433, + "grad_norm": 0.3353124579925197, + "learning_rate": 3.0307189070196358e-05, + "loss": 0.3997, + "step": 10445 + }, + { + "epoch": 2.437033582089552, + "grad_norm": 0.35212485306323965, + "learning_rate": 3.0289971640596737e-05, + "loss": 0.3849, + "step": 10450 + }, + { + "epoch": 2.438199626865672, + "grad_norm": 0.33020930114740726, + "learning_rate": 3.0272752551628975e-05, + "loss": 0.3882, + "step": 10455 + }, + { + "epoch": 2.439365671641791, + "grad_norm": 0.3418629569518309, + "learning_rate": 3.0255531813534322e-05, + "loss": 0.3901, + "step": 10460 + }, + { + "epoch": 2.4405317164179103, + "grad_norm": 0.3467397178053059, + "learning_rate": 3.0238309436555e-05, + "loss": 0.408, + "step": 10465 + }, + { + "epoch": 2.44169776119403, + "grad_norm": 0.3325376652620171, + "learning_rate": 3.022108543093425e-05, + "loss": 0.4018, + "step": 10470 + }, + { + "epoch": 2.4428638059701493, + "grad_norm": 0.3527808062363941, + "learning_rate": 3.020385980691621e-05, + "loss": 0.4078, + "step": 10475 + }, + { + "epoch": 2.4440298507462686, + "grad_norm": 0.3377371410844193, + "learning_rate": 3.0186632574746055e-05, + "loss": 0.3948, + "step": 10480 + }, + { + "epoch": 2.4451958955223883, + "grad_norm": 0.34172950471986, + "learning_rate": 3.016940374466986e-05, + "loss": 0.3941, + "step": 10485 + }, + { + "epoch": 2.4463619402985075, + "grad_norm": 0.32838751269242406, + "learning_rate": 3.0152173326934692e-05, + "loss": 0.4048, + "step": 10490 + }, + { + "epoch": 2.4475279850746268, + "grad_norm": 0.31714363182431027, + "learning_rate": 3.0134941331788525e-05, + "loss": 0.3934, + "step": 10495 + }, + { + "epoch": 2.4486940298507465, + "grad_norm": 0.31738577198258777, + "learning_rate": 3.0117707769480285e-05, + "loss": 0.3772, + "step": 10500 + }, + { + "epoch": 2.4498600746268657, + "grad_norm": 0.3109178049328986, + "learning_rate": 3.0100472650259866e-05, + "loss": 0.3825, + "step": 10505 + }, + { + "epoch": 2.451026119402985, + "grad_norm": 0.32192043035374324, + "learning_rate": 3.008323598437802e-05, + "loss": 0.4201, + "step": 10510 + }, + { + "epoch": 2.4521921641791042, + "grad_norm": 0.33672228524159903, + "learning_rate": 3.006599778208647e-05, + "loss": 0.3952, + "step": 10515 + }, + { + "epoch": 2.453358208955224, + "grad_norm": 0.34256596840833037, + "learning_rate": 3.0048758053637844e-05, + "loss": 0.391, + "step": 10520 + }, + { + "epoch": 2.454524253731343, + "grad_norm": 0.3390966293245535, + "learning_rate": 3.0031516809285658e-05, + "loss": 0.4009, + "step": 10525 + }, + { + "epoch": 2.455690298507463, + "grad_norm": 0.322208018320275, + "learning_rate": 3.001427405928435e-05, + "loss": 0.3977, + "step": 10530 + }, + { + "epoch": 2.456856343283582, + "grad_norm": 0.33096617816553986, + "learning_rate": 2.999702981388925e-05, + "loss": 0.4029, + "step": 10535 + }, + { + "epoch": 2.4580223880597014, + "grad_norm": 0.32913530116371414, + "learning_rate": 2.9979784083356567e-05, + "loss": 0.4121, + "step": 10540 + }, + { + "epoch": 2.4591884328358207, + "grad_norm": 0.3324314228830138, + "learning_rate": 2.996253687794341e-05, + "loss": 0.4184, + "step": 10545 + }, + { + "epoch": 2.4603544776119404, + "grad_norm": 0.344369600694389, + "learning_rate": 2.994528820790774e-05, + "loss": 0.393, + "step": 10550 + }, + { + "epoch": 2.4615205223880596, + "grad_norm": 0.35216412788337015, + "learning_rate": 2.9928038083508415e-05, + "loss": 0.3957, + "step": 10555 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.3236085499304529, + "learning_rate": 2.9910786515005146e-05, + "loss": 0.3933, + "step": 10560 + }, + { + "epoch": 2.4638526119402986, + "grad_norm": 0.33114051183143334, + "learning_rate": 2.9893533512658507e-05, + "loss": 0.395, + "step": 10565 + }, + { + "epoch": 2.465018656716418, + "grad_norm": 0.32588187811400243, + "learning_rate": 2.987627908672992e-05, + "loss": 0.399, + "step": 10570 + }, + { + "epoch": 2.466184701492537, + "grad_norm": 0.31675091471450684, + "learning_rate": 2.9859023247481644e-05, + "loss": 0.4059, + "step": 10575 + }, + { + "epoch": 2.467350746268657, + "grad_norm": 0.34012484971873264, + "learning_rate": 2.9841766005176808e-05, + "loss": 0.3979, + "step": 10580 + }, + { + "epoch": 2.468516791044776, + "grad_norm": 0.32363133426231616, + "learning_rate": 2.982450737007935e-05, + "loss": 0.4072, + "step": 10585 + }, + { + "epoch": 2.4696828358208958, + "grad_norm": 0.31620916149814543, + "learning_rate": 2.9807247352454055e-05, + "loss": 0.3875, + "step": 10590 + }, + { + "epoch": 2.470848880597015, + "grad_norm": 0.3531241878315411, + "learning_rate": 2.9789985962566503e-05, + "loss": 0.4097, + "step": 10595 + }, + { + "epoch": 2.4720149253731343, + "grad_norm": 0.34753890425407236, + "learning_rate": 2.977272321068311e-05, + "loss": 0.397, + "step": 10600 + }, + { + "epoch": 2.4731809701492535, + "grad_norm": 0.32675131126217766, + "learning_rate": 2.975545910707111e-05, + "loss": 0.3778, + "step": 10605 + }, + { + "epoch": 2.4743470149253732, + "grad_norm": 0.3765561934154416, + "learning_rate": 2.9738193661998526e-05, + "loss": 0.4077, + "step": 10610 + }, + { + "epoch": 2.4755130597014925, + "grad_norm": 0.3358992863592674, + "learning_rate": 2.9720926885734167e-05, + "loss": 0.3992, + "step": 10615 + }, + { + "epoch": 2.4766791044776117, + "grad_norm": 0.32565241368168574, + "learning_rate": 2.9703658788547674e-05, + "loss": 0.3795, + "step": 10620 + }, + { + "epoch": 2.4778451492537314, + "grad_norm": 0.3255668406361268, + "learning_rate": 2.968638938070942e-05, + "loss": 0.4102, + "step": 10625 + }, + { + "epoch": 2.4790111940298507, + "grad_norm": 0.3255638278906826, + "learning_rate": 2.9669118672490627e-05, + "loss": 0.3916, + "step": 10630 + }, + { + "epoch": 2.48017723880597, + "grad_norm": 0.34354907528716916, + "learning_rate": 2.9651846674163208e-05, + "loss": 0.4237, + "step": 10635 + }, + { + "epoch": 2.4813432835820897, + "grad_norm": 0.3288688933933861, + "learning_rate": 2.9634573395999916e-05, + "loss": 0.4254, + "step": 10640 + }, + { + "epoch": 2.482509328358209, + "grad_norm": 0.3403301878304212, + "learning_rate": 2.9617298848274223e-05, + "loss": 0.381, + "step": 10645 + }, + { + "epoch": 2.483675373134328, + "grad_norm": 0.31440854403841334, + "learning_rate": 2.9600023041260355e-05, + "loss": 0.3835, + "step": 10650 + }, + { + "epoch": 2.484841417910448, + "grad_norm": 0.3521502758854088, + "learning_rate": 2.9582745985233312e-05, + "loss": 0.402, + "step": 10655 + }, + { + "epoch": 2.486007462686567, + "grad_norm": 0.3452838350317882, + "learning_rate": 2.9565467690468834e-05, + "loss": 0.3938, + "step": 10660 + }, + { + "epoch": 2.4871735074626864, + "grad_norm": 0.3161399712571175, + "learning_rate": 2.9548188167243372e-05, + "loss": 0.3829, + "step": 10665 + }, + { + "epoch": 2.488339552238806, + "grad_norm": 0.32358393403248104, + "learning_rate": 2.953090742583413e-05, + "loss": 0.4056, + "step": 10670 + }, + { + "epoch": 2.4895055970149254, + "grad_norm": 0.3217085579518673, + "learning_rate": 2.951362547651903e-05, + "loss": 0.4123, + "step": 10675 + }, + { + "epoch": 2.4906716417910446, + "grad_norm": 0.33999354835093465, + "learning_rate": 2.949634232957671e-05, + "loss": 0.4114, + "step": 10680 + }, + { + "epoch": 2.4918376865671643, + "grad_norm": 0.3346261826072672, + "learning_rate": 2.9479057995286528e-05, + "loss": 0.4058, + "step": 10685 + }, + { + "epoch": 2.4930037313432836, + "grad_norm": 0.35037223712728494, + "learning_rate": 2.9461772483928547e-05, + "loss": 0.3912, + "step": 10690 + }, + { + "epoch": 2.494169776119403, + "grad_norm": 0.3224419644716262, + "learning_rate": 2.944448580578351e-05, + "loss": 0.4016, + "step": 10695 + }, + { + "epoch": 2.4953358208955225, + "grad_norm": 0.3409582170593146, + "learning_rate": 2.9427197971132886e-05, + "loss": 0.3966, + "step": 10700 + }, + { + "epoch": 2.496501865671642, + "grad_norm": 0.32065587569769405, + "learning_rate": 2.9409908990258812e-05, + "loss": 0.3997, + "step": 10705 + }, + { + "epoch": 2.497667910447761, + "grad_norm": 0.31995324751754234, + "learning_rate": 2.9392618873444112e-05, + "loss": 0.4037, + "step": 10710 + }, + { + "epoch": 2.4988339552238807, + "grad_norm": 0.3498064138015017, + "learning_rate": 2.937532763097227e-05, + "loss": 0.4141, + "step": 10715 + }, + { + "epoch": 2.5, + "grad_norm": 0.3384621293378265, + "learning_rate": 2.9358035273127483e-05, + "loss": 0.3816, + "step": 10720 + }, + { + "epoch": 2.5011660447761193, + "grad_norm": 0.34214690141611453, + "learning_rate": 2.934074181019455e-05, + "loss": 0.3886, + "step": 10725 + }, + { + "epoch": 2.502332089552239, + "grad_norm": 0.33582287074367945, + "learning_rate": 2.9323447252458986e-05, + "loss": 0.3956, + "step": 10730 + }, + { + "epoch": 2.503498134328358, + "grad_norm": 0.3594059753627121, + "learning_rate": 2.9306151610206916e-05, + "loss": 0.4089, + "step": 10735 + }, + { + "epoch": 2.5046641791044775, + "grad_norm": 0.35619172065572746, + "learning_rate": 2.9288854893725128e-05, + "loss": 0.3992, + "step": 10740 + }, + { + "epoch": 2.505830223880597, + "grad_norm": 0.3268673798294333, + "learning_rate": 2.9271557113301047e-05, + "loss": 0.3965, + "step": 10745 + }, + { + "epoch": 2.5069962686567164, + "grad_norm": 0.3355143444336588, + "learning_rate": 2.9254258279222724e-05, + "loss": 0.3999, + "step": 10750 + }, + { + "epoch": 2.5081623134328357, + "grad_norm": 0.32549204242638863, + "learning_rate": 2.9236958401778854e-05, + "loss": 0.3998, + "step": 10755 + }, + { + "epoch": 2.5093283582089554, + "grad_norm": 0.34416651578044116, + "learning_rate": 2.921965749125873e-05, + "loss": 0.4179, + "step": 10760 + }, + { + "epoch": 2.5104944029850746, + "grad_norm": 0.3605924045910323, + "learning_rate": 2.920235555795227e-05, + "loss": 0.3957, + "step": 10765 + }, + { + "epoch": 2.511660447761194, + "grad_norm": 0.34790625946822856, + "learning_rate": 2.9185052612150004e-05, + "loss": 0.3936, + "step": 10770 + }, + { + "epoch": 2.5128264925373136, + "grad_norm": 0.32786396747662133, + "learning_rate": 2.9167748664143067e-05, + "loss": 0.3968, + "step": 10775 + }, + { + "epoch": 2.513992537313433, + "grad_norm": 0.4410945382251137, + "learning_rate": 2.9150443724223174e-05, + "loss": 0.3867, + "step": 10780 + }, + { + "epoch": 2.515158582089552, + "grad_norm": 0.33370436734201, + "learning_rate": 2.9133137802682646e-05, + "loss": 0.3982, + "step": 10785 + }, + { + "epoch": 2.5163246268656714, + "grad_norm": 0.32308352973684296, + "learning_rate": 2.9115830909814374e-05, + "loss": 0.4055, + "step": 10790 + }, + { + "epoch": 2.517490671641791, + "grad_norm": 0.3091339519323906, + "learning_rate": 2.909852305591184e-05, + "loss": 0.3917, + "step": 10795 + }, + { + "epoch": 2.5186567164179103, + "grad_norm": 0.3476782282482589, + "learning_rate": 2.9081214251269095e-05, + "loss": 0.4129, + "step": 10800 + }, + { + "epoch": 2.51982276119403, + "grad_norm": 0.3343136586693371, + "learning_rate": 2.9063904506180746e-05, + "loss": 0.3969, + "step": 10805 + }, + { + "epoch": 2.5209888059701493, + "grad_norm": 0.33647997245022454, + "learning_rate": 2.904659383094197e-05, + "loss": 0.4127, + "step": 10810 + }, + { + "epoch": 2.5221548507462686, + "grad_norm": 0.3597715766647747, + "learning_rate": 2.902928223584848e-05, + "loss": 0.3866, + "step": 10815 + }, + { + "epoch": 2.523320895522388, + "grad_norm": 0.3325219298612197, + "learning_rate": 2.9011969731196565e-05, + "loss": 0.3959, + "step": 10820 + }, + { + "epoch": 2.5244869402985075, + "grad_norm": 0.32310679698430905, + "learning_rate": 2.8994656327283036e-05, + "loss": 0.3814, + "step": 10825 + }, + { + "epoch": 2.5256529850746268, + "grad_norm": 0.3261197381788878, + "learning_rate": 2.897734203440524e-05, + "loss": 0.3921, + "step": 10830 + }, + { + "epoch": 2.5268190298507465, + "grad_norm": 0.33835951605396203, + "learning_rate": 2.8960026862861057e-05, + "loss": 0.4002, + "step": 10835 + }, + { + "epoch": 2.5279850746268657, + "grad_norm": 0.33359168610897627, + "learning_rate": 2.894271082294887e-05, + "loss": 0.4054, + "step": 10840 + }, + { + "epoch": 2.529151119402985, + "grad_norm": 0.30907345255010304, + "learning_rate": 2.8925393924967615e-05, + "loss": 0.3803, + "step": 10845 + }, + { + "epoch": 2.5303171641791042, + "grad_norm": 0.3252368104046308, + "learning_rate": 2.8908076179216715e-05, + "loss": 0.3909, + "step": 10850 + }, + { + "epoch": 2.531483208955224, + "grad_norm": 0.3292986540454282, + "learning_rate": 2.88907575959961e-05, + "loss": 0.4068, + "step": 10855 + }, + { + "epoch": 2.532649253731343, + "grad_norm": 0.32859313146762636, + "learning_rate": 2.8873438185606194e-05, + "loss": 0.3959, + "step": 10860 + }, + { + "epoch": 2.533815298507463, + "grad_norm": 0.33746489817943187, + "learning_rate": 2.8856117958347923e-05, + "loss": 0.4025, + "step": 10865 + }, + { + "epoch": 2.534981343283582, + "grad_norm": 0.3153155014164979, + "learning_rate": 2.8838796924522694e-05, + "loss": 0.3906, + "step": 10870 + }, + { + "epoch": 2.5361473880597014, + "grad_norm": 0.33923991365508954, + "learning_rate": 2.8821475094432393e-05, + "loss": 0.4081, + "step": 10875 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.3186874297831969, + "learning_rate": 2.8804152478379377e-05, + "loss": 0.3999, + "step": 10880 + }, + { + "epoch": 2.5384794776119404, + "grad_norm": 0.31733276566443414, + "learning_rate": 2.8786829086666483e-05, + "loss": 0.3825, + "step": 10885 + }, + { + "epoch": 2.5396455223880596, + "grad_norm": 0.33228644986673356, + "learning_rate": 2.8769504929596986e-05, + "loss": 0.4013, + "step": 10890 + }, + { + "epoch": 2.5408115671641793, + "grad_norm": 0.3366748932350541, + "learning_rate": 2.8752180017474646e-05, + "loss": 0.4124, + "step": 10895 + }, + { + "epoch": 2.5419776119402986, + "grad_norm": 0.32734712036855396, + "learning_rate": 2.8734854360603646e-05, + "loss": 0.408, + "step": 10900 + }, + { + "epoch": 2.543143656716418, + "grad_norm": 0.3181618803084983, + "learning_rate": 2.8717527969288632e-05, + "loss": 0.3816, + "step": 10905 + }, + { + "epoch": 2.544309701492537, + "grad_norm": 0.3361742979419798, + "learning_rate": 2.870020085383466e-05, + "loss": 0.4011, + "step": 10910 + }, + { + "epoch": 2.545475746268657, + "grad_norm": 0.3324159163938773, + "learning_rate": 2.868287302454725e-05, + "loss": 0.4101, + "step": 10915 + }, + { + "epoch": 2.546641791044776, + "grad_norm": 0.3277822050735822, + "learning_rate": 2.8665544491732315e-05, + "loss": 0.4003, + "step": 10920 + }, + { + "epoch": 2.5478078358208958, + "grad_norm": 0.3724442747185561, + "learning_rate": 2.8648215265696227e-05, + "loss": 0.4073, + "step": 10925 + }, + { + "epoch": 2.548973880597015, + "grad_norm": 0.32836599322795856, + "learning_rate": 2.8630885356745716e-05, + "loss": 0.41, + "step": 10930 + }, + { + "epoch": 2.5501399253731343, + "grad_norm": 0.33133400987168266, + "learning_rate": 2.8613554775187962e-05, + "loss": 0.38, + "step": 10935 + }, + { + "epoch": 2.5513059701492535, + "grad_norm": 0.3448970138977121, + "learning_rate": 2.859622353133054e-05, + "loss": 0.4016, + "step": 10940 + }, + { + "epoch": 2.5524720149253732, + "grad_norm": 0.35279193257714214, + "learning_rate": 2.8578891635481387e-05, + "loss": 0.4081, + "step": 10945 + }, + { + "epoch": 2.5536380597014925, + "grad_norm": 0.3243935245847756, + "learning_rate": 2.8561559097948863e-05, + "loss": 0.4039, + "step": 10950 + }, + { + "epoch": 2.554804104477612, + "grad_norm": 0.3248898066941752, + "learning_rate": 2.8544225929041697e-05, + "loss": 0.3902, + "step": 10955 + }, + { + "epoch": 2.5559701492537314, + "grad_norm": 0.32629310243655124, + "learning_rate": 2.852689213906899e-05, + "loss": 0.4117, + "step": 10960 + }, + { + "epoch": 2.5571361940298507, + "grad_norm": 0.3150391771824354, + "learning_rate": 2.850955773834022e-05, + "loss": 0.3839, + "step": 10965 + }, + { + "epoch": 2.55830223880597, + "grad_norm": 0.31956479026051104, + "learning_rate": 2.849222273716522e-05, + "loss": 0.3843, + "step": 10970 + }, + { + "epoch": 2.5594682835820897, + "grad_norm": 0.3367837851435414, + "learning_rate": 2.8474887145854183e-05, + "loss": 0.4005, + "step": 10975 + }, + { + "epoch": 2.560634328358209, + "grad_norm": 0.32271229350305947, + "learning_rate": 2.8457550974717655e-05, + "loss": 0.4051, + "step": 10980 + }, + { + "epoch": 2.5618003731343286, + "grad_norm": 0.3491479293926865, + "learning_rate": 2.8440214234066524e-05, + "loss": 0.3888, + "step": 10985 + }, + { + "epoch": 2.562966417910448, + "grad_norm": 0.3681124864150189, + "learning_rate": 2.8422876934212027e-05, + "loss": 0.4028, + "step": 10990 + }, + { + "epoch": 2.564132462686567, + "grad_norm": 0.35561386881539514, + "learning_rate": 2.8405539085465717e-05, + "loss": 0.415, + "step": 10995 + }, + { + "epoch": 2.5652985074626864, + "grad_norm": 0.34146255984065516, + "learning_rate": 2.8388200698139484e-05, + "loss": 0.4051, + "step": 11000 + }, + { + "epoch": 2.566464552238806, + "grad_norm": 0.34126450314668866, + "learning_rate": 2.8370861782545537e-05, + "loss": 0.3842, + "step": 11005 + }, + { + "epoch": 2.5676305970149254, + "grad_norm": 0.33846587831377495, + "learning_rate": 2.8353522348996388e-05, + "loss": 0.4034, + "step": 11010 + }, + { + "epoch": 2.5687966417910446, + "grad_norm": 0.33056921381933013, + "learning_rate": 2.8336182407804886e-05, + "loss": 0.4055, + "step": 11015 + }, + { + "epoch": 2.5699626865671643, + "grad_norm": 0.31770192838128103, + "learning_rate": 2.8318841969284145e-05, + "loss": 0.3969, + "step": 11020 + }, + { + "epoch": 2.5711287313432836, + "grad_norm": 0.3422252535274209, + "learning_rate": 2.8301501043747608e-05, + "loss": 0.3995, + "step": 11025 + }, + { + "epoch": 2.572294776119403, + "grad_norm": 0.33408660146302854, + "learning_rate": 2.8284159641508972e-05, + "loss": 0.3928, + "step": 11030 + }, + { + "epoch": 2.5734608208955225, + "grad_norm": 0.3056766347046411, + "learning_rate": 2.826681777288226e-05, + "loss": 0.3916, + "step": 11035 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.33128605438275327, + "learning_rate": 2.824947544818175e-05, + "loss": 0.4001, + "step": 11040 + }, + { + "epoch": 2.575792910447761, + "grad_norm": 0.3184401376927514, + "learning_rate": 2.8232132677721972e-05, + "loss": 0.3925, + "step": 11045 + }, + { + "epoch": 2.5769589552238807, + "grad_norm": 0.3524473188642416, + "learning_rate": 2.8214789471817754e-05, + "loss": 0.3978, + "step": 11050 + }, + { + "epoch": 2.578125, + "grad_norm": 0.3258655834426742, + "learning_rate": 2.819744584078417e-05, + "loss": 0.4147, + "step": 11055 + }, + { + "epoch": 2.5792910447761193, + "grad_norm": 0.3258583122709031, + "learning_rate": 2.8180101794936542e-05, + "loss": 0.4042, + "step": 11060 + }, + { + "epoch": 2.580457089552239, + "grad_norm": 0.33982337808957086, + "learning_rate": 2.8162757344590445e-05, + "loss": 0.4083, + "step": 11065 + }, + { + "epoch": 2.581623134328358, + "grad_norm": 0.32406290550327665, + "learning_rate": 2.8145412500061702e-05, + "loss": 0.3791, + "step": 11070 + }, + { + "epoch": 2.5827891791044775, + "grad_norm": 0.3119583147853536, + "learning_rate": 2.812806727166635e-05, + "loss": 0.4092, + "step": 11075 + }, + { + "epoch": 2.583955223880597, + "grad_norm": 0.3196499398717737, + "learning_rate": 2.8110721669720663e-05, + "loss": 0.3857, + "step": 11080 + }, + { + "epoch": 2.5851212686567164, + "grad_norm": 0.32554825492053696, + "learning_rate": 2.8093375704541158e-05, + "loss": 0.3845, + "step": 11085 + }, + { + "epoch": 2.5862873134328357, + "grad_norm": 0.38251568384629714, + "learning_rate": 2.8076029386444524e-05, + "loss": 0.3994, + "step": 11090 + }, + { + "epoch": 2.5874533582089554, + "grad_norm": 0.3199323482279115, + "learning_rate": 2.805868272574771e-05, + "loss": 0.3858, + "step": 11095 + }, + { + "epoch": 2.5886194029850746, + "grad_norm": 0.34292229990871886, + "learning_rate": 2.804133573276783e-05, + "loss": 0.4276, + "step": 11100 + }, + { + "epoch": 2.589785447761194, + "grad_norm": 0.32460126706512665, + "learning_rate": 2.8023988417822222e-05, + "loss": 0.3921, + "step": 11105 + }, + { + "epoch": 2.5909514925373136, + "grad_norm": 0.32353862849431725, + "learning_rate": 2.800664079122839e-05, + "loss": 0.4138, + "step": 11110 + }, + { + "epoch": 2.592117537313433, + "grad_norm": 0.33458661213733015, + "learning_rate": 2.7989292863304045e-05, + "loss": 0.3962, + "step": 11115 + }, + { + "epoch": 2.593283582089552, + "grad_norm": 0.36347875244808336, + "learning_rate": 2.7971944644367066e-05, + "loss": 0.4038, + "step": 11120 + }, + { + "epoch": 2.5944496268656714, + "grad_norm": 0.316560823284915, + "learning_rate": 2.7954596144735512e-05, + "loss": 0.3906, + "step": 11125 + }, + { + "epoch": 2.595615671641791, + "grad_norm": 0.3498918106403532, + "learning_rate": 2.79372473747276e-05, + "loss": 0.3973, + "step": 11130 + }, + { + "epoch": 2.5967817164179103, + "grad_norm": 0.3469956114538937, + "learning_rate": 2.7919898344661723e-05, + "loss": 0.4115, + "step": 11135 + }, + { + "epoch": 2.59794776119403, + "grad_norm": 0.34961336786912744, + "learning_rate": 2.7902549064856405e-05, + "loss": 0.3979, + "step": 11140 + }, + { + "epoch": 2.5991138059701493, + "grad_norm": 0.3294875112462785, + "learning_rate": 2.7885199545630343e-05, + "loss": 0.4141, + "step": 11145 + }, + { + "epoch": 2.6002798507462686, + "grad_norm": 0.321734211773879, + "learning_rate": 2.7867849797302357e-05, + "loss": 0.4062, + "step": 11150 + }, + { + "epoch": 2.601445895522388, + "grad_norm": 0.3571711501997368, + "learning_rate": 2.785049983019143e-05, + "loss": 0.4036, + "step": 11155 + }, + { + "epoch": 2.6026119402985075, + "grad_norm": 0.34531340164768715, + "learning_rate": 2.7833149654616637e-05, + "loss": 0.4019, + "step": 11160 + }, + { + "epoch": 2.6037779850746268, + "grad_norm": 0.3255306297638485, + "learning_rate": 2.7815799280897202e-05, + "loss": 0.388, + "step": 11165 + }, + { + "epoch": 2.6049440298507465, + "grad_norm": 0.3481015414142927, + "learning_rate": 2.7798448719352467e-05, + "loss": 0.4003, + "step": 11170 + }, + { + "epoch": 2.6061100746268657, + "grad_norm": 0.31532168438580027, + "learning_rate": 2.7781097980301878e-05, + "loss": 0.3791, + "step": 11175 + }, + { + "epoch": 2.607276119402985, + "grad_norm": 0.3763784149688322, + "learning_rate": 2.7763747074065e-05, + "loss": 0.4038, + "step": 11180 + }, + { + "epoch": 2.6084421641791042, + "grad_norm": 0.31892940667405195, + "learning_rate": 2.7746396010961462e-05, + "loss": 0.3865, + "step": 11185 + }, + { + "epoch": 2.609608208955224, + "grad_norm": 0.3557532967762879, + "learning_rate": 2.7729044801311032e-05, + "loss": 0.3985, + "step": 11190 + }, + { + "epoch": 2.610774253731343, + "grad_norm": 0.3360995728639955, + "learning_rate": 2.7711693455433534e-05, + "loss": 0.3877, + "step": 11195 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.3491493918558091, + "learning_rate": 2.7694341983648884e-05, + "loss": 0.4065, + "step": 11200 + }, + { + "epoch": 2.613106343283582, + "grad_norm": 0.34104265174432297, + "learning_rate": 2.7676990396277085e-05, + "loss": 0.4191, + "step": 11205 + }, + { + "epoch": 2.6142723880597014, + "grad_norm": 0.3343339198378052, + "learning_rate": 2.7659638703638173e-05, + "loss": 0.4, + "step": 11210 + }, + { + "epoch": 2.6154384328358207, + "grad_norm": 0.3677754643152509, + "learning_rate": 2.764228691605229e-05, + "loss": 0.4079, + "step": 11215 + }, + { + "epoch": 2.6166044776119404, + "grad_norm": 0.31721255439706836, + "learning_rate": 2.76249350438396e-05, + "loss": 0.3866, + "step": 11220 + }, + { + "epoch": 2.6177705223880596, + "grad_norm": 0.32511204163275603, + "learning_rate": 2.7607583097320345e-05, + "loss": 0.4018, + "step": 11225 + }, + { + "epoch": 2.6189365671641793, + "grad_norm": 0.3644919707231653, + "learning_rate": 2.7590231086814782e-05, + "loss": 0.395, + "step": 11230 + }, + { + "epoch": 2.6201026119402986, + "grad_norm": 0.3186136651250511, + "learning_rate": 2.7572879022643228e-05, + "loss": 0.3832, + "step": 11235 + }, + { + "epoch": 2.621268656716418, + "grad_norm": 0.3396535926877513, + "learning_rate": 2.7555526915126033e-05, + "loss": 0.3882, + "step": 11240 + }, + { + "epoch": 2.622434701492537, + "grad_norm": 0.3162307242754689, + "learning_rate": 2.7538174774583552e-05, + "loss": 0.3704, + "step": 11245 + }, + { + "epoch": 2.623600746268657, + "grad_norm": 0.33208467844912376, + "learning_rate": 2.7520822611336176e-05, + "loss": 0.4047, + "step": 11250 + }, + { + "epoch": 2.624766791044776, + "grad_norm": 0.32975040235303565, + "learning_rate": 2.7503470435704322e-05, + "loss": 0.3803, + "step": 11255 + }, + { + "epoch": 2.6259328358208958, + "grad_norm": 0.32702536417309136, + "learning_rate": 2.7486118258008374e-05, + "loss": 0.3966, + "step": 11260 + }, + { + "epoch": 2.627098880597015, + "grad_norm": 0.32603898335853465, + "learning_rate": 2.746876608856876e-05, + "loss": 0.3937, + "step": 11265 + }, + { + "epoch": 2.6282649253731343, + "grad_norm": 0.32761402145244256, + "learning_rate": 2.7451413937705878e-05, + "loss": 0.3789, + "step": 11270 + }, + { + "epoch": 2.6294309701492535, + "grad_norm": 0.3258060309573296, + "learning_rate": 2.743406181574012e-05, + "loss": 0.4005, + "step": 11275 + }, + { + "epoch": 2.6305970149253732, + "grad_norm": 0.3492841677010907, + "learning_rate": 2.7416709732991863e-05, + "loss": 0.4047, + "step": 11280 + }, + { + "epoch": 2.6317630597014925, + "grad_norm": 0.350072312532316, + "learning_rate": 2.7399357699781477e-05, + "loss": 0.4096, + "step": 11285 + }, + { + "epoch": 2.632929104477612, + "grad_norm": 0.30080685045019406, + "learning_rate": 2.7382005726429256e-05, + "loss": 0.382, + "step": 11290 + }, + { + "epoch": 2.6340951492537314, + "grad_norm": 0.33943411424315834, + "learning_rate": 2.736465382325551e-05, + "loss": 0.3991, + "step": 11295 + }, + { + "epoch": 2.6352611940298507, + "grad_norm": 0.33228527249028883, + "learning_rate": 2.7347302000580475e-05, + "loss": 0.4053, + "step": 11300 + }, + { + "epoch": 2.63642723880597, + "grad_norm": 0.40510683583736945, + "learning_rate": 2.7329950268724358e-05, + "loss": 0.3939, + "step": 11305 + }, + { + "epoch": 2.6375932835820897, + "grad_norm": 0.3317850795628135, + "learning_rate": 2.7312598638007308e-05, + "loss": 0.391, + "step": 11310 + }, + { + "epoch": 2.638759328358209, + "grad_norm": 0.34005872267908127, + "learning_rate": 2.7295247118749395e-05, + "loss": 0.4119, + "step": 11315 + }, + { + "epoch": 2.6399253731343286, + "grad_norm": 0.3534443990767683, + "learning_rate": 2.727789572127064e-05, + "loss": 0.4069, + "step": 11320 + }, + { + "epoch": 2.641091417910448, + "grad_norm": 0.3235925758605551, + "learning_rate": 2.7260544455890996e-05, + "loss": 0.3915, + "step": 11325 + }, + { + "epoch": 2.642257462686567, + "grad_norm": 0.30904204184094364, + "learning_rate": 2.724319333293033e-05, + "loss": 0.388, + "step": 11330 + }, + { + "epoch": 2.6434235074626864, + "grad_norm": 0.32952150866860735, + "learning_rate": 2.7225842362708427e-05, + "loss": 0.3904, + "step": 11335 + }, + { + "epoch": 2.644589552238806, + "grad_norm": 0.32308363663222794, + "learning_rate": 2.7208491555544964e-05, + "loss": 0.3914, + "step": 11340 + }, + { + "epoch": 2.6457555970149254, + "grad_norm": 0.320582004158062, + "learning_rate": 2.7191140921759546e-05, + "loss": 0.4147, + "step": 11345 + }, + { + "epoch": 2.6469216417910446, + "grad_norm": 0.3142131125665813, + "learning_rate": 2.7173790471671662e-05, + "loss": 0.3907, + "step": 11350 + }, + { + "epoch": 2.6480876865671643, + "grad_norm": 0.3613758839320645, + "learning_rate": 2.7156440215600703e-05, + "loss": 0.3989, + "step": 11355 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.3268037278177219, + "learning_rate": 2.7139090163865932e-05, + "loss": 0.4028, + "step": 11360 + }, + { + "epoch": 2.650419776119403, + "grad_norm": 0.34418869429663473, + "learning_rate": 2.712174032678648e-05, + "loss": 0.4006, + "step": 11365 + }, + { + "epoch": 2.6515858208955225, + "grad_norm": 0.34411292220486966, + "learning_rate": 2.7104390714681393e-05, + "loss": 0.4064, + "step": 11370 + }, + { + "epoch": 2.652751865671642, + "grad_norm": 0.31855509220399103, + "learning_rate": 2.7087041337869522e-05, + "loss": 0.4084, + "step": 11375 + }, + { + "epoch": 2.653917910447761, + "grad_norm": 0.39000246313061576, + "learning_rate": 2.7069692206669633e-05, + "loss": 0.4004, + "step": 11380 + }, + { + "epoch": 2.6550839552238807, + "grad_norm": 0.3520022397137946, + "learning_rate": 2.7052343331400322e-05, + "loss": 0.4011, + "step": 11385 + }, + { + "epoch": 2.65625, + "grad_norm": 0.31755249099356897, + "learning_rate": 2.7034994722380036e-05, + "loss": 0.4028, + "step": 11390 + }, + { + "epoch": 2.6574160447761193, + "grad_norm": 0.3261361077644442, + "learning_rate": 2.701764638992705e-05, + "loss": 0.3844, + "step": 11395 + }, + { + "epoch": 2.658582089552239, + "grad_norm": 0.3537403079466135, + "learning_rate": 2.7000298344359494e-05, + "loss": 0.3916, + "step": 11400 + }, + { + "epoch": 2.659748134328358, + "grad_norm": 0.32627948445058835, + "learning_rate": 2.6982950595995315e-05, + "loss": 0.4105, + "step": 11405 + }, + { + "epoch": 2.6609141791044775, + "grad_norm": 0.3389767530228438, + "learning_rate": 2.6965603155152302e-05, + "loss": 0.3882, + "step": 11410 + }, + { + "epoch": 2.662080223880597, + "grad_norm": 0.3322576814889697, + "learning_rate": 2.6948256032148052e-05, + "loss": 0.4068, + "step": 11415 + }, + { + "epoch": 2.6632462686567164, + "grad_norm": 0.3426517578214566, + "learning_rate": 2.6930909237299934e-05, + "loss": 0.4031, + "step": 11420 + }, + { + "epoch": 2.6644123134328357, + "grad_norm": 0.32799301808104175, + "learning_rate": 2.691356278092519e-05, + "loss": 0.3885, + "step": 11425 + }, + { + "epoch": 2.6655783582089554, + "grad_norm": 0.3233374574722326, + "learning_rate": 2.6896216673340814e-05, + "loss": 0.4049, + "step": 11430 + }, + { + "epoch": 2.6667444029850746, + "grad_norm": 0.3131013971210666, + "learning_rate": 2.687887092486361e-05, + "loss": 0.3908, + "step": 11435 + }, + { + "epoch": 2.667910447761194, + "grad_norm": 0.34802518428559803, + "learning_rate": 2.686152554581016e-05, + "loss": 0.4375, + "step": 11440 + }, + { + "epoch": 2.6690764925373136, + "grad_norm": 0.3352028409611966, + "learning_rate": 2.6844180546496833e-05, + "loss": 0.3954, + "step": 11445 + }, + { + "epoch": 2.670242537313433, + "grad_norm": 0.34116821999145536, + "learning_rate": 2.682683593723977e-05, + "loss": 0.4029, + "step": 11450 + }, + { + "epoch": 2.671408582089552, + "grad_norm": 0.31866260815082276, + "learning_rate": 2.680949172835487e-05, + "loss": 0.3812, + "step": 11455 + }, + { + "epoch": 2.6725746268656714, + "grad_norm": 0.3071887351168522, + "learning_rate": 2.6792147930157812e-05, + "loss": 0.4016, + "step": 11460 + }, + { + "epoch": 2.673740671641791, + "grad_norm": 0.35257332177523143, + "learning_rate": 2.6774804552964034e-05, + "loss": 0.4032, + "step": 11465 + }, + { + "epoch": 2.6749067164179103, + "grad_norm": 0.3303925221970838, + "learning_rate": 2.6757461607088692e-05, + "loss": 0.4003, + "step": 11470 + }, + { + "epoch": 2.67607276119403, + "grad_norm": 0.32544472846675615, + "learning_rate": 2.6740119102846707e-05, + "loss": 0.4123, + "step": 11475 + }, + { + "epoch": 2.6772388059701493, + "grad_norm": 0.33468831151919165, + "learning_rate": 2.6722777050552737e-05, + "loss": 0.4063, + "step": 11480 + }, + { + "epoch": 2.6784048507462686, + "grad_norm": 0.33652628557284775, + "learning_rate": 2.6705435460521177e-05, + "loss": 0.4002, + "step": 11485 + }, + { + "epoch": 2.679570895522388, + "grad_norm": 0.327341712433127, + "learning_rate": 2.668809434306615e-05, + "loss": 0.4015, + "step": 11490 + }, + { + "epoch": 2.6807369402985075, + "grad_norm": 0.3122521221151579, + "learning_rate": 2.6670753708501454e-05, + "loss": 0.3952, + "step": 11495 + }, + { + "epoch": 2.6819029850746268, + "grad_norm": 0.4599505403758927, + "learning_rate": 2.6653413567140668e-05, + "loss": 0.4086, + "step": 11500 + }, + { + "epoch": 2.6830690298507465, + "grad_norm": 0.3389549123423084, + "learning_rate": 2.6636073929297018e-05, + "loss": 0.3937, + "step": 11505 + }, + { + "epoch": 2.6842350746268657, + "grad_norm": 0.35693314172674717, + "learning_rate": 2.661873480528347e-05, + "loss": 0.39, + "step": 11510 + }, + { + "epoch": 2.685401119402985, + "grad_norm": 0.34909238230751594, + "learning_rate": 2.660139620541267e-05, + "loss": 0.4118, + "step": 11515 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.3398210153849959, + "learning_rate": 2.6584058139996942e-05, + "loss": 0.4042, + "step": 11520 + }, + { + "epoch": 2.687733208955224, + "grad_norm": 0.33475580925794657, + "learning_rate": 2.656672061934831e-05, + "loss": 0.3982, + "step": 11525 + }, + { + "epoch": 2.688899253731343, + "grad_norm": 0.31449648136065284, + "learning_rate": 2.654938365377847e-05, + "loss": 0.3907, + "step": 11530 + }, + { + "epoch": 2.690065298507463, + "grad_norm": 0.34235536529933425, + "learning_rate": 2.6532047253598776e-05, + "loss": 0.4042, + "step": 11535 + }, + { + "epoch": 2.691231343283582, + "grad_norm": 0.3391407943414869, + "learning_rate": 2.651471142912026e-05, + "loss": 0.3932, + "step": 11540 + }, + { + "epoch": 2.6923973880597014, + "grad_norm": 0.33897338575542646, + "learning_rate": 2.6497376190653607e-05, + "loss": 0.3924, + "step": 11545 + }, + { + "epoch": 2.6935634328358207, + "grad_norm": 0.3291808731993279, + "learning_rate": 2.6480041548509137e-05, + "loss": 0.3923, + "step": 11550 + }, + { + "epoch": 2.6947294776119404, + "grad_norm": 0.32127289283311683, + "learning_rate": 2.6462707512996847e-05, + "loss": 0.3873, + "step": 11555 + }, + { + "epoch": 2.6958955223880596, + "grad_norm": 0.3193042644399082, + "learning_rate": 2.644537409442635e-05, + "loss": 0.3831, + "step": 11560 + }, + { + "epoch": 2.6970615671641793, + "grad_norm": 0.3330610435175032, + "learning_rate": 2.642804130310691e-05, + "loss": 0.3932, + "step": 11565 + }, + { + "epoch": 2.6982276119402986, + "grad_norm": 0.3256751410441129, + "learning_rate": 2.6410709149347385e-05, + "loss": 0.398, + "step": 11570 + }, + { + "epoch": 2.699393656716418, + "grad_norm": 0.34017827821819047, + "learning_rate": 2.6393377643456284e-05, + "loss": 0.4003, + "step": 11575 + }, + { + "epoch": 2.700559701492537, + "grad_norm": 0.3422711435713736, + "learning_rate": 2.6376046795741733e-05, + "loss": 0.3884, + "step": 11580 + }, + { + "epoch": 2.701725746268657, + "grad_norm": 0.3557609765443232, + "learning_rate": 2.6358716616511446e-05, + "loss": 0.3985, + "step": 11585 + }, + { + "epoch": 2.702891791044776, + "grad_norm": 0.3294102070107835, + "learning_rate": 2.6341387116072763e-05, + "loss": 0.3767, + "step": 11590 + }, + { + "epoch": 2.7040578358208958, + "grad_norm": 0.3476481571980942, + "learning_rate": 2.6324058304732574e-05, + "loss": 0.3897, + "step": 11595 + }, + { + "epoch": 2.705223880597015, + "grad_norm": 0.3180216864445747, + "learning_rate": 2.630673019279742e-05, + "loss": 0.3939, + "step": 11600 + }, + { + "epoch": 2.7063899253731343, + "grad_norm": 0.3447200668148722, + "learning_rate": 2.6289402790573392e-05, + "loss": 0.3945, + "step": 11605 + }, + { + "epoch": 2.7075559701492535, + "grad_norm": 0.3169899581728641, + "learning_rate": 2.6272076108366163e-05, + "loss": 0.3975, + "step": 11610 + }, + { + "epoch": 2.7087220149253732, + "grad_norm": 0.35037596538415655, + "learning_rate": 2.6254750156480973e-05, + "loss": 0.4174, + "step": 11615 + }, + { + "epoch": 2.7098880597014925, + "grad_norm": 0.3295914190684624, + "learning_rate": 2.623742494522264e-05, + "loss": 0.3884, + "step": 11620 + }, + { + "epoch": 2.711054104477612, + "grad_norm": 0.3216119293129583, + "learning_rate": 2.6220100484895527e-05, + "loss": 0.3957, + "step": 11625 + }, + { + "epoch": 2.7122201492537314, + "grad_norm": 0.3322297246793832, + "learning_rate": 2.620277678580358e-05, + "loss": 0.4197, + "step": 11630 + }, + { + "epoch": 2.7133861940298507, + "grad_norm": 0.34768686891971406, + "learning_rate": 2.6185453858250242e-05, + "loss": 0.409, + "step": 11635 + }, + { + "epoch": 2.71455223880597, + "grad_norm": 0.334971956806543, + "learning_rate": 2.616813171253855e-05, + "loss": 0.3836, + "step": 11640 + }, + { + "epoch": 2.7157182835820897, + "grad_norm": 0.34111962346610814, + "learning_rate": 2.615081035897104e-05, + "loss": 0.3807, + "step": 11645 + }, + { + "epoch": 2.716884328358209, + "grad_norm": 0.3195901275859157, + "learning_rate": 2.6133489807849786e-05, + "loss": 0.3797, + "step": 11650 + }, + { + "epoch": 2.7180503731343286, + "grad_norm": 0.32013670350785867, + "learning_rate": 2.6116170069476397e-05, + "loss": 0.3788, + "step": 11655 + }, + { + "epoch": 2.719216417910448, + "grad_norm": 0.47011368906524376, + "learning_rate": 2.609885115415198e-05, + "loss": 0.3962, + "step": 11660 + }, + { + "epoch": 2.720382462686567, + "grad_norm": 0.32982538837259384, + "learning_rate": 2.6081533072177183e-05, + "loss": 0.4108, + "step": 11665 + }, + { + "epoch": 2.7215485074626864, + "grad_norm": 0.35363158156051727, + "learning_rate": 2.6064215833852113e-05, + "loss": 0.3957, + "step": 11670 + }, + { + "epoch": 2.722714552238806, + "grad_norm": 0.3168259313377346, + "learning_rate": 2.6046899449476397e-05, + "loss": 0.4056, + "step": 11675 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.31529162330079885, + "learning_rate": 2.602958392934917e-05, + "loss": 0.3823, + "step": 11680 + }, + { + "epoch": 2.7250466417910446, + "grad_norm": 0.34805762786353617, + "learning_rate": 2.601226928376904e-05, + "loss": 0.4105, + "step": 11685 + }, + { + "epoch": 2.7262126865671643, + "grad_norm": 0.3304407121033633, + "learning_rate": 2.5994955523034098e-05, + "loss": 0.3743, + "step": 11690 + }, + { + "epoch": 2.7273787313432836, + "grad_norm": 0.3513517133228261, + "learning_rate": 2.5977642657441893e-05, + "loss": 0.4106, + "step": 11695 + }, + { + "epoch": 2.728544776119403, + "grad_norm": 0.3403043760631628, + "learning_rate": 2.5960330697289447e-05, + "loss": 0.4063, + "step": 11700 + }, + { + "epoch": 2.7297108208955225, + "grad_norm": 0.31497780385390123, + "learning_rate": 2.5943019652873267e-05, + "loss": 0.4014, + "step": 11705 + }, + { + "epoch": 2.730876865671642, + "grad_norm": 0.3049103415259645, + "learning_rate": 2.5925709534489295e-05, + "loss": 0.371, + "step": 11710 + }, + { + "epoch": 2.732042910447761, + "grad_norm": 0.3444127101710014, + "learning_rate": 2.5908400352432927e-05, + "loss": 0.3947, + "step": 11715 + }, + { + "epoch": 2.7332089552238807, + "grad_norm": 0.3337247336672335, + "learning_rate": 2.589109211699899e-05, + "loss": 0.4007, + "step": 11720 + }, + { + "epoch": 2.734375, + "grad_norm": 0.32567021716465516, + "learning_rate": 2.5873784838481762e-05, + "loss": 0.3925, + "step": 11725 + }, + { + "epoch": 2.7355410447761193, + "grad_norm": 0.3260034023143692, + "learning_rate": 2.5856478527174955e-05, + "loss": 0.3907, + "step": 11730 + }, + { + "epoch": 2.736707089552239, + "grad_norm": 0.3481198984833565, + "learning_rate": 2.5839173193371697e-05, + "loss": 0.4046, + "step": 11735 + }, + { + "epoch": 2.737873134328358, + "grad_norm": 0.33180708473431747, + "learning_rate": 2.5821868847364534e-05, + "loss": 0.3944, + "step": 11740 + }, + { + "epoch": 2.7390391791044775, + "grad_norm": 0.37912682432222555, + "learning_rate": 2.5804565499445437e-05, + "loss": 0.4046, + "step": 11745 + }, + { + "epoch": 2.740205223880597, + "grad_norm": 0.34226470716209806, + "learning_rate": 2.578726315990576e-05, + "loss": 0.3985, + "step": 11750 + }, + { + "epoch": 2.7413712686567164, + "grad_norm": 0.32678430005638676, + "learning_rate": 2.5769961839036277e-05, + "loss": 0.3896, + "step": 11755 + }, + { + "epoch": 2.7425373134328357, + "grad_norm": 0.33792757870111595, + "learning_rate": 2.575266154712715e-05, + "loss": 0.4046, + "step": 11760 + }, + { + "epoch": 2.7437033582089554, + "grad_norm": 0.33741924695677405, + "learning_rate": 2.5735362294467928e-05, + "loss": 0.435, + "step": 11765 + }, + { + "epoch": 2.7448694029850746, + "grad_norm": 0.33921663121838525, + "learning_rate": 2.571806409134756e-05, + "loss": 0.3997, + "step": 11770 + }, + { + "epoch": 2.746035447761194, + "grad_norm": 0.31464119160756665, + "learning_rate": 2.570076694805432e-05, + "loss": 0.4003, + "step": 11775 + }, + { + "epoch": 2.7472014925373136, + "grad_norm": 0.3544414565417629, + "learning_rate": 2.5683470874875913e-05, + "loss": 0.412, + "step": 11780 + }, + { + "epoch": 2.748367537313433, + "grad_norm": 0.3167308552952958, + "learning_rate": 2.566617588209937e-05, + "loss": 0.3772, + "step": 11785 + }, + { + "epoch": 2.749533582089552, + "grad_norm": 0.3311358398906818, + "learning_rate": 2.564888198001109e-05, + "loss": 0.4004, + "step": 11790 + }, + { + "epoch": 2.7506996268656714, + "grad_norm": 0.3212483813375164, + "learning_rate": 2.563158917889683e-05, + "loss": 0.4057, + "step": 11795 + }, + { + "epoch": 2.751865671641791, + "grad_norm": 0.3479030879680861, + "learning_rate": 2.5614297489041673e-05, + "loss": 0.4092, + "step": 11800 + }, + { + "epoch": 2.7530317164179103, + "grad_norm": 0.33158240582876686, + "learning_rate": 2.559700692073006e-05, + "loss": 0.3842, + "step": 11805 + }, + { + "epoch": 2.75419776119403, + "grad_norm": 0.3286502900384725, + "learning_rate": 2.5579717484245756e-05, + "loss": 0.372, + "step": 11810 + }, + { + "epoch": 2.7553638059701493, + "grad_norm": 0.3454572183227963, + "learning_rate": 2.556242918987185e-05, + "loss": 0.4046, + "step": 11815 + }, + { + "epoch": 2.7565298507462686, + "grad_norm": 0.315371977694258, + "learning_rate": 2.554514204789078e-05, + "loss": 0.4007, + "step": 11820 + }, + { + "epoch": 2.757695895522388, + "grad_norm": 0.3151912960465065, + "learning_rate": 2.5527856068584244e-05, + "loss": 0.3869, + "step": 11825 + }, + { + "epoch": 2.7588619402985075, + "grad_norm": 0.3422704418232128, + "learning_rate": 2.551057126223329e-05, + "loss": 0.4056, + "step": 11830 + }, + { + "epoch": 2.7600279850746268, + "grad_norm": 0.3284554562452858, + "learning_rate": 2.5493287639118265e-05, + "loss": 0.4066, + "step": 11835 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.3115128050029284, + "learning_rate": 2.54760052095188e-05, + "loss": 0.3908, + "step": 11840 + }, + { + "epoch": 2.7623600746268657, + "grad_norm": 0.3423318131846034, + "learning_rate": 2.545872398371383e-05, + "loss": 0.4091, + "step": 11845 + }, + { + "epoch": 2.763526119402985, + "grad_norm": 0.3192794044388495, + "learning_rate": 2.544144397198155e-05, + "loss": 0.3883, + "step": 11850 + }, + { + "epoch": 2.7646921641791042, + "grad_norm": 0.3298892515466483, + "learning_rate": 2.5424165184599457e-05, + "loss": 0.4, + "step": 11855 + }, + { + "epoch": 2.765858208955224, + "grad_norm": 0.33594465530904233, + "learning_rate": 2.5406887631844312e-05, + "loss": 0.4043, + "step": 11860 + }, + { + "epoch": 2.767024253731343, + "grad_norm": 0.34840887315760244, + "learning_rate": 2.5389611323992134e-05, + "loss": 0.3991, + "step": 11865 + }, + { + "epoch": 2.768190298507463, + "grad_norm": 0.3659025813828882, + "learning_rate": 2.5372336271318225e-05, + "loss": 0.4036, + "step": 11870 + }, + { + "epoch": 2.769356343283582, + "grad_norm": 0.3292366496998942, + "learning_rate": 2.5355062484097103e-05, + "loss": 0.4026, + "step": 11875 + }, + { + "epoch": 2.7705223880597014, + "grad_norm": 0.3789236833035916, + "learning_rate": 2.5337789972602566e-05, + "loss": 0.397, + "step": 11880 + }, + { + "epoch": 2.7716884328358207, + "grad_norm": 0.3193885477040352, + "learning_rate": 2.5320518747107646e-05, + "loss": 0.4056, + "step": 11885 + }, + { + "epoch": 2.7728544776119404, + "grad_norm": 0.3452125973954222, + "learning_rate": 2.530324881788459e-05, + "loss": 0.4155, + "step": 11890 + }, + { + "epoch": 2.7740205223880596, + "grad_norm": 0.32392224808503434, + "learning_rate": 2.5285980195204906e-05, + "loss": 0.3839, + "step": 11895 + }, + { + "epoch": 2.7751865671641793, + "grad_norm": 0.3140824518357596, + "learning_rate": 2.5268712889339296e-05, + "loss": 0.3889, + "step": 11900 + }, + { + "epoch": 2.7763526119402986, + "grad_norm": 0.34023526978279334, + "learning_rate": 2.5251446910557704e-05, + "loss": 0.4018, + "step": 11905 + }, + { + "epoch": 2.777518656716418, + "grad_norm": 0.3163431386329222, + "learning_rate": 2.5234182269129253e-05, + "loss": 0.3931, + "step": 11910 + }, + { + "epoch": 2.778684701492537, + "grad_norm": 0.33335623057536234, + "learning_rate": 2.5216918975322303e-05, + "loss": 0.3951, + "step": 11915 + }, + { + "epoch": 2.779850746268657, + "grad_norm": 0.3358972294222299, + "learning_rate": 2.519965703940441e-05, + "loss": 0.3848, + "step": 11920 + }, + { + "epoch": 2.781016791044776, + "grad_norm": 0.33874256012388476, + "learning_rate": 2.5182396471642287e-05, + "loss": 0.4076, + "step": 11925 + }, + { + "epoch": 2.7821828358208958, + "grad_norm": 0.34139399168943446, + "learning_rate": 2.5165137282301877e-05, + "loss": 0.3993, + "step": 11930 + }, + { + "epoch": 2.783348880597015, + "grad_norm": 0.3175169310964827, + "learning_rate": 2.5147879481648266e-05, + "loss": 0.3871, + "step": 11935 + }, + { + "epoch": 2.7845149253731343, + "grad_norm": 0.32703895942532285, + "learning_rate": 2.5130623079945754e-05, + "loss": 0.3873, + "step": 11940 + }, + { + "epoch": 2.7856809701492535, + "grad_norm": 0.3147451559140355, + "learning_rate": 2.511336808745778e-05, + "loss": 0.3761, + "step": 11945 + }, + { + "epoch": 2.7868470149253732, + "grad_norm": 0.31512001651088684, + "learning_rate": 2.5096114514446934e-05, + "loss": 0.3929, + "step": 11950 + }, + { + "epoch": 2.7880130597014925, + "grad_norm": 0.3431873108310587, + "learning_rate": 2.5078862371175e-05, + "loss": 0.4068, + "step": 11955 + }, + { + "epoch": 2.789179104477612, + "grad_norm": 0.31343137159498574, + "learning_rate": 2.5061611667902878e-05, + "loss": 0.395, + "step": 11960 + }, + { + "epoch": 2.7903451492537314, + "grad_norm": 0.3516000367218387, + "learning_rate": 2.504436241489064e-05, + "loss": 0.3799, + "step": 11965 + }, + { + "epoch": 2.7915111940298507, + "grad_norm": 0.35051456582328566, + "learning_rate": 2.5027114622397473e-05, + "loss": 0.4243, + "step": 11970 + }, + { + "epoch": 2.79267723880597, + "grad_norm": 0.3315836718030881, + "learning_rate": 2.50098683006817e-05, + "loss": 0.4206, + "step": 11975 + }, + { + "epoch": 2.7938432835820897, + "grad_norm": 0.3141947828655213, + "learning_rate": 2.4992623460000763e-05, + "loss": 0.3982, + "step": 11980 + }, + { + "epoch": 2.795009328358209, + "grad_norm": 0.3471648042578197, + "learning_rate": 2.497538011061125e-05, + "loss": 0.3947, + "step": 11985 + }, + { + "epoch": 2.7961753731343286, + "grad_norm": 0.34129598811920847, + "learning_rate": 2.495813826276884e-05, + "loss": 0.4008, + "step": 11990 + }, + { + "epoch": 2.797341417910448, + "grad_norm": 0.3455030171856007, + "learning_rate": 2.4940897926728314e-05, + "loss": 0.3918, + "step": 11995 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.34742618232423944, + "learning_rate": 2.4923659112743576e-05, + "loss": 0.3866, + "step": 12000 + }, + { + "epoch": 2.7996735074626864, + "grad_norm": 0.3362327819759096, + "learning_rate": 2.490642183106759e-05, + "loss": 0.3953, + "step": 12005 + }, + { + "epoch": 2.800839552238806, + "grad_norm": 0.3551627915697855, + "learning_rate": 2.4889186091952444e-05, + "loss": 0.4209, + "step": 12010 + }, + { + "epoch": 2.8020055970149254, + "grad_norm": 0.3575109929114036, + "learning_rate": 2.48719519056493e-05, + "loss": 0.3932, + "step": 12015 + }, + { + "epoch": 2.8031716417910446, + "grad_norm": 0.33500516760610627, + "learning_rate": 2.485471928240839e-05, + "loss": 0.4008, + "step": 12020 + }, + { + "epoch": 2.8043376865671643, + "grad_norm": 0.3216137890554855, + "learning_rate": 2.4837488232479005e-05, + "loss": 0.3753, + "step": 12025 + }, + { + "epoch": 2.8055037313432836, + "grad_norm": 0.3295186381540003, + "learning_rate": 2.4820258766109515e-05, + "loss": 0.4048, + "step": 12030 + }, + { + "epoch": 2.806669776119403, + "grad_norm": 0.3199776912817911, + "learning_rate": 2.4803030893547357e-05, + "loss": 0.4088, + "step": 12035 + }, + { + "epoch": 2.8078358208955225, + "grad_norm": 0.3404215529681031, + "learning_rate": 2.4785804625039005e-05, + "loss": 0.3856, + "step": 12040 + }, + { + "epoch": 2.809001865671642, + "grad_norm": 0.3225427308190754, + "learning_rate": 2.4768579970829985e-05, + "loss": 0.4016, + "step": 12045 + }, + { + "epoch": 2.810167910447761, + "grad_norm": 0.3278613222509628, + "learning_rate": 2.4751356941164855e-05, + "loss": 0.4032, + "step": 12050 + }, + { + "epoch": 2.8113339552238807, + "grad_norm": 0.3336860498574663, + "learning_rate": 2.4734135546287208e-05, + "loss": 0.3937, + "step": 12055 + }, + { + "epoch": 2.8125, + "grad_norm": 0.35002787355868725, + "learning_rate": 2.4716915796439678e-05, + "loss": 0.4071, + "step": 12060 + }, + { + "epoch": 2.8136660447761193, + "grad_norm": 0.36621963458977336, + "learning_rate": 2.4699697701863916e-05, + "loss": 0.413, + "step": 12065 + }, + { + "epoch": 2.814832089552239, + "grad_norm": 0.32839128654034394, + "learning_rate": 2.4682481272800572e-05, + "loss": 0.3838, + "step": 12070 + }, + { + "epoch": 2.815998134328358, + "grad_norm": 0.35798691442553493, + "learning_rate": 2.4665266519489328e-05, + "loss": 0.3981, + "step": 12075 + }, + { + "epoch": 2.8171641791044775, + "grad_norm": 0.3140940363462768, + "learning_rate": 2.4648053452168857e-05, + "loss": 0.3884, + "step": 12080 + }, + { + "epoch": 2.818330223880597, + "grad_norm": 0.324076070136119, + "learning_rate": 2.463084208107682e-05, + "loss": 0.3963, + "step": 12085 + }, + { + "epoch": 2.8194962686567164, + "grad_norm": 0.3330385699374476, + "learning_rate": 2.4613632416449893e-05, + "loss": 0.3858, + "step": 12090 + }, + { + "epoch": 2.8206623134328357, + "grad_norm": 0.31894826227742207, + "learning_rate": 2.4596424468523728e-05, + "loss": 0.3992, + "step": 12095 + }, + { + "epoch": 2.8218283582089554, + "grad_norm": 0.3192765644082724, + "learning_rate": 2.4579218247532947e-05, + "loss": 0.3828, + "step": 12100 + }, + { + "epoch": 2.8229944029850746, + "grad_norm": 0.346843156119822, + "learning_rate": 2.4562013763711145e-05, + "loss": 0.4018, + "step": 12105 + }, + { + "epoch": 2.824160447761194, + "grad_norm": 0.3221140934657376, + "learning_rate": 2.4544811027290893e-05, + "loss": 0.3958, + "step": 12110 + }, + { + "epoch": 2.8253264925373136, + "grad_norm": 0.3367028071026674, + "learning_rate": 2.452761004850371e-05, + "loss": 0.4, + "step": 12115 + }, + { + "epoch": 2.826492537313433, + "grad_norm": 0.33845487651179035, + "learning_rate": 2.4510410837580106e-05, + "loss": 0.3895, + "step": 12120 + }, + { + "epoch": 2.827658582089552, + "grad_norm": 0.33026623127228605, + "learning_rate": 2.4493213404749493e-05, + "loss": 0.3869, + "step": 12125 + }, + { + "epoch": 2.8288246268656714, + "grad_norm": 0.31860952569546147, + "learning_rate": 2.447601776024024e-05, + "loss": 0.3625, + "step": 12130 + }, + { + "epoch": 2.829990671641791, + "grad_norm": 0.32578484248594275, + "learning_rate": 2.4458823914279662e-05, + "loss": 0.3938, + "step": 12135 + }, + { + "epoch": 2.8311567164179103, + "grad_norm": 0.31136953954015467, + "learning_rate": 2.4441631877093995e-05, + "loss": 0.3861, + "step": 12140 + }, + { + "epoch": 2.83232276119403, + "grad_norm": 0.32698424639935214, + "learning_rate": 2.442444165890842e-05, + "loss": 0.411, + "step": 12145 + }, + { + "epoch": 2.8334888059701493, + "grad_norm": 0.3406999308412676, + "learning_rate": 2.4407253269947006e-05, + "loss": 0.4088, + "step": 12150 + }, + { + "epoch": 2.8346548507462686, + "grad_norm": 0.3480968262746312, + "learning_rate": 2.4390066720432746e-05, + "loss": 0.42, + "step": 12155 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.3270986452676917, + "learning_rate": 2.437288202058755e-05, + "loss": 0.3996, + "step": 12160 + }, + { + "epoch": 2.8369869402985075, + "grad_norm": 0.362320092080957, + "learning_rate": 2.4355699180632207e-05, + "loss": 0.4108, + "step": 12165 + }, + { + "epoch": 2.8381529850746268, + "grad_norm": 0.3530087344249372, + "learning_rate": 2.4338518210786416e-05, + "loss": 0.412, + "step": 12170 + }, + { + "epoch": 2.8393190298507465, + "grad_norm": 0.32787449927078066, + "learning_rate": 2.4321339121268766e-05, + "loss": 0.4057, + "step": 12175 + }, + { + "epoch": 2.8404850746268657, + "grad_norm": 0.31801229519548774, + "learning_rate": 2.430416192229672e-05, + "loss": 0.4014, + "step": 12180 + }, + { + "epoch": 2.841651119402985, + "grad_norm": 0.3165809465068147, + "learning_rate": 2.42869866240866e-05, + "loss": 0.3957, + "step": 12185 + }, + { + "epoch": 2.8428171641791042, + "grad_norm": 0.33666006077250604, + "learning_rate": 2.4269813236853632e-05, + "loss": 0.3957, + "step": 12190 + }, + { + "epoch": 2.843983208955224, + "grad_norm": 0.3261104756662522, + "learning_rate": 2.4252641770811886e-05, + "loss": 0.3803, + "step": 12195 + }, + { + "epoch": 2.845149253731343, + "grad_norm": 0.32277494507890464, + "learning_rate": 2.423547223617429e-05, + "loss": 0.4117, + "step": 12200 + }, + { + "epoch": 2.846315298507463, + "grad_norm": 0.3406342653932348, + "learning_rate": 2.4218304643152617e-05, + "loss": 0.3917, + "step": 12205 + }, + { + "epoch": 2.847481343283582, + "grad_norm": 0.34454687543498197, + "learning_rate": 2.42011390019575e-05, + "loss": 0.3952, + "step": 12210 + }, + { + "epoch": 2.8486473880597014, + "grad_norm": 0.32368511001968725, + "learning_rate": 2.4183975322798407e-05, + "loss": 0.3792, + "step": 12215 + }, + { + "epoch": 2.8498134328358207, + "grad_norm": 0.3267063901512848, + "learning_rate": 2.4166813615883625e-05, + "loss": 0.3917, + "step": 12220 + }, + { + "epoch": 2.8509794776119404, + "grad_norm": 0.3126605529469473, + "learning_rate": 2.4149653891420304e-05, + "loss": 0.396, + "step": 12225 + }, + { + "epoch": 2.8521455223880596, + "grad_norm": 0.3394317315293222, + "learning_rate": 2.4132496159614366e-05, + "loss": 0.4072, + "step": 12230 + }, + { + "epoch": 2.8533115671641793, + "grad_norm": 0.30186806585986314, + "learning_rate": 2.4115340430670574e-05, + "loss": 0.3815, + "step": 12235 + }, + { + "epoch": 2.8544776119402986, + "grad_norm": 0.3381417026019833, + "learning_rate": 2.4098186714792504e-05, + "loss": 0.4056, + "step": 12240 + }, + { + "epoch": 2.855643656716418, + "grad_norm": 0.3279298294433154, + "learning_rate": 2.408103502218253e-05, + "loss": 0.412, + "step": 12245 + }, + { + "epoch": 2.856809701492537, + "grad_norm": 0.3324818038432356, + "learning_rate": 2.4063885363041822e-05, + "loss": 0.4107, + "step": 12250 + }, + { + "epoch": 2.857975746268657, + "grad_norm": 0.34387584154562095, + "learning_rate": 2.4046737747570326e-05, + "loss": 0.4024, + "step": 12255 + }, + { + "epoch": 2.859141791044776, + "grad_norm": 0.3295931675094607, + "learning_rate": 2.4029592185966804e-05, + "loss": 0.3976, + "step": 12260 + }, + { + "epoch": 2.8603078358208958, + "grad_norm": 0.32557698565481685, + "learning_rate": 2.4012448688428768e-05, + "loss": 0.3839, + "step": 12265 + }, + { + "epoch": 2.861473880597015, + "grad_norm": 0.3271252750956059, + "learning_rate": 2.399530726515251e-05, + "loss": 0.4011, + "step": 12270 + }, + { + "epoch": 2.8626399253731343, + "grad_norm": 0.3462489088571486, + "learning_rate": 2.397816792633311e-05, + "loss": 0.3965, + "step": 12275 + }, + { + "epoch": 2.8638059701492535, + "grad_norm": 0.3263109507849424, + "learning_rate": 2.396103068216437e-05, + "loss": 0.3985, + "step": 12280 + }, + { + "epoch": 2.8649720149253732, + "grad_norm": 0.3158163998008106, + "learning_rate": 2.3943895542838868e-05, + "loss": 0.3927, + "step": 12285 + }, + { + "epoch": 2.8661380597014925, + "grad_norm": 0.3210667559741464, + "learning_rate": 2.3926762518547928e-05, + "loss": 0.3769, + "step": 12290 + }, + { + "epoch": 2.867304104477612, + "grad_norm": 0.3491386828131719, + "learning_rate": 2.3909631619481626e-05, + "loss": 0.4006, + "step": 12295 + }, + { + "epoch": 2.8684701492537314, + "grad_norm": 0.3489130761037985, + "learning_rate": 2.3892502855828762e-05, + "loss": 0.3987, + "step": 12300 + }, + { + "epoch": 2.8696361940298507, + "grad_norm": 0.33401463597130787, + "learning_rate": 2.387537623777686e-05, + "loss": 0.4054, + "step": 12305 + }, + { + "epoch": 2.87080223880597, + "grad_norm": 0.3300099766666651, + "learning_rate": 2.3858251775512176e-05, + "loss": 0.4049, + "step": 12310 + }, + { + "epoch": 2.8719682835820897, + "grad_norm": 0.3626893710625864, + "learning_rate": 2.384112947921968e-05, + "loss": 0.4107, + "step": 12315 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.31283099905413925, + "learning_rate": 2.3824009359083073e-05, + "loss": 0.3841, + "step": 12320 + }, + { + "epoch": 2.8743003731343286, + "grad_norm": 0.32022021248826466, + "learning_rate": 2.380689142528474e-05, + "loss": 0.404, + "step": 12325 + }, + { + "epoch": 2.875466417910448, + "grad_norm": 0.34838027513671244, + "learning_rate": 2.378977568800576e-05, + "loss": 0.409, + "step": 12330 + }, + { + "epoch": 2.876632462686567, + "grad_norm": 0.33376525286127284, + "learning_rate": 2.3772662157425925e-05, + "loss": 0.4148, + "step": 12335 + }, + { + "epoch": 2.8777985074626864, + "grad_norm": 0.32721898306258324, + "learning_rate": 2.375555084372371e-05, + "loss": 0.3961, + "step": 12340 + }, + { + "epoch": 2.878964552238806, + "grad_norm": 0.3293576017768263, + "learning_rate": 2.3738441757076268e-05, + "loss": 0.4084, + "step": 12345 + }, + { + "epoch": 2.8801305970149254, + "grad_norm": 0.3285917296102872, + "learning_rate": 2.3721334907659424e-05, + "loss": 0.3966, + "step": 12350 + }, + { + "epoch": 2.8812966417910446, + "grad_norm": 0.3231103566001575, + "learning_rate": 2.370423030564768e-05, + "loss": 0.3822, + "step": 12355 + }, + { + "epoch": 2.8824626865671643, + "grad_norm": 0.3291106258171459, + "learning_rate": 2.368712796121419e-05, + "loss": 0.3806, + "step": 12360 + }, + { + "epoch": 2.8836287313432836, + "grad_norm": 0.3173687904309937, + "learning_rate": 2.367002788453077e-05, + "loss": 0.3898, + "step": 12365 + }, + { + "epoch": 2.884794776119403, + "grad_norm": 0.4311246072339703, + "learning_rate": 2.3652930085767904e-05, + "loss": 0.3952, + "step": 12370 + }, + { + "epoch": 2.8859608208955225, + "grad_norm": 0.32551932885966506, + "learning_rate": 2.3635834575094705e-05, + "loss": 0.3914, + "step": 12375 + }, + { + "epoch": 2.887126865671642, + "grad_norm": 0.3262781870283349, + "learning_rate": 2.3618741362678915e-05, + "loss": 0.3928, + "step": 12380 + }, + { + "epoch": 2.888292910447761, + "grad_norm": 0.33000097131434913, + "learning_rate": 2.360165045868693e-05, + "loss": 0.3875, + "step": 12385 + }, + { + "epoch": 2.8894589552238807, + "grad_norm": 0.31939590414550273, + "learning_rate": 2.358456187328376e-05, + "loss": 0.3876, + "step": 12390 + }, + { + "epoch": 2.890625, + "grad_norm": 0.31138347582004905, + "learning_rate": 2.3567475616633046e-05, + "loss": 0.3728, + "step": 12395 + }, + { + "epoch": 2.8917910447761193, + "grad_norm": 0.33682073854992745, + "learning_rate": 2.355039169889704e-05, + "loss": 0.394, + "step": 12400 + }, + { + "epoch": 2.892957089552239, + "grad_norm": 0.3309306353993416, + "learning_rate": 2.3533310130236592e-05, + "loss": 0.3945, + "step": 12405 + }, + { + "epoch": 2.894123134328358, + "grad_norm": 0.32849882157788585, + "learning_rate": 2.3516230920811166e-05, + "loss": 0.3931, + "step": 12410 + }, + { + "epoch": 2.8952891791044775, + "grad_norm": 0.3273734084049458, + "learning_rate": 2.3499154080778823e-05, + "loss": 0.3834, + "step": 12415 + }, + { + "epoch": 2.896455223880597, + "grad_norm": 0.3053087709724216, + "learning_rate": 2.3482079620296223e-05, + "loss": 0.3837, + "step": 12420 + }, + { + "epoch": 2.8976212686567164, + "grad_norm": 0.3190287803135248, + "learning_rate": 2.3465007549518576e-05, + "loss": 0.3885, + "step": 12425 + }, + { + "epoch": 2.8987873134328357, + "grad_norm": 0.31900171760544377, + "learning_rate": 2.3447937878599725e-05, + "loss": 0.4102, + "step": 12430 + }, + { + "epoch": 2.8999533582089554, + "grad_norm": 0.32566002677049954, + "learning_rate": 2.343087061769203e-05, + "loss": 0.4016, + "step": 12435 + }, + { + "epoch": 2.9011194029850746, + "grad_norm": 0.3333006017751379, + "learning_rate": 2.3413805776946453e-05, + "loss": 0.4047, + "step": 12440 + }, + { + "epoch": 2.902285447761194, + "grad_norm": 0.325557748685479, + "learning_rate": 2.3396743366512508e-05, + "loss": 0.407, + "step": 12445 + }, + { + "epoch": 2.9034514925373136, + "grad_norm": 0.34387362835910973, + "learning_rate": 2.337968339653826e-05, + "loss": 0.3915, + "step": 12450 + }, + { + "epoch": 2.904617537313433, + "grad_norm": 0.3254231469793663, + "learning_rate": 2.3362625877170336e-05, + "loss": 0.4012, + "step": 12455 + }, + { + "epoch": 2.905783582089552, + "grad_norm": 0.31883782984302567, + "learning_rate": 2.3345570818553874e-05, + "loss": 0.3995, + "step": 12460 + }, + { + "epoch": 2.9069496268656714, + "grad_norm": 0.3322127939426441, + "learning_rate": 2.3328518230832587e-05, + "loss": 0.3991, + "step": 12465 + }, + { + "epoch": 2.908115671641791, + "grad_norm": 0.31989111332781434, + "learning_rate": 2.331146812414869e-05, + "loss": 0.3837, + "step": 12470 + }, + { + "epoch": 2.9092817164179103, + "grad_norm": 0.33879282085594653, + "learning_rate": 2.329442050864293e-05, + "loss": 0.4053, + "step": 12475 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.3543375690832978, + "learning_rate": 2.3277375394454594e-05, + "loss": 0.4041, + "step": 12480 + }, + { + "epoch": 2.9116138059701493, + "grad_norm": 0.31999931712441865, + "learning_rate": 2.326033279172144e-05, + "loss": 0.3966, + "step": 12485 + }, + { + "epoch": 2.9127798507462686, + "grad_norm": 0.32076392516037394, + "learning_rate": 2.324329271057976e-05, + "loss": 0.397, + "step": 12490 + }, + { + "epoch": 2.913945895522388, + "grad_norm": 0.3247086331298665, + "learning_rate": 2.322625516116435e-05, + "loss": 0.3946, + "step": 12495 + }, + { + "epoch": 2.9151119402985075, + "grad_norm": 0.3281782323495906, + "learning_rate": 2.3209220153608486e-05, + "loss": 0.4137, + "step": 12500 + }, + { + "epoch": 2.9162779850746268, + "grad_norm": 0.3372712945851879, + "learning_rate": 2.3192187698043944e-05, + "loss": 0.3975, + "step": 12505 + }, + { + "epoch": 2.9174440298507465, + "grad_norm": 0.37702107211387365, + "learning_rate": 2.3175157804600954e-05, + "loss": 0.4143, + "step": 12510 + }, + { + "epoch": 2.9186100746268657, + "grad_norm": 0.310618204430658, + "learning_rate": 2.3158130483408262e-05, + "loss": 0.3862, + "step": 12515 + }, + { + "epoch": 2.919776119402985, + "grad_norm": 0.3346469519772596, + "learning_rate": 2.3141105744593065e-05, + "loss": 0.3826, + "step": 12520 + }, + { + "epoch": 2.9209421641791042, + "grad_norm": 0.3206023141307392, + "learning_rate": 2.3124083598281022e-05, + "loss": 0.3803, + "step": 12525 + }, + { + "epoch": 2.922108208955224, + "grad_norm": 0.3310996156602885, + "learning_rate": 2.310706405459625e-05, + "loss": 0.3753, + "step": 12530 + }, + { + "epoch": 2.923274253731343, + "grad_norm": 0.31499057177592116, + "learning_rate": 2.3090047123661324e-05, + "loss": 0.3889, + "step": 12535 + }, + { + "epoch": 2.924440298507463, + "grad_norm": 0.30367718744586075, + "learning_rate": 2.3073032815597263e-05, + "loss": 0.3849, + "step": 12540 + }, + { + "epoch": 2.925606343283582, + "grad_norm": 0.31983114832533827, + "learning_rate": 2.3056021140523516e-05, + "loss": 0.3924, + "step": 12545 + }, + { + "epoch": 2.9267723880597014, + "grad_norm": 0.31463552150541285, + "learning_rate": 2.3039012108557982e-05, + "loss": 0.3762, + "step": 12550 + }, + { + "epoch": 2.9279384328358207, + "grad_norm": 0.31553299625090286, + "learning_rate": 2.3022005729817e-05, + "loss": 0.3854, + "step": 12555 + }, + { + "epoch": 2.9291044776119404, + "grad_norm": 0.32415412932787335, + "learning_rate": 2.3005002014415274e-05, + "loss": 0.406, + "step": 12560 + }, + { + "epoch": 2.9302705223880596, + "grad_norm": 0.35082667598873624, + "learning_rate": 2.2988000972465978e-05, + "loss": 0.4063, + "step": 12565 + }, + { + "epoch": 2.9314365671641793, + "grad_norm": 0.34850139125998236, + "learning_rate": 2.297100261408069e-05, + "loss": 0.4172, + "step": 12570 + }, + { + "epoch": 2.9326026119402986, + "grad_norm": 0.3226814126905502, + "learning_rate": 2.295400694936937e-05, + "loss": 0.3868, + "step": 12575 + }, + { + "epoch": 2.933768656716418, + "grad_norm": 0.321302545691153, + "learning_rate": 2.2937013988440405e-05, + "loss": 0.4168, + "step": 12580 + }, + { + "epoch": 2.934934701492537, + "grad_norm": 0.3323426345500244, + "learning_rate": 2.2920023741400533e-05, + "loss": 0.4041, + "step": 12585 + }, + { + "epoch": 2.936100746268657, + "grad_norm": 0.3278335493510715, + "learning_rate": 2.2903036218354912e-05, + "loss": 0.3833, + "step": 12590 + }, + { + "epoch": 2.937266791044776, + "grad_norm": 0.3109046277534067, + "learning_rate": 2.288605142940707e-05, + "loss": 0.3929, + "step": 12595 + }, + { + "epoch": 2.9384328358208958, + "grad_norm": 0.3470122662986262, + "learning_rate": 2.2869069384658908e-05, + "loss": 0.4113, + "step": 12600 + }, + { + "epoch": 2.939598880597015, + "grad_norm": 0.33151628305856545, + "learning_rate": 2.2852090094210698e-05, + "loss": 0.4082, + "step": 12605 + }, + { + "epoch": 2.9407649253731343, + "grad_norm": 0.31165230323487975, + "learning_rate": 2.283511356816106e-05, + "loss": 0.4003, + "step": 12610 + }, + { + "epoch": 2.9419309701492535, + "grad_norm": 0.30091752240182185, + "learning_rate": 2.2818139816607e-05, + "loss": 0.3699, + "step": 12615 + }, + { + "epoch": 2.9430970149253732, + "grad_norm": 0.3510837756513753, + "learning_rate": 2.280116884964383e-05, + "loss": 0.3971, + "step": 12620 + }, + { + "epoch": 2.9442630597014925, + "grad_norm": 0.3570534670145446, + "learning_rate": 2.2784200677365242e-05, + "loss": 0.4046, + "step": 12625 + }, + { + "epoch": 2.945429104477612, + "grad_norm": 0.3462519120583106, + "learning_rate": 2.276723530986327e-05, + "loss": 0.4083, + "step": 12630 + }, + { + "epoch": 2.9465951492537314, + "grad_norm": 0.3155521881776295, + "learning_rate": 2.2750272757228235e-05, + "loss": 0.3898, + "step": 12635 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3302676200847098, + "learning_rate": 2.273331302954883e-05, + "loss": 0.4021, + "step": 12640 + }, + { + "epoch": 2.94892723880597, + "grad_norm": 0.31531950202628056, + "learning_rate": 2.271635613691205e-05, + "loss": 0.3854, + "step": 12645 + }, + { + "epoch": 2.9500932835820897, + "grad_norm": 0.32218333777070857, + "learning_rate": 2.26994020894032e-05, + "loss": 0.4051, + "step": 12650 + }, + { + "epoch": 2.951259328358209, + "grad_norm": 0.33339429502269663, + "learning_rate": 2.2682450897105905e-05, + "loss": 0.3943, + "step": 12655 + }, + { + "epoch": 2.9524253731343286, + "grad_norm": 0.33577421197015894, + "learning_rate": 2.266550257010207e-05, + "loss": 0.3907, + "step": 12660 + }, + { + "epoch": 2.953591417910448, + "grad_norm": 0.33660080490133865, + "learning_rate": 2.2648557118471918e-05, + "loss": 0.3891, + "step": 12665 + }, + { + "epoch": 2.954757462686567, + "grad_norm": 0.3365030888007477, + "learning_rate": 2.2631614552293963e-05, + "loss": 0.3946, + "step": 12670 + }, + { + "epoch": 2.9559235074626864, + "grad_norm": 0.32265600033551056, + "learning_rate": 2.2614674881644974e-05, + "loss": 0.3942, + "step": 12675 + }, + { + "epoch": 2.957089552238806, + "grad_norm": 0.33902639321595024, + "learning_rate": 2.2597738116600048e-05, + "loss": 0.3942, + "step": 12680 + }, + { + "epoch": 2.9582555970149254, + "grad_norm": 0.3300590753897082, + "learning_rate": 2.2580804267232484e-05, + "loss": 0.3876, + "step": 12685 + }, + { + "epoch": 2.9594216417910446, + "grad_norm": 0.32527692274528264, + "learning_rate": 2.2563873343613916e-05, + "loss": 0.4014, + "step": 12690 + }, + { + "epoch": 2.9605876865671643, + "grad_norm": 0.320066216570011, + "learning_rate": 2.2546945355814196e-05, + "loss": 0.3916, + "step": 12695 + }, + { + "epoch": 2.9617537313432836, + "grad_norm": 0.34400766536803673, + "learning_rate": 2.2530020313901446e-05, + "loss": 0.4014, + "step": 12700 + }, + { + "epoch": 2.962919776119403, + "grad_norm": 0.32742320925968044, + "learning_rate": 2.2513098227942032e-05, + "loss": 0.3826, + "step": 12705 + }, + { + "epoch": 2.9640858208955225, + "grad_norm": 0.32644568730014994, + "learning_rate": 2.249617910800056e-05, + "loss": 0.3909, + "step": 12710 + }, + { + "epoch": 2.965251865671642, + "grad_norm": 0.32990731992647065, + "learning_rate": 2.2479262964139863e-05, + "loss": 0.3964, + "step": 12715 + }, + { + "epoch": 2.966417910447761, + "grad_norm": 0.3321003374534492, + "learning_rate": 2.2462349806421035e-05, + "loss": 0.3916, + "step": 12720 + }, + { + "epoch": 2.9675839552238807, + "grad_norm": 0.32661592271770257, + "learning_rate": 2.244543964490336e-05, + "loss": 0.3847, + "step": 12725 + }, + { + "epoch": 2.96875, + "grad_norm": 0.32407147265046626, + "learning_rate": 2.2428532489644368e-05, + "loss": 0.3937, + "step": 12730 + }, + { + "epoch": 2.9699160447761193, + "grad_norm": 0.351299879969938, + "learning_rate": 2.2411628350699766e-05, + "loss": 0.4001, + "step": 12735 + }, + { + "epoch": 2.971082089552239, + "grad_norm": 0.3420923695091976, + "learning_rate": 2.2394727238123497e-05, + "loss": 0.3976, + "step": 12740 + }, + { + "epoch": 2.972248134328358, + "grad_norm": 0.335331945197588, + "learning_rate": 2.23778291619677e-05, + "loss": 0.4026, + "step": 12745 + }, + { + "epoch": 2.9734141791044775, + "grad_norm": 0.33402675220345296, + "learning_rate": 2.236093413228269e-05, + "loss": 0.4088, + "step": 12750 + }, + { + "epoch": 2.974580223880597, + "grad_norm": 0.3473916621893903, + "learning_rate": 2.2344042159117006e-05, + "loss": 0.4159, + "step": 12755 + }, + { + "epoch": 2.9757462686567164, + "grad_norm": 0.31715982641176704, + "learning_rate": 2.2327153252517323e-05, + "loss": 0.3887, + "step": 12760 + }, + { + "epoch": 2.9769123134328357, + "grad_norm": 0.3302423283046164, + "learning_rate": 2.2310267422528523e-05, + "loss": 0.3945, + "step": 12765 + }, + { + "epoch": 2.9780783582089554, + "grad_norm": 0.3697981083704603, + "learning_rate": 2.2293384679193645e-05, + "loss": 0.4099, + "step": 12770 + }, + { + "epoch": 2.9792444029850746, + "grad_norm": 0.32605460793354085, + "learning_rate": 2.2276505032553912e-05, + "loss": 0.3994, + "step": 12775 + }, + { + "epoch": 2.980410447761194, + "grad_norm": 0.3375878303957479, + "learning_rate": 2.2259628492648676e-05, + "loss": 0.4165, + "step": 12780 + }, + { + "epoch": 2.9815764925373136, + "grad_norm": 0.3289720125129972, + "learning_rate": 2.224275506951547e-05, + "loss": 0.3867, + "step": 12785 + }, + { + "epoch": 2.982742537313433, + "grad_norm": 0.39439488786969035, + "learning_rate": 2.2225884773189936e-05, + "loss": 0.4027, + "step": 12790 + }, + { + "epoch": 2.983908582089552, + "grad_norm": 0.3167173881527927, + "learning_rate": 2.2209017613705908e-05, + "loss": 0.388, + "step": 12795 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.32674021695452415, + "learning_rate": 2.2192153601095293e-05, + "loss": 0.3981, + "step": 12800 + }, + { + "epoch": 2.986240671641791, + "grad_norm": 0.3220894508303537, + "learning_rate": 2.2175292745388186e-05, + "loss": 0.3843, + "step": 12805 + }, + { + "epoch": 2.9874067164179103, + "grad_norm": 0.32963873894719864, + "learning_rate": 2.2158435056612775e-05, + "loss": 0.4062, + "step": 12810 + }, + { + "epoch": 2.98857276119403, + "grad_norm": 0.3183732730239878, + "learning_rate": 2.2141580544795353e-05, + "loss": 0.3832, + "step": 12815 + }, + { + "epoch": 2.9897388059701493, + "grad_norm": 0.36222438252167755, + "learning_rate": 2.2124729219960343e-05, + "loss": 0.3861, + "step": 12820 + }, + { + "epoch": 2.9909048507462686, + "grad_norm": 0.344714460761376, + "learning_rate": 2.2107881092130266e-05, + "loss": 0.4102, + "step": 12825 + }, + { + "epoch": 2.992070895522388, + "grad_norm": 0.3350653668106831, + "learning_rate": 2.2091036171325754e-05, + "loss": 0.3987, + "step": 12830 + }, + { + "epoch": 2.9932369402985075, + "grad_norm": 0.3319939448431505, + "learning_rate": 2.2074194467565514e-05, + "loss": 0.3676, + "step": 12835 + }, + { + "epoch": 2.9944029850746268, + "grad_norm": 0.32887622190198984, + "learning_rate": 2.2057355990866328e-05, + "loss": 0.3873, + "step": 12840 + }, + { + "epoch": 2.9955690298507465, + "grad_norm": 0.32805591693365205, + "learning_rate": 2.2040520751243094e-05, + "loss": 0.4024, + "step": 12845 + }, + { + "epoch": 2.9967350746268657, + "grad_norm": 0.33828172614612584, + "learning_rate": 2.2023688758708767e-05, + "loss": 0.4147, + "step": 12850 + }, + { + "epoch": 2.997901119402985, + "grad_norm": 0.34288695869143143, + "learning_rate": 2.2006860023274363e-05, + "loss": 0.3981, + "step": 12855 + }, + { + "epoch": 2.9990671641791042, + "grad_norm": 0.3339388809033874, + "learning_rate": 2.199003455494898e-05, + "loss": 0.4067, + "step": 12860 + }, + { + "epoch": 3.000233208955224, + "grad_norm": 0.31480415784330323, + "learning_rate": 2.1973212363739747e-05, + "loss": 0.3647, + "step": 12865 + }, + { + "epoch": 3.001399253731343, + "grad_norm": 0.3415285396744332, + "learning_rate": 2.1956393459651864e-05, + "loss": 0.3147, + "step": 12870 + }, + { + "epoch": 3.002565298507463, + "grad_norm": 0.4424315410720294, + "learning_rate": 2.1939577852688576e-05, + "loss": 0.3333, + "step": 12875 + }, + { + "epoch": 3.003731343283582, + "grad_norm": 0.3576318000100144, + "learning_rate": 2.1922765552851155e-05, + "loss": 0.325, + "step": 12880 + }, + { + "epoch": 3.0048973880597014, + "grad_norm": 0.37283361438841267, + "learning_rate": 2.190595657013892e-05, + "loss": 0.3426, + "step": 12885 + }, + { + "epoch": 3.006063432835821, + "grad_norm": 0.3266554273224183, + "learning_rate": 2.1889150914549195e-05, + "loss": 0.32, + "step": 12890 + }, + { + "epoch": 3.0072294776119404, + "grad_norm": 0.3739654519269057, + "learning_rate": 2.1872348596077348e-05, + "loss": 0.3256, + "step": 12895 + }, + { + "epoch": 3.0083955223880596, + "grad_norm": 0.3711708858583731, + "learning_rate": 2.1855549624716755e-05, + "loss": 0.3373, + "step": 12900 + }, + { + "epoch": 3.009561567164179, + "grad_norm": 0.3469551419431101, + "learning_rate": 2.1838754010458796e-05, + "loss": 0.3399, + "step": 12905 + }, + { + "epoch": 3.0107276119402986, + "grad_norm": 0.3544564289921535, + "learning_rate": 2.182196176329287e-05, + "loss": 0.3246, + "step": 12910 + }, + { + "epoch": 3.011893656716418, + "grad_norm": 0.3769777201124636, + "learning_rate": 2.1805172893206342e-05, + "loss": 0.3297, + "step": 12915 + }, + { + "epoch": 3.013059701492537, + "grad_norm": 0.34419023106242463, + "learning_rate": 2.1788387410184603e-05, + "loss": 0.3127, + "step": 12920 + }, + { + "epoch": 3.014225746268657, + "grad_norm": 0.3656702030299312, + "learning_rate": 2.177160532421101e-05, + "loss": 0.329, + "step": 12925 + }, + { + "epoch": 3.015391791044776, + "grad_norm": 0.35183086668261326, + "learning_rate": 2.1754826645266895e-05, + "loss": 0.3194, + "step": 12930 + }, + { + "epoch": 3.0165578358208953, + "grad_norm": 0.37377436097190053, + "learning_rate": 2.1738051383331598e-05, + "loss": 0.3312, + "step": 12935 + }, + { + "epoch": 3.017723880597015, + "grad_norm": 0.3537758265486952, + "learning_rate": 2.172127954838238e-05, + "loss": 0.3337, + "step": 12940 + }, + { + "epoch": 3.0188899253731343, + "grad_norm": 0.3657259262696127, + "learning_rate": 2.1704511150394486e-05, + "loss": 0.3251, + "step": 12945 + }, + { + "epoch": 3.0200559701492535, + "grad_norm": 0.352442228486061, + "learning_rate": 2.1687746199341118e-05, + "loss": 0.3139, + "step": 12950 + }, + { + "epoch": 3.0212220149253732, + "grad_norm": 0.34137337379741683, + "learning_rate": 2.167098470519344e-05, + "loss": 0.3193, + "step": 12955 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.35424670829336125, + "learning_rate": 2.165422667792053e-05, + "loss": 0.3253, + "step": 12960 + }, + { + "epoch": 3.0235541044776117, + "grad_norm": 0.3417579101815207, + "learning_rate": 2.1637472127489427e-05, + "loss": 0.3284, + "step": 12965 + }, + { + "epoch": 3.0247201492537314, + "grad_norm": 0.3731817006318227, + "learning_rate": 2.162072106386509e-05, + "loss": 0.3305, + "step": 12970 + }, + { + "epoch": 3.0258861940298507, + "grad_norm": 0.3567048213584788, + "learning_rate": 2.1603973497010417e-05, + "loss": 0.3235, + "step": 12975 + }, + { + "epoch": 3.02705223880597, + "grad_norm": 0.37368030600959284, + "learning_rate": 2.158722943688621e-05, + "loss": 0.3304, + "step": 12980 + }, + { + "epoch": 3.0282182835820897, + "grad_norm": 0.3511157140059816, + "learning_rate": 2.1570488893451203e-05, + "loss": 0.3317, + "step": 12985 + }, + { + "epoch": 3.029384328358209, + "grad_norm": 0.3811281961954354, + "learning_rate": 2.1553751876662014e-05, + "loss": 0.3314, + "step": 12990 + }, + { + "epoch": 3.030550373134328, + "grad_norm": 0.3574592911407229, + "learning_rate": 2.1537018396473195e-05, + "loss": 0.3272, + "step": 12995 + }, + { + "epoch": 3.031716417910448, + "grad_norm": 0.369797368557322, + "learning_rate": 2.1520288462837175e-05, + "loss": 0.3351, + "step": 13000 + }, + { + "epoch": 3.032882462686567, + "grad_norm": 0.3582951305801054, + "learning_rate": 2.1503562085704265e-05, + "loss": 0.3179, + "step": 13005 + }, + { + "epoch": 3.0340485074626864, + "grad_norm": 0.3790297047857973, + "learning_rate": 2.148683927502269e-05, + "loss": 0.3338, + "step": 13010 + }, + { + "epoch": 3.035214552238806, + "grad_norm": 0.35676889097550846, + "learning_rate": 2.147012004073853e-05, + "loss": 0.3428, + "step": 13015 + }, + { + "epoch": 3.0363805970149254, + "grad_norm": 0.3333520567798483, + "learning_rate": 2.1453404392795735e-05, + "loss": 0.3155, + "step": 13020 + }, + { + "epoch": 3.0375466417910446, + "grad_norm": 0.3714709979317058, + "learning_rate": 2.143669234113614e-05, + "loss": 0.3156, + "step": 13025 + }, + { + "epoch": 3.0387126865671643, + "grad_norm": 0.3710941423544791, + "learning_rate": 2.1419983895699437e-05, + "loss": 0.3385, + "step": 13030 + }, + { + "epoch": 3.0398787313432836, + "grad_norm": 0.36832576543170387, + "learning_rate": 2.1403279066423166e-05, + "loss": 0.3314, + "step": 13035 + }, + { + "epoch": 3.041044776119403, + "grad_norm": 0.3669444577828884, + "learning_rate": 2.1386577863242708e-05, + "loss": 0.3377, + "step": 13040 + }, + { + "epoch": 3.0422108208955225, + "grad_norm": 0.37173717143067986, + "learning_rate": 2.136988029609131e-05, + "loss": 0.333, + "step": 13045 + }, + { + "epoch": 3.043376865671642, + "grad_norm": 0.34211398039343427, + "learning_rate": 2.135318637490004e-05, + "loss": 0.3222, + "step": 13050 + }, + { + "epoch": 3.044542910447761, + "grad_norm": 0.3488277941545808, + "learning_rate": 2.1336496109597804e-05, + "loss": 0.3187, + "step": 13055 + }, + { + "epoch": 3.0457089552238807, + "grad_norm": 0.3521515864556318, + "learning_rate": 2.131980951011134e-05, + "loss": 0.325, + "step": 13060 + }, + { + "epoch": 3.046875, + "grad_norm": 0.37878699689573786, + "learning_rate": 2.1303126586365175e-05, + "loss": 0.3321, + "step": 13065 + }, + { + "epoch": 3.0480410447761193, + "grad_norm": 0.36530897021315384, + "learning_rate": 2.1286447348281695e-05, + "loss": 0.3225, + "step": 13070 + }, + { + "epoch": 3.049207089552239, + "grad_norm": 0.3917845682734505, + "learning_rate": 2.126977180578106e-05, + "loss": 0.3389, + "step": 13075 + }, + { + "epoch": 3.050373134328358, + "grad_norm": 0.34326636319182824, + "learning_rate": 2.1253099968781237e-05, + "loss": 0.3263, + "step": 13080 + }, + { + "epoch": 3.0515391791044775, + "grad_norm": 0.357514903894581, + "learning_rate": 2.1236431847198017e-05, + "loss": 0.3252, + "step": 13085 + }, + { + "epoch": 3.052705223880597, + "grad_norm": 0.36977408550720303, + "learning_rate": 2.1219767450944938e-05, + "loss": 0.3336, + "step": 13090 + }, + { + "epoch": 3.0538712686567164, + "grad_norm": 0.3978530847172167, + "learning_rate": 2.1203106789933352e-05, + "loss": 0.3318, + "step": 13095 + }, + { + "epoch": 3.0550373134328357, + "grad_norm": 0.3854667714673586, + "learning_rate": 2.1186449874072385e-05, + "loss": 0.3326, + "step": 13100 + }, + { + "epoch": 3.0562033582089554, + "grad_norm": 0.3973655043830843, + "learning_rate": 2.116979671326892e-05, + "loss": 0.337, + "step": 13105 + }, + { + "epoch": 3.0573694029850746, + "grad_norm": 0.3453069582811405, + "learning_rate": 2.115314731742764e-05, + "loss": 0.3147, + "step": 13110 + }, + { + "epoch": 3.058535447761194, + "grad_norm": 0.3528439692816455, + "learning_rate": 2.1136501696450943e-05, + "loss": 0.3366, + "step": 13115 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.4015670467809074, + "learning_rate": 2.1119859860239023e-05, + "loss": 0.3229, + "step": 13120 + }, + { + "epoch": 3.060867537313433, + "grad_norm": 0.348396813775293, + "learning_rate": 2.1103221818689794e-05, + "loss": 0.3155, + "step": 13125 + }, + { + "epoch": 3.062033582089552, + "grad_norm": 0.3655626156571847, + "learning_rate": 2.108658758169893e-05, + "loss": 0.3222, + "step": 13130 + }, + { + "epoch": 3.063199626865672, + "grad_norm": 0.3933887786176633, + "learning_rate": 2.1069957159159848e-05, + "loss": 0.3288, + "step": 13135 + }, + { + "epoch": 3.064365671641791, + "grad_norm": 0.37811653007737567, + "learning_rate": 2.105333056096367e-05, + "loss": 0.3263, + "step": 13140 + }, + { + "epoch": 3.0655317164179103, + "grad_norm": 0.38015189790286996, + "learning_rate": 2.1036707796999267e-05, + "loss": 0.3174, + "step": 13145 + }, + { + "epoch": 3.06669776119403, + "grad_norm": 0.36729503197373997, + "learning_rate": 2.1020088877153215e-05, + "loss": 0.3205, + "step": 13150 + }, + { + "epoch": 3.0678638059701493, + "grad_norm": 0.38074724946063593, + "learning_rate": 2.100347381130982e-05, + "loss": 0.3256, + "step": 13155 + }, + { + "epoch": 3.0690298507462686, + "grad_norm": 0.3733607784160199, + "learning_rate": 2.0986862609351077e-05, + "loss": 0.3317, + "step": 13160 + }, + { + "epoch": 3.0701958955223883, + "grad_norm": 0.420440733695549, + "learning_rate": 2.09702552811567e-05, + "loss": 0.338, + "step": 13165 + }, + { + "epoch": 3.0713619402985075, + "grad_norm": 0.3821363536784176, + "learning_rate": 2.0953651836604083e-05, + "loss": 0.3314, + "step": 13170 + }, + { + "epoch": 3.0725279850746268, + "grad_norm": 0.35020554029833856, + "learning_rate": 2.093705228556832e-05, + "loss": 0.3294, + "step": 13175 + }, + { + "epoch": 3.0736940298507465, + "grad_norm": 0.3915225549020453, + "learning_rate": 2.0920456637922194e-05, + "loss": 0.3219, + "step": 13180 + }, + { + "epoch": 3.0748600746268657, + "grad_norm": 0.3792672546786865, + "learning_rate": 2.0903864903536147e-05, + "loss": 0.3337, + "step": 13185 + }, + { + "epoch": 3.076026119402985, + "grad_norm": 0.382816038533477, + "learning_rate": 2.088727709227833e-05, + "loss": 0.3239, + "step": 13190 + }, + { + "epoch": 3.0771921641791047, + "grad_norm": 0.36956181558759693, + "learning_rate": 2.087069321401451e-05, + "loss": 0.3203, + "step": 13195 + }, + { + "epoch": 3.078358208955224, + "grad_norm": 0.36696730415738354, + "learning_rate": 2.085411327860815e-05, + "loss": 0.342, + "step": 13200 + }, + { + "epoch": 3.079524253731343, + "grad_norm": 0.3479574920746276, + "learning_rate": 2.083753729592037e-05, + "loss": 0.3317, + "step": 13205 + }, + { + "epoch": 3.080690298507463, + "grad_norm": 0.3555964171988262, + "learning_rate": 2.0820965275809913e-05, + "loss": 0.3218, + "step": 13210 + }, + { + "epoch": 3.081856343283582, + "grad_norm": 0.3786019049736542, + "learning_rate": 2.0804397228133205e-05, + "loss": 0.332, + "step": 13215 + }, + { + "epoch": 3.0830223880597014, + "grad_norm": 0.3531465412610589, + "learning_rate": 2.0787833162744257e-05, + "loss": 0.3244, + "step": 13220 + }, + { + "epoch": 3.0841884328358207, + "grad_norm": 0.3603836800371123, + "learning_rate": 2.077127308949476e-05, + "loss": 0.3491, + "step": 13225 + }, + { + "epoch": 3.0853544776119404, + "grad_norm": 0.3456946918483755, + "learning_rate": 2.0754717018234003e-05, + "loss": 0.3201, + "step": 13230 + }, + { + "epoch": 3.0865205223880596, + "grad_norm": 0.3566261178734796, + "learning_rate": 2.0738164958808905e-05, + "loss": 0.322, + "step": 13235 + }, + { + "epoch": 3.0876865671641793, + "grad_norm": 0.40131521266042497, + "learning_rate": 2.072161692106399e-05, + "loss": 0.3576, + "step": 13240 + }, + { + "epoch": 3.0888526119402986, + "grad_norm": 0.3693815359775984, + "learning_rate": 2.0705072914841407e-05, + "loss": 0.3178, + "step": 13245 + }, + { + "epoch": 3.090018656716418, + "grad_norm": 0.39108908279940313, + "learning_rate": 2.0688532949980882e-05, + "loss": 0.3375, + "step": 13250 + }, + { + "epoch": 3.091184701492537, + "grad_norm": 0.3589102967846014, + "learning_rate": 2.0671997036319763e-05, + "loss": 0.3088, + "step": 13255 + }, + { + "epoch": 3.092350746268657, + "grad_norm": 0.384997470366685, + "learning_rate": 2.0655465183692972e-05, + "loss": 0.3265, + "step": 13260 + }, + { + "epoch": 3.093516791044776, + "grad_norm": 0.35614145612588777, + "learning_rate": 2.063893740193304e-05, + "loss": 0.3365, + "step": 13265 + }, + { + "epoch": 3.0946828358208953, + "grad_norm": 0.3682250788133163, + "learning_rate": 2.0622413700870026e-05, + "loss": 0.3444, + "step": 13270 + }, + { + "epoch": 3.095848880597015, + "grad_norm": 0.3612428170736634, + "learning_rate": 2.0605894090331607e-05, + "loss": 0.3322, + "step": 13275 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.36197909380447313, + "learning_rate": 2.0589378580143016e-05, + "loss": 0.3336, + "step": 13280 + }, + { + "epoch": 3.0981809701492535, + "grad_norm": 0.3606849185646576, + "learning_rate": 2.057286718012705e-05, + "loss": 0.3266, + "step": 13285 + }, + { + "epoch": 3.0993470149253732, + "grad_norm": 0.3946123516420972, + "learning_rate": 2.0556359900104054e-05, + "loss": 0.3356, + "step": 13290 + }, + { + "epoch": 3.1005130597014925, + "grad_norm": 0.3670960041616496, + "learning_rate": 2.0539856749891918e-05, + "loss": 0.3257, + "step": 13295 + }, + { + "epoch": 3.1016791044776117, + "grad_norm": 0.3694155072794769, + "learning_rate": 2.0523357739306087e-05, + "loss": 0.3399, + "step": 13300 + }, + { + "epoch": 3.1028451492537314, + "grad_norm": 0.3759644766045218, + "learning_rate": 2.050686287815954e-05, + "loss": 0.3426, + "step": 13305 + }, + { + "epoch": 3.1040111940298507, + "grad_norm": 0.3752222434874275, + "learning_rate": 2.049037217626279e-05, + "loss": 0.3398, + "step": 13310 + }, + { + "epoch": 3.10517723880597, + "grad_norm": 0.3740020967603994, + "learning_rate": 2.0473885643423885e-05, + "loss": 0.3271, + "step": 13315 + }, + { + "epoch": 3.1063432835820897, + "grad_norm": 0.34731913525042724, + "learning_rate": 2.0457403289448353e-05, + "loss": 0.3313, + "step": 13320 + }, + { + "epoch": 3.107509328358209, + "grad_norm": 0.3592863698837627, + "learning_rate": 2.0440925124139286e-05, + "loss": 0.336, + "step": 13325 + }, + { + "epoch": 3.108675373134328, + "grad_norm": 0.36920519073043123, + "learning_rate": 2.0424451157297264e-05, + "loss": 0.3206, + "step": 13330 + }, + { + "epoch": 3.109841417910448, + "grad_norm": 0.3744224947470079, + "learning_rate": 2.040798139872037e-05, + "loss": 0.3262, + "step": 13335 + }, + { + "epoch": 3.111007462686567, + "grad_norm": 0.3597885708249331, + "learning_rate": 2.0391515858204184e-05, + "loss": 0.3245, + "step": 13340 + }, + { + "epoch": 3.1121735074626864, + "grad_norm": 0.3731047113659824, + "learning_rate": 2.0375054545541776e-05, + "loss": 0.3266, + "step": 13345 + }, + { + "epoch": 3.113339552238806, + "grad_norm": 0.36423501844030926, + "learning_rate": 2.0358597470523706e-05, + "loss": 0.3351, + "step": 13350 + }, + { + "epoch": 3.1145055970149254, + "grad_norm": 0.3794624930929981, + "learning_rate": 2.034214464293801e-05, + "loss": 0.3256, + "step": 13355 + }, + { + "epoch": 3.1156716417910446, + "grad_norm": 0.34443932446845915, + "learning_rate": 2.0325696072570195e-05, + "loss": 0.3249, + "step": 13360 + }, + { + "epoch": 3.1168376865671643, + "grad_norm": 0.3759457556449976, + "learning_rate": 2.0309251769203252e-05, + "loss": 0.3231, + "step": 13365 + }, + { + "epoch": 3.1180037313432836, + "grad_norm": 0.3745091535962183, + "learning_rate": 2.0292811742617607e-05, + "loss": 0.3327, + "step": 13370 + }, + { + "epoch": 3.119169776119403, + "grad_norm": 0.41148298835812236, + "learning_rate": 2.0276376002591164e-05, + "loss": 0.3351, + "step": 13375 + }, + { + "epoch": 3.1203358208955225, + "grad_norm": 0.3903153953591576, + "learning_rate": 2.0259944558899274e-05, + "loss": 0.3311, + "step": 13380 + }, + { + "epoch": 3.121501865671642, + "grad_norm": 0.39583155887190763, + "learning_rate": 2.0243517421314727e-05, + "loss": 0.3307, + "step": 13385 + }, + { + "epoch": 3.122667910447761, + "grad_norm": 0.3540602512876891, + "learning_rate": 2.022709459960776e-05, + "loss": 0.333, + "step": 13390 + }, + { + "epoch": 3.1238339552238807, + "grad_norm": 0.349006419179924, + "learning_rate": 2.0210676103546028e-05, + "loss": 0.3305, + "step": 13395 + }, + { + "epoch": 3.125, + "grad_norm": 0.35353284890354925, + "learning_rate": 2.0194261942894628e-05, + "loss": 0.3267, + "step": 13400 + }, + { + "epoch": 3.1261660447761193, + "grad_norm": 0.3684268525794442, + "learning_rate": 2.0177852127416063e-05, + "loss": 0.3294, + "step": 13405 + }, + { + "epoch": 3.127332089552239, + "grad_norm": 0.3876907997700918, + "learning_rate": 2.016144666687029e-05, + "loss": 0.3324, + "step": 13410 + }, + { + "epoch": 3.128498134328358, + "grad_norm": 0.34843344551205363, + "learning_rate": 2.0145045571014614e-05, + "loss": 0.3048, + "step": 13415 + }, + { + "epoch": 3.1296641791044775, + "grad_norm": 0.3739906165940147, + "learning_rate": 2.0128648849603798e-05, + "loss": 0.3296, + "step": 13420 + }, + { + "epoch": 3.130830223880597, + "grad_norm": 0.369790686747009, + "learning_rate": 2.0112256512389976e-05, + "loss": 0.3247, + "step": 13425 + }, + { + "epoch": 3.1319962686567164, + "grad_norm": 0.4027240855788343, + "learning_rate": 2.009586856912269e-05, + "loss": 0.3233, + "step": 13430 + }, + { + "epoch": 3.1331623134328357, + "grad_norm": 0.3746132905195428, + "learning_rate": 2.0079485029548838e-05, + "loss": 0.3286, + "step": 13435 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.37255110913703593, + "learning_rate": 2.006310590341276e-05, + "loss": 0.3203, + "step": 13440 + }, + { + "epoch": 3.1354944029850746, + "grad_norm": 0.3971067568722652, + "learning_rate": 2.0046731200456097e-05, + "loss": 0.3438, + "step": 13445 + }, + { + "epoch": 3.136660447761194, + "grad_norm": 0.3763188179431443, + "learning_rate": 2.00303609304179e-05, + "loss": 0.3373, + "step": 13450 + }, + { + "epoch": 3.1378264925373136, + "grad_norm": 0.354426564175202, + "learning_rate": 2.0013995103034594e-05, + "loss": 0.3386, + "step": 13455 + }, + { + "epoch": 3.138992537313433, + "grad_norm": 0.3503828579014741, + "learning_rate": 1.9997633728039933e-05, + "loss": 0.3233, + "step": 13460 + }, + { + "epoch": 3.140158582089552, + "grad_norm": 0.3647250356826776, + "learning_rate": 1.9981276815165046e-05, + "loss": 0.3281, + "step": 13465 + }, + { + "epoch": 3.141324626865672, + "grad_norm": 0.3430909864290912, + "learning_rate": 1.996492437413838e-05, + "loss": 0.3175, + "step": 13470 + }, + { + "epoch": 3.142490671641791, + "grad_norm": 0.35006325560037993, + "learning_rate": 1.994857641468575e-05, + "loss": 0.3365, + "step": 13475 + }, + { + "epoch": 3.1436567164179103, + "grad_norm": 0.36103268334665484, + "learning_rate": 1.99322329465303e-05, + "loss": 0.3163, + "step": 13480 + }, + { + "epoch": 3.14482276119403, + "grad_norm": 0.39489082780521373, + "learning_rate": 1.9915893979392492e-05, + "loss": 0.3421, + "step": 13485 + }, + { + "epoch": 3.1459888059701493, + "grad_norm": 0.371311627980263, + "learning_rate": 1.989955952299012e-05, + "loss": 0.3235, + "step": 13490 + }, + { + "epoch": 3.1471548507462686, + "grad_norm": 0.38942356459182614, + "learning_rate": 1.9883229587038287e-05, + "loss": 0.328, + "step": 13495 + }, + { + "epoch": 3.1483208955223883, + "grad_norm": 0.37027465368557755, + "learning_rate": 1.986690418124942e-05, + "loss": 0.334, + "step": 13500 + }, + { + "epoch": 3.1494869402985075, + "grad_norm": 0.3749228281321373, + "learning_rate": 1.9850583315333242e-05, + "loss": 0.3288, + "step": 13505 + }, + { + "epoch": 3.1506529850746268, + "grad_norm": 0.3605921528491555, + "learning_rate": 1.983426699899677e-05, + "loss": 0.3235, + "step": 13510 + }, + { + "epoch": 3.1518190298507465, + "grad_norm": 0.366589498485244, + "learning_rate": 1.9817955241944335e-05, + "loss": 0.3331, + "step": 13515 + }, + { + "epoch": 3.1529850746268657, + "grad_norm": 0.3722581231316989, + "learning_rate": 1.9801648053877548e-05, + "loss": 0.3325, + "step": 13520 + }, + { + "epoch": 3.154151119402985, + "grad_norm": 0.3624795413286155, + "learning_rate": 1.978534544449528e-05, + "loss": 0.3062, + "step": 13525 + }, + { + "epoch": 3.1553171641791047, + "grad_norm": 0.3702713213461391, + "learning_rate": 1.9769047423493707e-05, + "loss": 0.3318, + "step": 13530 + }, + { + "epoch": 3.156483208955224, + "grad_norm": 0.3854460558686183, + "learning_rate": 1.975275400056627e-05, + "loss": 0.3413, + "step": 13535 + }, + { + "epoch": 3.157649253731343, + "grad_norm": 0.35754585019696894, + "learning_rate": 1.9736465185403675e-05, + "loss": 0.3174, + "step": 13540 + }, + { + "epoch": 3.158815298507463, + "grad_norm": 0.35173201839919066, + "learning_rate": 1.9720180987693888e-05, + "loss": 0.3359, + "step": 13545 + }, + { + "epoch": 3.159981343283582, + "grad_norm": 0.3679370893478662, + "learning_rate": 1.9703901417122106e-05, + "loss": 0.331, + "step": 13550 + }, + { + "epoch": 3.1611473880597014, + "grad_norm": 0.3535033269618664, + "learning_rate": 1.968762648337081e-05, + "loss": 0.3173, + "step": 13555 + }, + { + "epoch": 3.1623134328358207, + "grad_norm": 0.38482761857804487, + "learning_rate": 1.96713561961197e-05, + "loss": 0.3436, + "step": 13560 + }, + { + "epoch": 3.1634794776119404, + "grad_norm": 0.37373056008546923, + "learning_rate": 1.9655090565045718e-05, + "loss": 0.3415, + "step": 13565 + }, + { + "epoch": 3.1646455223880596, + "grad_norm": 0.352634120020291, + "learning_rate": 1.9638829599823056e-05, + "loss": 0.328, + "step": 13570 + }, + { + "epoch": 3.1658115671641793, + "grad_norm": 0.3719863213467069, + "learning_rate": 1.9622573310123082e-05, + "loss": 0.34, + "step": 13575 + }, + { + "epoch": 3.1669776119402986, + "grad_norm": 0.3775757810642568, + "learning_rate": 1.9606321705614427e-05, + "loss": 0.3407, + "step": 13580 + }, + { + "epoch": 3.168143656716418, + "grad_norm": 0.3517808938109586, + "learning_rate": 1.9590074795962925e-05, + "loss": 0.3173, + "step": 13585 + }, + { + "epoch": 3.169309701492537, + "grad_norm": 0.3673636906191626, + "learning_rate": 1.957383259083162e-05, + "loss": 0.3378, + "step": 13590 + }, + { + "epoch": 3.170475746268657, + "grad_norm": 0.3773964702378201, + "learning_rate": 1.955759509988075e-05, + "loss": 0.331, + "step": 13595 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.36583522899610693, + "learning_rate": 1.9541362332767737e-05, + "loss": 0.3374, + "step": 13600 + }, + { + "epoch": 3.1728078358208953, + "grad_norm": 0.35882066116368944, + "learning_rate": 1.952513429914723e-05, + "loss": 0.3386, + "step": 13605 + }, + { + "epoch": 3.173973880597015, + "grad_norm": 0.3522475379983531, + "learning_rate": 1.950891100867102e-05, + "loss": 0.3319, + "step": 13610 + }, + { + "epoch": 3.1751399253731343, + "grad_norm": 0.3380721645634281, + "learning_rate": 1.9492692470988115e-05, + "loss": 0.3172, + "step": 13615 + }, + { + "epoch": 3.1763059701492535, + "grad_norm": 0.3638642550275009, + "learning_rate": 1.9476478695744683e-05, + "loss": 0.326, + "step": 13620 + }, + { + "epoch": 3.1774720149253732, + "grad_norm": 0.34490985996230167, + "learning_rate": 1.9460269692584034e-05, + "loss": 0.311, + "step": 13625 + }, + { + "epoch": 3.1786380597014925, + "grad_norm": 0.3293630559420069, + "learning_rate": 1.944406547114667e-05, + "loss": 0.3175, + "step": 13630 + }, + { + "epoch": 3.1798041044776117, + "grad_norm": 0.36214515194537317, + "learning_rate": 1.9427866041070254e-05, + "loss": 0.3425, + "step": 13635 + }, + { + "epoch": 3.1809701492537314, + "grad_norm": 0.4074765117307699, + "learning_rate": 1.9411671411989568e-05, + "loss": 0.3347, + "step": 13640 + }, + { + "epoch": 3.1821361940298507, + "grad_norm": 0.3690374728241686, + "learning_rate": 1.9395481593536575e-05, + "loss": 0.3376, + "step": 13645 + }, + { + "epoch": 3.18330223880597, + "grad_norm": 0.3598588307706936, + "learning_rate": 1.937929659534034e-05, + "loss": 0.3135, + "step": 13650 + }, + { + "epoch": 3.1844682835820897, + "grad_norm": 0.37614810654671105, + "learning_rate": 1.9363116427027084e-05, + "loss": 0.3171, + "step": 13655 + }, + { + "epoch": 3.185634328358209, + "grad_norm": 0.3652088800983655, + "learning_rate": 1.9346941098220157e-05, + "loss": 0.3451, + "step": 13660 + }, + { + "epoch": 3.186800373134328, + "grad_norm": 0.37026072957166567, + "learning_rate": 1.933077061854002e-05, + "loss": 0.337, + "step": 13665 + }, + { + "epoch": 3.187966417910448, + "grad_norm": 0.3674339134285864, + "learning_rate": 1.931460499760426e-05, + "loss": 0.3209, + "step": 13670 + }, + { + "epoch": 3.189132462686567, + "grad_norm": 0.3787792050932168, + "learning_rate": 1.929844424502755e-05, + "loss": 0.3329, + "step": 13675 + }, + { + "epoch": 3.1902985074626864, + "grad_norm": 0.3464872434870928, + "learning_rate": 1.9282288370421708e-05, + "loss": 0.3274, + "step": 13680 + }, + { + "epoch": 3.191464552238806, + "grad_norm": 0.35433237253231414, + "learning_rate": 1.9266137383395626e-05, + "loss": 0.3194, + "step": 13685 + }, + { + "epoch": 3.1926305970149254, + "grad_norm": 0.36665862738024346, + "learning_rate": 1.9249991293555276e-05, + "loss": 0.3229, + "step": 13690 + }, + { + "epoch": 3.1937966417910446, + "grad_norm": 0.35818228941602226, + "learning_rate": 1.9233850110503748e-05, + "loss": 0.308, + "step": 13695 + }, + { + "epoch": 3.1949626865671643, + "grad_norm": 0.3818773569995412, + "learning_rate": 1.9217713843841195e-05, + "loss": 0.3242, + "step": 13700 + }, + { + "epoch": 3.1961287313432836, + "grad_norm": 0.3686870097052765, + "learning_rate": 1.9201582503164845e-05, + "loss": 0.3335, + "step": 13705 + }, + { + "epoch": 3.197294776119403, + "grad_norm": 0.39478332509122016, + "learning_rate": 1.9185456098068998e-05, + "loss": 0.3191, + "step": 13710 + }, + { + "epoch": 3.1984608208955225, + "grad_norm": 0.4369403040320393, + "learning_rate": 1.9169334638145037e-05, + "loss": 0.3074, + "step": 13715 + }, + { + "epoch": 3.199626865671642, + "grad_norm": 0.36890272116814343, + "learning_rate": 1.9153218132981375e-05, + "loss": 0.33, + "step": 13720 + }, + { + "epoch": 3.200792910447761, + "grad_norm": 0.3692134632105836, + "learning_rate": 1.9137106592163495e-05, + "loss": 0.3273, + "step": 13725 + }, + { + "epoch": 3.2019589552238807, + "grad_norm": 0.33604994736255905, + "learning_rate": 1.912100002527392e-05, + "loss": 0.323, + "step": 13730 + }, + { + "epoch": 3.203125, + "grad_norm": 0.38695669374062913, + "learning_rate": 1.9104898441892222e-05, + "loss": 0.3373, + "step": 13735 + }, + { + "epoch": 3.2042910447761193, + "grad_norm": 0.37765927910733466, + "learning_rate": 1.9088801851595008e-05, + "loss": 0.3259, + "step": 13740 + }, + { + "epoch": 3.205457089552239, + "grad_norm": 0.35284623104031987, + "learning_rate": 1.907271026395592e-05, + "loss": 0.3056, + "step": 13745 + }, + { + "epoch": 3.206623134328358, + "grad_norm": 0.3600030937462303, + "learning_rate": 1.9056623688545588e-05, + "loss": 0.3377, + "step": 13750 + }, + { + "epoch": 3.2077891791044775, + "grad_norm": 0.35080121298899186, + "learning_rate": 1.9040542134931715e-05, + "loss": 0.3265, + "step": 13755 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.3678894513431948, + "learning_rate": 1.9024465612678993e-05, + "loss": 0.3368, + "step": 13760 + }, + { + "epoch": 3.2101212686567164, + "grad_norm": 0.37001400720834304, + "learning_rate": 1.900839413134911e-05, + "loss": 0.3402, + "step": 13765 + }, + { + "epoch": 3.2112873134328357, + "grad_norm": 0.3684583095084399, + "learning_rate": 1.8992327700500772e-05, + "loss": 0.3279, + "step": 13770 + }, + { + "epoch": 3.2124533582089554, + "grad_norm": 0.3395411991593244, + "learning_rate": 1.897626632968968e-05, + "loss": 0.3115, + "step": 13775 + }, + { + "epoch": 3.2136194029850746, + "grad_norm": 0.36901019403122015, + "learning_rate": 1.8960210028468512e-05, + "loss": 0.3372, + "step": 13780 + }, + { + "epoch": 3.214785447761194, + "grad_norm": 0.37786966826385215, + "learning_rate": 1.8944158806386942e-05, + "loss": 0.3278, + "step": 13785 + }, + { + "epoch": 3.2159514925373136, + "grad_norm": 0.3484072803507474, + "learning_rate": 1.8928112672991626e-05, + "loss": 0.3227, + "step": 13790 + }, + { + "epoch": 3.217117537313433, + "grad_norm": 0.3813917097052821, + "learning_rate": 1.8912071637826196e-05, + "loss": 0.3182, + "step": 13795 + }, + { + "epoch": 3.218283582089552, + "grad_norm": 0.40392844230661523, + "learning_rate": 1.8896035710431225e-05, + "loss": 0.3535, + "step": 13800 + }, + { + "epoch": 3.219449626865672, + "grad_norm": 0.3711458054470723, + "learning_rate": 1.8880004900344283e-05, + "loss": 0.3377, + "step": 13805 + }, + { + "epoch": 3.220615671641791, + "grad_norm": 0.3489947468412329, + "learning_rate": 1.8863979217099874e-05, + "loss": 0.3422, + "step": 13810 + }, + { + "epoch": 3.2217817164179103, + "grad_norm": 0.4039440749796976, + "learning_rate": 1.8847958670229465e-05, + "loss": 0.3308, + "step": 13815 + }, + { + "epoch": 3.22294776119403, + "grad_norm": 0.35899557282466393, + "learning_rate": 1.8831943269261467e-05, + "loss": 0.3309, + "step": 13820 + }, + { + "epoch": 3.2241138059701493, + "grad_norm": 0.35455566040198766, + "learning_rate": 1.8815933023721206e-05, + "loss": 0.3344, + "step": 13825 + }, + { + "epoch": 3.2252798507462686, + "grad_norm": 0.3588685079349673, + "learning_rate": 1.8799927943130986e-05, + "loss": 0.3329, + "step": 13830 + }, + { + "epoch": 3.2264458955223883, + "grad_norm": 0.35914473477906345, + "learning_rate": 1.878392803701e-05, + "loss": 0.3316, + "step": 13835 + }, + { + "epoch": 3.2276119402985075, + "grad_norm": 0.3668416379119767, + "learning_rate": 1.8767933314874382e-05, + "loss": 0.3403, + "step": 13840 + }, + { + "epoch": 3.2287779850746268, + "grad_norm": 0.36205186453945915, + "learning_rate": 1.875194378623718e-05, + "loss": 0.3425, + "step": 13845 + }, + { + "epoch": 3.2299440298507465, + "grad_norm": 0.3760203722085514, + "learning_rate": 1.8735959460608364e-05, + "loss": 0.3423, + "step": 13850 + }, + { + "epoch": 3.2311100746268657, + "grad_norm": 0.37574713190429826, + "learning_rate": 1.871998034749478e-05, + "loss": 0.3337, + "step": 13855 + }, + { + "epoch": 3.232276119402985, + "grad_norm": 0.3630560486674643, + "learning_rate": 1.8704006456400202e-05, + "loss": 0.3449, + "step": 13860 + }, + { + "epoch": 3.2334421641791047, + "grad_norm": 0.38382066549657823, + "learning_rate": 1.8688037796825285e-05, + "loss": 0.3317, + "step": 13865 + }, + { + "epoch": 3.234608208955224, + "grad_norm": 0.36720684573288553, + "learning_rate": 1.8672074378267573e-05, + "loss": 0.3285, + "step": 13870 + }, + { + "epoch": 3.235774253731343, + "grad_norm": 0.3637815439353751, + "learning_rate": 1.8656116210221502e-05, + "loss": 0.3313, + "step": 13875 + }, + { + "epoch": 3.236940298507463, + "grad_norm": 0.35665712066317706, + "learning_rate": 1.8640163302178377e-05, + "loss": 0.3395, + "step": 13880 + }, + { + "epoch": 3.238106343283582, + "grad_norm": 0.37234806881985544, + "learning_rate": 1.8624215663626365e-05, + "loss": 0.3274, + "step": 13885 + }, + { + "epoch": 3.2392723880597014, + "grad_norm": 0.35997104573000205, + "learning_rate": 1.8608273304050515e-05, + "loss": 0.3452, + "step": 13890 + }, + { + "epoch": 3.2404384328358207, + "grad_norm": 0.35646213305760127, + "learning_rate": 1.859233623293274e-05, + "loss": 0.3272, + "step": 13895 + }, + { + "epoch": 3.2416044776119404, + "grad_norm": 0.3717220228017601, + "learning_rate": 1.8576404459751796e-05, + "loss": 0.3344, + "step": 13900 + }, + { + "epoch": 3.2427705223880596, + "grad_norm": 0.39226176760700243, + "learning_rate": 1.8560477993983284e-05, + "loss": 0.3373, + "step": 13905 + }, + { + "epoch": 3.2439365671641793, + "grad_norm": 0.3708386290612459, + "learning_rate": 1.8544556845099657e-05, + "loss": 0.3304, + "step": 13910 + }, + { + "epoch": 3.2451026119402986, + "grad_norm": 0.3632392546028572, + "learning_rate": 1.8528641022570202e-05, + "loss": 0.3355, + "step": 13915 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.3662832048203213, + "learning_rate": 1.851273053586105e-05, + "loss": 0.3252, + "step": 13920 + }, + { + "epoch": 3.247434701492537, + "grad_norm": 0.3599674150678043, + "learning_rate": 1.8496825394435146e-05, + "loss": 0.3366, + "step": 13925 + }, + { + "epoch": 3.248600746268657, + "grad_norm": 0.38699543809394027, + "learning_rate": 1.8480925607752248e-05, + "loss": 0.3378, + "step": 13930 + }, + { + "epoch": 3.249766791044776, + "grad_norm": 0.3904572616104557, + "learning_rate": 1.8465031185268943e-05, + "loss": 0.3481, + "step": 13935 + }, + { + "epoch": 3.2509328358208958, + "grad_norm": 0.35402925173137356, + "learning_rate": 1.8449142136438628e-05, + "loss": 0.3236, + "step": 13940 + }, + { + "epoch": 3.252098880597015, + "grad_norm": 0.36346030275637714, + "learning_rate": 1.84332584707115e-05, + "loss": 0.3374, + "step": 13945 + }, + { + "epoch": 3.2532649253731343, + "grad_norm": 0.37169783700709885, + "learning_rate": 1.8417380197534558e-05, + "loss": 0.3308, + "step": 13950 + }, + { + "epoch": 3.2544309701492535, + "grad_norm": 0.3460841700376648, + "learning_rate": 1.8401507326351575e-05, + "loss": 0.3326, + "step": 13955 + }, + { + "epoch": 3.2555970149253732, + "grad_norm": 0.3530027735330055, + "learning_rate": 1.8385639866603144e-05, + "loss": 0.3353, + "step": 13960 + }, + { + "epoch": 3.2567630597014925, + "grad_norm": 0.37620468595081824, + "learning_rate": 1.836977782772661e-05, + "loss": 0.3357, + "step": 13965 + }, + { + "epoch": 3.2579291044776117, + "grad_norm": 0.3784865153952274, + "learning_rate": 1.8353921219156102e-05, + "loss": 0.3381, + "step": 13970 + }, + { + "epoch": 3.2590951492537314, + "grad_norm": 0.3667850525620458, + "learning_rate": 1.8338070050322544e-05, + "loss": 0.3329, + "step": 13975 + }, + { + "epoch": 3.2602611940298507, + "grad_norm": 0.3596704409553893, + "learning_rate": 1.8322224330653576e-05, + "loss": 0.3328, + "step": 13980 + }, + { + "epoch": 3.26142723880597, + "grad_norm": 0.36071354726025173, + "learning_rate": 1.830638406957364e-05, + "loss": 0.332, + "step": 13985 + }, + { + "epoch": 3.2625932835820897, + "grad_norm": 0.38004524875147305, + "learning_rate": 1.8290549276503915e-05, + "loss": 0.342, + "step": 13990 + }, + { + "epoch": 3.263759328358209, + "grad_norm": 0.35262133682833335, + "learning_rate": 1.8274719960862325e-05, + "loss": 0.3354, + "step": 13995 + }, + { + "epoch": 3.264925373134328, + "grad_norm": 0.3668641053269362, + "learning_rate": 1.825889613206355e-05, + "loss": 0.3286, + "step": 14000 + }, + { + "epoch": 3.266091417910448, + "grad_norm": 0.37265888531935687, + "learning_rate": 1.824307779951898e-05, + "loss": 0.3212, + "step": 14005 + }, + { + "epoch": 3.267257462686567, + "grad_norm": 0.34176171445637105, + "learning_rate": 1.8227264972636758e-05, + "loss": 0.3134, + "step": 14010 + }, + { + "epoch": 3.2684235074626864, + "grad_norm": 0.37134680617034843, + "learning_rate": 1.821145766082176e-05, + "loss": 0.3366, + "step": 14015 + }, + { + "epoch": 3.269589552238806, + "grad_norm": 0.3605056947908637, + "learning_rate": 1.8195655873475554e-05, + "loss": 0.346, + "step": 14020 + }, + { + "epoch": 3.2707555970149254, + "grad_norm": 0.36325907950139397, + "learning_rate": 1.8179859619996448e-05, + "loss": 0.3271, + "step": 14025 + }, + { + "epoch": 3.2719216417910446, + "grad_norm": 0.354427230774899, + "learning_rate": 1.8164068909779437e-05, + "loss": 0.3309, + "step": 14030 + }, + { + "epoch": 3.2730876865671643, + "grad_norm": 0.40211130887509683, + "learning_rate": 1.814828375221623e-05, + "loss": 0.336, + "step": 14035 + }, + { + "epoch": 3.2742537313432836, + "grad_norm": 0.3669552473450502, + "learning_rate": 1.8132504156695245e-05, + "loss": 0.3413, + "step": 14040 + }, + { + "epoch": 3.275419776119403, + "grad_norm": 0.35626016714325476, + "learning_rate": 1.8116730132601565e-05, + "loss": 0.3337, + "step": 14045 + }, + { + "epoch": 3.2765858208955225, + "grad_norm": 0.36791701144465083, + "learning_rate": 1.8100961689317003e-05, + "loss": 0.3265, + "step": 14050 + }, + { + "epoch": 3.277751865671642, + "grad_norm": 0.3641957621004843, + "learning_rate": 1.808519883621999e-05, + "loss": 0.319, + "step": 14055 + }, + { + "epoch": 3.278917910447761, + "grad_norm": 0.36880246236354486, + "learning_rate": 1.806944158268568e-05, + "loss": 0.3309, + "step": 14060 + }, + { + "epoch": 3.2800839552238807, + "grad_norm": 0.3459063020550023, + "learning_rate": 1.805368993808589e-05, + "loss": 0.321, + "step": 14065 + }, + { + "epoch": 3.28125, + "grad_norm": 0.35941020884127856, + "learning_rate": 1.803794391178908e-05, + "loss": 0.3347, + "step": 14070 + }, + { + "epoch": 3.2824160447761193, + "grad_norm": 0.33881347076600704, + "learning_rate": 1.8022203513160406e-05, + "loss": 0.3275, + "step": 14075 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.36013411302573745, + "learning_rate": 1.8006468751561628e-05, + "loss": 0.3299, + "step": 14080 + }, + { + "epoch": 3.284748134328358, + "grad_norm": 0.3605260553175594, + "learning_rate": 1.7990739636351188e-05, + "loss": 0.329, + "step": 14085 + }, + { + "epoch": 3.2859141791044775, + "grad_norm": 0.3379452521638893, + "learning_rate": 1.797501617688417e-05, + "loss": 0.3124, + "step": 14090 + }, + { + "epoch": 3.287080223880597, + "grad_norm": 0.34755508575552396, + "learning_rate": 1.795929838251227e-05, + "loss": 0.3293, + "step": 14095 + }, + { + "epoch": 3.2882462686567164, + "grad_norm": 0.36215468157393715, + "learning_rate": 1.7943586262583846e-05, + "loss": 0.3346, + "step": 14100 + }, + { + "epoch": 3.2894123134328357, + "grad_norm": 0.35680337811185614, + "learning_rate": 1.7927879826443844e-05, + "loss": 0.3395, + "step": 14105 + }, + { + "epoch": 3.2905783582089554, + "grad_norm": 0.37236710880652174, + "learning_rate": 1.791217908343386e-05, + "loss": 0.3352, + "step": 14110 + }, + { + "epoch": 3.2917444029850746, + "grad_norm": 0.3748908586072519, + "learning_rate": 1.78964840428921e-05, + "loss": 0.3328, + "step": 14115 + }, + { + "epoch": 3.292910447761194, + "grad_norm": 0.3696905878621036, + "learning_rate": 1.7880794714153366e-05, + "loss": 0.3277, + "step": 14120 + }, + { + "epoch": 3.2940764925373136, + "grad_norm": 0.3925177280795479, + "learning_rate": 1.786511110654907e-05, + "loss": 0.3359, + "step": 14125 + }, + { + "epoch": 3.295242537313433, + "grad_norm": 0.3573280326662546, + "learning_rate": 1.784943322940722e-05, + "loss": 0.3326, + "step": 14130 + }, + { + "epoch": 3.296408582089552, + "grad_norm": 0.36097554010255833, + "learning_rate": 1.7833761092052415e-05, + "loss": 0.3288, + "step": 14135 + }, + { + "epoch": 3.297574626865672, + "grad_norm": 0.3395814953744775, + "learning_rate": 1.7818094703805837e-05, + "loss": 0.3206, + "step": 14140 + }, + { + "epoch": 3.298740671641791, + "grad_norm": 0.3846515758263603, + "learning_rate": 1.780243407398527e-05, + "loss": 0.3327, + "step": 14145 + }, + { + "epoch": 3.2999067164179103, + "grad_norm": 0.3686113299692695, + "learning_rate": 1.7786779211905048e-05, + "loss": 0.3516, + "step": 14150 + }, + { + "epoch": 3.30107276119403, + "grad_norm": 0.37897026986716464, + "learning_rate": 1.7771130126876068e-05, + "loss": 0.3365, + "step": 14155 + }, + { + "epoch": 3.3022388059701493, + "grad_norm": 0.37926004560560306, + "learning_rate": 1.775548682820582e-05, + "loss": 0.3267, + "step": 14160 + }, + { + "epoch": 3.3034048507462686, + "grad_norm": 0.3532865663444608, + "learning_rate": 1.7739849325198334e-05, + "loss": 0.318, + "step": 14165 + }, + { + "epoch": 3.3045708955223883, + "grad_norm": 0.375951212959595, + "learning_rate": 1.7724217627154204e-05, + "loss": 0.3153, + "step": 14170 + }, + { + "epoch": 3.3057369402985075, + "grad_norm": 0.3892275901278885, + "learning_rate": 1.7708591743370555e-05, + "loss": 0.3423, + "step": 14175 + }, + { + "epoch": 3.3069029850746268, + "grad_norm": 0.352642314062309, + "learning_rate": 1.7692971683141063e-05, + "loss": 0.3203, + "step": 14180 + }, + { + "epoch": 3.3080690298507465, + "grad_norm": 0.3608644544656739, + "learning_rate": 1.7677357455755954e-05, + "loss": 0.3149, + "step": 14185 + }, + { + "epoch": 3.3092350746268657, + "grad_norm": 0.37383509533418063, + "learning_rate": 1.766174907050196e-05, + "loss": 0.3326, + "step": 14190 + }, + { + "epoch": 3.310401119402985, + "grad_norm": 0.35408016367951, + "learning_rate": 1.764614653666235e-05, + "loss": 0.3354, + "step": 14195 + }, + { + "epoch": 3.3115671641791042, + "grad_norm": 0.4073773632420507, + "learning_rate": 1.7630549863516914e-05, + "loss": 0.3315, + "step": 14200 + }, + { + "epoch": 3.312733208955224, + "grad_norm": 0.36025295017030395, + "learning_rate": 1.7614959060341968e-05, + "loss": 0.3269, + "step": 14205 + }, + { + "epoch": 3.313899253731343, + "grad_norm": 0.36990534903869327, + "learning_rate": 1.75993741364103e-05, + "loss": 0.3233, + "step": 14210 + }, + { + "epoch": 3.315065298507463, + "grad_norm": 0.3549222879114956, + "learning_rate": 1.7583795100991246e-05, + "loss": 0.3324, + "step": 14215 + }, + { + "epoch": 3.316231343283582, + "grad_norm": 0.35963908710499526, + "learning_rate": 1.7568221963350605e-05, + "loss": 0.3214, + "step": 14220 + }, + { + "epoch": 3.3173973880597014, + "grad_norm": 0.37972969127051454, + "learning_rate": 1.755265473275069e-05, + "loss": 0.3447, + "step": 14225 + }, + { + "epoch": 3.3185634328358207, + "grad_norm": 0.37379510903005386, + "learning_rate": 1.7537093418450294e-05, + "loss": 0.3335, + "step": 14230 + }, + { + "epoch": 3.3197294776119404, + "grad_norm": 0.3712018006629852, + "learning_rate": 1.7521538029704682e-05, + "loss": 0.3302, + "step": 14235 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.3499709201063348, + "learning_rate": 1.750598857576561e-05, + "loss": 0.3331, + "step": 14240 + }, + { + "epoch": 3.3220615671641793, + "grad_norm": 0.34800044920430206, + "learning_rate": 1.749044506588129e-05, + "loss": 0.324, + "step": 14245 + }, + { + "epoch": 3.3232276119402986, + "grad_norm": 0.3755558003097278, + "learning_rate": 1.7474907509296412e-05, + "loss": 0.3388, + "step": 14250 + }, + { + "epoch": 3.324393656716418, + "grad_norm": 0.38141193719906546, + "learning_rate": 1.7459375915252123e-05, + "loss": 0.347, + "step": 14255 + }, + { + "epoch": 3.325559701492537, + "grad_norm": 0.4026915264621621, + "learning_rate": 1.7443850292986007e-05, + "loss": 0.3266, + "step": 14260 + }, + { + "epoch": 3.326725746268657, + "grad_norm": 0.3627612966851823, + "learning_rate": 1.742833065173212e-05, + "loss": 0.3362, + "step": 14265 + }, + { + "epoch": 3.327891791044776, + "grad_norm": 0.3754425378380735, + "learning_rate": 1.7412817000720937e-05, + "loss": 0.3488, + "step": 14270 + }, + { + "epoch": 3.3290578358208958, + "grad_norm": 0.35308948266583345, + "learning_rate": 1.7397309349179393e-05, + "loss": 0.3157, + "step": 14275 + }, + { + "epoch": 3.330223880597015, + "grad_norm": 0.363506854612107, + "learning_rate": 1.738180770633085e-05, + "loss": 0.335, + "step": 14280 + }, + { + "epoch": 3.3313899253731343, + "grad_norm": 0.36242269890457657, + "learning_rate": 1.7366312081395075e-05, + "loss": 0.342, + "step": 14285 + }, + { + "epoch": 3.3325559701492535, + "grad_norm": 0.3699237198400984, + "learning_rate": 1.7350822483588277e-05, + "loss": 0.3402, + "step": 14290 + }, + { + "epoch": 3.3337220149253732, + "grad_norm": 0.3705774296824844, + "learning_rate": 1.7335338922123076e-05, + "loss": 0.3261, + "step": 14295 + }, + { + "epoch": 3.3348880597014925, + "grad_norm": 0.3595502541334972, + "learning_rate": 1.7319861406208504e-05, + "loss": 0.3191, + "step": 14300 + }, + { + "epoch": 3.3360541044776117, + "grad_norm": 0.3647020771322686, + "learning_rate": 1.7304389945050004e-05, + "loss": 0.3314, + "step": 14305 + }, + { + "epoch": 3.3372201492537314, + "grad_norm": 0.37062679555831696, + "learning_rate": 1.728892454784938e-05, + "loss": 0.3338, + "step": 14310 + }, + { + "epoch": 3.3383861940298507, + "grad_norm": 0.37268020664483076, + "learning_rate": 1.7273465223804876e-05, + "loss": 0.3324, + "step": 14315 + }, + { + "epoch": 3.33955223880597, + "grad_norm": 0.3651408537647329, + "learning_rate": 1.7258011982111094e-05, + "loss": 0.329, + "step": 14320 + }, + { + "epoch": 3.3407182835820897, + "grad_norm": 0.351578371283549, + "learning_rate": 1.7242564831959045e-05, + "loss": 0.3321, + "step": 14325 + }, + { + "epoch": 3.341884328358209, + "grad_norm": 0.35842571012922647, + "learning_rate": 1.72271237825361e-05, + "loss": 0.3188, + "step": 14330 + }, + { + "epoch": 3.343050373134328, + "grad_norm": 0.3719815758057306, + "learning_rate": 1.7211688843025987e-05, + "loss": 0.3315, + "step": 14335 + }, + { + "epoch": 3.344216417910448, + "grad_norm": 0.3492881141302099, + "learning_rate": 1.7196260022608828e-05, + "loss": 0.3322, + "step": 14340 + }, + { + "epoch": 3.345382462686567, + "grad_norm": 0.3467557284343145, + "learning_rate": 1.7180837330461093e-05, + "loss": 0.3301, + "step": 14345 + }, + { + "epoch": 3.3465485074626864, + "grad_norm": 0.3490059049398232, + "learning_rate": 1.716542077575561e-05, + "loss": 0.3403, + "step": 14350 + }, + { + "epoch": 3.347714552238806, + "grad_norm": 0.3637037891945392, + "learning_rate": 1.7150010367661546e-05, + "loss": 0.3421, + "step": 14355 + }, + { + "epoch": 3.3488805970149254, + "grad_norm": 0.3819779828956117, + "learning_rate": 1.7134606115344427e-05, + "loss": 0.3379, + "step": 14360 + }, + { + "epoch": 3.3500466417910446, + "grad_norm": 0.3480677049117983, + "learning_rate": 1.7119208027966116e-05, + "loss": 0.3146, + "step": 14365 + }, + { + "epoch": 3.3512126865671643, + "grad_norm": 0.3588782011005385, + "learning_rate": 1.710381611468479e-05, + "loss": 0.3308, + "step": 14370 + }, + { + "epoch": 3.3523787313432836, + "grad_norm": 0.36499620650935155, + "learning_rate": 1.7088430384654984e-05, + "loss": 0.333, + "step": 14375 + }, + { + "epoch": 3.353544776119403, + "grad_norm": 0.34008861921924666, + "learning_rate": 1.7073050847027537e-05, + "loss": 0.3198, + "step": 14380 + }, + { + "epoch": 3.3547108208955225, + "grad_norm": 0.3565514773656197, + "learning_rate": 1.7057677510949598e-05, + "loss": 0.3331, + "step": 14385 + }, + { + "epoch": 3.355876865671642, + "grad_norm": 0.3611844622144962, + "learning_rate": 1.704231038556465e-05, + "loss": 0.3339, + "step": 14390 + }, + { + "epoch": 3.357042910447761, + "grad_norm": 0.3899193141653559, + "learning_rate": 1.702694948001246e-05, + "loss": 0.3434, + "step": 14395 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.3725681162182271, + "learning_rate": 1.701159480342911e-05, + "loss": 0.3278, + "step": 14400 + }, + { + "epoch": 3.359375, + "grad_norm": 0.36021694811227956, + "learning_rate": 1.6996246364946985e-05, + "loss": 0.3244, + "step": 14405 + }, + { + "epoch": 3.3605410447761193, + "grad_norm": 0.3569267961249343, + "learning_rate": 1.6980904173694727e-05, + "loss": 0.3345, + "step": 14410 + }, + { + "epoch": 3.361707089552239, + "grad_norm": 0.35999107607516434, + "learning_rate": 1.69655682387973e-05, + "loss": 0.3326, + "step": 14415 + }, + { + "epoch": 3.362873134328358, + "grad_norm": 0.3780637462882897, + "learning_rate": 1.695023856937591e-05, + "loss": 0.3358, + "step": 14420 + }, + { + "epoch": 3.3640391791044775, + "grad_norm": 0.3686828628333255, + "learning_rate": 1.6934915174548073e-05, + "loss": 0.3273, + "step": 14425 + }, + { + "epoch": 3.365205223880597, + "grad_norm": 0.3279985473768487, + "learning_rate": 1.691959806342756e-05, + "loss": 0.3301, + "step": 14430 + }, + { + "epoch": 3.3663712686567164, + "grad_norm": 0.35865782560224624, + "learning_rate": 1.690428724512439e-05, + "loss": 0.3384, + "step": 14435 + }, + { + "epoch": 3.3675373134328357, + "grad_norm": 0.3557517735777206, + "learning_rate": 1.688898272874485e-05, + "loss": 0.3114, + "step": 14440 + }, + { + "epoch": 3.3687033582089554, + "grad_norm": 0.40655515559650046, + "learning_rate": 1.6873684523391487e-05, + "loss": 0.3285, + "step": 14445 + }, + { + "epoch": 3.3698694029850746, + "grad_norm": 0.3690029448888416, + "learning_rate": 1.685839263816308e-05, + "loss": 0.3355, + "step": 14450 + }, + { + "epoch": 3.371035447761194, + "grad_norm": 0.34192650742954583, + "learning_rate": 1.6843107082154675e-05, + "loss": 0.3196, + "step": 14455 + }, + { + "epoch": 3.3722014925373136, + "grad_norm": 0.36763451883757187, + "learning_rate": 1.68278278644575e-05, + "loss": 0.3354, + "step": 14460 + }, + { + "epoch": 3.373367537313433, + "grad_norm": 0.37204877793428426, + "learning_rate": 1.6812554994159073e-05, + "loss": 0.337, + "step": 14465 + }, + { + "epoch": 3.374533582089552, + "grad_norm": 0.3783992424160964, + "learning_rate": 1.679728848034311e-05, + "loss": 0.3528, + "step": 14470 + }, + { + "epoch": 3.375699626865672, + "grad_norm": 0.38689274924933764, + "learning_rate": 1.678202833208954e-05, + "loss": 0.3365, + "step": 14475 + }, + { + "epoch": 3.376865671641791, + "grad_norm": 0.3841655946617686, + "learning_rate": 1.6766774558474523e-05, + "loss": 0.3312, + "step": 14480 + }, + { + "epoch": 3.3780317164179103, + "grad_norm": 0.3593124300222711, + "learning_rate": 1.675152716857041e-05, + "loss": 0.3264, + "step": 14485 + }, + { + "epoch": 3.37919776119403, + "grad_norm": 0.3486205976615116, + "learning_rate": 1.6736286171445763e-05, + "loss": 0.3254, + "step": 14490 + }, + { + "epoch": 3.3803638059701493, + "grad_norm": 0.3540705212510821, + "learning_rate": 1.672105157616535e-05, + "loss": 0.3173, + "step": 14495 + }, + { + "epoch": 3.3815298507462686, + "grad_norm": 0.3752893939402229, + "learning_rate": 1.670582339179012e-05, + "loss": 0.3419, + "step": 14500 + }, + { + "epoch": 3.3826958955223883, + "grad_norm": 0.36620118750667346, + "learning_rate": 1.669060162737722e-05, + "loss": 0.3332, + "step": 14505 + }, + { + "epoch": 3.3838619402985075, + "grad_norm": 0.34932427427599044, + "learning_rate": 1.667538629197996e-05, + "loss": 0.3279, + "step": 14510 + }, + { + "epoch": 3.3850279850746268, + "grad_norm": 0.3775048502619263, + "learning_rate": 1.666017739464784e-05, + "loss": 0.3449, + "step": 14515 + }, + { + "epoch": 3.3861940298507465, + "grad_norm": 0.3945710780938654, + "learning_rate": 1.664497494442654e-05, + "loss": 0.3357, + "step": 14520 + }, + { + "epoch": 3.3873600746268657, + "grad_norm": 0.3668755523994854, + "learning_rate": 1.6629778950357883e-05, + "loss": 0.3339, + "step": 14525 + }, + { + "epoch": 3.388526119402985, + "grad_norm": 0.3681631742381257, + "learning_rate": 1.6614589421479876e-05, + "loss": 0.3278, + "step": 14530 + }, + { + "epoch": 3.3896921641791042, + "grad_norm": 0.38126690337022373, + "learning_rate": 1.6599406366826648e-05, + "loss": 0.3387, + "step": 14535 + }, + { + "epoch": 3.390858208955224, + "grad_norm": 0.3805957320880095, + "learning_rate": 1.6584229795428514e-05, + "loss": 0.3412, + "step": 14540 + }, + { + "epoch": 3.392024253731343, + "grad_norm": 0.3455529416980486, + "learning_rate": 1.656905971631192e-05, + "loss": 0.3278, + "step": 14545 + }, + { + "epoch": 3.393190298507463, + "grad_norm": 0.3666042009298596, + "learning_rate": 1.655389613849943e-05, + "loss": 0.3305, + "step": 14550 + }, + { + "epoch": 3.394356343283582, + "grad_norm": 0.3562036770529133, + "learning_rate": 1.653873907100977e-05, + "loss": 0.3271, + "step": 14555 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.351897081751981, + "learning_rate": 1.6523588522857784e-05, + "loss": 0.3261, + "step": 14560 + }, + { + "epoch": 3.3966884328358207, + "grad_norm": 0.3711425099386066, + "learning_rate": 1.6508444503054432e-05, + "loss": 0.3299, + "step": 14565 + }, + { + "epoch": 3.3978544776119404, + "grad_norm": 0.3688138996565365, + "learning_rate": 1.6493307020606796e-05, + "loss": 0.3348, + "step": 14570 + }, + { + "epoch": 3.3990205223880596, + "grad_norm": 0.36755301611238034, + "learning_rate": 1.647817608451807e-05, + "loss": 0.3416, + "step": 14575 + }, + { + "epoch": 3.4001865671641793, + "grad_norm": 0.37049370498932566, + "learning_rate": 1.6463051703787557e-05, + "loss": 0.334, + "step": 14580 + }, + { + "epoch": 3.4013526119402986, + "grad_norm": 0.3516671218129951, + "learning_rate": 1.644793388741067e-05, + "loss": 0.3265, + "step": 14585 + }, + { + "epoch": 3.402518656716418, + "grad_norm": 0.3601317188695524, + "learning_rate": 1.6432822644378888e-05, + "loss": 0.332, + "step": 14590 + }, + { + "epoch": 3.403684701492537, + "grad_norm": 0.36753741291718534, + "learning_rate": 1.64177179836798e-05, + "loss": 0.3289, + "step": 14595 + }, + { + "epoch": 3.404850746268657, + "grad_norm": 0.35386825988075743, + "learning_rate": 1.6402619914297087e-05, + "loss": 0.3389, + "step": 14600 + }, + { + "epoch": 3.406016791044776, + "grad_norm": 0.3667412175060357, + "learning_rate": 1.6387528445210497e-05, + "loss": 0.3307, + "step": 14605 + }, + { + "epoch": 3.4071828358208958, + "grad_norm": 0.4038785901545273, + "learning_rate": 1.6372443585395875e-05, + "loss": 0.3265, + "step": 14610 + }, + { + "epoch": 3.408348880597015, + "grad_norm": 0.3550492244513922, + "learning_rate": 1.6357365343825088e-05, + "loss": 0.3177, + "step": 14615 + }, + { + "epoch": 3.4095149253731343, + "grad_norm": 0.3613251240825706, + "learning_rate": 1.634229372946611e-05, + "loss": 0.3288, + "step": 14620 + }, + { + "epoch": 3.4106809701492535, + "grad_norm": 0.36542787637025925, + "learning_rate": 1.632722875128296e-05, + "loss": 0.3361, + "step": 14625 + }, + { + "epoch": 3.4118470149253732, + "grad_norm": 0.35927686580214985, + "learning_rate": 1.6312170418235705e-05, + "loss": 0.3517, + "step": 14630 + }, + { + "epoch": 3.4130130597014925, + "grad_norm": 0.35177798901385426, + "learning_rate": 1.6297118739280483e-05, + "loss": 0.3277, + "step": 14635 + }, + { + "epoch": 3.4141791044776117, + "grad_norm": 0.3452142196151901, + "learning_rate": 1.6282073723369427e-05, + "loss": 0.3231, + "step": 14640 + }, + { + "epoch": 3.4153451492537314, + "grad_norm": 0.3680101886199691, + "learning_rate": 1.6267035379450744e-05, + "loss": 0.321, + "step": 14645 + }, + { + "epoch": 3.4165111940298507, + "grad_norm": 0.35988407445514975, + "learning_rate": 1.625200371646867e-05, + "loss": 0.327, + "step": 14650 + }, + { + "epoch": 3.41767723880597, + "grad_norm": 0.37105421667334054, + "learning_rate": 1.6236978743363464e-05, + "loss": 0.3309, + "step": 14655 + }, + { + "epoch": 3.4188432835820897, + "grad_norm": 0.3798918900036447, + "learning_rate": 1.622196046907141e-05, + "loss": 0.3392, + "step": 14660 + }, + { + "epoch": 3.420009328358209, + "grad_norm": 0.3446234975186816, + "learning_rate": 1.6206948902524783e-05, + "loss": 0.3126, + "step": 14665 + }, + { + "epoch": 3.421175373134328, + "grad_norm": 0.358649061311015, + "learning_rate": 1.619194405265189e-05, + "loss": 0.3277, + "step": 14670 + }, + { + "epoch": 3.422341417910448, + "grad_norm": 0.3493127914792498, + "learning_rate": 1.617694592837705e-05, + "loss": 0.3162, + "step": 14675 + }, + { + "epoch": 3.423507462686567, + "grad_norm": 0.3720671187559675, + "learning_rate": 1.616195453862057e-05, + "loss": 0.3185, + "step": 14680 + }, + { + "epoch": 3.4246735074626864, + "grad_norm": 0.3829346481868671, + "learning_rate": 1.614696989229876e-05, + "loss": 0.3313, + "step": 14685 + }, + { + "epoch": 3.425839552238806, + "grad_norm": 0.36026012633648186, + "learning_rate": 1.6131991998323893e-05, + "loss": 0.3159, + "step": 14690 + }, + { + "epoch": 3.4270055970149254, + "grad_norm": 0.364732664708826, + "learning_rate": 1.611702086560426e-05, + "loss": 0.3321, + "step": 14695 + }, + { + "epoch": 3.4281716417910446, + "grad_norm": 0.3733393386423135, + "learning_rate": 1.6102056503044115e-05, + "loss": 0.3243, + "step": 14700 + }, + { + "epoch": 3.4293376865671643, + "grad_norm": 0.36131975570955943, + "learning_rate": 1.6087098919543696e-05, + "loss": 0.3292, + "step": 14705 + }, + { + "epoch": 3.4305037313432836, + "grad_norm": 0.35939910986677887, + "learning_rate": 1.6072148123999182e-05, + "loss": 0.3394, + "step": 14710 + }, + { + "epoch": 3.431669776119403, + "grad_norm": 0.3447149523770174, + "learning_rate": 1.605720412530274e-05, + "loss": 0.3312, + "step": 14715 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.42216881380869536, + "learning_rate": 1.6042266932342498e-05, + "loss": 0.3379, + "step": 14720 + }, + { + "epoch": 3.434001865671642, + "grad_norm": 0.3607212441952194, + "learning_rate": 1.6027336554002512e-05, + "loss": 0.3359, + "step": 14725 + }, + { + "epoch": 3.435167910447761, + "grad_norm": 0.38130759153704746, + "learning_rate": 1.60124129991628e-05, + "loss": 0.3257, + "step": 14730 + }, + { + "epoch": 3.4363339552238807, + "grad_norm": 0.33928036201086126, + "learning_rate": 1.599749627669933e-05, + "loss": 0.3157, + "step": 14735 + }, + { + "epoch": 3.4375, + "grad_norm": 0.36133861610151197, + "learning_rate": 1.5982586395483983e-05, + "loss": 0.3357, + "step": 14740 + }, + { + "epoch": 3.4386660447761193, + "grad_norm": 0.35291428627048993, + "learning_rate": 1.5967683364384595e-05, + "loss": 0.3238, + "step": 14745 + }, + { + "epoch": 3.439832089552239, + "grad_norm": 0.3682635737404115, + "learning_rate": 1.595278719226491e-05, + "loss": 0.329, + "step": 14750 + }, + { + "epoch": 3.440998134328358, + "grad_norm": 0.35753795407437755, + "learning_rate": 1.5937897887984605e-05, + "loss": 0.3387, + "step": 14755 + }, + { + "epoch": 3.4421641791044775, + "grad_norm": 0.36141855960515507, + "learning_rate": 1.5923015460399277e-05, + "loss": 0.3375, + "step": 14760 + }, + { + "epoch": 3.443330223880597, + "grad_norm": 0.36290759516276055, + "learning_rate": 1.59081399183604e-05, + "loss": 0.333, + "step": 14765 + }, + { + "epoch": 3.4444962686567164, + "grad_norm": 0.3498827290295448, + "learning_rate": 1.589327127071539e-05, + "loss": 0.3405, + "step": 14770 + }, + { + "epoch": 3.4456623134328357, + "grad_norm": 0.3596383278188398, + "learning_rate": 1.587840952630755e-05, + "loss": 0.3203, + "step": 14775 + }, + { + "epoch": 3.4468283582089554, + "grad_norm": 0.3664597378303038, + "learning_rate": 1.5863554693976065e-05, + "loss": 0.3392, + "step": 14780 + }, + { + "epoch": 3.4479944029850746, + "grad_norm": 0.37031740975092664, + "learning_rate": 1.584870678255604e-05, + "loss": 0.3306, + "step": 14785 + }, + { + "epoch": 3.449160447761194, + "grad_norm": 0.34070326780599375, + "learning_rate": 1.5833865800878422e-05, + "loss": 0.3355, + "step": 14790 + }, + { + "epoch": 3.4503264925373136, + "grad_norm": 0.36613699523160836, + "learning_rate": 1.5819031757770064e-05, + "loss": 0.336, + "step": 14795 + }, + { + "epoch": 3.451492537313433, + "grad_norm": 0.36801957707677047, + "learning_rate": 1.580420466205369e-05, + "loss": 0.3344, + "step": 14800 + }, + { + "epoch": 3.452658582089552, + "grad_norm": 0.3858393377866959, + "learning_rate": 1.5789384522547888e-05, + "loss": 0.3338, + "step": 14805 + }, + { + "epoch": 3.453824626865672, + "grad_norm": 0.3503525325976741, + "learning_rate": 1.577457134806711e-05, + "loss": 0.3108, + "step": 14810 + }, + { + "epoch": 3.454990671641791, + "grad_norm": 0.36296555377155126, + "learning_rate": 1.5759765147421658e-05, + "loss": 0.3296, + "step": 14815 + }, + { + "epoch": 3.4561567164179103, + "grad_norm": 0.37633455287935386, + "learning_rate": 1.5744965929417693e-05, + "loss": 0.3206, + "step": 14820 + }, + { + "epoch": 3.45732276119403, + "grad_norm": 0.35934618176243993, + "learning_rate": 1.573017370285722e-05, + "loss": 0.319, + "step": 14825 + }, + { + "epoch": 3.4584888059701493, + "grad_norm": 0.3653365171269483, + "learning_rate": 1.57153884765381e-05, + "loss": 0.3343, + "step": 14830 + }, + { + "epoch": 3.4596548507462686, + "grad_norm": 0.3769542476376287, + "learning_rate": 1.5700610259254018e-05, + "loss": 0.3351, + "step": 14835 + }, + { + "epoch": 3.4608208955223883, + "grad_norm": 0.3669040001478793, + "learning_rate": 1.5685839059794476e-05, + "loss": 0.3368, + "step": 14840 + }, + { + "epoch": 3.4619869402985075, + "grad_norm": 0.35911218432643416, + "learning_rate": 1.5671074886944823e-05, + "loss": 0.323, + "step": 14845 + }, + { + "epoch": 3.4631529850746268, + "grad_norm": 0.3523318873019422, + "learning_rate": 1.5656317749486225e-05, + "loss": 0.3286, + "step": 14850 + }, + { + "epoch": 3.4643190298507465, + "grad_norm": 0.37422689672709303, + "learning_rate": 1.5641567656195664e-05, + "loss": 0.3324, + "step": 14855 + }, + { + "epoch": 3.4654850746268657, + "grad_norm": 0.38766511982005936, + "learning_rate": 1.562682461584594e-05, + "loss": 0.3503, + "step": 14860 + }, + { + "epoch": 3.466651119402985, + "grad_norm": 0.35505242648315294, + "learning_rate": 1.561208863720562e-05, + "loss": 0.3299, + "step": 14865 + }, + { + "epoch": 3.4678171641791042, + "grad_norm": 0.3611680837469734, + "learning_rate": 1.559735972903912e-05, + "loss": 0.3274, + "step": 14870 + }, + { + "epoch": 3.468983208955224, + "grad_norm": 0.34972098864033974, + "learning_rate": 1.5582637900106622e-05, + "loss": 0.339, + "step": 14875 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.34847286933699, + "learning_rate": 1.5567923159164108e-05, + "loss": 0.3343, + "step": 14880 + }, + { + "epoch": 3.471315298507463, + "grad_norm": 0.36757621767472476, + "learning_rate": 1.555321551496335e-05, + "loss": 0.3425, + "step": 14885 + }, + { + "epoch": 3.472481343283582, + "grad_norm": 0.3793889665690012, + "learning_rate": 1.553851497625187e-05, + "loss": 0.3483, + "step": 14890 + }, + { + "epoch": 3.4736473880597014, + "grad_norm": 0.3711182127309298, + "learning_rate": 1.5523821551773006e-05, + "loss": 0.3325, + "step": 14895 + }, + { + "epoch": 3.4748134328358207, + "grad_norm": 0.36533654574408975, + "learning_rate": 1.5509135250265835e-05, + "loss": 0.3284, + "step": 14900 + }, + { + "epoch": 3.4759794776119404, + "grad_norm": 0.37700089004250487, + "learning_rate": 1.5494456080465198e-05, + "loss": 0.3178, + "step": 14905 + }, + { + "epoch": 3.4771455223880596, + "grad_norm": 0.36278063063044996, + "learning_rate": 1.547978405110171e-05, + "loss": 0.3312, + "step": 14910 + }, + { + "epoch": 3.4783115671641793, + "grad_norm": 0.3493431734159773, + "learning_rate": 1.5465119170901742e-05, + "loss": 0.334, + "step": 14915 + }, + { + "epoch": 3.4794776119402986, + "grad_norm": 0.3655611583068637, + "learning_rate": 1.545046144858738e-05, + "loss": 0.3389, + "step": 14920 + }, + { + "epoch": 3.480643656716418, + "grad_norm": 0.3805565658243217, + "learning_rate": 1.543581089287649e-05, + "loss": 0.3273, + "step": 14925 + }, + { + "epoch": 3.481809701492537, + "grad_norm": 0.3627125661528815, + "learning_rate": 1.5421167512482655e-05, + "loss": 0.3422, + "step": 14930 + }, + { + "epoch": 3.482975746268657, + "grad_norm": 0.3678410497823603, + "learning_rate": 1.5406531316115197e-05, + "loss": 0.3257, + "step": 14935 + }, + { + "epoch": 3.484141791044776, + "grad_norm": 0.3654476965843805, + "learning_rate": 1.539190231247917e-05, + "loss": 0.3241, + "step": 14940 + }, + { + "epoch": 3.4853078358208958, + "grad_norm": 0.3530117766823089, + "learning_rate": 1.5377280510275342e-05, + "loss": 0.3183, + "step": 14945 + }, + { + "epoch": 3.486473880597015, + "grad_norm": 0.36856292967098087, + "learning_rate": 1.5362665918200193e-05, + "loss": 0.3299, + "step": 14950 + }, + { + "epoch": 3.4876399253731343, + "grad_norm": 0.3686830865840385, + "learning_rate": 1.534805854494593e-05, + "loss": 0.3297, + "step": 14955 + }, + { + "epoch": 3.4888059701492535, + "grad_norm": 0.3494305535754593, + "learning_rate": 1.533345839920045e-05, + "loss": 0.3263, + "step": 14960 + }, + { + "epoch": 3.4899720149253732, + "grad_norm": 0.3753474242982273, + "learning_rate": 1.5318865489647383e-05, + "loss": 0.3263, + "step": 14965 + }, + { + "epoch": 3.4911380597014925, + "grad_norm": 0.36408345590552543, + "learning_rate": 1.5304279824966e-05, + "loss": 0.3363, + "step": 14970 + }, + { + "epoch": 3.4923041044776117, + "grad_norm": 0.37622496051105103, + "learning_rate": 1.528970141383131e-05, + "loss": 0.3534, + "step": 14975 + }, + { + "epoch": 3.4934701492537314, + "grad_norm": 0.3794392803979275, + "learning_rate": 1.5275130264913994e-05, + "loss": 0.3351, + "step": 14980 + }, + { + "epoch": 3.4946361940298507, + "grad_norm": 0.3859595842099221, + "learning_rate": 1.5260566386880413e-05, + "loss": 0.3314, + "step": 14985 + }, + { + "epoch": 3.49580223880597, + "grad_norm": 0.34602364135429065, + "learning_rate": 1.5246009788392606e-05, + "loss": 0.3209, + "step": 14990 + }, + { + "epoch": 3.4969682835820897, + "grad_norm": 0.3816376672551873, + "learning_rate": 1.5231460478108268e-05, + "loss": 0.3357, + "step": 14995 + }, + { + "epoch": 3.498134328358209, + "grad_norm": 0.3717461736813935, + "learning_rate": 1.5216918464680776e-05, + "loss": 0.3324, + "step": 15000 + }, + { + "epoch": 3.499300373134328, + "grad_norm": 0.33178416765381397, + "learning_rate": 1.520238375675917e-05, + "loss": 0.3181, + "step": 15005 + }, + { + "epoch": 3.500466417910448, + "grad_norm": 0.3903633798359936, + "learning_rate": 1.5187856362988123e-05, + "loss": 0.3407, + "step": 15010 + }, + { + "epoch": 3.501632462686567, + "grad_norm": 0.3367713124282168, + "learning_rate": 1.5173336292007994e-05, + "loss": 0.338, + "step": 15015 + }, + { + "epoch": 3.5027985074626864, + "grad_norm": 0.3544053714592569, + "learning_rate": 1.5158823552454737e-05, + "loss": 0.328, + "step": 15020 + }, + { + "epoch": 3.503964552238806, + "grad_norm": 0.35941095782907484, + "learning_rate": 1.5144318152959985e-05, + "loss": 0.3404, + "step": 15025 + }, + { + "epoch": 3.5051305970149254, + "grad_norm": 0.35440758227539454, + "learning_rate": 1.5129820102151e-05, + "loss": 0.3297, + "step": 15030 + }, + { + "epoch": 3.5062966417910446, + "grad_norm": 0.3591423084644365, + "learning_rate": 1.5115329408650658e-05, + "loss": 0.3252, + "step": 15035 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.3622317501692589, + "learning_rate": 1.5100846081077479e-05, + "loss": 0.3259, + "step": 15040 + }, + { + "epoch": 3.5086287313432836, + "grad_norm": 0.3758616159252475, + "learning_rate": 1.5086370128045574e-05, + "loss": 0.3331, + "step": 15045 + }, + { + "epoch": 3.509794776119403, + "grad_norm": 0.3474888575260031, + "learning_rate": 1.5071901558164692e-05, + "loss": 0.3435, + "step": 15050 + }, + { + "epoch": 3.5109608208955225, + "grad_norm": 0.35899589947351973, + "learning_rate": 1.5057440380040184e-05, + "loss": 0.3183, + "step": 15055 + }, + { + "epoch": 3.512126865671642, + "grad_norm": 0.38714229369622266, + "learning_rate": 1.5042986602273017e-05, + "loss": 0.3395, + "step": 15060 + }, + { + "epoch": 3.513292910447761, + "grad_norm": 0.3657904168301437, + "learning_rate": 1.502854023345972e-05, + "loss": 0.34, + "step": 15065 + }, + { + "epoch": 3.5144589552238807, + "grad_norm": 0.38156129937719535, + "learning_rate": 1.5014101282192452e-05, + "loss": 0.325, + "step": 15070 + }, + { + "epoch": 3.515625, + "grad_norm": 0.371536919613815, + "learning_rate": 1.4999669757058956e-05, + "loss": 0.3461, + "step": 15075 + }, + { + "epoch": 3.5167910447761193, + "grad_norm": 0.35176434893868985, + "learning_rate": 1.498524566664253e-05, + "loss": 0.3142, + "step": 15080 + }, + { + "epoch": 3.517957089552239, + "grad_norm": 0.33867449955543255, + "learning_rate": 1.4970829019522083e-05, + "loss": 0.312, + "step": 15085 + }, + { + "epoch": 3.519123134328358, + "grad_norm": 0.3640566878166673, + "learning_rate": 1.4956419824272083e-05, + "loss": 0.3373, + "step": 15090 + }, + { + "epoch": 3.5202891791044775, + "grad_norm": 0.3868816339197881, + "learning_rate": 1.4942018089462567e-05, + "loss": 0.3332, + "step": 15095 + }, + { + "epoch": 3.521455223880597, + "grad_norm": 0.3680703668899198, + "learning_rate": 1.4927623823659126e-05, + "loss": 0.3226, + "step": 15100 + }, + { + "epoch": 3.5226212686567164, + "grad_norm": 0.3745327814610825, + "learning_rate": 1.4913237035422926e-05, + "loss": 0.3312, + "step": 15105 + }, + { + "epoch": 3.5237873134328357, + "grad_norm": 0.3615791358363849, + "learning_rate": 1.4898857733310673e-05, + "loss": 0.3322, + "step": 15110 + }, + { + "epoch": 3.5249533582089554, + "grad_norm": 0.38463221506641937, + "learning_rate": 1.4884485925874634e-05, + "loss": 0.3372, + "step": 15115 + }, + { + "epoch": 3.5261194029850746, + "grad_norm": 0.4251087903244426, + "learning_rate": 1.4870121621662594e-05, + "loss": 0.3422, + "step": 15120 + }, + { + "epoch": 3.527285447761194, + "grad_norm": 0.38294678196384374, + "learning_rate": 1.4855764829217894e-05, + "loss": 0.3285, + "step": 15125 + }, + { + "epoch": 3.5284514925373136, + "grad_norm": 0.3728894580708877, + "learning_rate": 1.4841415557079413e-05, + "loss": 0.3304, + "step": 15130 + }, + { + "epoch": 3.529617537313433, + "grad_norm": 0.3578395421103677, + "learning_rate": 1.482707381378154e-05, + "loss": 0.3469, + "step": 15135 + }, + { + "epoch": 3.530783582089552, + "grad_norm": 0.3685878381954686, + "learning_rate": 1.4812739607854199e-05, + "loss": 0.3351, + "step": 15140 + }, + { + "epoch": 3.5319496268656714, + "grad_norm": 0.33687964250194624, + "learning_rate": 1.479841294782282e-05, + "loss": 0.3138, + "step": 15145 + }, + { + "epoch": 3.533115671641791, + "grad_norm": 0.362544293885602, + "learning_rate": 1.4784093842208351e-05, + "loss": 0.3316, + "step": 15150 + }, + { + "epoch": 3.5342817164179103, + "grad_norm": 0.3548155459638462, + "learning_rate": 1.4769782299527252e-05, + "loss": 0.3386, + "step": 15155 + }, + { + "epoch": 3.53544776119403, + "grad_norm": 0.359730220407117, + "learning_rate": 1.4755478328291476e-05, + "loss": 0.3228, + "step": 15160 + }, + { + "epoch": 3.5366138059701493, + "grad_norm": 0.3670487320757613, + "learning_rate": 1.4741181937008485e-05, + "loss": 0.3143, + "step": 15165 + }, + { + "epoch": 3.5377798507462686, + "grad_norm": 0.3607450050322974, + "learning_rate": 1.4726893134181214e-05, + "loss": 0.3259, + "step": 15170 + }, + { + "epoch": 3.538945895522388, + "grad_norm": 0.3534104666076903, + "learning_rate": 1.4712611928308095e-05, + "loss": 0.3348, + "step": 15175 + }, + { + "epoch": 3.5401119402985075, + "grad_norm": 0.37555662273732066, + "learning_rate": 1.4698338327883044e-05, + "loss": 0.3283, + "step": 15180 + }, + { + "epoch": 3.5412779850746268, + "grad_norm": 0.3586619929521861, + "learning_rate": 1.4684072341395454e-05, + "loss": 0.3267, + "step": 15185 + }, + { + "epoch": 3.5424440298507465, + "grad_norm": 0.3739340903164566, + "learning_rate": 1.4669813977330193e-05, + "loss": 0.3187, + "step": 15190 + }, + { + "epoch": 3.5436100746268657, + "grad_norm": 0.3765370086078261, + "learning_rate": 1.4655563244167572e-05, + "loss": 0.3175, + "step": 15195 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.361939433787893, + "learning_rate": 1.4641320150383391e-05, + "loss": 0.3279, + "step": 15200 + }, + { + "epoch": 3.5459421641791042, + "grad_norm": 0.33771998888651455, + "learning_rate": 1.4627084704448895e-05, + "loss": 0.3279, + "step": 15205 + }, + { + "epoch": 3.547108208955224, + "grad_norm": 0.3679615023683131, + "learning_rate": 1.461285691483078e-05, + "loss": 0.3289, + "step": 15210 + }, + { + "epoch": 3.548274253731343, + "grad_norm": 0.35695586140050806, + "learning_rate": 1.4598636789991199e-05, + "loss": 0.3318, + "step": 15215 + }, + { + "epoch": 3.549440298507463, + "grad_norm": 0.3470314610034076, + "learning_rate": 1.458442433838772e-05, + "loss": 0.3303, + "step": 15220 + }, + { + "epoch": 3.550606343283582, + "grad_norm": 0.35117993739354486, + "learning_rate": 1.4570219568473376e-05, + "loss": 0.3244, + "step": 15225 + }, + { + "epoch": 3.5517723880597014, + "grad_norm": 0.35359037370879304, + "learning_rate": 1.4556022488696614e-05, + "loss": 0.3231, + "step": 15230 + }, + { + "epoch": 3.5529384328358207, + "grad_norm": 0.36067931917478974, + "learning_rate": 1.4541833107501312e-05, + "loss": 0.3332, + "step": 15235 + }, + { + "epoch": 3.5541044776119404, + "grad_norm": 0.3711918788865033, + "learning_rate": 1.4527651433326786e-05, + "loss": 0.334, + "step": 15240 + }, + { + "epoch": 3.5552705223880596, + "grad_norm": 0.3638690629653978, + "learning_rate": 1.4513477474607729e-05, + "loss": 0.3253, + "step": 15245 + }, + { + "epoch": 3.5564365671641793, + "grad_norm": 0.34811799023017653, + "learning_rate": 1.4499311239774277e-05, + "loss": 0.3351, + "step": 15250 + }, + { + "epoch": 3.5576026119402986, + "grad_norm": 0.37106387121399587, + "learning_rate": 1.4485152737251972e-05, + "loss": 0.339, + "step": 15255 + }, + { + "epoch": 3.558768656716418, + "grad_norm": 0.361600670652692, + "learning_rate": 1.4471001975461735e-05, + "loss": 0.3438, + "step": 15260 + }, + { + "epoch": 3.559934701492537, + "grad_norm": 0.3637800703363995, + "learning_rate": 1.4456858962819897e-05, + "loss": 0.3368, + "step": 15265 + }, + { + "epoch": 3.561100746268657, + "grad_norm": 0.3796780001513974, + "learning_rate": 1.4442723707738199e-05, + "loss": 0.328, + "step": 15270 + }, + { + "epoch": 3.562266791044776, + "grad_norm": 0.36835448631835566, + "learning_rate": 1.4428596218623722e-05, + "loss": 0.3448, + "step": 15275 + }, + { + "epoch": 3.5634328358208958, + "grad_norm": 0.3577276120644672, + "learning_rate": 1.4414476503878968e-05, + "loss": 0.3372, + "step": 15280 + }, + { + "epoch": 3.564598880597015, + "grad_norm": 0.3613767996407675, + "learning_rate": 1.4400364571901803e-05, + "loss": 0.3307, + "step": 15285 + }, + { + "epoch": 3.5657649253731343, + "grad_norm": 0.3677552637842849, + "learning_rate": 1.4386260431085457e-05, + "loss": 0.338, + "step": 15290 + }, + { + "epoch": 3.5669309701492535, + "grad_norm": 0.356908452833887, + "learning_rate": 1.4372164089818546e-05, + "loss": 0.3427, + "step": 15295 + }, + { + "epoch": 3.5680970149253732, + "grad_norm": 0.36206965948095843, + "learning_rate": 1.4358075556485016e-05, + "loss": 0.3376, + "step": 15300 + }, + { + "epoch": 3.5692630597014925, + "grad_norm": 0.3624327624630281, + "learning_rate": 1.4343994839464192e-05, + "loss": 0.3206, + "step": 15305 + }, + { + "epoch": 3.570429104477612, + "grad_norm": 0.38006948309906924, + "learning_rate": 1.4329921947130748e-05, + "loss": 0.3212, + "step": 15310 + }, + { + "epoch": 3.5715951492537314, + "grad_norm": 0.38480622442127016, + "learning_rate": 1.43158568878547e-05, + "loss": 0.3241, + "step": 15315 + }, + { + "epoch": 3.5727611940298507, + "grad_norm": 0.34484025763709397, + "learning_rate": 1.430179967000141e-05, + "loss": 0.3299, + "step": 15320 + }, + { + "epoch": 3.57392723880597, + "grad_norm": 0.3499694142962118, + "learning_rate": 1.4287750301931557e-05, + "loss": 0.3321, + "step": 15325 + }, + { + "epoch": 3.5750932835820897, + "grad_norm": 0.3398013700826958, + "learning_rate": 1.4273708792001182e-05, + "loss": 0.3316, + "step": 15330 + }, + { + "epoch": 3.576259328358209, + "grad_norm": 0.3760552529788722, + "learning_rate": 1.4259675148561627e-05, + "loss": 0.3492, + "step": 15335 + }, + { + "epoch": 3.5774253731343286, + "grad_norm": 0.38859124118217825, + "learning_rate": 1.424564937995957e-05, + "loss": 0.3479, + "step": 15340 + }, + { + "epoch": 3.578591417910448, + "grad_norm": 0.35942793969687326, + "learning_rate": 1.4231631494537007e-05, + "loss": 0.3322, + "step": 15345 + }, + { + "epoch": 3.579757462686567, + "grad_norm": 0.36194333425110714, + "learning_rate": 1.4217621500631222e-05, + "loss": 0.3354, + "step": 15350 + }, + { + "epoch": 3.5809235074626864, + "grad_norm": 0.3459759448708475, + "learning_rate": 1.4203619406574833e-05, + "loss": 0.3231, + "step": 15355 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.3589072630519182, + "learning_rate": 1.4189625220695746e-05, + "loss": 0.3348, + "step": 15360 + }, + { + "epoch": 3.5832555970149254, + "grad_norm": 0.3677150312874064, + "learning_rate": 1.4175638951317166e-05, + "loss": 0.3326, + "step": 15365 + }, + { + "epoch": 3.5844216417910446, + "grad_norm": 0.3579009984342167, + "learning_rate": 1.41616606067576e-05, + "loss": 0.3427, + "step": 15370 + }, + { + "epoch": 3.5855876865671643, + "grad_norm": 0.411591501910976, + "learning_rate": 1.4147690195330815e-05, + "loss": 0.3271, + "step": 15375 + }, + { + "epoch": 3.5867537313432836, + "grad_norm": 0.38433124951876196, + "learning_rate": 1.413372772534588e-05, + "loss": 0.3272, + "step": 15380 + }, + { + "epoch": 3.587919776119403, + "grad_norm": 0.3634833176942538, + "learning_rate": 1.411977320510714e-05, + "loss": 0.3432, + "step": 15385 + }, + { + "epoch": 3.5890858208955225, + "grad_norm": 0.3668720363682665, + "learning_rate": 1.410582664291421e-05, + "loss": 0.3276, + "step": 15390 + }, + { + "epoch": 3.590251865671642, + "grad_norm": 0.3638320437184362, + "learning_rate": 1.4091888047061974e-05, + "loss": 0.3343, + "step": 15395 + }, + { + "epoch": 3.591417910447761, + "grad_norm": 0.3524707695867182, + "learning_rate": 1.4077957425840563e-05, + "loss": 0.3213, + "step": 15400 + }, + { + "epoch": 3.5925839552238807, + "grad_norm": 0.3621898628153211, + "learning_rate": 1.406403478753538e-05, + "loss": 0.3194, + "step": 15405 + }, + { + "epoch": 3.59375, + "grad_norm": 0.3590742712912799, + "learning_rate": 1.4050120140427081e-05, + "loss": 0.3261, + "step": 15410 + }, + { + "epoch": 3.5949160447761193, + "grad_norm": 0.3450912029468673, + "learning_rate": 1.4036213492791561e-05, + "loss": 0.3256, + "step": 15415 + }, + { + "epoch": 3.596082089552239, + "grad_norm": 0.35088638446113596, + "learning_rate": 1.4022314852899968e-05, + "loss": 0.3297, + "step": 15420 + }, + { + "epoch": 3.597248134328358, + "grad_norm": 0.3472203270848133, + "learning_rate": 1.4008424229018668e-05, + "loss": 0.337, + "step": 15425 + }, + { + "epoch": 3.5984141791044775, + "grad_norm": 0.3589158404874403, + "learning_rate": 1.3994541629409275e-05, + "loss": 0.3109, + "step": 15430 + }, + { + "epoch": 3.599580223880597, + "grad_norm": 0.36760153151119807, + "learning_rate": 1.398066706232864e-05, + "loss": 0.3336, + "step": 15435 + }, + { + "epoch": 3.6007462686567164, + "grad_norm": 0.3960640249827398, + "learning_rate": 1.3966800536028802e-05, + "loss": 0.3493, + "step": 15440 + }, + { + "epoch": 3.6019123134328357, + "grad_norm": 0.3460082205141992, + "learning_rate": 1.395294205875705e-05, + "loss": 0.3, + "step": 15445 + }, + { + "epoch": 3.6030783582089554, + "grad_norm": 0.36391806609117766, + "learning_rate": 1.3939091638755882e-05, + "loss": 0.3232, + "step": 15450 + }, + { + "epoch": 3.6042444029850746, + "grad_norm": 0.36817682739749436, + "learning_rate": 1.3925249284262984e-05, + "loss": 0.3389, + "step": 15455 + }, + { + "epoch": 3.605410447761194, + "grad_norm": 0.3970887012135433, + "learning_rate": 1.3911415003511258e-05, + "loss": 0.3331, + "step": 15460 + }, + { + "epoch": 3.6065764925373136, + "grad_norm": 0.3598222387254086, + "learning_rate": 1.3897588804728818e-05, + "loss": 0.3395, + "step": 15465 + }, + { + "epoch": 3.607742537313433, + "grad_norm": 0.39486262469076977, + "learning_rate": 1.3883770696138946e-05, + "loss": 0.3502, + "step": 15470 + }, + { + "epoch": 3.608908582089552, + "grad_norm": 0.35882559754039134, + "learning_rate": 1.3869960685960118e-05, + "loss": 0.3341, + "step": 15475 + }, + { + "epoch": 3.6100746268656714, + "grad_norm": 0.36429890692316313, + "learning_rate": 1.3856158782406007e-05, + "loss": 0.3272, + "step": 15480 + }, + { + "epoch": 3.611240671641791, + "grad_norm": 0.35308271094610133, + "learning_rate": 1.3842364993685453e-05, + "loss": 0.3314, + "step": 15485 + }, + { + "epoch": 3.6124067164179103, + "grad_norm": 0.33863993816585075, + "learning_rate": 1.3828579328002473e-05, + "loss": 0.3325, + "step": 15490 + }, + { + "epoch": 3.61357276119403, + "grad_norm": 0.363113691762699, + "learning_rate": 1.3814801793556264e-05, + "loss": 0.3276, + "step": 15495 + }, + { + "epoch": 3.6147388059701493, + "grad_norm": 0.3417010384915563, + "learning_rate": 1.3801032398541153e-05, + "loss": 0.3263, + "step": 15500 + }, + { + "epoch": 3.6159048507462686, + "grad_norm": 0.35067492562279406, + "learning_rate": 1.3787271151146658e-05, + "loss": 0.3253, + "step": 15505 + }, + { + "epoch": 3.617070895522388, + "grad_norm": 0.3608403842297616, + "learning_rate": 1.3773518059557445e-05, + "loss": 0.3314, + "step": 15510 + }, + { + "epoch": 3.6182369402985075, + "grad_norm": 0.3371418126128464, + "learning_rate": 1.3759773131953321e-05, + "loss": 0.3263, + "step": 15515 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.3573099675524598, + "learning_rate": 1.3746036376509252e-05, + "loss": 0.3277, + "step": 15520 + }, + { + "epoch": 3.6205690298507465, + "grad_norm": 0.3757913562251166, + "learning_rate": 1.3732307801395322e-05, + "loss": 0.3222, + "step": 15525 + }, + { + "epoch": 3.6217350746268657, + "grad_norm": 0.3903498408572106, + "learning_rate": 1.3718587414776756e-05, + "loss": 0.3253, + "step": 15530 + }, + { + "epoch": 3.622901119402985, + "grad_norm": 0.373151775416442, + "learning_rate": 1.3704875224813928e-05, + "loss": 0.344, + "step": 15535 + }, + { + "epoch": 3.6240671641791042, + "grad_norm": 0.3573542232540521, + "learning_rate": 1.3691171239662315e-05, + "loss": 0.3349, + "step": 15540 + }, + { + "epoch": 3.625233208955224, + "grad_norm": 0.35338449471090716, + "learning_rate": 1.3677475467472534e-05, + "loss": 0.3384, + "step": 15545 + }, + { + "epoch": 3.626399253731343, + "grad_norm": 0.37427275029898005, + "learning_rate": 1.366378791639028e-05, + "loss": 0.3157, + "step": 15550 + }, + { + "epoch": 3.627565298507463, + "grad_norm": 0.38765675807836114, + "learning_rate": 1.3650108594556404e-05, + "loss": 0.345, + "step": 15555 + }, + { + "epoch": 3.628731343283582, + "grad_norm": 0.3533464330030186, + "learning_rate": 1.3636437510106836e-05, + "loss": 0.3269, + "step": 15560 + }, + { + "epoch": 3.6298973880597014, + "grad_norm": 0.3417358657942666, + "learning_rate": 1.362277467117261e-05, + "loss": 0.3244, + "step": 15565 + }, + { + "epoch": 3.6310634328358207, + "grad_norm": 0.3715215147670424, + "learning_rate": 1.3609120085879872e-05, + "loss": 0.3395, + "step": 15570 + }, + { + "epoch": 3.6322294776119404, + "grad_norm": 0.35037124144158394, + "learning_rate": 1.3595473762349825e-05, + "loss": 0.3173, + "step": 15575 + }, + { + "epoch": 3.6333955223880596, + "grad_norm": 0.35952911712874575, + "learning_rate": 1.3581835708698796e-05, + "loss": 0.3454, + "step": 15580 + }, + { + "epoch": 3.6345615671641793, + "grad_norm": 0.37880670504603425, + "learning_rate": 1.3568205933038164e-05, + "loss": 0.3403, + "step": 15585 + }, + { + "epoch": 3.6357276119402986, + "grad_norm": 0.35633675803326076, + "learning_rate": 1.3554584443474405e-05, + "loss": 0.333, + "step": 15590 + }, + { + "epoch": 3.636893656716418, + "grad_norm": 0.34864687126539257, + "learning_rate": 1.3540971248109063e-05, + "loss": 0.317, + "step": 15595 + }, + { + "epoch": 3.638059701492537, + "grad_norm": 0.3613190677360087, + "learning_rate": 1.352736635503873e-05, + "loss": 0.3223, + "step": 15600 + }, + { + "epoch": 3.639225746268657, + "grad_norm": 0.3493317201849097, + "learning_rate": 1.3513769772355083e-05, + "loss": 0.328, + "step": 15605 + }, + { + "epoch": 3.640391791044776, + "grad_norm": 0.36571371883536613, + "learning_rate": 1.3500181508144855e-05, + "loss": 0.3287, + "step": 15610 + }, + { + "epoch": 3.6415578358208958, + "grad_norm": 0.3535576982310634, + "learning_rate": 1.3486601570489809e-05, + "loss": 0.3372, + "step": 15615 + }, + { + "epoch": 3.642723880597015, + "grad_norm": 0.34118310103297367, + "learning_rate": 1.3473029967466779e-05, + "loss": 0.3278, + "step": 15620 + }, + { + "epoch": 3.6438899253731343, + "grad_norm": 0.38546871733126326, + "learning_rate": 1.3459466707147644e-05, + "loss": 0.3391, + "step": 15625 + }, + { + "epoch": 3.6450559701492535, + "grad_norm": 0.37287157064773213, + "learning_rate": 1.3445911797599293e-05, + "loss": 0.3387, + "step": 15630 + }, + { + "epoch": 3.6462220149253732, + "grad_norm": 0.36069452698571924, + "learning_rate": 1.3432365246883671e-05, + "loss": 0.3313, + "step": 15635 + }, + { + "epoch": 3.6473880597014925, + "grad_norm": 0.3605317083094569, + "learning_rate": 1.3418827063057754e-05, + "loss": 0.3184, + "step": 15640 + }, + { + "epoch": 3.648554104477612, + "grad_norm": 0.35748846974735504, + "learning_rate": 1.3405297254173532e-05, + "loss": 0.3337, + "step": 15645 + }, + { + "epoch": 3.6497201492537314, + "grad_norm": 0.42971535301545016, + "learning_rate": 1.3391775828278023e-05, + "loss": 0.3193, + "step": 15650 + }, + { + "epoch": 3.6508861940298507, + "grad_norm": 0.3833379583017852, + "learning_rate": 1.3378262793413237e-05, + "loss": 0.3307, + "step": 15655 + }, + { + "epoch": 3.65205223880597, + "grad_norm": 0.36711703155663405, + "learning_rate": 1.3364758157616219e-05, + "loss": 0.3193, + "step": 15660 + }, + { + "epoch": 3.6532182835820897, + "grad_norm": 0.40412254990119817, + "learning_rate": 1.3351261928919007e-05, + "loss": 0.3557, + "step": 15665 + }, + { + "epoch": 3.654384328358209, + "grad_norm": 0.3453863264301336, + "learning_rate": 1.3337774115348639e-05, + "loss": 0.3169, + "step": 15670 + }, + { + "epoch": 3.6555503731343286, + "grad_norm": 0.35121559249784134, + "learning_rate": 1.3324294724927155e-05, + "loss": 0.309, + "step": 15675 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3930320470487703, + "learning_rate": 1.3310823765671571e-05, + "loss": 0.3465, + "step": 15680 + }, + { + "epoch": 3.657882462686567, + "grad_norm": 0.35668151675568066, + "learning_rate": 1.32973612455939e-05, + "loss": 0.3269, + "step": 15685 + }, + { + "epoch": 3.6590485074626864, + "grad_norm": 0.41385254819495504, + "learning_rate": 1.3283907172701135e-05, + "loss": 0.3423, + "step": 15690 + }, + { + "epoch": 3.660214552238806, + "grad_norm": 0.3705111186390367, + "learning_rate": 1.3270461554995243e-05, + "loss": 0.3237, + "step": 15695 + }, + { + "epoch": 3.6613805970149254, + "grad_norm": 0.36339732325505775, + "learning_rate": 1.3257024400473162e-05, + "loss": 0.3204, + "step": 15700 + }, + { + "epoch": 3.6625466417910446, + "grad_norm": 0.3860933890392395, + "learning_rate": 1.3243595717126792e-05, + "loss": 0.3197, + "step": 15705 + }, + { + "epoch": 3.6637126865671643, + "grad_norm": 0.3520029127768971, + "learning_rate": 1.3230175512943e-05, + "loss": 0.318, + "step": 15710 + }, + { + "epoch": 3.6648787313432836, + "grad_norm": 0.34843853869679225, + "learning_rate": 1.3216763795903608e-05, + "loss": 0.3147, + "step": 15715 + }, + { + "epoch": 3.666044776119403, + "grad_norm": 0.36568601479642543, + "learning_rate": 1.3203360573985394e-05, + "loss": 0.3323, + "step": 15720 + }, + { + "epoch": 3.6672108208955225, + "grad_norm": 0.3542524262376503, + "learning_rate": 1.3189965855160088e-05, + "loss": 0.3172, + "step": 15725 + }, + { + "epoch": 3.668376865671642, + "grad_norm": 0.36937279614828306, + "learning_rate": 1.3176579647394338e-05, + "loss": 0.3225, + "step": 15730 + }, + { + "epoch": 3.669542910447761, + "grad_norm": 0.36723823203508654, + "learning_rate": 1.3163201958649757e-05, + "loss": 0.3244, + "step": 15735 + }, + { + "epoch": 3.6707089552238807, + "grad_norm": 0.39009194370239136, + "learning_rate": 1.314983279688288e-05, + "loss": 0.329, + "step": 15740 + }, + { + "epoch": 3.671875, + "grad_norm": 0.34495145793866044, + "learning_rate": 1.3136472170045171e-05, + "loss": 0.3367, + "step": 15745 + }, + { + "epoch": 3.6730410447761193, + "grad_norm": 0.3759980412237486, + "learning_rate": 1.3123120086083026e-05, + "loss": 0.3303, + "step": 15750 + }, + { + "epoch": 3.674207089552239, + "grad_norm": 0.382340831760479, + "learning_rate": 1.3109776552937742e-05, + "loss": 0.3374, + "step": 15755 + }, + { + "epoch": 3.675373134328358, + "grad_norm": 0.3767650864889917, + "learning_rate": 1.3096441578545544e-05, + "loss": 0.3368, + "step": 15760 + }, + { + "epoch": 3.6765391791044775, + "grad_norm": 0.3589395179024901, + "learning_rate": 1.308311517083756e-05, + "loss": 0.3263, + "step": 15765 + }, + { + "epoch": 3.677705223880597, + "grad_norm": 0.37011970417314655, + "learning_rate": 1.306979733773983e-05, + "loss": 0.3298, + "step": 15770 + }, + { + "epoch": 3.6788712686567164, + "grad_norm": 0.353542681220899, + "learning_rate": 1.3056488087173302e-05, + "loss": 0.3264, + "step": 15775 + }, + { + "epoch": 3.6800373134328357, + "grad_norm": 0.3554306940645688, + "learning_rate": 1.3043187427053788e-05, + "loss": 0.3228, + "step": 15780 + }, + { + "epoch": 3.6812033582089554, + "grad_norm": 0.36167627559379484, + "learning_rate": 1.3029895365292018e-05, + "loss": 0.3357, + "step": 15785 + }, + { + "epoch": 3.6823694029850746, + "grad_norm": 0.36949305735522564, + "learning_rate": 1.3016611909793613e-05, + "loss": 0.3346, + "step": 15790 + }, + { + "epoch": 3.683535447761194, + "grad_norm": 0.34689154298138003, + "learning_rate": 1.3003337068459037e-05, + "loss": 0.3222, + "step": 15795 + }, + { + "epoch": 3.6847014925373136, + "grad_norm": 0.3565317678427899, + "learning_rate": 1.2990070849183678e-05, + "loss": 0.3395, + "step": 15800 + }, + { + "epoch": 3.685867537313433, + "grad_norm": 0.34608924921795814, + "learning_rate": 1.2976813259857773e-05, + "loss": 0.3159, + "step": 15805 + }, + { + "epoch": 3.687033582089552, + "grad_norm": 0.36119933659445547, + "learning_rate": 1.2963564308366416e-05, + "loss": 0.3329, + "step": 15810 + }, + { + "epoch": 3.6881996268656714, + "grad_norm": 0.3777273397001088, + "learning_rate": 1.295032400258958e-05, + "loss": 0.3437, + "step": 15815 + }, + { + "epoch": 3.689365671641791, + "grad_norm": 0.36791880504170366, + "learning_rate": 1.2937092350402097e-05, + "loss": 0.3274, + "step": 15820 + }, + { + "epoch": 3.6905317164179103, + "grad_norm": 0.3650755251064847, + "learning_rate": 1.2923869359673646e-05, + "loss": 0.335, + "step": 15825 + }, + { + "epoch": 3.69169776119403, + "grad_norm": 0.3518973143656496, + "learning_rate": 1.2910655038268749e-05, + "loss": 0.3229, + "step": 15830 + }, + { + "epoch": 3.6928638059701493, + "grad_norm": 0.35874834227369784, + "learning_rate": 1.2897449394046773e-05, + "loss": 0.3326, + "step": 15835 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.35772673982122344, + "learning_rate": 1.2884252434861938e-05, + "loss": 0.3148, + "step": 15840 + }, + { + "epoch": 3.695195895522388, + "grad_norm": 0.3625737095208401, + "learning_rate": 1.2871064168563291e-05, + "loss": 0.3218, + "step": 15845 + }, + { + "epoch": 3.6963619402985075, + "grad_norm": 0.36284227434342214, + "learning_rate": 1.2857884602994706e-05, + "loss": 0.33, + "step": 15850 + }, + { + "epoch": 3.6975279850746268, + "grad_norm": 0.39173976995634996, + "learning_rate": 1.2844713745994871e-05, + "loss": 0.3414, + "step": 15855 + }, + { + "epoch": 3.6986940298507465, + "grad_norm": 0.3815036748658974, + "learning_rate": 1.2831551605397321e-05, + "loss": 0.3295, + "step": 15860 + }, + { + "epoch": 3.6998600746268657, + "grad_norm": 0.35444807700065517, + "learning_rate": 1.2818398189030383e-05, + "loss": 0.3137, + "step": 15865 + }, + { + "epoch": 3.701026119402985, + "grad_norm": 0.3524164632489728, + "learning_rate": 1.2805253504717213e-05, + "loss": 0.3244, + "step": 15870 + }, + { + "epoch": 3.7021921641791042, + "grad_norm": 0.3783306387762941, + "learning_rate": 1.2792117560275766e-05, + "loss": 0.3336, + "step": 15875 + }, + { + "epoch": 3.703358208955224, + "grad_norm": 0.37788785199661423, + "learning_rate": 1.2778990363518785e-05, + "loss": 0.3377, + "step": 15880 + }, + { + "epoch": 3.704524253731343, + "grad_norm": 0.38682248210669895, + "learning_rate": 1.2765871922253835e-05, + "loss": 0.3418, + "step": 15885 + }, + { + "epoch": 3.705690298507463, + "grad_norm": 0.36074312860848057, + "learning_rate": 1.2752762244283255e-05, + "loss": 0.3426, + "step": 15890 + }, + { + "epoch": 3.706856343283582, + "grad_norm": 0.3610923629832659, + "learning_rate": 1.2739661337404185e-05, + "loss": 0.333, + "step": 15895 + }, + { + "epoch": 3.7080223880597014, + "grad_norm": 0.3665839019632394, + "learning_rate": 1.2726569209408545e-05, + "loss": 0.3212, + "step": 15900 + }, + { + "epoch": 3.7091884328358207, + "grad_norm": 0.36768178961895553, + "learning_rate": 1.2713485868083014e-05, + "loss": 0.3409, + "step": 15905 + }, + { + "epoch": 3.7103544776119404, + "grad_norm": 0.3849882925587323, + "learning_rate": 1.2700411321209078e-05, + "loss": 0.3311, + "step": 15910 + }, + { + "epoch": 3.7115205223880596, + "grad_norm": 0.37112550059375615, + "learning_rate": 1.2687345576562965e-05, + "loss": 0.3433, + "step": 15915 + }, + { + "epoch": 3.7126865671641793, + "grad_norm": 0.34565291867874093, + "learning_rate": 1.2674288641915688e-05, + "loss": 0.3118, + "step": 15920 + }, + { + "epoch": 3.7138526119402986, + "grad_norm": 0.37740723081110616, + "learning_rate": 1.2661240525033016e-05, + "loss": 0.3371, + "step": 15925 + }, + { + "epoch": 3.715018656716418, + "grad_norm": 0.3655720709530991, + "learning_rate": 1.264820123367545e-05, + "loss": 0.3408, + "step": 15930 + }, + { + "epoch": 3.716184701492537, + "grad_norm": 0.3616587077451775, + "learning_rate": 1.2635170775598271e-05, + "loss": 0.3221, + "step": 15935 + }, + { + "epoch": 3.717350746268657, + "grad_norm": 0.37889750941780176, + "learning_rate": 1.26221491585515e-05, + "loss": 0.3298, + "step": 15940 + }, + { + "epoch": 3.718516791044776, + "grad_norm": 0.41774288823596173, + "learning_rate": 1.2609136390279886e-05, + "loss": 0.3328, + "step": 15945 + }, + { + "epoch": 3.7196828358208958, + "grad_norm": 0.3780901090472747, + "learning_rate": 1.2596132478522938e-05, + "loss": 0.3375, + "step": 15950 + }, + { + "epoch": 3.720848880597015, + "grad_norm": 0.3557886701922332, + "learning_rate": 1.258313743101487e-05, + "loss": 0.3282, + "step": 15955 + }, + { + "epoch": 3.7220149253731343, + "grad_norm": 0.3581564980288118, + "learning_rate": 1.2570151255484639e-05, + "loss": 0.3329, + "step": 15960 + }, + { + "epoch": 3.7231809701492535, + "grad_norm": 0.36383138312350627, + "learning_rate": 1.2557173959655932e-05, + "loss": 0.3251, + "step": 15965 + }, + { + "epoch": 3.7243470149253732, + "grad_norm": 0.3675587335921228, + "learning_rate": 1.2544205551247148e-05, + "loss": 0.3313, + "step": 15970 + }, + { + "epoch": 3.7255130597014925, + "grad_norm": 0.35992698182812694, + "learning_rate": 1.253124603797139e-05, + "loss": 0.3217, + "step": 15975 + }, + { + "epoch": 3.726679104477612, + "grad_norm": 0.36245092199613566, + "learning_rate": 1.251829542753648e-05, + "loss": 0.3327, + "step": 15980 + }, + { + "epoch": 3.7278451492537314, + "grad_norm": 0.36628656492309, + "learning_rate": 1.2505353727644958e-05, + "loss": 0.3277, + "step": 15985 + }, + { + "epoch": 3.7290111940298507, + "grad_norm": 0.3764789038085975, + "learning_rate": 1.249242094599404e-05, + "loss": 0.3411, + "step": 15990 + }, + { + "epoch": 3.73017723880597, + "grad_norm": 0.3551201730830275, + "learning_rate": 1.2479497090275643e-05, + "loss": 0.3267, + "step": 15995 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.37070614924990003, + "learning_rate": 1.246658216817639e-05, + "loss": 0.3459, + "step": 16000 + }, + { + "epoch": 3.732509328358209, + "grad_norm": 0.38709253175875064, + "learning_rate": 1.2453676187377588e-05, + "loss": 0.3208, + "step": 16005 + }, + { + "epoch": 3.7336753731343286, + "grad_norm": 0.3655769446848934, + "learning_rate": 1.2440779155555202e-05, + "loss": 0.329, + "step": 16010 + }, + { + "epoch": 3.734841417910448, + "grad_norm": 0.33993270878017057, + "learning_rate": 1.24278910803799e-05, + "loss": 0.3296, + "step": 16015 + }, + { + "epoch": 3.736007462686567, + "grad_norm": 0.36181420058602254, + "learning_rate": 1.2415011969517016e-05, + "loss": 0.3464, + "step": 16020 + }, + { + "epoch": 3.7371735074626864, + "grad_norm": 0.3663382714754168, + "learning_rate": 1.2402141830626547e-05, + "loss": 0.3276, + "step": 16025 + }, + { + "epoch": 3.738339552238806, + "grad_norm": 0.3866828361307577, + "learning_rate": 1.2389280671363175e-05, + "loss": 0.3224, + "step": 16030 + }, + { + "epoch": 3.7395055970149254, + "grad_norm": 0.36179016855606844, + "learning_rate": 1.2376428499376201e-05, + "loss": 0.3252, + "step": 16035 + }, + { + "epoch": 3.7406716417910446, + "grad_norm": 0.3524771337224453, + "learning_rate": 1.2363585322309615e-05, + "loss": 0.3273, + "step": 16040 + }, + { + "epoch": 3.7418376865671643, + "grad_norm": 0.35460626714970217, + "learning_rate": 1.2350751147802047e-05, + "loss": 0.3315, + "step": 16045 + }, + { + "epoch": 3.7430037313432836, + "grad_norm": 0.3697576134505472, + "learning_rate": 1.2337925983486768e-05, + "loss": 0.3229, + "step": 16050 + }, + { + "epoch": 3.744169776119403, + "grad_norm": 0.3510243176952665, + "learning_rate": 1.2325109836991703e-05, + "loss": 0.3233, + "step": 16055 + }, + { + "epoch": 3.7453358208955225, + "grad_norm": 0.3606194166769512, + "learning_rate": 1.2312302715939394e-05, + "loss": 0.327, + "step": 16060 + }, + { + "epoch": 3.746501865671642, + "grad_norm": 0.3766537672012198, + "learning_rate": 1.2299504627947029e-05, + "loss": 0.3225, + "step": 16065 + }, + { + "epoch": 3.747667910447761, + "grad_norm": 0.3601936128603242, + "learning_rate": 1.2286715580626418e-05, + "loss": 0.3367, + "step": 16070 + }, + { + "epoch": 3.7488339552238807, + "grad_norm": 0.36101305872400397, + "learning_rate": 1.2273935581584e-05, + "loss": 0.3334, + "step": 16075 + }, + { + "epoch": 3.75, + "grad_norm": 0.3606697833578396, + "learning_rate": 1.2261164638420832e-05, + "loss": 0.3201, + "step": 16080 + }, + { + "epoch": 3.7511660447761193, + "grad_norm": 0.36056632999574745, + "learning_rate": 1.2248402758732568e-05, + "loss": 0.3292, + "step": 16085 + }, + { + "epoch": 3.752332089552239, + "grad_norm": 0.3570610193262246, + "learning_rate": 1.2235649950109492e-05, + "loss": 0.3354, + "step": 16090 + }, + { + "epoch": 3.753498134328358, + "grad_norm": 0.3569585132664504, + "learning_rate": 1.222290622013649e-05, + "loss": 0.3258, + "step": 16095 + }, + { + "epoch": 3.7546641791044775, + "grad_norm": 0.38364762652881995, + "learning_rate": 1.2210171576393037e-05, + "loss": 0.343, + "step": 16100 + }, + { + "epoch": 3.755830223880597, + "grad_norm": 0.3619653811600626, + "learning_rate": 1.2197446026453219e-05, + "loss": 0.3295, + "step": 16105 + }, + { + "epoch": 3.7569962686567164, + "grad_norm": 0.360349124413514, + "learning_rate": 1.2184729577885695e-05, + "loss": 0.3502, + "step": 16110 + }, + { + "epoch": 3.7581623134328357, + "grad_norm": 0.37133569103651815, + "learning_rate": 1.2172022238253727e-05, + "loss": 0.327, + "step": 16115 + }, + { + "epoch": 3.7593283582089554, + "grad_norm": 0.3792583674612013, + "learning_rate": 1.2159324015115148e-05, + "loss": 0.3421, + "step": 16120 + }, + { + "epoch": 3.7604944029850746, + "grad_norm": 0.36459360623121073, + "learning_rate": 1.2146634916022383e-05, + "loss": 0.3453, + "step": 16125 + }, + { + "epoch": 3.761660447761194, + "grad_norm": 0.3839162151450258, + "learning_rate": 1.2133954948522423e-05, + "loss": 0.3321, + "step": 16130 + }, + { + "epoch": 3.7628264925373136, + "grad_norm": 0.358302189283745, + "learning_rate": 1.2121284120156812e-05, + "loss": 0.3243, + "step": 16135 + }, + { + "epoch": 3.763992537313433, + "grad_norm": 0.36844151802635366, + "learning_rate": 1.210862243846168e-05, + "loss": 0.3294, + "step": 16140 + }, + { + "epoch": 3.765158582089552, + "grad_norm": 0.37835947628768996, + "learning_rate": 1.209596991096772e-05, + "loss": 0.3324, + "step": 16145 + }, + { + "epoch": 3.7663246268656714, + "grad_norm": 0.3985861036005318, + "learning_rate": 1.2083326545200154e-05, + "loss": 0.3464, + "step": 16150 + }, + { + "epoch": 3.767490671641791, + "grad_norm": 0.3860859803367432, + "learning_rate": 1.2070692348678776e-05, + "loss": 0.3297, + "step": 16155 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.3720051991308574, + "learning_rate": 1.205806732891793e-05, + "loss": 0.3236, + "step": 16160 + }, + { + "epoch": 3.76982276119403, + "grad_norm": 0.357634493214289, + "learning_rate": 1.2045451493426483e-05, + "loss": 0.3293, + "step": 16165 + }, + { + "epoch": 3.7709888059701493, + "grad_norm": 0.3478269872758963, + "learning_rate": 1.2032844849707853e-05, + "loss": 0.3133, + "step": 16170 + }, + { + "epoch": 3.7721548507462686, + "grad_norm": 0.3721615636676857, + "learning_rate": 1.202024740525999e-05, + "loss": 0.3353, + "step": 16175 + }, + { + "epoch": 3.773320895522388, + "grad_norm": 0.3429143624732521, + "learning_rate": 1.2007659167575377e-05, + "loss": 0.3142, + "step": 16180 + }, + { + "epoch": 3.7744869402985075, + "grad_norm": 0.35599678689392994, + "learning_rate": 1.1995080144141004e-05, + "loss": 0.3247, + "step": 16185 + }, + { + "epoch": 3.7756529850746268, + "grad_norm": 0.36873530185602066, + "learning_rate": 1.1982510342438395e-05, + "loss": 0.3289, + "step": 16190 + }, + { + "epoch": 3.7768190298507465, + "grad_norm": 0.34625541721057895, + "learning_rate": 1.1969949769943587e-05, + "loss": 0.3184, + "step": 16195 + }, + { + "epoch": 3.7779850746268657, + "grad_norm": 0.3757368747073787, + "learning_rate": 1.195739843412713e-05, + "loss": 0.3274, + "step": 16200 + }, + { + "epoch": 3.779151119402985, + "grad_norm": 0.3673938662320257, + "learning_rate": 1.1944856342454078e-05, + "loss": 0.3309, + "step": 16205 + }, + { + "epoch": 3.7803171641791042, + "grad_norm": 0.37229335411824105, + "learning_rate": 1.1932323502383978e-05, + "loss": 0.329, + "step": 16210 + }, + { + "epoch": 3.781483208955224, + "grad_norm": 0.38713291988033305, + "learning_rate": 1.1919799921370888e-05, + "loss": 0.333, + "step": 16215 + }, + { + "epoch": 3.782649253731343, + "grad_norm": 0.3637043924495309, + "learning_rate": 1.1907285606863351e-05, + "loss": 0.3226, + "step": 16220 + }, + { + "epoch": 3.783815298507463, + "grad_norm": 0.3474172831422929, + "learning_rate": 1.1894780566304406e-05, + "loss": 0.3314, + "step": 16225 + }, + { + "epoch": 3.784981343283582, + "grad_norm": 0.36606885899211933, + "learning_rate": 1.1882284807131576e-05, + "loss": 0.3375, + "step": 16230 + }, + { + "epoch": 3.7861473880597014, + "grad_norm": 0.3662270801385073, + "learning_rate": 1.1869798336776845e-05, + "loss": 0.3257, + "step": 16235 + }, + { + "epoch": 3.7873134328358207, + "grad_norm": 0.35687393795731764, + "learning_rate": 1.1857321162666692e-05, + "loss": 0.3295, + "step": 16240 + }, + { + "epoch": 3.7884794776119404, + "grad_norm": 0.3431823692654768, + "learning_rate": 1.1844853292222066e-05, + "loss": 0.3318, + "step": 16245 + }, + { + "epoch": 3.7896455223880596, + "grad_norm": 0.32490377735150194, + "learning_rate": 1.1832394732858377e-05, + "loss": 0.3114, + "step": 16250 + }, + { + "epoch": 3.7908115671641793, + "grad_norm": 0.3743464264848783, + "learning_rate": 1.1819945491985504e-05, + "loss": 0.3513, + "step": 16255 + }, + { + "epoch": 3.7919776119402986, + "grad_norm": 0.37000283732742667, + "learning_rate": 1.1807505577007765e-05, + "loss": 0.3303, + "step": 16260 + }, + { + "epoch": 3.793143656716418, + "grad_norm": 0.3928333375430292, + "learning_rate": 1.179507499532395e-05, + "loss": 0.3459, + "step": 16265 + }, + { + "epoch": 3.794309701492537, + "grad_norm": 0.3536567995936902, + "learning_rate": 1.1782653754327295e-05, + "loss": 0.3365, + "step": 16270 + }, + { + "epoch": 3.795475746268657, + "grad_norm": 0.3625995531356676, + "learning_rate": 1.1770241861405475e-05, + "loss": 0.3329, + "step": 16275 + }, + { + "epoch": 3.796641791044776, + "grad_norm": 0.38334973081198714, + "learning_rate": 1.1757839323940616e-05, + "loss": 0.3325, + "step": 16280 + }, + { + "epoch": 3.7978078358208958, + "grad_norm": 0.34318316196978654, + "learning_rate": 1.1745446149309257e-05, + "loss": 0.326, + "step": 16285 + }, + { + "epoch": 3.798973880597015, + "grad_norm": 0.3501659069979656, + "learning_rate": 1.1733062344882396e-05, + "loss": 0.3428, + "step": 16290 + }, + { + "epoch": 3.8001399253731343, + "grad_norm": 0.3467977209951471, + "learning_rate": 1.1720687918025434e-05, + "loss": 0.3259, + "step": 16295 + }, + { + "epoch": 3.8013059701492535, + "grad_norm": 0.3628241490196815, + "learning_rate": 1.1708322876098215e-05, + "loss": 0.3284, + "step": 16300 + }, + { + "epoch": 3.8024720149253732, + "grad_norm": 0.34558505589509175, + "learning_rate": 1.1695967226454996e-05, + "loss": 0.3331, + "step": 16305 + }, + { + "epoch": 3.8036380597014925, + "grad_norm": 0.36797179727050766, + "learning_rate": 1.1683620976444426e-05, + "loss": 0.3382, + "step": 16310 + }, + { + "epoch": 3.804804104477612, + "grad_norm": 0.34973063617918587, + "learning_rate": 1.1671284133409592e-05, + "loss": 0.3321, + "step": 16315 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.3450380100052671, + "learning_rate": 1.1658956704687974e-05, + "loss": 0.3257, + "step": 16320 + }, + { + "epoch": 3.8071361940298507, + "grad_norm": 0.3710667274629841, + "learning_rate": 1.1646638697611462e-05, + "loss": 0.3464, + "step": 16325 + }, + { + "epoch": 3.80830223880597, + "grad_norm": 0.3729847127011631, + "learning_rate": 1.1634330119506317e-05, + "loss": 0.3402, + "step": 16330 + }, + { + "epoch": 3.8094682835820897, + "grad_norm": 0.35671892051640974, + "learning_rate": 1.1622030977693221e-05, + "loss": 0.3253, + "step": 16335 + }, + { + "epoch": 3.810634328358209, + "grad_norm": 0.36766966266855905, + "learning_rate": 1.1609741279487236e-05, + "loss": 0.3491, + "step": 16340 + }, + { + "epoch": 3.8118003731343286, + "grad_norm": 0.3835837834330343, + "learning_rate": 1.1597461032197788e-05, + "loss": 0.3219, + "step": 16345 + }, + { + "epoch": 3.812966417910448, + "grad_norm": 0.3650261367830838, + "learning_rate": 1.1585190243128707e-05, + "loss": 0.3342, + "step": 16350 + }, + { + "epoch": 3.814132462686567, + "grad_norm": 0.37029816407186616, + "learning_rate": 1.1572928919578186e-05, + "loss": 0.3174, + "step": 16355 + }, + { + "epoch": 3.8152985074626864, + "grad_norm": 0.36217941719615987, + "learning_rate": 1.15606770688388e-05, + "loss": 0.3289, + "step": 16360 + }, + { + "epoch": 3.816464552238806, + "grad_norm": 0.36013492606117087, + "learning_rate": 1.154843469819746e-05, + "loss": 0.3203, + "step": 16365 + }, + { + "epoch": 3.8176305970149254, + "grad_norm": 0.3864651132792706, + "learning_rate": 1.1536201814935473e-05, + "loss": 0.3164, + "step": 16370 + }, + { + "epoch": 3.8187966417910446, + "grad_norm": 0.35905870938331036, + "learning_rate": 1.152397842632848e-05, + "loss": 0.3278, + "step": 16375 + }, + { + "epoch": 3.8199626865671643, + "grad_norm": 0.3649833454823798, + "learning_rate": 1.1511764539646494e-05, + "loss": 0.3351, + "step": 16380 + }, + { + "epoch": 3.8211287313432836, + "grad_norm": 0.37236860547218065, + "learning_rate": 1.1499560162153866e-05, + "loss": 0.324, + "step": 16385 + }, + { + "epoch": 3.822294776119403, + "grad_norm": 0.37017362807070964, + "learning_rate": 1.1487365301109281e-05, + "loss": 0.338, + "step": 16390 + }, + { + "epoch": 3.8234608208955225, + "grad_norm": 0.35689226631205556, + "learning_rate": 1.147517996376578e-05, + "loss": 0.326, + "step": 16395 + }, + { + "epoch": 3.824626865671642, + "grad_norm": 0.3628218576176336, + "learning_rate": 1.1463004157370735e-05, + "loss": 0.32, + "step": 16400 + }, + { + "epoch": 3.825792910447761, + "grad_norm": 0.3578471571354439, + "learning_rate": 1.1450837889165852e-05, + "loss": 0.3315, + "step": 16405 + }, + { + "epoch": 3.8269589552238807, + "grad_norm": 0.3848440765017413, + "learning_rate": 1.1438681166387162e-05, + "loss": 0.3512, + "step": 16410 + }, + { + "epoch": 3.828125, + "grad_norm": 0.3571541370538188, + "learning_rate": 1.1426533996265008e-05, + "loss": 0.3242, + "step": 16415 + }, + { + "epoch": 3.8292910447761193, + "grad_norm": 0.3418990113679379, + "learning_rate": 1.1414396386024064e-05, + "loss": 0.3209, + "step": 16420 + }, + { + "epoch": 3.830457089552239, + "grad_norm": 0.33975118139796545, + "learning_rate": 1.140226834288332e-05, + "loss": 0.3358, + "step": 16425 + }, + { + "epoch": 3.831623134328358, + "grad_norm": 0.33976282322002593, + "learning_rate": 1.1390149874056065e-05, + "loss": 0.3429, + "step": 16430 + }, + { + "epoch": 3.8327891791044775, + "grad_norm": 0.34259215704067036, + "learning_rate": 1.1378040986749912e-05, + "loss": 0.3282, + "step": 16435 + }, + { + "epoch": 3.833955223880597, + "grad_norm": 0.3688992447689341, + "learning_rate": 1.1365941688166747e-05, + "loss": 0.3359, + "step": 16440 + }, + { + "epoch": 3.8351212686567164, + "grad_norm": 0.36270140916201354, + "learning_rate": 1.1353851985502777e-05, + "loss": 0.3314, + "step": 16445 + }, + { + "epoch": 3.8362873134328357, + "grad_norm": 0.344950401242646, + "learning_rate": 1.134177188594849e-05, + "loss": 0.3303, + "step": 16450 + }, + { + "epoch": 3.8374533582089554, + "grad_norm": 0.36785239477332343, + "learning_rate": 1.1329701396688669e-05, + "loss": 0.3516, + "step": 16455 + }, + { + "epoch": 3.8386194029850746, + "grad_norm": 0.38894096599593125, + "learning_rate": 1.1317640524902383e-05, + "loss": 0.3384, + "step": 16460 + }, + { + "epoch": 3.839785447761194, + "grad_norm": 0.36061789289459095, + "learning_rate": 1.1305589277762965e-05, + "loss": 0.3385, + "step": 16465 + }, + { + "epoch": 3.8409514925373136, + "grad_norm": 0.36253449553220574, + "learning_rate": 1.129354766243804e-05, + "loss": 0.3184, + "step": 16470 + }, + { + "epoch": 3.842117537313433, + "grad_norm": 0.37415484991471054, + "learning_rate": 1.1281515686089497e-05, + "loss": 0.3161, + "step": 16475 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.4816301426672368, + "learning_rate": 1.1269493355873498e-05, + "loss": 0.3343, + "step": 16480 + }, + { + "epoch": 3.8444496268656714, + "grad_norm": 0.3447241723165985, + "learning_rate": 1.1257480678940469e-05, + "loss": 0.3168, + "step": 16485 + }, + { + "epoch": 3.845615671641791, + "grad_norm": 0.34973382369570744, + "learning_rate": 1.1245477662435076e-05, + "loss": 0.3167, + "step": 16490 + }, + { + "epoch": 3.8467817164179103, + "grad_norm": 0.35499020069663667, + "learning_rate": 1.123348431349626e-05, + "loss": 0.3287, + "step": 16495 + }, + { + "epoch": 3.84794776119403, + "grad_norm": 0.36365685080246757, + "learning_rate": 1.1221500639257204e-05, + "loss": 0.3285, + "step": 16500 + }, + { + "epoch": 3.8491138059701493, + "grad_norm": 0.3643265001488134, + "learning_rate": 1.1209526646845346e-05, + "loss": 0.3374, + "step": 16505 + }, + { + "epoch": 3.8502798507462686, + "grad_norm": 0.3673120965924131, + "learning_rate": 1.1197562343382341e-05, + "loss": 0.3283, + "step": 16510 + }, + { + "epoch": 3.851445895522388, + "grad_norm": 0.35885175027255184, + "learning_rate": 1.118560773598411e-05, + "loss": 0.3242, + "step": 16515 + }, + { + "epoch": 3.8526119402985075, + "grad_norm": 0.3722870845157102, + "learning_rate": 1.1173662831760798e-05, + "loss": 0.3436, + "step": 16520 + }, + { + "epoch": 3.8537779850746268, + "grad_norm": 0.3632231569664917, + "learning_rate": 1.1161727637816762e-05, + "loss": 0.3339, + "step": 16525 + }, + { + "epoch": 3.8549440298507465, + "grad_norm": 0.35643835047349254, + "learning_rate": 1.1149802161250607e-05, + "loss": 0.3343, + "step": 16530 + }, + { + "epoch": 3.8561100746268657, + "grad_norm": 0.362687010349951, + "learning_rate": 1.1137886409155158e-05, + "loss": 0.3254, + "step": 16535 + }, + { + "epoch": 3.857276119402985, + "grad_norm": 0.38194800139357077, + "learning_rate": 1.1125980388617425e-05, + "loss": 0.3464, + "step": 16540 + }, + { + "epoch": 3.8584421641791042, + "grad_norm": 0.37894934633642446, + "learning_rate": 1.1114084106718667e-05, + "loss": 0.3354, + "step": 16545 + }, + { + "epoch": 3.859608208955224, + "grad_norm": 0.3505124849219401, + "learning_rate": 1.1102197570534334e-05, + "loss": 0.3312, + "step": 16550 + }, + { + "epoch": 3.860774253731343, + "grad_norm": 0.3756841556190372, + "learning_rate": 1.1090320787134085e-05, + "loss": 0.3332, + "step": 16555 + }, + { + "epoch": 3.861940298507463, + "grad_norm": 0.36218154459593793, + "learning_rate": 1.1078453763581776e-05, + "loss": 0.3237, + "step": 16560 + }, + { + "epoch": 3.863106343283582, + "grad_norm": 0.3822273203480594, + "learning_rate": 1.1066596506935447e-05, + "loss": 0.3286, + "step": 16565 + }, + { + "epoch": 3.8642723880597014, + "grad_norm": 0.3670961602710075, + "learning_rate": 1.1054749024247348e-05, + "loss": 0.3312, + "step": 16570 + }, + { + "epoch": 3.8654384328358207, + "grad_norm": 0.35519039241521144, + "learning_rate": 1.1042911322563903e-05, + "loss": 0.3213, + "step": 16575 + }, + { + "epoch": 3.8666044776119404, + "grad_norm": 0.36159604405217605, + "learning_rate": 1.103108340892573e-05, + "loss": 0.3437, + "step": 16580 + }, + { + "epoch": 3.8677705223880596, + "grad_norm": 0.36936968190453584, + "learning_rate": 1.1019265290367616e-05, + "loss": 0.3329, + "step": 16585 + }, + { + "epoch": 3.8689365671641793, + "grad_norm": 0.34637316043003397, + "learning_rate": 1.100745697391852e-05, + "loss": 0.3095, + "step": 16590 + }, + { + "epoch": 3.8701026119402986, + "grad_norm": 0.3584097955671261, + "learning_rate": 1.099565846660158e-05, + "loss": 0.3254, + "step": 16595 + }, + { + "epoch": 3.871268656716418, + "grad_norm": 0.36626316083434135, + "learning_rate": 1.0983869775434091e-05, + "loss": 0.3384, + "step": 16600 + }, + { + "epoch": 3.872434701492537, + "grad_norm": 0.3433672160631697, + "learning_rate": 1.097209090742752e-05, + "loss": 0.3154, + "step": 16605 + }, + { + "epoch": 3.873600746268657, + "grad_norm": 0.36423084520484045, + "learning_rate": 1.096032186958749e-05, + "loss": 0.3298, + "step": 16610 + }, + { + "epoch": 3.874766791044776, + "grad_norm": 0.369634546025693, + "learning_rate": 1.0948562668913763e-05, + "loss": 0.3319, + "step": 16615 + }, + { + "epoch": 3.8759328358208958, + "grad_norm": 0.3581641937544491, + "learning_rate": 1.0936813312400263e-05, + "loss": 0.3448, + "step": 16620 + }, + { + "epoch": 3.877098880597015, + "grad_norm": 0.3447436422193593, + "learning_rate": 1.092507380703506e-05, + "loss": 0.3361, + "step": 16625 + }, + { + "epoch": 3.8782649253731343, + "grad_norm": 0.3642745484519943, + "learning_rate": 1.091334415980036e-05, + "loss": 0.3167, + "step": 16630 + }, + { + "epoch": 3.8794309701492535, + "grad_norm": 0.35772294652482234, + "learning_rate": 1.0901624377672513e-05, + "loss": 0.3161, + "step": 16635 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.35915974216675167, + "learning_rate": 1.0889914467621986e-05, + "loss": 0.3372, + "step": 16640 + }, + { + "epoch": 3.8817630597014925, + "grad_norm": 0.35121936939843745, + "learning_rate": 1.0878214436613387e-05, + "loss": 0.3234, + "step": 16645 + }, + { + "epoch": 3.882929104477612, + "grad_norm": 0.35372520150892794, + "learning_rate": 1.0866524291605452e-05, + "loss": 0.3323, + "step": 16650 + }, + { + "epoch": 3.8840951492537314, + "grad_norm": 0.3466279598128762, + "learning_rate": 1.0854844039551023e-05, + "loss": 0.3326, + "step": 16655 + }, + { + "epoch": 3.8852611940298507, + "grad_norm": 0.3580657133906849, + "learning_rate": 1.0843173687397079e-05, + "loss": 0.3216, + "step": 16660 + }, + { + "epoch": 3.88642723880597, + "grad_norm": 0.36608447027340973, + "learning_rate": 1.0831513242084681e-05, + "loss": 0.3398, + "step": 16665 + }, + { + "epoch": 3.8875932835820897, + "grad_norm": 0.3518514817912954, + "learning_rate": 1.0819862710549025e-05, + "loss": 0.3211, + "step": 16670 + }, + { + "epoch": 3.888759328358209, + "grad_norm": 0.3780953494971563, + "learning_rate": 1.0808222099719396e-05, + "loss": 0.3238, + "step": 16675 + }, + { + "epoch": 3.8899253731343286, + "grad_norm": 0.34541646400879394, + "learning_rate": 1.0796591416519192e-05, + "loss": 0.3244, + "step": 16680 + }, + { + "epoch": 3.891091417910448, + "grad_norm": 0.3710284220371676, + "learning_rate": 1.0784970667865882e-05, + "loss": 0.3321, + "step": 16685 + }, + { + "epoch": 3.892257462686567, + "grad_norm": 0.3494964492431871, + "learning_rate": 1.0773359860671054e-05, + "loss": 0.3256, + "step": 16690 + }, + { + "epoch": 3.8934235074626864, + "grad_norm": 0.3959199222512189, + "learning_rate": 1.0761759001840371e-05, + "loss": 0.339, + "step": 16695 + }, + { + "epoch": 3.894589552238806, + "grad_norm": 0.3682203271829885, + "learning_rate": 1.0750168098273569e-05, + "loss": 0.3258, + "step": 16700 + }, + { + "epoch": 3.8957555970149254, + "grad_norm": 0.36868967803369485, + "learning_rate": 1.073858715686448e-05, + "loss": 0.3446, + "step": 16705 + }, + { + "epoch": 3.8969216417910446, + "grad_norm": 0.38765007689293923, + "learning_rate": 1.0727016184501e-05, + "loss": 0.3419, + "step": 16710 + }, + { + "epoch": 3.8980876865671643, + "grad_norm": 0.36932697842412554, + "learning_rate": 1.0715455188065112e-05, + "loss": 0.3262, + "step": 16715 + }, + { + "epoch": 3.8992537313432836, + "grad_norm": 0.350074039553313, + "learning_rate": 1.0703904174432836e-05, + "loss": 0.3417, + "step": 16720 + }, + { + "epoch": 3.900419776119403, + "grad_norm": 0.3695761815222959, + "learning_rate": 1.069236315047428e-05, + "loss": 0.344, + "step": 16725 + }, + { + "epoch": 3.9015858208955225, + "grad_norm": 0.3730850744905737, + "learning_rate": 1.0680832123053603e-05, + "loss": 0.3328, + "step": 16730 + }, + { + "epoch": 3.902751865671642, + "grad_norm": 0.3766990919413252, + "learning_rate": 1.0669311099029014e-05, + "loss": 0.3539, + "step": 16735 + }, + { + "epoch": 3.903917910447761, + "grad_norm": 0.37727879408409504, + "learning_rate": 1.0657800085252789e-05, + "loss": 0.3434, + "step": 16740 + }, + { + "epoch": 3.9050839552238807, + "grad_norm": 0.38143995339382847, + "learning_rate": 1.064629908857122e-05, + "loss": 0.3311, + "step": 16745 + }, + { + "epoch": 3.90625, + "grad_norm": 0.34587440774360995, + "learning_rate": 1.0634808115824668e-05, + "loss": 0.3353, + "step": 16750 + }, + { + "epoch": 3.9074160447761193, + "grad_norm": 0.3496412727509482, + "learning_rate": 1.062332717384752e-05, + "loss": 0.3436, + "step": 16755 + }, + { + "epoch": 3.908582089552239, + "grad_norm": 0.40704435544810874, + "learning_rate": 1.0611856269468203e-05, + "loss": 0.3451, + "step": 16760 + }, + { + "epoch": 3.909748134328358, + "grad_norm": 0.3655776721725987, + "learning_rate": 1.0600395409509177e-05, + "loss": 0.3385, + "step": 16765 + }, + { + "epoch": 3.9109141791044775, + "grad_norm": 0.3569723384561409, + "learning_rate": 1.0588944600786907e-05, + "loss": 0.3056, + "step": 16770 + }, + { + "epoch": 3.912080223880597, + "grad_norm": 0.3677148317383257, + "learning_rate": 1.0577503850111903e-05, + "loss": 0.3172, + "step": 16775 + }, + { + "epoch": 3.9132462686567164, + "grad_norm": 0.3679093419193611, + "learning_rate": 1.0566073164288687e-05, + "loss": 0.3393, + "step": 16780 + }, + { + "epoch": 3.9144123134328357, + "grad_norm": 0.34795094404353005, + "learning_rate": 1.0554652550115788e-05, + "loss": 0.3246, + "step": 16785 + }, + { + "epoch": 3.9155783582089554, + "grad_norm": 0.37333639101376587, + "learning_rate": 1.0543242014385758e-05, + "loss": 0.3101, + "step": 16790 + }, + { + "epoch": 3.9167444029850746, + "grad_norm": 0.36861435656773384, + "learning_rate": 1.0531841563885134e-05, + "loss": 0.3241, + "step": 16795 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.3542847891851535, + "learning_rate": 1.052045120539447e-05, + "loss": 0.3246, + "step": 16800 + }, + { + "epoch": 3.9190764925373136, + "grad_norm": 0.3726980335684039, + "learning_rate": 1.050907094568832e-05, + "loss": 0.3548, + "step": 16805 + }, + { + "epoch": 3.920242537313433, + "grad_norm": 0.3719498959276835, + "learning_rate": 1.0497700791535221e-05, + "loss": 0.3345, + "step": 16810 + }, + { + "epoch": 3.921408582089552, + "grad_norm": 0.3777979555885601, + "learning_rate": 1.0486340749697716e-05, + "loss": 0.3507, + "step": 16815 + }, + { + "epoch": 3.9225746268656714, + "grad_norm": 0.352674645064866, + "learning_rate": 1.0474990826932301e-05, + "loss": 0.33, + "step": 16820 + }, + { + "epoch": 3.923740671641791, + "grad_norm": 0.35508721372875535, + "learning_rate": 1.0463651029989492e-05, + "loss": 0.3431, + "step": 16825 + }, + { + "epoch": 3.9249067164179103, + "grad_norm": 0.41400108346055425, + "learning_rate": 1.0452321365613758e-05, + "loss": 0.3654, + "step": 16830 + }, + { + "epoch": 3.92607276119403, + "grad_norm": 0.3598885605631565, + "learning_rate": 1.0441001840543548e-05, + "loss": 0.3387, + "step": 16835 + }, + { + "epoch": 3.9272388059701493, + "grad_norm": 0.39152450463817323, + "learning_rate": 1.0429692461511298e-05, + "loss": 0.3262, + "step": 16840 + }, + { + "epoch": 3.9284048507462686, + "grad_norm": 0.36601038358208293, + "learning_rate": 1.041839323524337e-05, + "loss": 0.3324, + "step": 16845 + }, + { + "epoch": 3.929570895522388, + "grad_norm": 0.35711964271930985, + "learning_rate": 1.0407104168460116e-05, + "loss": 0.3293, + "step": 16850 + }, + { + "epoch": 3.9307369402985075, + "grad_norm": 0.33704013731851895, + "learning_rate": 1.0395825267875846e-05, + "loss": 0.3301, + "step": 16855 + }, + { + "epoch": 3.9319029850746268, + "grad_norm": 0.3412339094158753, + "learning_rate": 1.0384556540198825e-05, + "loss": 0.3257, + "step": 16860 + }, + { + "epoch": 3.9330690298507465, + "grad_norm": 0.3487698052040988, + "learning_rate": 1.0373297992131242e-05, + "loss": 0.3242, + "step": 16865 + }, + { + "epoch": 3.9342350746268657, + "grad_norm": 0.3398157030416527, + "learning_rate": 1.0362049630369259e-05, + "loss": 0.3136, + "step": 16870 + }, + { + "epoch": 3.935401119402985, + "grad_norm": 0.34943895508449124, + "learning_rate": 1.0350811461602974e-05, + "loss": 0.3341, + "step": 16875 + }, + { + "epoch": 3.9365671641791042, + "grad_norm": 0.34294387406591686, + "learning_rate": 1.033958349251641e-05, + "loss": 0.3284, + "step": 16880 + }, + { + "epoch": 3.937733208955224, + "grad_norm": 0.38092359504875767, + "learning_rate": 1.0328365729787536e-05, + "loss": 0.3376, + "step": 16885 + }, + { + "epoch": 3.938899253731343, + "grad_norm": 0.39175322808446295, + "learning_rate": 1.0317158180088254e-05, + "loss": 0.3427, + "step": 16890 + }, + { + "epoch": 3.940065298507463, + "grad_norm": 0.3642002771961428, + "learning_rate": 1.0305960850084373e-05, + "loss": 0.3311, + "step": 16895 + }, + { + "epoch": 3.941231343283582, + "grad_norm": 0.4044208411505232, + "learning_rate": 1.0294773746435638e-05, + "loss": 0.3531, + "step": 16900 + }, + { + "epoch": 3.9423973880597014, + "grad_norm": 0.3835140241070223, + "learning_rate": 1.0283596875795718e-05, + "loss": 0.346, + "step": 16905 + }, + { + "epoch": 3.9435634328358207, + "grad_norm": 0.3570930841402473, + "learning_rate": 1.0272430244812175e-05, + "loss": 0.3411, + "step": 16910 + }, + { + "epoch": 3.9447294776119404, + "grad_norm": 0.3360779667826955, + "learning_rate": 1.0261273860126514e-05, + "loss": 0.3252, + "step": 16915 + }, + { + "epoch": 3.9458955223880596, + "grad_norm": 0.3637844641033224, + "learning_rate": 1.0250127728374098e-05, + "loss": 0.322, + "step": 16920 + }, + { + "epoch": 3.9470615671641793, + "grad_norm": 0.370577876806029, + "learning_rate": 1.023899185618423e-05, + "loss": 0.3219, + "step": 16925 + }, + { + "epoch": 3.9482276119402986, + "grad_norm": 0.37931647342956126, + "learning_rate": 1.0227866250180105e-05, + "loss": 0.3302, + "step": 16930 + }, + { + "epoch": 3.949393656716418, + "grad_norm": 0.3518036227852333, + "learning_rate": 1.02167509169788e-05, + "loss": 0.3291, + "step": 16935 + }, + { + "epoch": 3.950559701492537, + "grad_norm": 0.3523764144241545, + "learning_rate": 1.02056458631913e-05, + "loss": 0.3423, + "step": 16940 + }, + { + "epoch": 3.951725746268657, + "grad_norm": 0.3593927237913662, + "learning_rate": 1.0194551095422447e-05, + "loss": 0.3388, + "step": 16945 + }, + { + "epoch": 3.952891791044776, + "grad_norm": 0.3664733667070302, + "learning_rate": 1.0183466620270996e-05, + "loss": 0.3357, + "step": 16950 + }, + { + "epoch": 3.9540578358208958, + "grad_norm": 0.3487621273512764, + "learning_rate": 1.0172392444329561e-05, + "loss": 0.3325, + "step": 16955 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.3641172967334953, + "learning_rate": 1.0161328574184645e-05, + "loss": 0.3241, + "step": 16960 + }, + { + "epoch": 3.9563899253731343, + "grad_norm": 0.3767742238835195, + "learning_rate": 1.0150275016416613e-05, + "loss": 0.3205, + "step": 16965 + }, + { + "epoch": 3.9575559701492535, + "grad_norm": 0.3389958994897356, + "learning_rate": 1.0139231777599689e-05, + "loss": 0.3405, + "step": 16970 + }, + { + "epoch": 3.9587220149253732, + "grad_norm": 0.37157193171540465, + "learning_rate": 1.0128198864301976e-05, + "loss": 0.3334, + "step": 16975 + }, + { + "epoch": 3.9598880597014925, + "grad_norm": 0.3458020925348863, + "learning_rate": 1.0117176283085419e-05, + "loss": 0.3257, + "step": 16980 + }, + { + "epoch": 3.961054104477612, + "grad_norm": 0.36863265137328216, + "learning_rate": 1.0106164040505835e-05, + "loss": 0.3364, + "step": 16985 + }, + { + "epoch": 3.9622201492537314, + "grad_norm": 0.3808840293256123, + "learning_rate": 1.009516214311289e-05, + "loss": 0.3403, + "step": 16990 + }, + { + "epoch": 3.9633861940298507, + "grad_norm": 0.35766163428437153, + "learning_rate": 1.0084170597450073e-05, + "loss": 0.3224, + "step": 16995 + }, + { + "epoch": 3.96455223880597, + "grad_norm": 0.3469163213748774, + "learning_rate": 1.0073189410054742e-05, + "loss": 0.3327, + "step": 17000 + }, + { + "epoch": 3.9657182835820897, + "grad_norm": 0.3607061724727493, + "learning_rate": 1.0062218587458085e-05, + "loss": 0.3211, + "step": 17005 + }, + { + "epoch": 3.966884328358209, + "grad_norm": 0.36323009345054685, + "learning_rate": 1.0051258136185132e-05, + "loss": 0.3293, + "step": 17010 + }, + { + "epoch": 3.9680503731343286, + "grad_norm": 0.36402290316596186, + "learning_rate": 1.0040308062754738e-05, + "loss": 0.3341, + "step": 17015 + }, + { + "epoch": 3.969216417910448, + "grad_norm": 0.38681597015686653, + "learning_rate": 1.0029368373679583e-05, + "loss": 0.3323, + "step": 17020 + }, + { + "epoch": 3.970382462686567, + "grad_norm": 0.34623211602067594, + "learning_rate": 1.001843907546617e-05, + "loss": 0.3175, + "step": 17025 + }, + { + "epoch": 3.9715485074626864, + "grad_norm": 0.3499523440752185, + "learning_rate": 1.0007520174614836e-05, + "loss": 0.3237, + "step": 17030 + }, + { + "epoch": 3.972714552238806, + "grad_norm": 0.3605004792956158, + "learning_rate": 9.996611677619719e-06, + "loss": 0.3266, + "step": 17035 + }, + { + "epoch": 3.9738805970149254, + "grad_norm": 0.35267364742934776, + "learning_rate": 9.98571359096878e-06, + "loss": 0.3244, + "step": 17040 + }, + { + "epoch": 3.9750466417910446, + "grad_norm": 0.3708483035028362, + "learning_rate": 9.97482592114378e-06, + "loss": 0.3422, + "step": 17045 + }, + { + "epoch": 3.9762126865671643, + "grad_norm": 0.3648063731433488, + "learning_rate": 9.96394867462028e-06, + "loss": 0.3392, + "step": 17050 + }, + { + "epoch": 3.9773787313432836, + "grad_norm": 0.3458415816651768, + "learning_rate": 9.953081857867665e-06, + "loss": 0.3268, + "step": 17055 + }, + { + "epoch": 3.978544776119403, + "grad_norm": 0.38164120855475553, + "learning_rate": 9.94222547734909e-06, + "loss": 0.3112, + "step": 17060 + }, + { + "epoch": 3.9797108208955225, + "grad_norm": 0.3684841691014534, + "learning_rate": 9.93137953952151e-06, + "loss": 0.3326, + "step": 17065 + }, + { + "epoch": 3.980876865671642, + "grad_norm": 0.37748506035923324, + "learning_rate": 9.92054405083569e-06, + "loss": 0.3333, + "step": 17070 + }, + { + "epoch": 3.982042910447761, + "grad_norm": 0.35884362385574936, + "learning_rate": 9.90971901773614e-06, + "loss": 0.3334, + "step": 17075 + }, + { + "epoch": 3.9832089552238807, + "grad_norm": 0.349232885564346, + "learning_rate": 9.898904446661188e-06, + "loss": 0.3244, + "step": 17080 + }, + { + "epoch": 3.984375, + "grad_norm": 0.3803573479542466, + "learning_rate": 9.888100344042926e-06, + "loss": 0.3293, + "step": 17085 + }, + { + "epoch": 3.9855410447761193, + "grad_norm": 0.36835281448664914, + "learning_rate": 9.87730671630722e-06, + "loss": 0.3189, + "step": 17090 + }, + { + "epoch": 3.986707089552239, + "grad_norm": 0.3633115584459684, + "learning_rate": 9.866523569873708e-06, + "loss": 0.3342, + "step": 17095 + }, + { + "epoch": 3.987873134328358, + "grad_norm": 0.355961031124995, + "learning_rate": 9.855750911155784e-06, + "loss": 0.3202, + "step": 17100 + }, + { + "epoch": 3.9890391791044775, + "grad_norm": 0.36014989217850457, + "learning_rate": 9.844988746560615e-06, + "loss": 0.3189, + "step": 17105 + }, + { + "epoch": 3.990205223880597, + "grad_norm": 0.37534210089790565, + "learning_rate": 9.834237082489126e-06, + "loss": 0.3255, + "step": 17110 + }, + { + "epoch": 3.9913712686567164, + "grad_norm": 0.3688245738811151, + "learning_rate": 9.823495925335995e-06, + "loss": 0.3284, + "step": 17115 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.37833927184877747, + "learning_rate": 9.812765281489655e-06, + "loss": 0.3496, + "step": 17120 + }, + { + "epoch": 3.9937033582089554, + "grad_norm": 0.3692126708342929, + "learning_rate": 9.802045157332269e-06, + "loss": 0.3269, + "step": 17125 + }, + { + "epoch": 3.9948694029850746, + "grad_norm": 0.39539934417641975, + "learning_rate": 9.79133555923976e-06, + "loss": 0.3285, + "step": 17130 + }, + { + "epoch": 3.996035447761194, + "grad_norm": 0.37023818018411236, + "learning_rate": 9.780636493581797e-06, + "loss": 0.3383, + "step": 17135 + }, + { + "epoch": 3.9972014925373136, + "grad_norm": 0.36515860019550433, + "learning_rate": 9.76994796672176e-06, + "loss": 0.3269, + "step": 17140 + }, + { + "epoch": 3.998367537313433, + "grad_norm": 0.38032836933336583, + "learning_rate": 9.759269985016786e-06, + "loss": 0.3435, + "step": 17145 + }, + { + "epoch": 3.999533582089552, + "grad_norm": 0.34346026570957655, + "learning_rate": 9.748602554817721e-06, + "loss": 0.3335, + "step": 17150 + }, + { + "epoch": 4.000699626865671, + "grad_norm": 0.34961007074318995, + "learning_rate": 9.737945682469145e-06, + "loss": 0.2843, + "step": 17155 + }, + { + "epoch": 4.001865671641791, + "grad_norm": 0.3595797350406245, + "learning_rate": 9.72729937430936e-06, + "loss": 0.2724, + "step": 17160 + }, + { + "epoch": 4.003031716417911, + "grad_norm": 0.3757878590900649, + "learning_rate": 9.716663636670375e-06, + "loss": 0.2671, + "step": 17165 + }, + { + "epoch": 4.00419776119403, + "grad_norm": 0.4232679939550836, + "learning_rate": 9.706038475877938e-06, + "loss": 0.2768, + "step": 17170 + }, + { + "epoch": 4.005363805970149, + "grad_norm": 0.3853101808137293, + "learning_rate": 9.69542389825146e-06, + "loss": 0.278, + "step": 17175 + }, + { + "epoch": 4.0065298507462686, + "grad_norm": 0.38065011179529434, + "learning_rate": 9.6848199101041e-06, + "loss": 0.2743, + "step": 17180 + }, + { + "epoch": 4.007695895522388, + "grad_norm": 0.37438551006463544, + "learning_rate": 9.674226517742705e-06, + "loss": 0.2639, + "step": 17185 + }, + { + "epoch": 4.008861940298507, + "grad_norm": 0.37899651523451733, + "learning_rate": 9.66364372746781e-06, + "loss": 0.2697, + "step": 17190 + }, + { + "epoch": 4.010027985074627, + "grad_norm": 0.4365087875180718, + "learning_rate": 9.653071545573667e-06, + "loss": 0.2803, + "step": 17195 + }, + { + "epoch": 4.0111940298507465, + "grad_norm": 0.3721769349613592, + "learning_rate": 9.64250997834819e-06, + "loss": 0.2696, + "step": 17200 + }, + { + "epoch": 4.012360074626866, + "grad_norm": 0.37729886286105113, + "learning_rate": 9.631959032072997e-06, + "loss": 0.2667, + "step": 17205 + }, + { + "epoch": 4.013526119402985, + "grad_norm": 0.3903095843544387, + "learning_rate": 9.621418713023389e-06, + "loss": 0.2736, + "step": 17210 + }, + { + "epoch": 4.014692164179104, + "grad_norm": 0.38610021114229004, + "learning_rate": 9.61088902746835e-06, + "loss": 0.2657, + "step": 17215 + }, + { + "epoch": 4.0158582089552235, + "grad_norm": 0.3667713690500915, + "learning_rate": 9.60036998167052e-06, + "loss": 0.2486, + "step": 17220 + }, + { + "epoch": 4.017024253731344, + "grad_norm": 0.40451194763613085, + "learning_rate": 9.589861581886232e-06, + "loss": 0.2895, + "step": 17225 + }, + { + "epoch": 4.018190298507463, + "grad_norm": 0.38119312707674224, + "learning_rate": 9.579363834365484e-06, + "loss": 0.2578, + "step": 17230 + }, + { + "epoch": 4.019356343283582, + "grad_norm": 0.40688848855625515, + "learning_rate": 9.568876745351919e-06, + "loss": 0.2736, + "step": 17235 + }, + { + "epoch": 4.020522388059701, + "grad_norm": 0.3778376679826051, + "learning_rate": 9.558400321082863e-06, + "loss": 0.2754, + "step": 17240 + }, + { + "epoch": 4.021688432835821, + "grad_norm": 0.3899220307228565, + "learning_rate": 9.547934567789302e-06, + "loss": 0.2799, + "step": 17245 + }, + { + "epoch": 4.02285447761194, + "grad_norm": 0.4087913552661455, + "learning_rate": 9.537479491695845e-06, + "loss": 0.2848, + "step": 17250 + }, + { + "epoch": 4.02402052238806, + "grad_norm": 0.38211948963355963, + "learning_rate": 9.527035099020784e-06, + "loss": 0.2715, + "step": 17255 + }, + { + "epoch": 4.025186567164179, + "grad_norm": 0.39987617273591775, + "learning_rate": 9.516601395976038e-06, + "loss": 0.2812, + "step": 17260 + }, + { + "epoch": 4.026352611940299, + "grad_norm": 0.3746875540320509, + "learning_rate": 9.506178388767176e-06, + "loss": 0.2702, + "step": 17265 + }, + { + "epoch": 4.027518656716418, + "grad_norm": 0.39832317079454277, + "learning_rate": 9.495766083593407e-06, + "loss": 0.2637, + "step": 17270 + }, + { + "epoch": 4.028684701492537, + "grad_norm": 0.41295342388894385, + "learning_rate": 9.485364486647561e-06, + "loss": 0.2731, + "step": 17275 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.40496088844307826, + "learning_rate": 9.474973604116112e-06, + "loss": 0.2705, + "step": 17280 + }, + { + "epoch": 4.0310167910447765, + "grad_norm": 0.38663525321521314, + "learning_rate": 9.464593442179162e-06, + "loss": 0.27, + "step": 17285 + }, + { + "epoch": 4.032182835820896, + "grad_norm": 0.4299620384176467, + "learning_rate": 9.454224007010428e-06, + "loss": 0.3039, + "step": 17290 + }, + { + "epoch": 4.033348880597015, + "grad_norm": 0.3728216932152512, + "learning_rate": 9.443865304777266e-06, + "loss": 0.2649, + "step": 17295 + }, + { + "epoch": 4.034514925373134, + "grad_norm": 0.3899351037140187, + "learning_rate": 9.433517341640621e-06, + "loss": 0.2695, + "step": 17300 + }, + { + "epoch": 4.0356809701492535, + "grad_norm": 0.4000363248231797, + "learning_rate": 9.423180123755064e-06, + "loss": 0.2696, + "step": 17305 + }, + { + "epoch": 4.036847014925373, + "grad_norm": 0.41445350192630664, + "learning_rate": 9.41285365726878e-06, + "loss": 0.2743, + "step": 17310 + }, + { + "epoch": 4.038013059701493, + "grad_norm": 0.41177140790199745, + "learning_rate": 9.40253794832356e-06, + "loss": 0.2861, + "step": 17315 + }, + { + "epoch": 4.039179104477612, + "grad_norm": 0.39093016206560616, + "learning_rate": 9.39223300305479e-06, + "loss": 0.2695, + "step": 17320 + }, + { + "epoch": 4.0403451492537314, + "grad_norm": 0.43359088740355056, + "learning_rate": 9.381938827591447e-06, + "loss": 0.2828, + "step": 17325 + }, + { + "epoch": 4.041511194029851, + "grad_norm": 0.3861700543791944, + "learning_rate": 9.371655428056122e-06, + "loss": 0.2701, + "step": 17330 + }, + { + "epoch": 4.04267723880597, + "grad_norm": 0.39946561197399155, + "learning_rate": 9.361382810564984e-06, + "loss": 0.2752, + "step": 17335 + }, + { + "epoch": 4.043843283582089, + "grad_norm": 0.38194852995606676, + "learning_rate": 9.351120981227788e-06, + "loss": 0.2748, + "step": 17340 + }, + { + "epoch": 4.045009328358209, + "grad_norm": 0.37606470497937694, + "learning_rate": 9.34086994614789e-06, + "loss": 0.2704, + "step": 17345 + }, + { + "epoch": 4.046175373134329, + "grad_norm": 0.38728957201754266, + "learning_rate": 9.330629711422196e-06, + "loss": 0.2714, + "step": 17350 + }, + { + "epoch": 4.047341417910448, + "grad_norm": 0.41514043457818217, + "learning_rate": 9.320400283141208e-06, + "loss": 0.2843, + "step": 17355 + }, + { + "epoch": 4.048507462686567, + "grad_norm": 0.4022222643818692, + "learning_rate": 9.310181667389003e-06, + "loss": 0.279, + "step": 17360 + }, + { + "epoch": 4.049673507462686, + "grad_norm": 0.38707845189965684, + "learning_rate": 9.299973870243222e-06, + "loss": 0.2662, + "step": 17365 + }, + { + "epoch": 4.050839552238806, + "grad_norm": 0.380700902845872, + "learning_rate": 9.289776897775074e-06, + "loss": 0.2764, + "step": 17370 + }, + { + "epoch": 4.052005597014926, + "grad_norm": 0.38072165881104764, + "learning_rate": 9.279590756049316e-06, + "loss": 0.2707, + "step": 17375 + }, + { + "epoch": 4.053171641791045, + "grad_norm": 0.3980632672807575, + "learning_rate": 9.269415451124283e-06, + "loss": 0.2724, + "step": 17380 + }, + { + "epoch": 4.054337686567164, + "grad_norm": 0.39328929956905995, + "learning_rate": 9.25925098905185e-06, + "loss": 0.2647, + "step": 17385 + }, + { + "epoch": 4.055503731343284, + "grad_norm": 0.3783499102829366, + "learning_rate": 9.249097375877458e-06, + "loss": 0.265, + "step": 17390 + }, + { + "epoch": 4.056669776119403, + "grad_norm": 0.3944809621780304, + "learning_rate": 9.23895461764009e-06, + "loss": 0.2648, + "step": 17395 + }, + { + "epoch": 4.057835820895522, + "grad_norm": 0.4155514147390988, + "learning_rate": 9.22882272037225e-06, + "loss": 0.2806, + "step": 17400 + }, + { + "epoch": 4.059001865671642, + "grad_norm": 0.40146981171808716, + "learning_rate": 9.218701690100017e-06, + "loss": 0.2842, + "step": 17405 + }, + { + "epoch": 4.0601679104477615, + "grad_norm": 0.42473378469312006, + "learning_rate": 9.208591532842995e-06, + "loss": 0.2695, + "step": 17410 + }, + { + "epoch": 4.061333955223881, + "grad_norm": 0.4132395156270794, + "learning_rate": 9.198492254614302e-06, + "loss": 0.2754, + "step": 17415 + }, + { + "epoch": 4.0625, + "grad_norm": 0.38857253688506155, + "learning_rate": 9.188403861420615e-06, + "loss": 0.2864, + "step": 17420 + }, + { + "epoch": 4.063666044776119, + "grad_norm": 0.3858487720074873, + "learning_rate": 9.178326359262124e-06, + "loss": 0.275, + "step": 17425 + }, + { + "epoch": 4.0648320895522385, + "grad_norm": 0.37561090884474024, + "learning_rate": 9.16825975413253e-06, + "loss": 0.2817, + "step": 17430 + }, + { + "epoch": 4.065998134328359, + "grad_norm": 0.40132068910283547, + "learning_rate": 9.158204052019069e-06, + "loss": 0.2781, + "step": 17435 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.39343345573457644, + "learning_rate": 9.148159258902488e-06, + "loss": 0.2761, + "step": 17440 + }, + { + "epoch": 4.068330223880597, + "grad_norm": 0.3989119946989191, + "learning_rate": 9.138125380757046e-06, + "loss": 0.2675, + "step": 17445 + }, + { + "epoch": 4.069496268656716, + "grad_norm": 0.41008492201275865, + "learning_rate": 9.128102423550511e-06, + "loss": 0.2699, + "step": 17450 + }, + { + "epoch": 4.070662313432836, + "grad_norm": 0.42224674882948077, + "learning_rate": 9.118090393244147e-06, + "loss": 0.2849, + "step": 17455 + }, + { + "epoch": 4.071828358208955, + "grad_norm": 0.41417025410187047, + "learning_rate": 9.108089295792726e-06, + "loss": 0.2791, + "step": 17460 + }, + { + "epoch": 4.072994402985074, + "grad_norm": 0.38992394241051687, + "learning_rate": 9.098099137144522e-06, + "loss": 0.2682, + "step": 17465 + }, + { + "epoch": 4.074160447761194, + "grad_norm": 0.4002085567371614, + "learning_rate": 9.088119923241295e-06, + "loss": 0.2848, + "step": 17470 + }, + { + "epoch": 4.075326492537314, + "grad_norm": 0.4008502225666991, + "learning_rate": 9.07815166001831e-06, + "loss": 0.2786, + "step": 17475 + }, + { + "epoch": 4.076492537313433, + "grad_norm": 0.4198649960906934, + "learning_rate": 9.068194353404288e-06, + "loss": 0.2836, + "step": 17480 + }, + { + "epoch": 4.077658582089552, + "grad_norm": 0.38919466382635604, + "learning_rate": 9.058248009321464e-06, + "loss": 0.2759, + "step": 17485 + }, + { + "epoch": 4.078824626865671, + "grad_norm": 0.4040898916629864, + "learning_rate": 9.04831263368554e-06, + "loss": 0.2639, + "step": 17490 + }, + { + "epoch": 4.079990671641791, + "grad_norm": 0.4061639016868157, + "learning_rate": 9.038388232405699e-06, + "loss": 0.2686, + "step": 17495 + }, + { + "epoch": 4.081156716417911, + "grad_norm": 0.38873144956384365, + "learning_rate": 9.028474811384597e-06, + "loss": 0.2739, + "step": 17500 + }, + { + "epoch": 4.08232276119403, + "grad_norm": 0.3907113401610684, + "learning_rate": 9.01857237651835e-06, + "loss": 0.2719, + "step": 17505 + }, + { + "epoch": 4.083488805970149, + "grad_norm": 0.43037381088785187, + "learning_rate": 9.008680933696545e-06, + "loss": 0.2903, + "step": 17510 + }, + { + "epoch": 4.0846548507462686, + "grad_norm": 0.4153131214146846, + "learning_rate": 8.998800488802239e-06, + "loss": 0.2802, + "step": 17515 + }, + { + "epoch": 4.085820895522388, + "grad_norm": 0.4254650531632462, + "learning_rate": 8.98893104771194e-06, + "loss": 0.2836, + "step": 17520 + }, + { + "epoch": 4.086986940298507, + "grad_norm": 0.41021442918425227, + "learning_rate": 8.979072616295616e-06, + "loss": 0.2776, + "step": 17525 + }, + { + "epoch": 4.088152985074627, + "grad_norm": 0.4199890447183592, + "learning_rate": 8.969225200416678e-06, + "loss": 0.2781, + "step": 17530 + }, + { + "epoch": 4.0893190298507465, + "grad_norm": 0.4224494686795398, + "learning_rate": 8.959388805931993e-06, + "loss": 0.2829, + "step": 17535 + }, + { + "epoch": 4.090485074626866, + "grad_norm": 0.3965658671662444, + "learning_rate": 8.94956343869187e-06, + "loss": 0.2775, + "step": 17540 + }, + { + "epoch": 4.091651119402985, + "grad_norm": 0.3845067592761444, + "learning_rate": 8.939749104540065e-06, + "loss": 0.2519, + "step": 17545 + }, + { + "epoch": 4.092817164179104, + "grad_norm": 0.42278234806318643, + "learning_rate": 8.929945809313773e-06, + "loss": 0.2732, + "step": 17550 + }, + { + "epoch": 4.0939832089552235, + "grad_norm": 0.40257751752804294, + "learning_rate": 8.9201535588436e-06, + "loss": 0.2657, + "step": 17555 + }, + { + "epoch": 4.095149253731344, + "grad_norm": 0.3905140218344296, + "learning_rate": 8.910372358953614e-06, + "loss": 0.2659, + "step": 17560 + }, + { + "epoch": 4.096315298507463, + "grad_norm": 0.3970352823295486, + "learning_rate": 8.900602215461297e-06, + "loss": 0.2608, + "step": 17565 + }, + { + "epoch": 4.097481343283582, + "grad_norm": 0.4211184463964193, + "learning_rate": 8.890843134177555e-06, + "loss": 0.2796, + "step": 17570 + }, + { + "epoch": 4.098647388059701, + "grad_norm": 0.4148157751306683, + "learning_rate": 8.881095120906716e-06, + "loss": 0.2755, + "step": 17575 + }, + { + "epoch": 4.099813432835821, + "grad_norm": 0.3834600079099796, + "learning_rate": 8.871358181446519e-06, + "loss": 0.2617, + "step": 17580 + }, + { + "epoch": 4.10097947761194, + "grad_norm": 0.3839034265657398, + "learning_rate": 8.861632321588126e-06, + "loss": 0.2679, + "step": 17585 + }, + { + "epoch": 4.10214552238806, + "grad_norm": 0.3763802424840373, + "learning_rate": 8.851917547116111e-06, + "loss": 0.2804, + "step": 17590 + }, + { + "epoch": 4.103311567164179, + "grad_norm": 0.4062580262689815, + "learning_rate": 8.842213863808439e-06, + "loss": 0.2905, + "step": 17595 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.39543484011614194, + "learning_rate": 8.83252127743649e-06, + "loss": 0.2653, + "step": 17600 + }, + { + "epoch": 4.105643656716418, + "grad_norm": 0.3950151746012995, + "learning_rate": 8.822839793765056e-06, + "loss": 0.2776, + "step": 17605 + }, + { + "epoch": 4.106809701492537, + "grad_norm": 0.42042659879200683, + "learning_rate": 8.813169418552294e-06, + "loss": 0.2794, + "step": 17610 + }, + { + "epoch": 4.107975746268656, + "grad_norm": 0.4085981463591945, + "learning_rate": 8.803510157549785e-06, + "loss": 0.2793, + "step": 17615 + }, + { + "epoch": 4.1091417910447765, + "grad_norm": 0.3962803355524766, + "learning_rate": 8.793862016502477e-06, + "loss": 0.271, + "step": 17620 + }, + { + "epoch": 4.110307835820896, + "grad_norm": 0.4046974830505113, + "learning_rate": 8.78422500114873e-06, + "loss": 0.2687, + "step": 17625 + }, + { + "epoch": 4.111473880597015, + "grad_norm": 0.409597263803634, + "learning_rate": 8.774599117220254e-06, + "loss": 0.2855, + "step": 17630 + }, + { + "epoch": 4.112639925373134, + "grad_norm": 0.416872029353015, + "learning_rate": 8.764984370442166e-06, + "loss": 0.2819, + "step": 17635 + }, + { + "epoch": 4.1138059701492535, + "grad_norm": 0.3727123254002717, + "learning_rate": 8.755380766532945e-06, + "loss": 0.2634, + "step": 17640 + }, + { + "epoch": 4.114972014925373, + "grad_norm": 0.38964046030172467, + "learning_rate": 8.745788311204444e-06, + "loss": 0.2758, + "step": 17645 + }, + { + "epoch": 4.116138059701493, + "grad_norm": 0.41981137533908836, + "learning_rate": 8.736207010161899e-06, + "loss": 0.2806, + "step": 17650 + }, + { + "epoch": 4.117304104477612, + "grad_norm": 0.39914056124124697, + "learning_rate": 8.726636869103884e-06, + "loss": 0.2712, + "step": 17655 + }, + { + "epoch": 4.1184701492537314, + "grad_norm": 0.3914937415659602, + "learning_rate": 8.71707789372236e-06, + "loss": 0.2678, + "step": 17660 + }, + { + "epoch": 4.119636194029851, + "grad_norm": 0.42513322775911844, + "learning_rate": 8.70753008970264e-06, + "loss": 0.2737, + "step": 17665 + }, + { + "epoch": 4.12080223880597, + "grad_norm": 0.3925064576087041, + "learning_rate": 8.697993462723392e-06, + "loss": 0.2845, + "step": 17670 + }, + { + "epoch": 4.121968283582089, + "grad_norm": 0.4068642934868272, + "learning_rate": 8.688468018456639e-06, + "loss": 0.2617, + "step": 17675 + }, + { + "epoch": 4.123134328358209, + "grad_norm": 0.39727368108579314, + "learning_rate": 8.678953762567739e-06, + "loss": 0.2743, + "step": 17680 + }, + { + "epoch": 4.124300373134329, + "grad_norm": 0.3928667850605243, + "learning_rate": 8.669450700715414e-06, + "loss": 0.2697, + "step": 17685 + }, + { + "epoch": 4.125466417910448, + "grad_norm": 0.4019362126336018, + "learning_rate": 8.659958838551722e-06, + "loss": 0.2643, + "step": 17690 + }, + { + "epoch": 4.126632462686567, + "grad_norm": 0.4272157250003413, + "learning_rate": 8.650478181722055e-06, + "loss": 0.2757, + "step": 17695 + }, + { + "epoch": 4.127798507462686, + "grad_norm": 0.4214354240713484, + "learning_rate": 8.641008735865153e-06, + "loss": 0.2791, + "step": 17700 + }, + { + "epoch": 4.128964552238806, + "grad_norm": 0.40725072931202166, + "learning_rate": 8.631550506613062e-06, + "loss": 0.2729, + "step": 17705 + }, + { + "epoch": 4.130130597014926, + "grad_norm": 0.3779088159590965, + "learning_rate": 8.62210349959119e-06, + "loss": 0.2689, + "step": 17710 + }, + { + "epoch": 4.131296641791045, + "grad_norm": 0.4156240483038565, + "learning_rate": 8.612667720418243e-06, + "loss": 0.2809, + "step": 17715 + }, + { + "epoch": 4.132462686567164, + "grad_norm": 0.40249218473646325, + "learning_rate": 8.60324317470627e-06, + "loss": 0.2738, + "step": 17720 + }, + { + "epoch": 4.133628731343284, + "grad_norm": 0.3934841325796097, + "learning_rate": 8.593829868060632e-06, + "loss": 0.2674, + "step": 17725 + }, + { + "epoch": 4.134794776119403, + "grad_norm": 0.4007634368816624, + "learning_rate": 8.584427806079988e-06, + "loss": 0.2751, + "step": 17730 + }, + { + "epoch": 4.135960820895522, + "grad_norm": 0.4138556471912693, + "learning_rate": 8.575036994356334e-06, + "loss": 0.2787, + "step": 17735 + }, + { + "epoch": 4.137126865671641, + "grad_norm": 0.40615935492996275, + "learning_rate": 8.565657438474963e-06, + "loss": 0.2776, + "step": 17740 + }, + { + "epoch": 4.1382929104477615, + "grad_norm": 0.3867587031964204, + "learning_rate": 8.556289144014474e-06, + "loss": 0.2638, + "step": 17745 + }, + { + "epoch": 4.139458955223881, + "grad_norm": 0.430155591472759, + "learning_rate": 8.546932116546775e-06, + "loss": 0.2793, + "step": 17750 + }, + { + "epoch": 4.140625, + "grad_norm": 0.40298935237119554, + "learning_rate": 8.53758636163706e-06, + "loss": 0.2644, + "step": 17755 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.41138868654844857, + "learning_rate": 8.528251884843829e-06, + "loss": 0.2759, + "step": 17760 + }, + { + "epoch": 4.1429570895522385, + "grad_norm": 0.41005162569903975, + "learning_rate": 8.518928691718872e-06, + "loss": 0.2892, + "step": 17765 + }, + { + "epoch": 4.144123134328359, + "grad_norm": 0.38415735441945087, + "learning_rate": 8.509616787807263e-06, + "loss": 0.2731, + "step": 17770 + }, + { + "epoch": 4.145289179104478, + "grad_norm": 0.40812940858209573, + "learning_rate": 8.500316178647366e-06, + "loss": 0.2771, + "step": 17775 + }, + { + "epoch": 4.146455223880597, + "grad_norm": 0.4229080087039047, + "learning_rate": 8.491026869770832e-06, + "loss": 0.2775, + "step": 17780 + }, + { + "epoch": 4.147621268656716, + "grad_norm": 0.4162540906219848, + "learning_rate": 8.48174886670258e-06, + "loss": 0.2678, + "step": 17785 + }, + { + "epoch": 4.148787313432836, + "grad_norm": 0.4096473455980717, + "learning_rate": 8.472482174960808e-06, + "loss": 0.2834, + "step": 17790 + }, + { + "epoch": 4.149953358208955, + "grad_norm": 0.4024387626065952, + "learning_rate": 8.463226800056995e-06, + "loss": 0.2663, + "step": 17795 + }, + { + "epoch": 4.151119402985074, + "grad_norm": 0.39489254863081147, + "learning_rate": 8.453982747495881e-06, + "loss": 0.2764, + "step": 17800 + }, + { + "epoch": 4.152285447761194, + "grad_norm": 0.4185944125024017, + "learning_rate": 8.44475002277548e-06, + "loss": 0.2736, + "step": 17805 + }, + { + "epoch": 4.153451492537314, + "grad_norm": 0.38401269701391966, + "learning_rate": 8.435528631387052e-06, + "loss": 0.2761, + "step": 17810 + }, + { + "epoch": 4.154617537313433, + "grad_norm": 0.433321075218259, + "learning_rate": 8.426318578815128e-06, + "loss": 0.2767, + "step": 17815 + }, + { + "epoch": 4.155783582089552, + "grad_norm": 0.39581775967542454, + "learning_rate": 8.417119870537503e-06, + "loss": 0.2875, + "step": 17820 + }, + { + "epoch": 4.156949626865671, + "grad_norm": 0.38618524509810154, + "learning_rate": 8.407932512025207e-06, + "loss": 0.2662, + "step": 17825 + }, + { + "epoch": 4.158115671641791, + "grad_norm": 0.3984497079434106, + "learning_rate": 8.398756508742536e-06, + "loss": 0.2633, + "step": 17830 + }, + { + "epoch": 4.159281716417911, + "grad_norm": 0.3982649240862175, + "learning_rate": 8.38959186614702e-06, + "loss": 0.2698, + "step": 17835 + }, + { + "epoch": 4.16044776119403, + "grad_norm": 0.40275814991539843, + "learning_rate": 8.380438589689438e-06, + "loss": 0.2777, + "step": 17840 + }, + { + "epoch": 4.161613805970149, + "grad_norm": 0.40564226347858123, + "learning_rate": 8.371296684813806e-06, + "loss": 0.2688, + "step": 17845 + }, + { + "epoch": 4.1627798507462686, + "grad_norm": 0.40044806911939124, + "learning_rate": 8.36216615695738e-06, + "loss": 0.281, + "step": 17850 + }, + { + "epoch": 4.163945895522388, + "grad_norm": 0.3777701394706234, + "learning_rate": 8.353047011550654e-06, + "loss": 0.2792, + "step": 17855 + }, + { + "epoch": 4.165111940298507, + "grad_norm": 0.4127086292962352, + "learning_rate": 8.343939254017336e-06, + "loss": 0.2784, + "step": 17860 + }, + { + "epoch": 4.166277985074627, + "grad_norm": 0.40379573089311377, + "learning_rate": 8.334842889774374e-06, + "loss": 0.2722, + "step": 17865 + }, + { + "epoch": 4.1674440298507465, + "grad_norm": 0.4277490808244483, + "learning_rate": 8.325757924231938e-06, + "loss": 0.2775, + "step": 17870 + }, + { + "epoch": 4.168610074626866, + "grad_norm": 0.38276952284794696, + "learning_rate": 8.31668436279342e-06, + "loss": 0.2679, + "step": 17875 + }, + { + "epoch": 4.169776119402985, + "grad_norm": 0.42944563393474117, + "learning_rate": 8.307622210855425e-06, + "loss": 0.2613, + "step": 17880 + }, + { + "epoch": 4.170942164179104, + "grad_norm": 0.40801134956065815, + "learning_rate": 8.298571473807767e-06, + "loss": 0.2761, + "step": 17885 + }, + { + "epoch": 4.1721082089552235, + "grad_norm": 0.4112413108518129, + "learning_rate": 8.289532157033481e-06, + "loss": 0.2863, + "step": 17890 + }, + { + "epoch": 4.173274253731344, + "grad_norm": 0.4048818565106462, + "learning_rate": 8.28050426590881e-06, + "loss": 0.267, + "step": 17895 + }, + { + "epoch": 4.174440298507463, + "grad_norm": 0.38934026131417887, + "learning_rate": 8.271487805803193e-06, + "loss": 0.2641, + "step": 17900 + }, + { + "epoch": 4.175606343283582, + "grad_norm": 0.3953189315405114, + "learning_rate": 8.262482782079281e-06, + "loss": 0.27, + "step": 17905 + }, + { + "epoch": 4.176772388059701, + "grad_norm": 0.39999437963549983, + "learning_rate": 8.253489200092912e-06, + "loss": 0.261, + "step": 17910 + }, + { + "epoch": 4.177938432835821, + "grad_norm": 0.386670395609924, + "learning_rate": 8.244507065193117e-06, + "loss": 0.2658, + "step": 17915 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.3856087444116748, + "learning_rate": 8.235536382722133e-06, + "loss": 0.2663, + "step": 17920 + }, + { + "epoch": 4.18027052238806, + "grad_norm": 0.4004318455532569, + "learning_rate": 8.226577158015383e-06, + "loss": 0.2701, + "step": 17925 + }, + { + "epoch": 4.181436567164179, + "grad_norm": 0.4029934263843612, + "learning_rate": 8.217629396401465e-06, + "loss": 0.2914, + "step": 17930 + }, + { + "epoch": 4.182602611940299, + "grad_norm": 0.4297519561669356, + "learning_rate": 8.208693103202158e-06, + "loss": 0.2676, + "step": 17935 + }, + { + "epoch": 4.183768656716418, + "grad_norm": 0.41071873659549063, + "learning_rate": 8.199768283732432e-06, + "loss": 0.2699, + "step": 17940 + }, + { + "epoch": 4.184934701492537, + "grad_norm": 0.3927487458988259, + "learning_rate": 8.190854943300436e-06, + "loss": 0.2783, + "step": 17945 + }, + { + "epoch": 4.186100746268656, + "grad_norm": 0.39544062944596514, + "learning_rate": 8.181953087207467e-06, + "loss": 0.2655, + "step": 17950 + }, + { + "epoch": 4.1872667910447765, + "grad_norm": 0.3880874898146546, + "learning_rate": 8.17306272074802e-06, + "loss": 0.2674, + "step": 17955 + }, + { + "epoch": 4.188432835820896, + "grad_norm": 0.3958773152588233, + "learning_rate": 8.164183849209741e-06, + "loss": 0.2629, + "step": 17960 + }, + { + "epoch": 4.189598880597015, + "grad_norm": 0.37874451988691205, + "learning_rate": 8.155316477873438e-06, + "loss": 0.2792, + "step": 17965 + }, + { + "epoch": 4.190764925373134, + "grad_norm": 0.42105218931902155, + "learning_rate": 8.146460612013083e-06, + "loss": 0.2722, + "step": 17970 + }, + { + "epoch": 4.1919309701492535, + "grad_norm": 0.4367285017785108, + "learning_rate": 8.137616256895811e-06, + "loss": 0.2752, + "step": 17975 + }, + { + "epoch": 4.193097014925373, + "grad_norm": 0.3927999057775576, + "learning_rate": 8.128783417781909e-06, + "loss": 0.2702, + "step": 17980 + }, + { + "epoch": 4.194263059701493, + "grad_norm": 0.3967807633098899, + "learning_rate": 8.119962099924797e-06, + "loss": 0.269, + "step": 17985 + }, + { + "epoch": 4.195429104477612, + "grad_norm": 0.39769987661014367, + "learning_rate": 8.111152308571065e-06, + "loss": 0.2753, + "step": 17990 + }, + { + "epoch": 4.1965951492537314, + "grad_norm": 0.412846110018462, + "learning_rate": 8.10235404896044e-06, + "loss": 0.2772, + "step": 17995 + }, + { + "epoch": 4.197761194029851, + "grad_norm": 0.38426551987470575, + "learning_rate": 8.09356732632579e-06, + "loss": 0.2733, + "step": 18000 + }, + { + "epoch": 4.19892723880597, + "grad_norm": 0.3668974286942403, + "learning_rate": 8.084792145893122e-06, + "loss": 0.2588, + "step": 18005 + }, + { + "epoch": 4.200093283582089, + "grad_norm": 0.41338717569233074, + "learning_rate": 8.07602851288157e-06, + "loss": 0.2754, + "step": 18010 + }, + { + "epoch": 4.201259328358209, + "grad_norm": 0.3986613892467767, + "learning_rate": 8.067276432503406e-06, + "loss": 0.2809, + "step": 18015 + }, + { + "epoch": 4.202425373134329, + "grad_norm": 0.4229143083671442, + "learning_rate": 8.058535909964041e-06, + "loss": 0.2701, + "step": 18020 + }, + { + "epoch": 4.203591417910448, + "grad_norm": 0.4225483163449904, + "learning_rate": 8.049806950461996e-06, + "loss": 0.2765, + "step": 18025 + }, + { + "epoch": 4.204757462686567, + "grad_norm": 0.40954725712677237, + "learning_rate": 8.041089559188929e-06, + "loss": 0.2627, + "step": 18030 + }, + { + "epoch": 4.205923507462686, + "grad_norm": 0.3928293796239581, + "learning_rate": 8.032383741329598e-06, + "loss": 0.2677, + "step": 18035 + }, + { + "epoch": 4.207089552238806, + "grad_norm": 0.4052297327521418, + "learning_rate": 8.023689502061897e-06, + "loss": 0.2567, + "step": 18040 + }, + { + "epoch": 4.208255597014926, + "grad_norm": 0.393476277539444, + "learning_rate": 8.015006846556825e-06, + "loss": 0.2636, + "step": 18045 + }, + { + "epoch": 4.209421641791045, + "grad_norm": 0.3923784394452194, + "learning_rate": 8.006335779978494e-06, + "loss": 0.2804, + "step": 18050 + }, + { + "epoch": 4.210587686567164, + "grad_norm": 0.36991243854819816, + "learning_rate": 7.997676307484123e-06, + "loss": 0.2691, + "step": 18055 + }, + { + "epoch": 4.211753731343284, + "grad_norm": 0.43169159772478505, + "learning_rate": 7.989028434224028e-06, + "loss": 0.2811, + "step": 18060 + }, + { + "epoch": 4.212919776119403, + "grad_norm": 0.40923562740502417, + "learning_rate": 7.980392165341636e-06, + "loss": 0.2758, + "step": 18065 + }, + { + "epoch": 4.214085820895522, + "grad_norm": 0.422018177827729, + "learning_rate": 7.971767505973468e-06, + "loss": 0.2789, + "step": 18070 + }, + { + "epoch": 4.215251865671641, + "grad_norm": 0.39494749417562225, + "learning_rate": 7.963154461249143e-06, + "loss": 0.2776, + "step": 18075 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.3867227498750476, + "learning_rate": 7.95455303629137e-06, + "loss": 0.2763, + "step": 18080 + }, + { + "epoch": 4.217583955223881, + "grad_norm": 0.3753961520495688, + "learning_rate": 7.945963236215944e-06, + "loss": 0.272, + "step": 18085 + }, + { + "epoch": 4.21875, + "grad_norm": 0.42936881893191087, + "learning_rate": 7.937385066131745e-06, + "loss": 0.2913, + "step": 18090 + }, + { + "epoch": 4.219916044776119, + "grad_norm": 0.41154807923770514, + "learning_rate": 7.928818531140748e-06, + "loss": 0.2728, + "step": 18095 + }, + { + "epoch": 4.2210820895522385, + "grad_norm": 0.3877003504655198, + "learning_rate": 7.920263636337994e-06, + "loss": 0.2697, + "step": 18100 + }, + { + "epoch": 4.222248134328359, + "grad_norm": 0.39692175480407094, + "learning_rate": 7.911720386811613e-06, + "loss": 0.2758, + "step": 18105 + }, + { + "epoch": 4.223414179104478, + "grad_norm": 0.426405668197753, + "learning_rate": 7.90318878764279e-06, + "loss": 0.2775, + "step": 18110 + }, + { + "epoch": 4.224580223880597, + "grad_norm": 0.42296221155951175, + "learning_rate": 7.894668843905803e-06, + "loss": 0.2783, + "step": 18115 + }, + { + "epoch": 4.225746268656716, + "grad_norm": 0.38304370256201864, + "learning_rate": 7.886160560667984e-06, + "loss": 0.2827, + "step": 18120 + }, + { + "epoch": 4.226912313432836, + "grad_norm": 0.4243619646936963, + "learning_rate": 7.87766394298974e-06, + "loss": 0.274, + "step": 18125 + }, + { + "epoch": 4.228078358208955, + "grad_norm": 0.3908501604076978, + "learning_rate": 7.869178995924525e-06, + "loss": 0.2762, + "step": 18130 + }, + { + "epoch": 4.229244402985074, + "grad_norm": 0.39394592450348964, + "learning_rate": 7.860705724518857e-06, + "loss": 0.2782, + "step": 18135 + }, + { + "epoch": 4.230410447761194, + "grad_norm": 0.40271481185017766, + "learning_rate": 7.852244133812332e-06, + "loss": 0.2774, + "step": 18140 + }, + { + "epoch": 4.231576492537314, + "grad_norm": 0.39968102282688084, + "learning_rate": 7.843794228837556e-06, + "loss": 0.2853, + "step": 18145 + }, + { + "epoch": 4.232742537313433, + "grad_norm": 0.4247008418363225, + "learning_rate": 7.83535601462022e-06, + "loss": 0.2669, + "step": 18150 + }, + { + "epoch": 4.233908582089552, + "grad_norm": 0.39448119459676123, + "learning_rate": 7.82692949617905e-06, + "loss": 0.2877, + "step": 18155 + }, + { + "epoch": 4.235074626865671, + "grad_norm": 0.41898670602918947, + "learning_rate": 7.818514678525822e-06, + "loss": 0.2665, + "step": 18160 + }, + { + "epoch": 4.236240671641791, + "grad_norm": 0.41629875977458675, + "learning_rate": 7.810111566665333e-06, + "loss": 0.2764, + "step": 18165 + }, + { + "epoch": 4.237406716417911, + "grad_norm": 0.41336606695873085, + "learning_rate": 7.80172016559544e-06, + "loss": 0.2739, + "step": 18170 + }, + { + "epoch": 4.23857276119403, + "grad_norm": 0.4229416876666708, + "learning_rate": 7.793340480307027e-06, + "loss": 0.259, + "step": 18175 + }, + { + "epoch": 4.239738805970149, + "grad_norm": 0.41907162173078616, + "learning_rate": 7.784972515784004e-06, + "loss": 0.2752, + "step": 18180 + }, + { + "epoch": 4.2409048507462686, + "grad_norm": 0.41674551957571665, + "learning_rate": 7.776616277003328e-06, + "loss": 0.2805, + "step": 18185 + }, + { + "epoch": 4.242070895522388, + "grad_norm": 0.410588553164536, + "learning_rate": 7.768271768934955e-06, + "loss": 0.2722, + "step": 18190 + }, + { + "epoch": 4.243236940298507, + "grad_norm": 0.402796268396602, + "learning_rate": 7.759938996541886e-06, + "loss": 0.2858, + "step": 18195 + }, + { + "epoch": 4.244402985074627, + "grad_norm": 0.3850130874319362, + "learning_rate": 7.751617964780131e-06, + "loss": 0.2754, + "step": 18200 + }, + { + "epoch": 4.2455690298507465, + "grad_norm": 0.42831256969779924, + "learning_rate": 7.743308678598722e-06, + "loss": 0.2821, + "step": 18205 + }, + { + "epoch": 4.246735074626866, + "grad_norm": 0.424981782694087, + "learning_rate": 7.73501114293971e-06, + "loss": 0.2783, + "step": 18210 + }, + { + "epoch": 4.247901119402985, + "grad_norm": 0.4510353226283879, + "learning_rate": 7.726725362738141e-06, + "loss": 0.2805, + "step": 18215 + }, + { + "epoch": 4.249067164179104, + "grad_norm": 0.3677540093562044, + "learning_rate": 7.71845134292208e-06, + "loss": 0.2747, + "step": 18220 + }, + { + "epoch": 4.2502332089552235, + "grad_norm": 0.4041495223777481, + "learning_rate": 7.710189088412604e-06, + "loss": 0.2792, + "step": 18225 + }, + { + "epoch": 4.251399253731344, + "grad_norm": 0.4209873163499664, + "learning_rate": 7.70193860412378e-06, + "loss": 0.2791, + "step": 18230 + }, + { + "epoch": 4.252565298507463, + "grad_norm": 0.4050461190548646, + "learning_rate": 7.693699894962686e-06, + "loss": 0.2754, + "step": 18235 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.40044208229826345, + "learning_rate": 7.68547296582938e-06, + "loss": 0.2754, + "step": 18240 + }, + { + "epoch": 4.254897388059701, + "grad_norm": 0.3806130309779204, + "learning_rate": 7.67725782161693e-06, + "loss": 0.273, + "step": 18245 + }, + { + "epoch": 4.256063432835821, + "grad_norm": 0.4071739565270072, + "learning_rate": 7.669054467211388e-06, + "loss": 0.2669, + "step": 18250 + }, + { + "epoch": 4.25722947761194, + "grad_norm": 0.4102686853577913, + "learning_rate": 7.660862907491795e-06, + "loss": 0.2792, + "step": 18255 + }, + { + "epoch": 4.25839552238806, + "grad_norm": 0.3934025175959887, + "learning_rate": 7.652683147330177e-06, + "loss": 0.2782, + "step": 18260 + }, + { + "epoch": 4.259561567164179, + "grad_norm": 0.39507155314076864, + "learning_rate": 7.644515191591542e-06, + "loss": 0.2813, + "step": 18265 + }, + { + "epoch": 4.260727611940299, + "grad_norm": 0.3890717689329465, + "learning_rate": 7.636359045133873e-06, + "loss": 0.2705, + "step": 18270 + }, + { + "epoch": 4.261893656716418, + "grad_norm": 0.3933419963216611, + "learning_rate": 7.6282147128081364e-06, + "loss": 0.2705, + "step": 18275 + }, + { + "epoch": 4.263059701492537, + "grad_norm": 0.40378936435653107, + "learning_rate": 7.620082199458269e-06, + "loss": 0.2808, + "step": 18280 + }, + { + "epoch": 4.264225746268656, + "grad_norm": 0.4218348692990293, + "learning_rate": 7.611961509921182e-06, + "loss": 0.2744, + "step": 18285 + }, + { + "epoch": 4.2653917910447765, + "grad_norm": 0.4371185608735183, + "learning_rate": 7.603852649026738e-06, + "loss": 0.2695, + "step": 18290 + }, + { + "epoch": 4.266557835820896, + "grad_norm": 0.36353804865112804, + "learning_rate": 7.595755621597788e-06, + "loss": 0.2701, + "step": 18295 + }, + { + "epoch": 4.267723880597015, + "grad_norm": 0.4031347424289463, + "learning_rate": 7.587670432450131e-06, + "loss": 0.2743, + "step": 18300 + }, + { + "epoch": 4.268889925373134, + "grad_norm": 0.40866978560488426, + "learning_rate": 7.57959708639252e-06, + "loss": 0.279, + "step": 18305 + }, + { + "epoch": 4.2700559701492535, + "grad_norm": 0.41098549774145526, + "learning_rate": 7.5715355882266815e-06, + "loss": 0.2735, + "step": 18310 + }, + { + "epoch": 4.271222014925373, + "grad_norm": 0.40197967026229475, + "learning_rate": 7.5634859427472835e-06, + "loss": 0.2718, + "step": 18315 + }, + { + "epoch": 4.272388059701493, + "grad_norm": 0.4326629450762017, + "learning_rate": 7.5554481547419395e-06, + "loss": 0.2817, + "step": 18320 + }, + { + "epoch": 4.273554104477612, + "grad_norm": 0.4164172872124524, + "learning_rate": 7.547422228991223e-06, + "loss": 0.2845, + "step": 18325 + }, + { + "epoch": 4.2747201492537314, + "grad_norm": 0.4115319235447634, + "learning_rate": 7.539408170268644e-06, + "loss": 0.28, + "step": 18330 + }, + { + "epoch": 4.275886194029851, + "grad_norm": 0.3922002276692683, + "learning_rate": 7.531405983340668e-06, + "loss": 0.2681, + "step": 18335 + }, + { + "epoch": 4.27705223880597, + "grad_norm": 0.4006349977272167, + "learning_rate": 7.523415672966675e-06, + "loss": 0.2613, + "step": 18340 + }, + { + "epoch": 4.278218283582089, + "grad_norm": 0.42867198696884184, + "learning_rate": 7.515437243898998e-06, + "loss": 0.2716, + "step": 18345 + }, + { + "epoch": 4.279384328358209, + "grad_norm": 0.395475700228639, + "learning_rate": 7.507470700882905e-06, + "loss": 0.2728, + "step": 18350 + }, + { + "epoch": 4.280550373134329, + "grad_norm": 0.4105932715424159, + "learning_rate": 7.499516048656589e-06, + "loss": 0.267, + "step": 18355 + }, + { + "epoch": 4.281716417910448, + "grad_norm": 0.402741507907578, + "learning_rate": 7.491573291951176e-06, + "loss": 0.2593, + "step": 18360 + }, + { + "epoch": 4.282882462686567, + "grad_norm": 0.40645558368972784, + "learning_rate": 7.483642435490706e-06, + "loss": 0.2966, + "step": 18365 + }, + { + "epoch": 4.284048507462686, + "grad_norm": 0.417345801951849, + "learning_rate": 7.475723483992149e-06, + "loss": 0.28, + "step": 18370 + }, + { + "epoch": 4.285214552238806, + "grad_norm": 0.42859196731371385, + "learning_rate": 7.467816442165397e-06, + "loss": 0.2736, + "step": 18375 + }, + { + "epoch": 4.286380597014926, + "grad_norm": 0.42671733070056267, + "learning_rate": 7.459921314713253e-06, + "loss": 0.2799, + "step": 18380 + }, + { + "epoch": 4.287546641791045, + "grad_norm": 0.39637314816055846, + "learning_rate": 7.452038106331442e-06, + "loss": 0.263, + "step": 18385 + }, + { + "epoch": 4.288712686567164, + "grad_norm": 0.4060549873320143, + "learning_rate": 7.444166821708584e-06, + "loss": 0.2742, + "step": 18390 + }, + { + "epoch": 4.289878731343284, + "grad_norm": 0.4248193501855419, + "learning_rate": 7.436307465526224e-06, + "loss": 0.2769, + "step": 18395 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.4000788833508795, + "learning_rate": 7.4284600424588045e-06, + "loss": 0.275, + "step": 18400 + }, + { + "epoch": 4.292210820895522, + "grad_norm": 0.40681180347843243, + "learning_rate": 7.42062455717367e-06, + "loss": 0.2672, + "step": 18405 + }, + { + "epoch": 4.293376865671641, + "grad_norm": 0.3794132400613795, + "learning_rate": 7.412801014331075e-06, + "loss": 0.2658, + "step": 18410 + }, + { + "epoch": 4.2945429104477615, + "grad_norm": 0.4195999135179553, + "learning_rate": 7.4049894185841476e-06, + "loss": 0.275, + "step": 18415 + }, + { + "epoch": 4.295708955223881, + "grad_norm": 0.3891850858333817, + "learning_rate": 7.397189774578939e-06, + "loss": 0.2684, + "step": 18420 + }, + { + "epoch": 4.296875, + "grad_norm": 0.42414664545538083, + "learning_rate": 7.389402086954368e-06, + "loss": 0.2736, + "step": 18425 + }, + { + "epoch": 4.298041044776119, + "grad_norm": 0.3932808123921889, + "learning_rate": 7.38162636034226e-06, + "loss": 0.2751, + "step": 18430 + }, + { + "epoch": 4.2992070895522385, + "grad_norm": 0.40286976532965957, + "learning_rate": 7.373862599367316e-06, + "loss": 0.2787, + "step": 18435 + }, + { + "epoch": 4.300373134328359, + "grad_norm": 0.4191664541492445, + "learning_rate": 7.366110808647128e-06, + "loss": 0.2796, + "step": 18440 + }, + { + "epoch": 4.301539179104478, + "grad_norm": 0.380688004195661, + "learning_rate": 7.3583709927921574e-06, + "loss": 0.2714, + "step": 18445 + }, + { + "epoch": 4.302705223880597, + "grad_norm": 0.38275380874369747, + "learning_rate": 7.350643156405751e-06, + "loss": 0.2652, + "step": 18450 + }, + { + "epoch": 4.303871268656716, + "grad_norm": 0.3953094826945292, + "learning_rate": 7.342927304084132e-06, + "loss": 0.2748, + "step": 18455 + }, + { + "epoch": 4.305037313432836, + "grad_norm": 0.40109883831737486, + "learning_rate": 7.335223440416391e-06, + "loss": 0.263, + "step": 18460 + }, + { + "epoch": 4.306203358208955, + "grad_norm": 0.41201085772616464, + "learning_rate": 7.327531569984497e-06, + "loss": 0.2746, + "step": 18465 + }, + { + "epoch": 4.307369402985074, + "grad_norm": 0.4032558562628837, + "learning_rate": 7.319851697363271e-06, + "loss": 0.2694, + "step": 18470 + }, + { + "epoch": 4.308535447761194, + "grad_norm": 0.40220772868725246, + "learning_rate": 7.31218382712041e-06, + "loss": 0.2648, + "step": 18475 + }, + { + "epoch": 4.309701492537314, + "grad_norm": 0.39557134116257103, + "learning_rate": 7.304527963816472e-06, + "loss": 0.2719, + "step": 18480 + }, + { + "epoch": 4.310867537313433, + "grad_norm": 0.3955770890075112, + "learning_rate": 7.2968841120048666e-06, + "loss": 0.2711, + "step": 18485 + }, + { + "epoch": 4.312033582089552, + "grad_norm": 0.4401251018553915, + "learning_rate": 7.289252276231863e-06, + "loss": 0.2701, + "step": 18490 + }, + { + "epoch": 4.313199626865671, + "grad_norm": 0.40667506023810096, + "learning_rate": 7.281632461036594e-06, + "loss": 0.2676, + "step": 18495 + }, + { + "epoch": 4.314365671641791, + "grad_norm": 0.42782409846654357, + "learning_rate": 7.27402467095102e-06, + "loss": 0.2837, + "step": 18500 + }, + { + "epoch": 4.315531716417911, + "grad_norm": 0.4158211862037495, + "learning_rate": 7.266428910499971e-06, + "loss": 0.2888, + "step": 18505 + }, + { + "epoch": 4.31669776119403, + "grad_norm": 0.40022210349031667, + "learning_rate": 7.258845184201111e-06, + "loss": 0.2791, + "step": 18510 + }, + { + "epoch": 4.317863805970149, + "grad_norm": 0.39224874072895266, + "learning_rate": 7.251273496564957e-06, + "loss": 0.2735, + "step": 18515 + }, + { + "epoch": 4.3190298507462686, + "grad_norm": 0.3828169151730118, + "learning_rate": 7.243713852094848e-06, + "loss": 0.2712, + "step": 18520 + }, + { + "epoch": 4.320195895522388, + "grad_norm": 0.3785602543827901, + "learning_rate": 7.2361662552869734e-06, + "loss": 0.2757, + "step": 18525 + }, + { + "epoch": 4.321361940298507, + "grad_norm": 0.40368842673001093, + "learning_rate": 7.228630710630356e-06, + "loss": 0.2813, + "step": 18530 + }, + { + "epoch": 4.322527985074627, + "grad_norm": 0.3966995045907521, + "learning_rate": 7.221107222606851e-06, + "loss": 0.2683, + "step": 18535 + }, + { + "epoch": 4.3236940298507465, + "grad_norm": 0.3881458660154604, + "learning_rate": 7.21359579569114e-06, + "loss": 0.2741, + "step": 18540 + }, + { + "epoch": 4.324860074626866, + "grad_norm": 0.38040306667446083, + "learning_rate": 7.206096434350728e-06, + "loss": 0.276, + "step": 18545 + }, + { + "epoch": 4.326026119402985, + "grad_norm": 0.4210535710387319, + "learning_rate": 7.198609143045948e-06, + "loss": 0.2746, + "step": 18550 + }, + { + "epoch": 4.327192164179104, + "grad_norm": 0.3957509557638466, + "learning_rate": 7.191133926229957e-06, + "loss": 0.2694, + "step": 18555 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.3868809353549212, + "learning_rate": 7.183670788348726e-06, + "loss": 0.2641, + "step": 18560 + }, + { + "epoch": 4.329524253731344, + "grad_norm": 0.39010445915469755, + "learning_rate": 7.176219733841047e-06, + "loss": 0.2538, + "step": 18565 + }, + { + "epoch": 4.330690298507463, + "grad_norm": 0.396442626935043, + "learning_rate": 7.168780767138512e-06, + "loss": 0.2606, + "step": 18570 + }, + { + "epoch": 4.331856343283582, + "grad_norm": 0.4006272597680199, + "learning_rate": 7.161353892665538e-06, + "loss": 0.2704, + "step": 18575 + }, + { + "epoch": 4.333022388059701, + "grad_norm": 0.4025410382377788, + "learning_rate": 7.1539391148393474e-06, + "loss": 0.2761, + "step": 18580 + }, + { + "epoch": 4.334188432835821, + "grad_norm": 0.43821229012187163, + "learning_rate": 7.146536438069963e-06, + "loss": 0.2902, + "step": 18585 + }, + { + "epoch": 4.33535447761194, + "grad_norm": 0.42203900554396184, + "learning_rate": 7.139145866760217e-06, + "loss": 0.2863, + "step": 18590 + }, + { + "epoch": 4.33652052238806, + "grad_norm": 0.3928273962646149, + "learning_rate": 7.1317674053057335e-06, + "loss": 0.267, + "step": 18595 + }, + { + "epoch": 4.337686567164179, + "grad_norm": 0.4475315306879053, + "learning_rate": 7.124401058094938e-06, + "loss": 0.2832, + "step": 18600 + }, + { + "epoch": 4.338852611940299, + "grad_norm": 0.3875417608232742, + "learning_rate": 7.117046829509057e-06, + "loss": 0.2714, + "step": 18605 + }, + { + "epoch": 4.340018656716418, + "grad_norm": 0.40792835610705425, + "learning_rate": 7.109704723922094e-06, + "loss": 0.2811, + "step": 18610 + }, + { + "epoch": 4.341184701492537, + "grad_norm": 0.394978446970077, + "learning_rate": 7.102374745700866e-06, + "loss": 0.2823, + "step": 18615 + }, + { + "epoch": 4.342350746268656, + "grad_norm": 0.3873837118235636, + "learning_rate": 7.0950568992049494e-06, + "loss": 0.2708, + "step": 18620 + }, + { + "epoch": 4.3435167910447765, + "grad_norm": 0.39206257530074834, + "learning_rate": 7.087751188786723e-06, + "loss": 0.2777, + "step": 18625 + }, + { + "epoch": 4.344682835820896, + "grad_norm": 0.39565087671877125, + "learning_rate": 7.080457618791344e-06, + "loss": 0.2743, + "step": 18630 + }, + { + "epoch": 4.345848880597015, + "grad_norm": 0.41411369302212386, + "learning_rate": 7.0731761935567495e-06, + "loss": 0.2883, + "step": 18635 + }, + { + "epoch": 4.347014925373134, + "grad_norm": 0.36316642570779184, + "learning_rate": 7.0659069174136544e-06, + "loss": 0.2532, + "step": 18640 + }, + { + "epoch": 4.3481809701492535, + "grad_norm": 0.41132054512797556, + "learning_rate": 7.058649794685537e-06, + "loss": 0.2698, + "step": 18645 + }, + { + "epoch": 4.349347014925373, + "grad_norm": 0.4114841400832778, + "learning_rate": 7.051404829688663e-06, + "loss": 0.2676, + "step": 18650 + }, + { + "epoch": 4.350513059701493, + "grad_norm": 0.4257416427557622, + "learning_rate": 7.044172026732059e-06, + "loss": 0.2844, + "step": 18655 + }, + { + "epoch": 4.351679104477612, + "grad_norm": 0.420027093639048, + "learning_rate": 7.036951390117512e-06, + "loss": 0.273, + "step": 18660 + }, + { + "epoch": 4.3528451492537314, + "grad_norm": 0.3897984050752887, + "learning_rate": 7.029742924139586e-06, + "loss": 0.2707, + "step": 18665 + }, + { + "epoch": 4.354011194029851, + "grad_norm": 0.38179591064784923, + "learning_rate": 7.022546633085604e-06, + "loss": 0.2697, + "step": 18670 + }, + { + "epoch": 4.35517723880597, + "grad_norm": 0.40566034270554724, + "learning_rate": 7.015362521235632e-06, + "loss": 0.2697, + "step": 18675 + }, + { + "epoch": 4.356343283582089, + "grad_norm": 0.4006639140691593, + "learning_rate": 7.008190592862514e-06, + "loss": 0.2841, + "step": 18680 + }, + { + "epoch": 4.357509328358209, + "grad_norm": 0.41275912265736375, + "learning_rate": 7.0010308522318355e-06, + "loss": 0.2776, + "step": 18685 + }, + { + "epoch": 4.358675373134329, + "grad_norm": 0.40050083868302266, + "learning_rate": 6.9938833036019365e-06, + "loss": 0.2687, + "step": 18690 + }, + { + "epoch": 4.359841417910448, + "grad_norm": 0.39616132942510224, + "learning_rate": 6.9867479512239e-06, + "loss": 0.2865, + "step": 18695 + }, + { + "epoch": 4.361007462686567, + "grad_norm": 0.41896509762753836, + "learning_rate": 6.979624799341565e-06, + "loss": 0.2678, + "step": 18700 + }, + { + "epoch": 4.362173507462686, + "grad_norm": 0.39898689634510065, + "learning_rate": 6.972513852191508e-06, + "loss": 0.2838, + "step": 18705 + }, + { + "epoch": 4.363339552238806, + "grad_norm": 0.39019308904651356, + "learning_rate": 6.965415114003046e-06, + "loss": 0.2729, + "step": 18710 + }, + { + "epoch": 4.364505597014926, + "grad_norm": 0.41372980766676376, + "learning_rate": 6.958328588998242e-06, + "loss": 0.2801, + "step": 18715 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.38547078697304504, + "learning_rate": 6.951254281391881e-06, + "loss": 0.2638, + "step": 18720 + }, + { + "epoch": 4.366837686567164, + "grad_norm": 0.388440955064004, + "learning_rate": 6.944192195391494e-06, + "loss": 0.2706, + "step": 18725 + }, + { + "epoch": 4.368003731343284, + "grad_norm": 0.4340887735110533, + "learning_rate": 6.937142335197338e-06, + "loss": 0.2835, + "step": 18730 + }, + { + "epoch": 4.369169776119403, + "grad_norm": 0.3925250738284526, + "learning_rate": 6.930104705002403e-06, + "loss": 0.2641, + "step": 18735 + }, + { + "epoch": 4.370335820895522, + "grad_norm": 0.41935211598110145, + "learning_rate": 6.9230793089924005e-06, + "loss": 0.2752, + "step": 18740 + }, + { + "epoch": 4.371501865671641, + "grad_norm": 0.3858755016319207, + "learning_rate": 6.916066151345761e-06, + "loss": 0.2764, + "step": 18745 + }, + { + "epoch": 4.3726679104477615, + "grad_norm": 0.3988249900306538, + "learning_rate": 6.909065236233644e-06, + "loss": 0.2765, + "step": 18750 + }, + { + "epoch": 4.373833955223881, + "grad_norm": 0.39211849279452404, + "learning_rate": 6.90207656781993e-06, + "loss": 0.2706, + "step": 18755 + }, + { + "epoch": 4.375, + "grad_norm": 0.4099475077210503, + "learning_rate": 6.8951001502612065e-06, + "loss": 0.2681, + "step": 18760 + }, + { + "epoch": 4.376166044776119, + "grad_norm": 0.40630297492656764, + "learning_rate": 6.888135987706787e-06, + "loss": 0.2879, + "step": 18765 + }, + { + "epoch": 4.3773320895522385, + "grad_norm": 0.42286882372567736, + "learning_rate": 6.881184084298675e-06, + "loss": 0.2806, + "step": 18770 + }, + { + "epoch": 4.378498134328359, + "grad_norm": 0.3960834884757748, + "learning_rate": 6.874244444171607e-06, + "loss": 0.2688, + "step": 18775 + }, + { + "epoch": 4.379664179104478, + "grad_norm": 0.41468075646088814, + "learning_rate": 6.867317071453007e-06, + "loss": 0.2653, + "step": 18780 + }, + { + "epoch": 4.380830223880597, + "grad_norm": 0.4023368857309421, + "learning_rate": 6.860401970263017e-06, + "loss": 0.2626, + "step": 18785 + }, + { + "epoch": 4.381996268656716, + "grad_norm": 0.3793091077410151, + "learning_rate": 6.8534991447144706e-06, + "loss": 0.2634, + "step": 18790 + }, + { + "epoch": 4.383162313432836, + "grad_norm": 0.37678824298536073, + "learning_rate": 6.8466085989129066e-06, + "loss": 0.2595, + "step": 18795 + }, + { + "epoch": 4.384328358208955, + "grad_norm": 0.40251889586948386, + "learning_rate": 6.839730336956554e-06, + "loss": 0.2709, + "step": 18800 + }, + { + "epoch": 4.385494402985074, + "grad_norm": 0.4056816699893774, + "learning_rate": 6.83286436293634e-06, + "loss": 0.2744, + "step": 18805 + }, + { + "epoch": 4.386660447761194, + "grad_norm": 0.41362262742566164, + "learning_rate": 6.826010680935886e-06, + "loss": 0.2724, + "step": 18810 + }, + { + "epoch": 4.387826492537314, + "grad_norm": 0.3784388793501856, + "learning_rate": 6.819169295031493e-06, + "loss": 0.2678, + "step": 18815 + }, + { + "epoch": 4.388992537313433, + "grad_norm": 0.40012074332417263, + "learning_rate": 6.812340209292164e-06, + "loss": 0.2833, + "step": 18820 + }, + { + "epoch": 4.390158582089552, + "grad_norm": 0.39757265960329474, + "learning_rate": 6.80552342777957e-06, + "loss": 0.275, + "step": 18825 + }, + { + "epoch": 4.391324626865671, + "grad_norm": 0.41927261976118096, + "learning_rate": 6.79871895454807e-06, + "loss": 0.2897, + "step": 18830 + }, + { + "epoch": 4.392490671641791, + "grad_norm": 0.39977536858299106, + "learning_rate": 6.791926793644713e-06, + "loss": 0.2682, + "step": 18835 + }, + { + "epoch": 4.393656716417911, + "grad_norm": 0.41731170175521587, + "learning_rate": 6.785146949109206e-06, + "loss": 0.2745, + "step": 18840 + }, + { + "epoch": 4.39482276119403, + "grad_norm": 0.39821488851666337, + "learning_rate": 6.778379424973943e-06, + "loss": 0.2648, + "step": 18845 + }, + { + "epoch": 4.395988805970149, + "grad_norm": 0.40766819813214755, + "learning_rate": 6.771624225263994e-06, + "loss": 0.2618, + "step": 18850 + }, + { + "epoch": 4.3971548507462686, + "grad_norm": 0.4142756771919827, + "learning_rate": 6.764881353997082e-06, + "loss": 0.2741, + "step": 18855 + }, + { + "epoch": 4.398320895522388, + "grad_norm": 0.42668661830470156, + "learning_rate": 6.758150815183618e-06, + "loss": 0.2875, + "step": 18860 + }, + { + "epoch": 4.399486940298507, + "grad_norm": 0.4113454794124042, + "learning_rate": 6.751432612826664e-06, + "loss": 0.2853, + "step": 18865 + }, + { + "epoch": 4.400652985074627, + "grad_norm": 0.3945741472602751, + "learning_rate": 6.7447267509219494e-06, + "loss": 0.2686, + "step": 18870 + }, + { + "epoch": 4.4018190298507465, + "grad_norm": 0.39115576733852403, + "learning_rate": 6.738033233457863e-06, + "loss": 0.2782, + "step": 18875 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.3988196565507617, + "learning_rate": 6.7313520644154555e-06, + "loss": 0.2698, + "step": 18880 + }, + { + "epoch": 4.404151119402985, + "grad_norm": 0.3676469105261775, + "learning_rate": 6.724683247768427e-06, + "loss": 0.264, + "step": 18885 + }, + { + "epoch": 4.405317164179104, + "grad_norm": 0.3970448843150535, + "learning_rate": 6.718026787483131e-06, + "loss": 0.2684, + "step": 18890 + }, + { + "epoch": 4.4064832089552235, + "grad_norm": 0.40734340524720397, + "learning_rate": 6.7113826875185885e-06, + "loss": 0.2901, + "step": 18895 + }, + { + "epoch": 4.407649253731344, + "grad_norm": 0.3879153206101479, + "learning_rate": 6.704750951826438e-06, + "loss": 0.2728, + "step": 18900 + }, + { + "epoch": 4.408815298507463, + "grad_norm": 0.37419427548808687, + "learning_rate": 6.698131584350989e-06, + "loss": 0.2789, + "step": 18905 + }, + { + "epoch": 4.409981343283582, + "grad_norm": 0.41270892431821893, + "learning_rate": 6.691524589029188e-06, + "loss": 0.2801, + "step": 18910 + }, + { + "epoch": 4.411147388059701, + "grad_norm": 0.3982928618966707, + "learning_rate": 6.684929969790622e-06, + "loss": 0.2698, + "step": 18915 + }, + { + "epoch": 4.412313432835821, + "grad_norm": 0.39749924018053034, + "learning_rate": 6.6783477305575215e-06, + "loss": 0.2843, + "step": 18920 + }, + { + "epoch": 4.41347947761194, + "grad_norm": 0.37837567587105775, + "learning_rate": 6.671777875244745e-06, + "loss": 0.2729, + "step": 18925 + }, + { + "epoch": 4.41464552238806, + "grad_norm": 0.39945407037967195, + "learning_rate": 6.665220407759788e-06, + "loss": 0.2871, + "step": 18930 + }, + { + "epoch": 4.415811567164179, + "grad_norm": 0.4057597958414668, + "learning_rate": 6.658675332002787e-06, + "loss": 0.2697, + "step": 18935 + }, + { + "epoch": 4.416977611940299, + "grad_norm": 0.40746892828624903, + "learning_rate": 6.652142651866497e-06, + "loss": 0.2685, + "step": 18940 + }, + { + "epoch": 4.418143656716418, + "grad_norm": 0.3918110842437935, + "learning_rate": 6.645622371236314e-06, + "loss": 0.2756, + "step": 18945 + }, + { + "epoch": 4.419309701492537, + "grad_norm": 0.37102893586161895, + "learning_rate": 6.639114493990238e-06, + "loss": 0.2734, + "step": 18950 + }, + { + "epoch": 4.420475746268656, + "grad_norm": 0.4056425541286255, + "learning_rate": 6.6326190239989135e-06, + "loss": 0.2743, + "step": 18955 + }, + { + "epoch": 4.4216417910447765, + "grad_norm": 0.39256078118021526, + "learning_rate": 6.626135965125597e-06, + "loss": 0.265, + "step": 18960 + }, + { + "epoch": 4.422807835820896, + "grad_norm": 0.3935822845824643, + "learning_rate": 6.61966532122616e-06, + "loss": 0.2838, + "step": 18965 + }, + { + "epoch": 4.423973880597015, + "grad_norm": 0.4158362871831473, + "learning_rate": 6.613207096149099e-06, + "loss": 0.2673, + "step": 18970 + }, + { + "epoch": 4.425139925373134, + "grad_norm": 0.40875456504871766, + "learning_rate": 6.606761293735513e-06, + "loss": 0.2767, + "step": 18975 + }, + { + "epoch": 4.4263059701492535, + "grad_norm": 0.3878233054183334, + "learning_rate": 6.600327917819114e-06, + "loss": 0.2646, + "step": 18980 + }, + { + "epoch": 4.427472014925373, + "grad_norm": 0.37276222693146205, + "learning_rate": 6.593906972226238e-06, + "loss": 0.2608, + "step": 18985 + }, + { + "epoch": 4.428638059701493, + "grad_norm": 0.41202104775478976, + "learning_rate": 6.587498460775811e-06, + "loss": 0.2812, + "step": 18990 + }, + { + "epoch": 4.429804104477612, + "grad_norm": 0.421211060105778, + "learning_rate": 6.581102387279374e-06, + "loss": 0.2852, + "step": 18995 + }, + { + "epoch": 4.4309701492537314, + "grad_norm": 0.4191625377655691, + "learning_rate": 6.574718755541061e-06, + "loss": 0.2805, + "step": 19000 + }, + { + "epoch": 4.432136194029851, + "grad_norm": 0.3941124652552664, + "learning_rate": 6.568347569357611e-06, + "loss": 0.2695, + "step": 19005 + }, + { + "epoch": 4.43330223880597, + "grad_norm": 0.3793229746609596, + "learning_rate": 6.561988832518367e-06, + "loss": 0.2732, + "step": 19010 + }, + { + "epoch": 4.434468283582089, + "grad_norm": 0.43193453612308014, + "learning_rate": 6.555642548805262e-06, + "loss": 0.2944, + "step": 19015 + }, + { + "epoch": 4.435634328358209, + "grad_norm": 0.42671415627695103, + "learning_rate": 6.5493087219928114e-06, + "loss": 0.2714, + "step": 19020 + }, + { + "epoch": 4.436800373134329, + "grad_norm": 0.3906920961525172, + "learning_rate": 6.542987355848144e-06, + "loss": 0.2596, + "step": 19025 + }, + { + "epoch": 4.437966417910448, + "grad_norm": 0.40912308298665784, + "learning_rate": 6.536678454130965e-06, + "loss": 0.2839, + "step": 19030 + }, + { + "epoch": 4.439132462686567, + "grad_norm": 0.39463773036597727, + "learning_rate": 6.530382020593559e-06, + "loss": 0.2741, + "step": 19035 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.3967586998553728, + "learning_rate": 6.52409805898081e-06, + "loss": 0.2726, + "step": 19040 + }, + { + "epoch": 4.441464552238806, + "grad_norm": 0.40978112634281993, + "learning_rate": 6.517826573030178e-06, + "loss": 0.2848, + "step": 19045 + }, + { + "epoch": 4.442630597014926, + "grad_norm": 0.4142811019822367, + "learning_rate": 6.511567566471697e-06, + "loss": 0.2741, + "step": 19050 + }, + { + "epoch": 4.443796641791045, + "grad_norm": 0.40419837663053265, + "learning_rate": 6.50532104302799e-06, + "loss": 0.2732, + "step": 19055 + }, + { + "epoch": 4.444962686567164, + "grad_norm": 0.4187348332592611, + "learning_rate": 6.499087006414245e-06, + "loss": 0.2775, + "step": 19060 + }, + { + "epoch": 4.446128731343284, + "grad_norm": 0.4116433904244841, + "learning_rate": 6.492865460338228e-06, + "loss": 0.2611, + "step": 19065 + }, + { + "epoch": 4.447294776119403, + "grad_norm": 0.3952524457641686, + "learning_rate": 6.4866564085002826e-06, + "loss": 0.2787, + "step": 19070 + }, + { + "epoch": 4.448460820895522, + "grad_norm": 0.41531152114803566, + "learning_rate": 6.480459854593305e-06, + "loss": 0.2838, + "step": 19075 + }, + { + "epoch": 4.449626865671641, + "grad_norm": 0.41930044222211527, + "learning_rate": 6.474275802302776e-06, + "loss": 0.2823, + "step": 19080 + }, + { + "epoch": 4.4507929104477615, + "grad_norm": 0.4268489902373017, + "learning_rate": 6.468104255306728e-06, + "loss": 0.2682, + "step": 19085 + }, + { + "epoch": 4.451958955223881, + "grad_norm": 0.41252067564112116, + "learning_rate": 6.461945217275761e-06, + "loss": 0.2816, + "step": 19090 + }, + { + "epoch": 4.453125, + "grad_norm": 0.38449147715064336, + "learning_rate": 6.455798691873042e-06, + "loss": 0.2605, + "step": 19095 + }, + { + "epoch": 4.454291044776119, + "grad_norm": 0.4533579608370104, + "learning_rate": 6.449664682754278e-06, + "loss": 0.2734, + "step": 19100 + }, + { + "epoch": 4.4554570895522385, + "grad_norm": 0.3904560363082716, + "learning_rate": 6.443543193567745e-06, + "loss": 0.2808, + "step": 19105 + }, + { + "epoch": 4.456623134328359, + "grad_norm": 0.40327441751142873, + "learning_rate": 6.4374342279542726e-06, + "loss": 0.2831, + "step": 19110 + }, + { + "epoch": 4.457789179104478, + "grad_norm": 0.41445646555278115, + "learning_rate": 6.431337789547239e-06, + "loss": 0.2714, + "step": 19115 + }, + { + "epoch": 4.458955223880597, + "grad_norm": 0.3737900447135218, + "learning_rate": 6.425253881972573e-06, + "loss": 0.261, + "step": 19120 + }, + { + "epoch": 4.460121268656716, + "grad_norm": 0.4066120230512756, + "learning_rate": 6.419182508848745e-06, + "loss": 0.2722, + "step": 19125 + }, + { + "epoch": 4.461287313432836, + "grad_norm": 0.3979750471063328, + "learning_rate": 6.4131236737867795e-06, + "loss": 0.2707, + "step": 19130 + }, + { + "epoch": 4.462453358208955, + "grad_norm": 0.3897999751940203, + "learning_rate": 6.407077380390236e-06, + "loss": 0.2721, + "step": 19135 + }, + { + "epoch": 4.463619402985074, + "grad_norm": 0.4001411615680101, + "learning_rate": 6.4010436322552204e-06, + "loss": 0.2642, + "step": 19140 + }, + { + "epoch": 4.464785447761194, + "grad_norm": 0.41422149509694084, + "learning_rate": 6.395022432970375e-06, + "loss": 0.2571, + "step": 19145 + }, + { + "epoch": 4.465951492537314, + "grad_norm": 0.4154919561238171, + "learning_rate": 6.389013786116878e-06, + "loss": 0.2808, + "step": 19150 + }, + { + "epoch": 4.467117537313433, + "grad_norm": 0.3978764925884262, + "learning_rate": 6.383017695268441e-06, + "loss": 0.2639, + "step": 19155 + }, + { + "epoch": 4.468283582089552, + "grad_norm": 0.39306752609181533, + "learning_rate": 6.377034163991308e-06, + "loss": 0.2796, + "step": 19160 + }, + { + "epoch": 4.469449626865671, + "grad_norm": 0.384063747009322, + "learning_rate": 6.3710631958442524e-06, + "loss": 0.2622, + "step": 19165 + }, + { + "epoch": 4.470615671641791, + "grad_norm": 0.40126341615131395, + "learning_rate": 6.365104794378582e-06, + "loss": 0.275, + "step": 19170 + }, + { + "epoch": 4.471781716417911, + "grad_norm": 0.42365641017668654, + "learning_rate": 6.3591589631381286e-06, + "loss": 0.2688, + "step": 19175 + }, + { + "epoch": 4.47294776119403, + "grad_norm": 0.40908085859655946, + "learning_rate": 6.353225705659234e-06, + "loss": 0.2782, + "step": 19180 + }, + { + "epoch": 4.474113805970149, + "grad_norm": 0.39825533025059007, + "learning_rate": 6.347305025470776e-06, + "loss": 0.2749, + "step": 19185 + }, + { + "epoch": 4.4752798507462686, + "grad_norm": 0.41525520232326524, + "learning_rate": 6.341396926094155e-06, + "loss": 0.2864, + "step": 19190 + }, + { + "epoch": 4.476445895522388, + "grad_norm": 0.4338386050024731, + "learning_rate": 6.335501411043274e-06, + "loss": 0.2939, + "step": 19195 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.39885416238441795, + "learning_rate": 6.329618483824559e-06, + "loss": 0.2777, + "step": 19200 + }, + { + "epoch": 4.478777985074627, + "grad_norm": 0.4197249220668985, + "learning_rate": 6.323748147936959e-06, + "loss": 0.2821, + "step": 19205 + }, + { + "epoch": 4.4799440298507465, + "grad_norm": 0.42926796475044454, + "learning_rate": 6.317890406871914e-06, + "loss": 0.2851, + "step": 19210 + }, + { + "epoch": 4.481110074626866, + "grad_norm": 0.3951438331874199, + "learning_rate": 6.312045264113388e-06, + "loss": 0.2807, + "step": 19215 + }, + { + "epoch": 4.482276119402985, + "grad_norm": 0.4255003447727782, + "learning_rate": 6.306212723137846e-06, + "loss": 0.2747, + "step": 19220 + }, + { + "epoch": 4.483442164179104, + "grad_norm": 0.4074220430752247, + "learning_rate": 6.300392787414265e-06, + "loss": 0.2793, + "step": 19225 + }, + { + "epoch": 4.4846082089552235, + "grad_norm": 0.4025382438197484, + "learning_rate": 6.2945854604041135e-06, + "loss": 0.2792, + "step": 19230 + }, + { + "epoch": 4.485774253731344, + "grad_norm": 0.3921972243318409, + "learning_rate": 6.28879074556137e-06, + "loss": 0.2795, + "step": 19235 + }, + { + "epoch": 4.486940298507463, + "grad_norm": 0.40775686243195514, + "learning_rate": 6.283008646332507e-06, + "loss": 0.274, + "step": 19240 + }, + { + "epoch": 4.488106343283582, + "grad_norm": 0.3870994218374946, + "learning_rate": 6.277239166156497e-06, + "loss": 0.2641, + "step": 19245 + }, + { + "epoch": 4.489272388059701, + "grad_norm": 0.3906998650606952, + "learning_rate": 6.271482308464807e-06, + "loss": 0.2712, + "step": 19250 + }, + { + "epoch": 4.490438432835821, + "grad_norm": 0.3817095400466071, + "learning_rate": 6.265738076681392e-06, + "loss": 0.2665, + "step": 19255 + }, + { + "epoch": 4.49160447761194, + "grad_norm": 0.42633466345186055, + "learning_rate": 6.2600064742227e-06, + "loss": 0.2916, + "step": 19260 + }, + { + "epoch": 4.49277052238806, + "grad_norm": 0.4241079906906127, + "learning_rate": 6.254287504497672e-06, + "loss": 0.2836, + "step": 19265 + }, + { + "epoch": 4.493936567164179, + "grad_norm": 0.4053351899896776, + "learning_rate": 6.248581170907729e-06, + "loss": 0.2696, + "step": 19270 + }, + { + "epoch": 4.495102611940299, + "grad_norm": 0.4092687463740792, + "learning_rate": 6.242887476846785e-06, + "loss": 0.2758, + "step": 19275 + }, + { + "epoch": 4.496268656716418, + "grad_norm": 0.4457329920963302, + "learning_rate": 6.237206425701223e-06, + "loss": 0.2794, + "step": 19280 + }, + { + "epoch": 4.497434701492537, + "grad_norm": 0.422589189665392, + "learning_rate": 6.231538020849919e-06, + "loss": 0.2833, + "step": 19285 + }, + { + "epoch": 4.498600746268656, + "grad_norm": 0.40719545940377416, + "learning_rate": 6.225882265664218e-06, + "loss": 0.2826, + "step": 19290 + }, + { + "epoch": 4.4997667910447765, + "grad_norm": 0.3894236450328694, + "learning_rate": 6.220239163507955e-06, + "loss": 0.2722, + "step": 19295 + }, + { + "epoch": 4.500932835820896, + "grad_norm": 0.4093844279695568, + "learning_rate": 6.214608717737426e-06, + "loss": 0.2894, + "step": 19300 + }, + { + "epoch": 4.502098880597015, + "grad_norm": 0.3999452596114185, + "learning_rate": 6.2089909317014e-06, + "loss": 0.2637, + "step": 19305 + }, + { + "epoch": 4.503264925373134, + "grad_norm": 0.37928564631271194, + "learning_rate": 6.2033858087411275e-06, + "loss": 0.2602, + "step": 19310 + }, + { + "epoch": 4.5044309701492535, + "grad_norm": 0.40258765319208045, + "learning_rate": 6.197793352190316e-06, + "loss": 0.2823, + "step": 19315 + }, + { + "epoch": 4.505597014925373, + "grad_norm": 0.39881501025472776, + "learning_rate": 6.192213565375147e-06, + "loss": 0.2727, + "step": 19320 + }, + { + "epoch": 4.506763059701493, + "grad_norm": 0.38961424574655634, + "learning_rate": 6.186646451614265e-06, + "loss": 0.2708, + "step": 19325 + }, + { + "epoch": 4.507929104477612, + "grad_norm": 0.40239205310246, + "learning_rate": 6.1810920142187726e-06, + "loss": 0.2751, + "step": 19330 + }, + { + "epoch": 4.5090951492537314, + "grad_norm": 0.40769869988114504, + "learning_rate": 6.175550256492235e-06, + "loss": 0.2913, + "step": 19335 + }, + { + "epoch": 4.510261194029851, + "grad_norm": 0.41681622084301867, + "learning_rate": 6.170021181730681e-06, + "loss": 0.2796, + "step": 19340 + }, + { + "epoch": 4.51142723880597, + "grad_norm": 0.40521223352998204, + "learning_rate": 6.164504793222589e-06, + "loss": 0.2763, + "step": 19345 + }, + { + "epoch": 4.512593283582089, + "grad_norm": 0.3794976557676561, + "learning_rate": 6.159001094248904e-06, + "loss": 0.2887, + "step": 19350 + }, + { + "epoch": 4.5137593283582085, + "grad_norm": 0.4259876082606273, + "learning_rate": 6.153510088083e-06, + "loss": 0.2719, + "step": 19355 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.39316638200400666, + "learning_rate": 6.1480317779907285e-06, + "loss": 0.2757, + "step": 19360 + }, + { + "epoch": 4.516091417910448, + "grad_norm": 0.39203245337716425, + "learning_rate": 6.1425661672303735e-06, + "loss": 0.2695, + "step": 19365 + }, + { + "epoch": 4.517257462686567, + "grad_norm": 0.4167874420088122, + "learning_rate": 6.1371132590526744e-06, + "loss": 0.2723, + "step": 19370 + }, + { + "epoch": 4.518423507462686, + "grad_norm": 0.40082339237716336, + "learning_rate": 6.1316730567008086e-06, + "loss": 0.2741, + "step": 19375 + }, + { + "epoch": 4.519589552238806, + "grad_norm": 0.39196663617399374, + "learning_rate": 6.126245563410399e-06, + "loss": 0.2826, + "step": 19380 + }, + { + "epoch": 4.520755597014926, + "grad_norm": 0.4154410313830451, + "learning_rate": 6.120830782409515e-06, + "loss": 0.2798, + "step": 19385 + }, + { + "epoch": 4.521921641791045, + "grad_norm": 0.41780321723229025, + "learning_rate": 6.115428716918657e-06, + "loss": 0.276, + "step": 19390 + }, + { + "epoch": 4.523087686567164, + "grad_norm": 0.40963979476987106, + "learning_rate": 6.110039370150765e-06, + "loss": 0.2678, + "step": 19395 + }, + { + "epoch": 4.524253731343284, + "grad_norm": 0.4111879076863036, + "learning_rate": 6.104662745311222e-06, + "loss": 0.2765, + "step": 19400 + }, + { + "epoch": 4.525419776119403, + "grad_norm": 0.41693216869068, + "learning_rate": 6.099298845597832e-06, + "loss": 0.2821, + "step": 19405 + }, + { + "epoch": 4.526585820895522, + "grad_norm": 0.392640636302135, + "learning_rate": 6.093947674200838e-06, + "loss": 0.2815, + "step": 19410 + }, + { + "epoch": 4.527751865671641, + "grad_norm": 0.4132072694079558, + "learning_rate": 6.088609234302912e-06, + "loss": 0.2698, + "step": 19415 + }, + { + "epoch": 4.5289179104477615, + "grad_norm": 0.40173065497838006, + "learning_rate": 6.083283529079157e-06, + "loss": 0.2716, + "step": 19420 + }, + { + "epoch": 4.530083955223881, + "grad_norm": 0.39584872427771123, + "learning_rate": 6.077970561697095e-06, + "loss": 0.2862, + "step": 19425 + }, + { + "epoch": 4.53125, + "grad_norm": 0.4061946706562812, + "learning_rate": 6.072670335316676e-06, + "loss": 0.2716, + "step": 19430 + }, + { + "epoch": 4.532416044776119, + "grad_norm": 0.40805514869112874, + "learning_rate": 6.067382853090269e-06, + "loss": 0.2746, + "step": 19435 + }, + { + "epoch": 4.5335820895522385, + "grad_norm": 0.3963705910534693, + "learning_rate": 6.062108118162669e-06, + "loss": 0.2716, + "step": 19440 + }, + { + "epoch": 4.534748134328359, + "grad_norm": 0.40975563096672996, + "learning_rate": 6.056846133671083e-06, + "loss": 0.2679, + "step": 19445 + }, + { + "epoch": 4.535914179104478, + "grad_norm": 0.39497962693592065, + "learning_rate": 6.051596902745143e-06, + "loss": 0.2769, + "step": 19450 + }, + { + "epoch": 4.537080223880597, + "grad_norm": 0.4096775616097535, + "learning_rate": 6.0463604285068834e-06, + "loss": 0.2666, + "step": 19455 + }, + { + "epoch": 4.538246268656716, + "grad_norm": 0.38747793264249547, + "learning_rate": 6.0411367140707625e-06, + "loss": 0.2743, + "step": 19460 + }, + { + "epoch": 4.539412313432836, + "grad_norm": 0.40292660808703534, + "learning_rate": 6.035925762543644e-06, + "loss": 0.2664, + "step": 19465 + }, + { + "epoch": 4.540578358208955, + "grad_norm": 0.3971369327299724, + "learning_rate": 6.030727577024802e-06, + "loss": 0.2694, + "step": 19470 + }, + { + "epoch": 4.541744402985074, + "grad_norm": 0.376268510836514, + "learning_rate": 6.025542160605923e-06, + "loss": 0.2743, + "step": 19475 + }, + { + "epoch": 4.542910447761194, + "grad_norm": 0.37070145138374705, + "learning_rate": 6.020369516371085e-06, + "loss": 0.2862, + "step": 19480 + }, + { + "epoch": 4.544076492537314, + "grad_norm": 0.3940824099672883, + "learning_rate": 6.015209647396781e-06, + "loss": 0.2903, + "step": 19485 + }, + { + "epoch": 4.545242537313433, + "grad_norm": 0.4135855527891064, + "learning_rate": 6.010062556751906e-06, + "loss": 0.2816, + "step": 19490 + }, + { + "epoch": 4.546408582089552, + "grad_norm": 0.3998933292712433, + "learning_rate": 6.00492824749775e-06, + "loss": 0.2726, + "step": 19495 + }, + { + "epoch": 4.547574626865671, + "grad_norm": 0.40362698833453464, + "learning_rate": 5.999806722688007e-06, + "loss": 0.2706, + "step": 19500 + }, + { + "epoch": 4.5487406716417915, + "grad_norm": 0.3996476049128751, + "learning_rate": 5.994697985368761e-06, + "loss": 0.2759, + "step": 19505 + }, + { + "epoch": 4.549906716417911, + "grad_norm": 0.4358477852289081, + "learning_rate": 5.98960203857849e-06, + "loss": 0.2759, + "step": 19510 + }, + { + "epoch": 4.55107276119403, + "grad_norm": 0.42666702668186296, + "learning_rate": 5.98451888534807e-06, + "loss": 0.2844, + "step": 19515 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.38888095487037455, + "learning_rate": 5.9794485287007696e-06, + "loss": 0.2722, + "step": 19520 + }, + { + "epoch": 4.5534048507462686, + "grad_norm": 0.4222347972637875, + "learning_rate": 5.974390971652237e-06, + "loss": 0.2823, + "step": 19525 + }, + { + "epoch": 4.554570895522388, + "grad_norm": 0.4043149810792717, + "learning_rate": 5.9693462172105165e-06, + "loss": 0.2746, + "step": 19530 + }, + { + "epoch": 4.555736940298507, + "grad_norm": 0.384161856818575, + "learning_rate": 5.964314268376031e-06, + "loss": 0.2714, + "step": 19535 + }, + { + "epoch": 4.556902985074627, + "grad_norm": 0.4074095968516756, + "learning_rate": 5.959295128141596e-06, + "loss": 0.274, + "step": 19540 + }, + { + "epoch": 4.5580690298507465, + "grad_norm": 0.37147033638693244, + "learning_rate": 5.9542887994923985e-06, + "loss": 0.2706, + "step": 19545 + }, + { + "epoch": 4.559235074626866, + "grad_norm": 0.42004201414060405, + "learning_rate": 5.949295285406015e-06, + "loss": 0.2894, + "step": 19550 + }, + { + "epoch": 4.560401119402985, + "grad_norm": 0.405002870436818, + "learning_rate": 5.944314588852393e-06, + "loss": 0.286, + "step": 19555 + }, + { + "epoch": 4.561567164179104, + "grad_norm": 0.39430856424292815, + "learning_rate": 5.93934671279386e-06, + "loss": 0.2703, + "step": 19560 + }, + { + "epoch": 4.5627332089552235, + "grad_norm": 0.40082691907537743, + "learning_rate": 5.934391660185121e-06, + "loss": 0.2709, + "step": 19565 + }, + { + "epoch": 4.563899253731344, + "grad_norm": 0.41785338119901133, + "learning_rate": 5.929449433973249e-06, + "loss": 0.2651, + "step": 19570 + }, + { + "epoch": 4.565065298507463, + "grad_norm": 0.4279430163773111, + "learning_rate": 5.924520037097688e-06, + "loss": 0.2818, + "step": 19575 + }, + { + "epoch": 4.566231343283582, + "grad_norm": 0.40038913922060654, + "learning_rate": 5.919603472490263e-06, + "loss": 0.2951, + "step": 19580 + }, + { + "epoch": 4.567397388059701, + "grad_norm": 0.39948244960271895, + "learning_rate": 5.914699743075149e-06, + "loss": 0.2605, + "step": 19585 + }, + { + "epoch": 4.568563432835821, + "grad_norm": 0.43136010733909785, + "learning_rate": 5.909808851768898e-06, + "loss": 0.2746, + "step": 19590 + }, + { + "epoch": 4.56972947761194, + "grad_norm": 0.43280422382694816, + "learning_rate": 5.904930801480427e-06, + "loss": 0.2771, + "step": 19595 + }, + { + "epoch": 4.57089552238806, + "grad_norm": 0.3957451950378775, + "learning_rate": 5.900065595111014e-06, + "loss": 0.2749, + "step": 19600 + }, + { + "epoch": 4.572061567164179, + "grad_norm": 0.44394443324252325, + "learning_rate": 5.895213235554298e-06, + "loss": 0.2892, + "step": 19605 + }, + { + "epoch": 4.573227611940299, + "grad_norm": 0.400056466974145, + "learning_rate": 5.890373725696271e-06, + "loss": 0.2639, + "step": 19610 + }, + { + "epoch": 4.574393656716418, + "grad_norm": 0.41458887736370276, + "learning_rate": 5.885547068415289e-06, + "loss": 0.281, + "step": 19615 + }, + { + "epoch": 4.575559701492537, + "grad_norm": 0.40621557083779697, + "learning_rate": 5.880733266582066e-06, + "loss": 0.2735, + "step": 19620 + }, + { + "epoch": 4.576725746268656, + "grad_norm": 0.4023809100073007, + "learning_rate": 5.875932323059667e-06, + "loss": 0.2731, + "step": 19625 + }, + { + "epoch": 4.5778917910447765, + "grad_norm": 0.38252664921450025, + "learning_rate": 5.871144240703507e-06, + "loss": 0.265, + "step": 19630 + }, + { + "epoch": 4.579057835820896, + "grad_norm": 0.3912285302237702, + "learning_rate": 5.866369022361354e-06, + "loss": 0.2629, + "step": 19635 + }, + { + "epoch": 4.580223880597015, + "grad_norm": 0.41530629012469544, + "learning_rate": 5.8616066708733255e-06, + "loss": 0.2794, + "step": 19640 + }, + { + "epoch": 4.581389925373134, + "grad_norm": 0.427373998670521, + "learning_rate": 5.856857189071884e-06, + "loss": 0.2724, + "step": 19645 + }, + { + "epoch": 4.5825559701492535, + "grad_norm": 0.4235487584855721, + "learning_rate": 5.852120579781838e-06, + "loss": 0.2759, + "step": 19650 + }, + { + "epoch": 4.583722014925373, + "grad_norm": 0.4305721437683783, + "learning_rate": 5.847396845820349e-06, + "loss": 0.2939, + "step": 19655 + }, + { + "epoch": 4.584888059701493, + "grad_norm": 0.41505408391745124, + "learning_rate": 5.8426859899969034e-06, + "loss": 0.2799, + "step": 19660 + }, + { + "epoch": 4.586054104477612, + "grad_norm": 0.3952767658187638, + "learning_rate": 5.83798801511334e-06, + "loss": 0.2816, + "step": 19665 + }, + { + "epoch": 4.5872201492537314, + "grad_norm": 0.40807007047949034, + "learning_rate": 5.833302923963837e-06, + "loss": 0.2826, + "step": 19670 + }, + { + "epoch": 4.588386194029851, + "grad_norm": 0.39343617196658465, + "learning_rate": 5.828630719334905e-06, + "loss": 0.2797, + "step": 19675 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.4027098901842826, + "learning_rate": 5.8239714040053936e-06, + "loss": 0.2898, + "step": 19680 + }, + { + "epoch": 4.590718283582089, + "grad_norm": 0.38531328965715933, + "learning_rate": 5.819324980746483e-06, + "loss": 0.2729, + "step": 19685 + }, + { + "epoch": 4.5918843283582085, + "grad_norm": 0.3879302233639575, + "learning_rate": 5.814691452321687e-06, + "loss": 0.2645, + "step": 19690 + }, + { + "epoch": 4.593050373134329, + "grad_norm": 0.39080847205786856, + "learning_rate": 5.810070821486854e-06, + "loss": 0.2803, + "step": 19695 + }, + { + "epoch": 4.594216417910448, + "grad_norm": 0.40894217807761285, + "learning_rate": 5.805463090990154e-06, + "loss": 0.2793, + "step": 19700 + }, + { + "epoch": 4.595382462686567, + "grad_norm": 0.3825694244005513, + "learning_rate": 5.800868263572093e-06, + "loss": 0.2749, + "step": 19705 + }, + { + "epoch": 4.596548507462686, + "grad_norm": 0.37684047859810854, + "learning_rate": 5.796286341965492e-06, + "loss": 0.2676, + "step": 19710 + }, + { + "epoch": 4.597714552238806, + "grad_norm": 0.373842441950314, + "learning_rate": 5.7917173288955105e-06, + "loss": 0.272, + "step": 19715 + }, + { + "epoch": 4.598880597014926, + "grad_norm": 0.42330168939749574, + "learning_rate": 5.787161227079613e-06, + "loss": 0.2884, + "step": 19720 + }, + { + "epoch": 4.600046641791045, + "grad_norm": 0.39687130006718135, + "learning_rate": 5.782618039227603e-06, + "loss": 0.2705, + "step": 19725 + }, + { + "epoch": 4.601212686567164, + "grad_norm": 0.4116001899644282, + "learning_rate": 5.778087768041589e-06, + "loss": 0.2845, + "step": 19730 + }, + { + "epoch": 4.602378731343284, + "grad_norm": 0.40456122964502916, + "learning_rate": 5.7735704162160005e-06, + "loss": 0.27, + "step": 19735 + }, + { + "epoch": 4.603544776119403, + "grad_norm": 0.384095039579496, + "learning_rate": 5.769065986437591e-06, + "loss": 0.2898, + "step": 19740 + }, + { + "epoch": 4.604710820895522, + "grad_norm": 0.3855658442097468, + "learning_rate": 5.764574481385419e-06, + "loss": 0.2631, + "step": 19745 + }, + { + "epoch": 4.605876865671641, + "grad_norm": 0.4333675633039686, + "learning_rate": 5.7600959037308626e-06, + "loss": 0.278, + "step": 19750 + }, + { + "epoch": 4.6070429104477615, + "grad_norm": 0.41376429703298284, + "learning_rate": 5.755630256137605e-06, + "loss": 0.286, + "step": 19755 + }, + { + "epoch": 4.608208955223881, + "grad_norm": 0.41920233257169415, + "learning_rate": 5.7511775412616415e-06, + "loss": 0.2693, + "step": 19760 + }, + { + "epoch": 4.609375, + "grad_norm": 0.40718260204158097, + "learning_rate": 5.74673776175128e-06, + "loss": 0.2706, + "step": 19765 + }, + { + "epoch": 4.610541044776119, + "grad_norm": 0.3864503289913195, + "learning_rate": 5.742310920247127e-06, + "loss": 0.2612, + "step": 19770 + }, + { + "epoch": 4.6117070895522385, + "grad_norm": 0.4162571718892432, + "learning_rate": 5.737897019382098e-06, + "loss": 0.2882, + "step": 19775 + }, + { + "epoch": 4.612873134328359, + "grad_norm": 0.4005442030212411, + "learning_rate": 5.733496061781418e-06, + "loss": 0.2735, + "step": 19780 + }, + { + "epoch": 4.614039179104478, + "grad_norm": 0.4176972373199422, + "learning_rate": 5.729108050062603e-06, + "loss": 0.2752, + "step": 19785 + }, + { + "epoch": 4.615205223880597, + "grad_norm": 0.4254386676294371, + "learning_rate": 5.7247329868354705e-06, + "loss": 0.2674, + "step": 19790 + }, + { + "epoch": 4.616371268656716, + "grad_norm": 0.4198990073695244, + "learning_rate": 5.720370874702148e-06, + "loss": 0.2849, + "step": 19795 + }, + { + "epoch": 4.617537313432836, + "grad_norm": 0.41152217793888024, + "learning_rate": 5.716021716257047e-06, + "loss": 0.2766, + "step": 19800 + }, + { + "epoch": 4.618703358208955, + "grad_norm": 0.392074902485431, + "learning_rate": 5.7116855140868874e-06, + "loss": 0.2658, + "step": 19805 + }, + { + "epoch": 4.619869402985074, + "grad_norm": 0.4130968665526762, + "learning_rate": 5.707362270770665e-06, + "loss": 0.2698, + "step": 19810 + }, + { + "epoch": 4.621035447761194, + "grad_norm": 0.3889066425584659, + "learning_rate": 5.703051988879689e-06, + "loss": 0.2693, + "step": 19815 + }, + { + "epoch": 4.622201492537314, + "grad_norm": 0.40396315931555515, + "learning_rate": 5.698754670977544e-06, + "loss": 0.2681, + "step": 19820 + }, + { + "epoch": 4.623367537313433, + "grad_norm": 0.45162345998255143, + "learning_rate": 5.69447031962011e-06, + "loss": 0.2908, + "step": 19825 + }, + { + "epoch": 4.624533582089552, + "grad_norm": 0.3647318833276829, + "learning_rate": 5.690198937355561e-06, + "loss": 0.27, + "step": 19830 + }, + { + "epoch": 4.625699626865671, + "grad_norm": 0.4030906159068544, + "learning_rate": 5.685940526724344e-06, + "loss": 0.2709, + "step": 19835 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.39945894837975454, + "learning_rate": 5.6816950902592005e-06, + "loss": 0.2707, + "step": 19840 + }, + { + "epoch": 4.628031716417911, + "grad_norm": 0.4005422592919341, + "learning_rate": 5.6774626304851555e-06, + "loss": 0.2677, + "step": 19845 + }, + { + "epoch": 4.62919776119403, + "grad_norm": 0.423893919526871, + "learning_rate": 5.673243149919512e-06, + "loss": 0.2698, + "step": 19850 + }, + { + "epoch": 4.630363805970149, + "grad_norm": 0.3957984543906955, + "learning_rate": 5.669036651071857e-06, + "loss": 0.274, + "step": 19855 + }, + { + "epoch": 4.6315298507462686, + "grad_norm": 0.40995172521649115, + "learning_rate": 5.664843136444054e-06, + "loss": 0.2818, + "step": 19860 + }, + { + "epoch": 4.632695895522388, + "grad_norm": 0.4106170054479251, + "learning_rate": 5.660662608530239e-06, + "loss": 0.2665, + "step": 19865 + }, + { + "epoch": 4.633861940298507, + "grad_norm": 0.41235426028584793, + "learning_rate": 5.6564950698168385e-06, + "loss": 0.2743, + "step": 19870 + }, + { + "epoch": 4.635027985074627, + "grad_norm": 0.39748498292025686, + "learning_rate": 5.652340522782542e-06, + "loss": 0.27, + "step": 19875 + }, + { + "epoch": 4.6361940298507465, + "grad_norm": 0.4066110480697797, + "learning_rate": 5.648198969898311e-06, + "loss": 0.2718, + "step": 19880 + }, + { + "epoch": 4.637360074626866, + "grad_norm": 0.3675449216703294, + "learning_rate": 5.644070413627386e-06, + "loss": 0.259, + "step": 19885 + }, + { + "epoch": 4.638526119402985, + "grad_norm": 0.4288644330626841, + "learning_rate": 5.639954856425273e-06, + "loss": 0.2897, + "step": 19890 + }, + { + "epoch": 4.639692164179104, + "grad_norm": 0.3938760486236866, + "learning_rate": 5.6358523007397485e-06, + "loss": 0.2795, + "step": 19895 + }, + { + "epoch": 4.6408582089552235, + "grad_norm": 0.3837875410548764, + "learning_rate": 5.631762749010855e-06, + "loss": 0.2626, + "step": 19900 + }, + { + "epoch": 4.642024253731344, + "grad_norm": 0.37882241259163735, + "learning_rate": 5.6276862036709e-06, + "loss": 0.2838, + "step": 19905 + }, + { + "epoch": 4.643190298507463, + "grad_norm": 0.4461753407156029, + "learning_rate": 5.6236226671444555e-06, + "loss": 0.2764, + "step": 19910 + }, + { + "epoch": 4.644356343283582, + "grad_norm": 0.38132448693471266, + "learning_rate": 5.619572141848358e-06, + "loss": 0.2685, + "step": 19915 + }, + { + "epoch": 4.645522388059701, + "grad_norm": 0.3946667902969425, + "learning_rate": 5.615534630191708e-06, + "loss": 0.2691, + "step": 19920 + }, + { + "epoch": 4.646688432835821, + "grad_norm": 0.42992782334530544, + "learning_rate": 5.611510134575859e-06, + "loss": 0.2721, + "step": 19925 + }, + { + "epoch": 4.64785447761194, + "grad_norm": 0.41687010629281285, + "learning_rate": 5.607498657394424e-06, + "loss": 0.2834, + "step": 19930 + }, + { + "epoch": 4.64902052238806, + "grad_norm": 0.41904424231712306, + "learning_rate": 5.603500201033285e-06, + "loss": 0.2925, + "step": 19935 + }, + { + "epoch": 4.650186567164179, + "grad_norm": 0.41797449445289314, + "learning_rate": 5.59951476787056e-06, + "loss": 0.2607, + "step": 19940 + }, + { + "epoch": 4.651352611940299, + "grad_norm": 0.4167939162797424, + "learning_rate": 5.595542360276636e-06, + "loss": 0.2692, + "step": 19945 + }, + { + "epoch": 4.652518656716418, + "grad_norm": 0.40344143571319735, + "learning_rate": 5.591582980614151e-06, + "loss": 0.2705, + "step": 19950 + }, + { + "epoch": 4.653684701492537, + "grad_norm": 0.4342451674505531, + "learning_rate": 5.587636631237991e-06, + "loss": 0.2732, + "step": 19955 + }, + { + "epoch": 4.654850746268656, + "grad_norm": 0.3938663686243035, + "learning_rate": 5.583703314495294e-06, + "loss": 0.2678, + "step": 19960 + }, + { + "epoch": 4.6560167910447765, + "grad_norm": 0.391969984782422, + "learning_rate": 5.579783032725441e-06, + "loss": 0.284, + "step": 19965 + }, + { + "epoch": 4.657182835820896, + "grad_norm": 0.4009376587634659, + "learning_rate": 5.5758757882600706e-06, + "loss": 0.2675, + "step": 19970 + }, + { + "epoch": 4.658348880597015, + "grad_norm": 0.40241606810174496, + "learning_rate": 5.57198158342306e-06, + "loss": 0.2852, + "step": 19975 + }, + { + "epoch": 4.659514925373134, + "grad_norm": 0.4247647502726461, + "learning_rate": 5.568100420530533e-06, + "loss": 0.2768, + "step": 19980 + }, + { + "epoch": 4.6606809701492535, + "grad_norm": 0.4084528798804982, + "learning_rate": 5.5642323018908595e-06, + "loss": 0.2786, + "step": 19985 + }, + { + "epoch": 4.661847014925373, + "grad_norm": 0.4074659575509635, + "learning_rate": 5.560377229804644e-06, + "loss": 0.27, + "step": 19990 + }, + { + "epoch": 4.663013059701493, + "grad_norm": 0.3820600401203203, + "learning_rate": 5.556535206564733e-06, + "loss": 0.2834, + "step": 19995 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.40351571148753534, + "learning_rate": 5.55270623445622e-06, + "loss": 0.2648, + "step": 20000 + }, + { + "epoch": 4.6653451492537314, + "grad_norm": 0.3964258851523608, + "learning_rate": 5.548890315756433e-06, + "loss": 0.2741, + "step": 20005 + }, + { + "epoch": 4.666511194029851, + "grad_norm": 0.37152087763676445, + "learning_rate": 5.545087452734928e-06, + "loss": 0.2822, + "step": 20010 + }, + { + "epoch": 4.66767723880597, + "grad_norm": 0.4253016328676518, + "learning_rate": 5.541297647653505e-06, + "loss": 0.2714, + "step": 20015 + }, + { + "epoch": 4.668843283582089, + "grad_norm": 0.3998650134106237, + "learning_rate": 5.537520902766193e-06, + "loss": 0.2919, + "step": 20020 + }, + { + "epoch": 4.6700093283582085, + "grad_norm": 0.40601634420089444, + "learning_rate": 5.533757220319257e-06, + "loss": 0.2767, + "step": 20025 + }, + { + "epoch": 4.671175373134329, + "grad_norm": 0.41907168348590956, + "learning_rate": 5.5300066025511885e-06, + "loss": 0.2694, + "step": 20030 + }, + { + "epoch": 4.672341417910448, + "grad_norm": 0.4157035283843186, + "learning_rate": 5.526269051692717e-06, + "loss": 0.286, + "step": 20035 + }, + { + "epoch": 4.673507462686567, + "grad_norm": 0.39126692904238075, + "learning_rate": 5.522544569966786e-06, + "loss": 0.2747, + "step": 20040 + }, + { + "epoch": 4.674673507462686, + "grad_norm": 0.4174392835172584, + "learning_rate": 5.518833159588582e-06, + "loss": 0.2715, + "step": 20045 + }, + { + "epoch": 4.675839552238806, + "grad_norm": 0.37641835072158225, + "learning_rate": 5.515134822765504e-06, + "loss": 0.2629, + "step": 20050 + }, + { + "epoch": 4.677005597014926, + "grad_norm": 0.4158241667768276, + "learning_rate": 5.511449561697183e-06, + "loss": 0.2758, + "step": 20055 + }, + { + "epoch": 4.678171641791045, + "grad_norm": 0.40241732091317955, + "learning_rate": 5.507777378575474e-06, + "loss": 0.2828, + "step": 20060 + }, + { + "epoch": 4.679337686567164, + "grad_norm": 0.40779033540993415, + "learning_rate": 5.504118275584444e-06, + "loss": 0.269, + "step": 20065 + }, + { + "epoch": 4.680503731343284, + "grad_norm": 0.42844495534181765, + "learning_rate": 5.500472254900392e-06, + "loss": 0.2926, + "step": 20070 + }, + { + "epoch": 4.681669776119403, + "grad_norm": 0.38903131772197835, + "learning_rate": 5.49683931869183e-06, + "loss": 0.2604, + "step": 20075 + }, + { + "epoch": 4.682835820895522, + "grad_norm": 0.39269896031724016, + "learning_rate": 5.4932194691194905e-06, + "loss": 0.2666, + "step": 20080 + }, + { + "epoch": 4.684001865671641, + "grad_norm": 0.4428629130930986, + "learning_rate": 5.489612708336324e-06, + "loss": 0.2769, + "step": 20085 + }, + { + "epoch": 4.6851679104477615, + "grad_norm": 0.42070859736541616, + "learning_rate": 5.486019038487483e-06, + "loss": 0.2842, + "step": 20090 + }, + { + "epoch": 4.686333955223881, + "grad_norm": 0.41839978965281005, + "learning_rate": 5.482438461710355e-06, + "loss": 0.2792, + "step": 20095 + }, + { + "epoch": 4.6875, + "grad_norm": 0.41402089350167637, + "learning_rate": 5.4788709801345244e-06, + "loss": 0.2749, + "step": 20100 + }, + { + "epoch": 4.688666044776119, + "grad_norm": 0.41518823880099653, + "learning_rate": 5.475316595881796e-06, + "loss": 0.2755, + "step": 20105 + }, + { + "epoch": 4.6898320895522385, + "grad_norm": 0.4041644682151216, + "learning_rate": 5.471775311066177e-06, + "loss": 0.2878, + "step": 20110 + }, + { + "epoch": 4.690998134328359, + "grad_norm": 0.3913561730786025, + "learning_rate": 5.468247127793893e-06, + "loss": 0.2834, + "step": 20115 + }, + { + "epoch": 4.692164179104478, + "grad_norm": 0.4061629665339246, + "learning_rate": 5.464732048163365e-06, + "loss": 0.2719, + "step": 20120 + }, + { + "epoch": 4.693330223880597, + "grad_norm": 0.3703512330712156, + "learning_rate": 5.461230074265233e-06, + "loss": 0.2889, + "step": 20125 + }, + { + "epoch": 4.694496268656716, + "grad_norm": 0.39130635011992626, + "learning_rate": 5.4577412081823355e-06, + "loss": 0.2715, + "step": 20130 + }, + { + "epoch": 4.695662313432836, + "grad_norm": 0.40949348608109915, + "learning_rate": 5.45426545198972e-06, + "loss": 0.2838, + "step": 20135 + }, + { + "epoch": 4.696828358208955, + "grad_norm": 0.4139020715594625, + "learning_rate": 5.450802807754625e-06, + "loss": 0.2875, + "step": 20140 + }, + { + "epoch": 4.697994402985074, + "grad_norm": 0.4078866618718476, + "learning_rate": 5.4473532775365026e-06, + "loss": 0.2799, + "step": 20145 + }, + { + "epoch": 4.699160447761194, + "grad_norm": 0.4337431895927768, + "learning_rate": 5.443916863387002e-06, + "loss": 0.2739, + "step": 20150 + }, + { + "epoch": 4.700326492537314, + "grad_norm": 0.40120045642838176, + "learning_rate": 5.4404935673499685e-06, + "loss": 0.2797, + "step": 20155 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.3969535063563502, + "learning_rate": 5.437083391461452e-06, + "loss": 0.2804, + "step": 20160 + }, + { + "epoch": 4.702658582089552, + "grad_norm": 0.42195915136841367, + "learning_rate": 5.43368633774969e-06, + "loss": 0.2832, + "step": 20165 + }, + { + "epoch": 4.703824626865671, + "grad_norm": 0.4090587649795599, + "learning_rate": 5.43030240823512e-06, + "loss": 0.2707, + "step": 20170 + }, + { + "epoch": 4.7049906716417915, + "grad_norm": 0.39695667224760095, + "learning_rate": 5.426931604930375e-06, + "loss": 0.259, + "step": 20175 + }, + { + "epoch": 4.706156716417911, + "grad_norm": 0.415981260709335, + "learning_rate": 5.423573929840277e-06, + "loss": 0.2788, + "step": 20180 + }, + { + "epoch": 4.70732276119403, + "grad_norm": 0.42426830296680235, + "learning_rate": 5.420229384961847e-06, + "loss": 0.2759, + "step": 20185 + }, + { + "epoch": 4.708488805970149, + "grad_norm": 0.41406649717023797, + "learning_rate": 5.416897972284287e-06, + "loss": 0.2718, + "step": 20190 + }, + { + "epoch": 4.7096548507462686, + "grad_norm": 0.38536876981679996, + "learning_rate": 5.413579693788995e-06, + "loss": 0.269, + "step": 20195 + }, + { + "epoch": 4.710820895522388, + "grad_norm": 0.3833330688114252, + "learning_rate": 5.410274551449559e-06, + "loss": 0.2845, + "step": 20200 + }, + { + "epoch": 4.711986940298507, + "grad_norm": 0.3994575259692321, + "learning_rate": 5.406982547231746e-06, + "loss": 0.2707, + "step": 20205 + }, + { + "epoch": 4.713152985074627, + "grad_norm": 0.37610555057233974, + "learning_rate": 5.403703683093517e-06, + "loss": 0.2724, + "step": 20210 + }, + { + "epoch": 4.7143190298507465, + "grad_norm": 0.39062542516181153, + "learning_rate": 5.400437960985017e-06, + "loss": 0.2829, + "step": 20215 + }, + { + "epoch": 4.715485074626866, + "grad_norm": 0.4061890750292189, + "learning_rate": 5.397185382848568e-06, + "loss": 0.2806, + "step": 20220 + }, + { + "epoch": 4.716651119402985, + "grad_norm": 0.4034494158687929, + "learning_rate": 5.393945950618678e-06, + "loss": 0.281, + "step": 20225 + }, + { + "epoch": 4.717817164179104, + "grad_norm": 0.404306940460495, + "learning_rate": 5.39071966622204e-06, + "loss": 0.2787, + "step": 20230 + }, + { + "epoch": 4.7189832089552235, + "grad_norm": 0.3952795988985536, + "learning_rate": 5.387506531577523e-06, + "loss": 0.2818, + "step": 20235 + }, + { + "epoch": 4.720149253731344, + "grad_norm": 0.4097339875261114, + "learning_rate": 5.384306548596178e-06, + "loss": 0.276, + "step": 20240 + }, + { + "epoch": 4.721315298507463, + "grad_norm": 0.4143551337285383, + "learning_rate": 5.3811197191812296e-06, + "loss": 0.2694, + "step": 20245 + }, + { + "epoch": 4.722481343283582, + "grad_norm": 0.3925335566603688, + "learning_rate": 5.377946045228084e-06, + "loss": 0.2679, + "step": 20250 + }, + { + "epoch": 4.723647388059701, + "grad_norm": 0.3979737513923886, + "learning_rate": 5.374785528624317e-06, + "loss": 0.2969, + "step": 20255 + }, + { + "epoch": 4.724813432835821, + "grad_norm": 0.4122181264677655, + "learning_rate": 5.37163817124969e-06, + "loss": 0.273, + "step": 20260 + }, + { + "epoch": 4.72597947761194, + "grad_norm": 0.4062442845332021, + "learning_rate": 5.368503974976122e-06, + "loss": 0.2729, + "step": 20265 + }, + { + "epoch": 4.72714552238806, + "grad_norm": 0.4388084829839234, + "learning_rate": 5.36538294166772e-06, + "loss": 0.284, + "step": 20270 + }, + { + "epoch": 4.728311567164179, + "grad_norm": 0.4110496964471569, + "learning_rate": 5.362275073180749e-06, + "loss": 0.2791, + "step": 20275 + }, + { + "epoch": 4.729477611940299, + "grad_norm": 0.42563170081055096, + "learning_rate": 5.3591803713636545e-06, + "loss": 0.2747, + "step": 20280 + }, + { + "epoch": 4.730643656716418, + "grad_norm": 0.4011040672858432, + "learning_rate": 5.3560988380570405e-06, + "loss": 0.2828, + "step": 20285 + }, + { + "epoch": 4.731809701492537, + "grad_norm": 0.38093855338310534, + "learning_rate": 5.353030475093694e-06, + "loss": 0.277, + "step": 20290 + }, + { + "epoch": 4.732975746268656, + "grad_norm": 0.3856859545089293, + "learning_rate": 5.349975284298552e-06, + "loss": 0.2767, + "step": 20295 + }, + { + "epoch": 4.7341417910447765, + "grad_norm": 0.4069441410447371, + "learning_rate": 5.346933267488726e-06, + "loss": 0.2815, + "step": 20300 + }, + { + "epoch": 4.735307835820896, + "grad_norm": 0.41705152172515086, + "learning_rate": 5.343904426473493e-06, + "loss": 0.2692, + "step": 20305 + }, + { + "epoch": 4.736473880597015, + "grad_norm": 0.40305160376437654, + "learning_rate": 5.340888763054291e-06, + "loss": 0.2832, + "step": 20310 + }, + { + "epoch": 4.737639925373134, + "grad_norm": 0.3821681930196034, + "learning_rate": 5.337886279024722e-06, + "loss": 0.2747, + "step": 20315 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.3773940462874887, + "learning_rate": 5.3348969761705446e-06, + "loss": 0.2735, + "step": 20320 + }, + { + "epoch": 4.739972014925373, + "grad_norm": 0.41765947000111353, + "learning_rate": 5.331920856269686e-06, + "loss": 0.2666, + "step": 20325 + }, + { + "epoch": 4.741138059701493, + "grad_norm": 0.392884893447926, + "learning_rate": 5.328957921092224e-06, + "loss": 0.277, + "step": 20330 + }, + { + "epoch": 4.742304104477612, + "grad_norm": 0.426386450404409, + "learning_rate": 5.326008172400402e-06, + "loss": 0.2887, + "step": 20335 + }, + { + "epoch": 4.7434701492537314, + "grad_norm": 0.4057923794315817, + "learning_rate": 5.323071611948619e-06, + "loss": 0.2664, + "step": 20340 + }, + { + "epoch": 4.744636194029851, + "grad_norm": 0.399774726871975, + "learning_rate": 5.320148241483422e-06, + "loss": 0.2701, + "step": 20345 + }, + { + "epoch": 4.74580223880597, + "grad_norm": 0.4391061103101001, + "learning_rate": 5.317238062743527e-06, + "loss": 0.2925, + "step": 20350 + }, + { + "epoch": 4.746968283582089, + "grad_norm": 0.3843696557078419, + "learning_rate": 5.31434107745979e-06, + "loss": 0.2648, + "step": 20355 + }, + { + "epoch": 4.7481343283582085, + "grad_norm": 0.38943130902787915, + "learning_rate": 5.311457287355232e-06, + "loss": 0.2735, + "step": 20360 + }, + { + "epoch": 4.749300373134329, + "grad_norm": 0.39352769937236864, + "learning_rate": 5.3085866941450185e-06, + "loss": 0.262, + "step": 20365 + }, + { + "epoch": 4.750466417910448, + "grad_norm": 0.3997169457448534, + "learning_rate": 5.3057292995364695e-06, + "loss": 0.2893, + "step": 20370 + }, + { + "epoch": 4.751632462686567, + "grad_norm": 0.4270993888536849, + "learning_rate": 5.302885105229052e-06, + "loss": 0.2727, + "step": 20375 + }, + { + "epoch": 4.752798507462686, + "grad_norm": 0.41379495154201595, + "learning_rate": 5.300054112914385e-06, + "loss": 0.2768, + "step": 20380 + }, + { + "epoch": 4.753964552238806, + "grad_norm": 0.42526662832515993, + "learning_rate": 5.297236324276231e-06, + "loss": 0.2741, + "step": 20385 + }, + { + "epoch": 4.755130597014926, + "grad_norm": 0.4205437580875176, + "learning_rate": 5.294431740990509e-06, + "loss": 0.2599, + "step": 20390 + }, + { + "epoch": 4.756296641791045, + "grad_norm": 0.42970850645324027, + "learning_rate": 5.291640364725272e-06, + "loss": 0.2776, + "step": 20395 + }, + { + "epoch": 4.757462686567164, + "grad_norm": 0.42266879394528883, + "learning_rate": 5.288862197140726e-06, + "loss": 0.2771, + "step": 20400 + }, + { + "epoch": 4.758628731343284, + "grad_norm": 0.4144657388495723, + "learning_rate": 5.286097239889219e-06, + "loss": 0.2891, + "step": 20405 + }, + { + "epoch": 4.759794776119403, + "grad_norm": 0.40661242470869285, + "learning_rate": 5.283345494615238e-06, + "loss": 0.2723, + "step": 20410 + }, + { + "epoch": 4.760960820895522, + "grad_norm": 0.4191269034167923, + "learning_rate": 5.280606962955423e-06, + "loss": 0.2771, + "step": 20415 + }, + { + "epoch": 4.762126865671641, + "grad_norm": 0.4030049130565329, + "learning_rate": 5.277881646538537e-06, + "loss": 0.281, + "step": 20420 + }, + { + "epoch": 4.7632929104477615, + "grad_norm": 0.3973846618368994, + "learning_rate": 5.275169546985502e-06, + "loss": 0.2748, + "step": 20425 + }, + { + "epoch": 4.764458955223881, + "grad_norm": 0.4012487763683455, + "learning_rate": 5.272470665909368e-06, + "loss": 0.2681, + "step": 20430 + }, + { + "epoch": 4.765625, + "grad_norm": 0.3917302257351195, + "learning_rate": 5.269785004915328e-06, + "loss": 0.2738, + "step": 20435 + }, + { + "epoch": 4.766791044776119, + "grad_norm": 0.4015905245652249, + "learning_rate": 5.267112565600707e-06, + "loss": 0.2827, + "step": 20440 + }, + { + "epoch": 4.7679570895522385, + "grad_norm": 0.41158626768188383, + "learning_rate": 5.26445334955497e-06, + "loss": 0.2666, + "step": 20445 + }, + { + "epoch": 4.769123134328359, + "grad_norm": 0.4153865577892089, + "learning_rate": 5.261807358359719e-06, + "loss": 0.2785, + "step": 20450 + }, + { + "epoch": 4.770289179104478, + "grad_norm": 0.4074937342163048, + "learning_rate": 5.259174593588688e-06, + "loss": 0.2771, + "step": 20455 + }, + { + "epoch": 4.771455223880597, + "grad_norm": 0.42260098375275656, + "learning_rate": 5.25655505680774e-06, + "loss": 0.2706, + "step": 20460 + }, + { + "epoch": 4.772621268656716, + "grad_norm": 0.4386691660849028, + "learning_rate": 5.253948749574879e-06, + "loss": 0.2911, + "step": 20465 + }, + { + "epoch": 4.773787313432836, + "grad_norm": 0.389190161514281, + "learning_rate": 5.2513556734402384e-06, + "loss": 0.2659, + "step": 20470 + }, + { + "epoch": 4.774953358208955, + "grad_norm": 0.3967901724050337, + "learning_rate": 5.248775829946076e-06, + "loss": 0.2783, + "step": 20475 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.397550656014172, + "learning_rate": 5.2462092206267864e-06, + "loss": 0.2783, + "step": 20480 + }, + { + "epoch": 4.777285447761194, + "grad_norm": 0.405818189936009, + "learning_rate": 5.243655847008888e-06, + "loss": 0.2764, + "step": 20485 + }, + { + "epoch": 4.778451492537314, + "grad_norm": 0.4079539673137217, + "learning_rate": 5.241115710611033e-06, + "loss": 0.2582, + "step": 20490 + }, + { + "epoch": 4.779617537313433, + "grad_norm": 0.41731085220924946, + "learning_rate": 5.2385888129439934e-06, + "loss": 0.2878, + "step": 20495 + }, + { + "epoch": 4.780783582089552, + "grad_norm": 0.41954206535870575, + "learning_rate": 5.236075155510675e-06, + "loss": 0.2878, + "step": 20500 + }, + { + "epoch": 4.781949626865671, + "grad_norm": 0.3800071249584396, + "learning_rate": 5.2335747398061e-06, + "loss": 0.2631, + "step": 20505 + }, + { + "epoch": 4.7831156716417915, + "grad_norm": 0.405722668641218, + "learning_rate": 5.231087567317425e-06, + "loss": 0.2655, + "step": 20510 + }, + { + "epoch": 4.784281716417911, + "grad_norm": 0.4223186540224272, + "learning_rate": 5.228613639523922e-06, + "loss": 0.2822, + "step": 20515 + }, + { + "epoch": 4.78544776119403, + "grad_norm": 0.3747955830740057, + "learning_rate": 5.2261529578969905e-06, + "loss": 0.2602, + "step": 20520 + }, + { + "epoch": 4.786613805970149, + "grad_norm": 0.4166857677830499, + "learning_rate": 5.223705523900145e-06, + "loss": 0.2692, + "step": 20525 + }, + { + "epoch": 4.7877798507462686, + "grad_norm": 0.39583760675436, + "learning_rate": 5.22127133898903e-06, + "loss": 0.2763, + "step": 20530 + }, + { + "epoch": 4.788945895522388, + "grad_norm": 0.38369609698701834, + "learning_rate": 5.2188504046114005e-06, + "loss": 0.2838, + "step": 20535 + }, + { + "epoch": 4.790111940298507, + "grad_norm": 0.41067503935590915, + "learning_rate": 5.216442722207141e-06, + "loss": 0.2719, + "step": 20540 + }, + { + "epoch": 4.791277985074627, + "grad_norm": 0.4185264107269963, + "learning_rate": 5.214048293208246e-06, + "loss": 0.2776, + "step": 20545 + }, + { + "epoch": 4.7924440298507465, + "grad_norm": 0.40455556927216235, + "learning_rate": 5.211667119038829e-06, + "loss": 0.2892, + "step": 20550 + }, + { + "epoch": 4.793610074626866, + "grad_norm": 0.38896140722711875, + "learning_rate": 5.209299201115125e-06, + "loss": 0.2715, + "step": 20555 + }, + { + "epoch": 4.794776119402985, + "grad_norm": 0.39473402626185594, + "learning_rate": 5.206944540845476e-06, + "loss": 0.2725, + "step": 20560 + }, + { + "epoch": 4.795942164179104, + "grad_norm": 0.4181201762247158, + "learning_rate": 5.204603139630345e-06, + "loss": 0.2802, + "step": 20565 + }, + { + "epoch": 4.7971082089552235, + "grad_norm": 0.40088658611859335, + "learning_rate": 5.202274998862312e-06, + "loss": 0.2852, + "step": 20570 + }, + { + "epoch": 4.798274253731344, + "grad_norm": 0.42271892088067387, + "learning_rate": 5.199960119926059e-06, + "loss": 0.2796, + "step": 20575 + }, + { + "epoch": 4.799440298507463, + "grad_norm": 0.41903168028385773, + "learning_rate": 5.197658504198392e-06, + "loss": 0.2808, + "step": 20580 + }, + { + "epoch": 4.800606343283582, + "grad_norm": 0.38513278786793087, + "learning_rate": 5.1953701530482215e-06, + "loss": 0.2755, + "step": 20585 + }, + { + "epoch": 4.801772388059701, + "grad_norm": 0.4143591996979099, + "learning_rate": 5.1930950678365715e-06, + "loss": 0.2923, + "step": 20590 + }, + { + "epoch": 4.802938432835821, + "grad_norm": 0.4141765979120398, + "learning_rate": 5.190833249916577e-06, + "loss": 0.2837, + "step": 20595 + }, + { + "epoch": 4.80410447761194, + "grad_norm": 0.4111899582436221, + "learning_rate": 5.188584700633478e-06, + "loss": 0.2632, + "step": 20600 + }, + { + "epoch": 4.80527052238806, + "grad_norm": 0.40661213239556154, + "learning_rate": 5.186349421324627e-06, + "loss": 0.273, + "step": 20605 + }, + { + "epoch": 4.806436567164179, + "grad_norm": 0.4241697822125163, + "learning_rate": 5.184127413319482e-06, + "loss": 0.2669, + "step": 20610 + }, + { + "epoch": 4.807602611940299, + "grad_norm": 0.3873464057053273, + "learning_rate": 5.181918677939608e-06, + "loss": 0.2617, + "step": 20615 + }, + { + "epoch": 4.808768656716418, + "grad_norm": 0.39057400514464946, + "learning_rate": 5.179723216498677e-06, + "loss": 0.2688, + "step": 20620 + }, + { + "epoch": 4.809934701492537, + "grad_norm": 0.4436534986256238, + "learning_rate": 5.177541030302462e-06, + "loss": 0.2889, + "step": 20625 + }, + { + "epoch": 4.811100746268656, + "grad_norm": 0.39557667111546235, + "learning_rate": 5.17537212064885e-06, + "loss": 0.2797, + "step": 20630 + }, + { + "epoch": 4.8122667910447765, + "grad_norm": 0.4025533900646421, + "learning_rate": 5.173216488827822e-06, + "loss": 0.2665, + "step": 20635 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.4099993726819529, + "learning_rate": 5.171074136121461e-06, + "loss": 0.27, + "step": 20640 + }, + { + "epoch": 4.814598880597015, + "grad_norm": 0.4017661316263101, + "learning_rate": 5.168945063803962e-06, + "loss": 0.2768, + "step": 20645 + }, + { + "epoch": 4.815764925373134, + "grad_norm": 0.44565005631447957, + "learning_rate": 5.166829273141612e-06, + "loss": 0.284, + "step": 20650 + }, + { + "epoch": 4.8169309701492535, + "grad_norm": 0.37438785755230786, + "learning_rate": 5.164726765392805e-06, + "loss": 0.265, + "step": 20655 + }, + { + "epoch": 4.818097014925373, + "grad_norm": 0.4614441214998527, + "learning_rate": 5.162637541808031e-06, + "loss": 0.2807, + "step": 20660 + }, + { + "epoch": 4.819263059701493, + "grad_norm": 0.4175894918737853, + "learning_rate": 5.16056160362988e-06, + "loss": 0.2826, + "step": 20665 + }, + { + "epoch": 4.820429104477612, + "grad_norm": 0.4080039712506842, + "learning_rate": 5.158498952093038e-06, + "loss": 0.2728, + "step": 20670 + }, + { + "epoch": 4.8215951492537314, + "grad_norm": 0.40019707337954497, + "learning_rate": 5.156449588424295e-06, + "loss": 0.28, + "step": 20675 + }, + { + "epoch": 4.822761194029851, + "grad_norm": 0.3877080707890139, + "learning_rate": 5.154413513842533e-06, + "loss": 0.258, + "step": 20680 + }, + { + "epoch": 4.82392723880597, + "grad_norm": 0.4002480845719403, + "learning_rate": 5.152390729558727e-06, + "loss": 0.2751, + "step": 20685 + }, + { + "epoch": 4.825093283582089, + "grad_norm": 0.40572437026470765, + "learning_rate": 5.1503812367759575e-06, + "loss": 0.2758, + "step": 20690 + }, + { + "epoch": 4.8262593283582085, + "grad_norm": 0.4166201389331063, + "learning_rate": 5.148385036689391e-06, + "loss": 0.2856, + "step": 20695 + }, + { + "epoch": 4.827425373134329, + "grad_norm": 0.42408475063675277, + "learning_rate": 5.146402130486288e-06, + "loss": 0.2779, + "step": 20700 + }, + { + "epoch": 4.828591417910448, + "grad_norm": 0.40101898017000503, + "learning_rate": 5.144432519346011e-06, + "loss": 0.2742, + "step": 20705 + }, + { + "epoch": 4.829757462686567, + "grad_norm": 0.3928200240336515, + "learning_rate": 5.142476204440002e-06, + "loss": 0.2715, + "step": 20710 + }, + { + "epoch": 4.830923507462686, + "grad_norm": 0.4181371835163029, + "learning_rate": 5.140533186931809e-06, + "loss": 0.2856, + "step": 20715 + }, + { + "epoch": 4.832089552238806, + "grad_norm": 0.4010293431365239, + "learning_rate": 5.138603467977062e-06, + "loss": 0.2756, + "step": 20720 + }, + { + "epoch": 4.833255597014926, + "grad_norm": 0.411341035638822, + "learning_rate": 5.136687048723483e-06, + "loss": 0.2804, + "step": 20725 + }, + { + "epoch": 4.834421641791045, + "grad_norm": 0.4148270501291078, + "learning_rate": 5.134783930310883e-06, + "loss": 0.2925, + "step": 20730 + }, + { + "epoch": 4.835587686567164, + "grad_norm": 0.4198749943133584, + "learning_rate": 5.132894113871167e-06, + "loss": 0.2797, + "step": 20735 + }, + { + "epoch": 4.836753731343284, + "grad_norm": 0.38647493430190527, + "learning_rate": 5.131017600528324e-06, + "loss": 0.2659, + "step": 20740 + }, + { + "epoch": 4.837919776119403, + "grad_norm": 0.4087886980424793, + "learning_rate": 5.129154391398433e-06, + "loss": 0.2764, + "step": 20745 + }, + { + "epoch": 4.839085820895522, + "grad_norm": 0.42352821566942966, + "learning_rate": 5.127304487589658e-06, + "loss": 0.2836, + "step": 20750 + }, + { + "epoch": 4.840251865671641, + "grad_norm": 0.4064733010483587, + "learning_rate": 5.12546789020225e-06, + "loss": 0.2763, + "step": 20755 + }, + { + "epoch": 4.8414179104477615, + "grad_norm": 0.4057512468643699, + "learning_rate": 5.123644600328549e-06, + "loss": 0.2799, + "step": 20760 + }, + { + "epoch": 4.842583955223881, + "grad_norm": 0.41425201830861147, + "learning_rate": 5.121834619052979e-06, + "loss": 0.2764, + "step": 20765 + }, + { + "epoch": 4.84375, + "grad_norm": 0.40693348888896497, + "learning_rate": 5.120037947452043e-06, + "loss": 0.2793, + "step": 20770 + }, + { + "epoch": 4.844916044776119, + "grad_norm": 0.37555379991166826, + "learning_rate": 5.118254586594335e-06, + "loss": 0.2669, + "step": 20775 + }, + { + "epoch": 4.8460820895522385, + "grad_norm": 0.3770990206275446, + "learning_rate": 5.116484537540532e-06, + "loss": 0.2713, + "step": 20780 + }, + { + "epoch": 4.847248134328359, + "grad_norm": 0.40450907830436417, + "learning_rate": 5.114727801343385e-06, + "loss": 0.2729, + "step": 20785 + }, + { + "epoch": 4.848414179104478, + "grad_norm": 0.4169318326002382, + "learning_rate": 5.11298437904774e-06, + "loss": 0.2838, + "step": 20790 + }, + { + "epoch": 4.849580223880597, + "grad_norm": 0.42606791499071295, + "learning_rate": 5.111254271690516e-06, + "loss": 0.2816, + "step": 20795 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.40524229727924804, + "learning_rate": 5.1095374803007115e-06, + "loss": 0.2656, + "step": 20800 + }, + { + "epoch": 4.851912313432836, + "grad_norm": 0.4204196037155859, + "learning_rate": 5.107834005899409e-06, + "loss": 0.284, + "step": 20805 + }, + { + "epoch": 4.853078358208955, + "grad_norm": 0.4074731828219992, + "learning_rate": 5.1061438494997726e-06, + "loss": 0.2719, + "step": 20810 + }, + { + "epoch": 4.854244402985074, + "grad_norm": 0.41820534589092, + "learning_rate": 5.104467012107041e-06, + "loss": 0.2826, + "step": 20815 + }, + { + "epoch": 4.855410447761194, + "grad_norm": 0.4385629601178043, + "learning_rate": 5.102803494718532e-06, + "loss": 0.2795, + "step": 20820 + }, + { + "epoch": 4.856576492537314, + "grad_norm": 0.4117020983487212, + "learning_rate": 5.101153298323643e-06, + "loss": 0.2795, + "step": 20825 + }, + { + "epoch": 4.857742537313433, + "grad_norm": 0.40007321945644764, + "learning_rate": 5.099516423903844e-06, + "loss": 0.2743, + "step": 20830 + }, + { + "epoch": 4.858908582089552, + "grad_norm": 0.37531894798379556, + "learning_rate": 5.097892872432691e-06, + "loss": 0.2582, + "step": 20835 + }, + { + "epoch": 4.860074626865671, + "grad_norm": 0.3967798096406278, + "learning_rate": 5.096282644875807e-06, + "loss": 0.2684, + "step": 20840 + }, + { + "epoch": 4.8612406716417915, + "grad_norm": 0.4359728323670395, + "learning_rate": 5.094685742190896e-06, + "loss": 0.271, + "step": 20845 + }, + { + "epoch": 4.862406716417911, + "grad_norm": 0.40289826215219937, + "learning_rate": 5.093102165327729e-06, + "loss": 0.2769, + "step": 20850 + }, + { + "epoch": 4.86357276119403, + "grad_norm": 0.39137328077031774, + "learning_rate": 5.09153191522816e-06, + "loss": 0.27, + "step": 20855 + }, + { + "epoch": 4.864738805970149, + "grad_norm": 0.40514599405796825, + "learning_rate": 5.089974992826117e-06, + "loss": 0.2681, + "step": 20860 + }, + { + "epoch": 4.8659048507462686, + "grad_norm": 0.4011535322183252, + "learning_rate": 5.08843139904759e-06, + "loss": 0.2762, + "step": 20865 + }, + { + "epoch": 4.867070895522388, + "grad_norm": 0.3857881219293404, + "learning_rate": 5.086901134810658e-06, + "loss": 0.2683, + "step": 20870 + }, + { + "epoch": 4.868236940298507, + "grad_norm": 0.39294086703585596, + "learning_rate": 5.085384201025457e-06, + "loss": 0.275, + "step": 20875 + }, + { + "epoch": 4.869402985074627, + "grad_norm": 0.4397434877985555, + "learning_rate": 5.083880598594204e-06, + "loss": 0.2882, + "step": 20880 + }, + { + "epoch": 4.8705690298507465, + "grad_norm": 0.39389140326985644, + "learning_rate": 5.082390328411184e-06, + "loss": 0.2745, + "step": 20885 + }, + { + "epoch": 4.871735074626866, + "grad_norm": 0.39485809597519445, + "learning_rate": 5.080913391362749e-06, + "loss": 0.2759, + "step": 20890 + }, + { + "epoch": 4.872901119402985, + "grad_norm": 0.40063280239618126, + "learning_rate": 5.079449788327332e-06, + "loss": 0.2851, + "step": 20895 + }, + { + "epoch": 4.874067164179104, + "grad_norm": 0.3827772878754662, + "learning_rate": 5.0779995201754225e-06, + "loss": 0.2723, + "step": 20900 + }, + { + "epoch": 4.8752332089552235, + "grad_norm": 0.43225014747643237, + "learning_rate": 5.076562587769584e-06, + "loss": 0.2862, + "step": 20905 + }, + { + "epoch": 4.876399253731344, + "grad_norm": 0.41449429723546716, + "learning_rate": 5.07513899196445e-06, + "loss": 0.2693, + "step": 20910 + }, + { + "epoch": 4.877565298507463, + "grad_norm": 0.39608998902119325, + "learning_rate": 5.073728733606722e-06, + "loss": 0.2722, + "step": 20915 + }, + { + "epoch": 4.878731343283582, + "grad_norm": 0.4016620610164004, + "learning_rate": 5.072331813535166e-06, + "loss": 0.266, + "step": 20920 + }, + { + "epoch": 4.879897388059701, + "grad_norm": 0.4151739389003867, + "learning_rate": 5.070948232580618e-06, + "loss": 0.2728, + "step": 20925 + }, + { + "epoch": 4.881063432835821, + "grad_norm": 0.39938321055353626, + "learning_rate": 5.069577991565977e-06, + "loss": 0.2626, + "step": 20930 + }, + { + "epoch": 4.88222947761194, + "grad_norm": 0.4259708662374472, + "learning_rate": 5.06822109130621e-06, + "loss": 0.2882, + "step": 20935 + }, + { + "epoch": 4.88339552238806, + "grad_norm": 0.3950929612813323, + "learning_rate": 5.066877532608349e-06, + "loss": 0.2745, + "step": 20940 + }, + { + "epoch": 4.884561567164179, + "grad_norm": 0.44903327030586204, + "learning_rate": 5.065547316271494e-06, + "loss": 0.2878, + "step": 20945 + }, + { + "epoch": 4.885727611940299, + "grad_norm": 0.4123597349979514, + "learning_rate": 5.064230443086805e-06, + "loss": 0.2657, + "step": 20950 + }, + { + "epoch": 4.886893656716418, + "grad_norm": 0.39160944428542344, + "learning_rate": 5.062926913837507e-06, + "loss": 0.2764, + "step": 20955 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.44649339030687424, + "learning_rate": 5.06163672929889e-06, + "loss": 0.27, + "step": 20960 + }, + { + "epoch": 4.889225746268656, + "grad_norm": 0.40361681778558994, + "learning_rate": 5.060359890238305e-06, + "loss": 0.2793, + "step": 20965 + }, + { + "epoch": 4.8903917910447765, + "grad_norm": 0.4199921652860125, + "learning_rate": 5.059096397415167e-06, + "loss": 0.281, + "step": 20970 + }, + { + "epoch": 4.891557835820896, + "grad_norm": 0.3907780228757323, + "learning_rate": 5.057846251580957e-06, + "loss": 0.2673, + "step": 20975 + }, + { + "epoch": 4.892723880597015, + "grad_norm": 0.4156772282704941, + "learning_rate": 5.056609453479208e-06, + "loss": 0.2788, + "step": 20980 + }, + { + "epoch": 4.893889925373134, + "grad_norm": 0.39500932422262897, + "learning_rate": 5.055386003845524e-06, + "loss": 0.2743, + "step": 20985 + }, + { + "epoch": 4.8950559701492535, + "grad_norm": 0.40733193078774504, + "learning_rate": 5.0541759034075645e-06, + "loss": 0.2771, + "step": 20990 + }, + { + "epoch": 4.896222014925373, + "grad_norm": 0.39320460492035453, + "learning_rate": 5.0529791528850515e-06, + "loss": 0.2816, + "step": 20995 + }, + { + "epoch": 4.897388059701493, + "grad_norm": 0.3834414057678221, + "learning_rate": 5.051795752989764e-06, + "loss": 0.2659, + "step": 21000 + }, + { + "epoch": 4.898554104477612, + "grad_norm": 0.40682123041297497, + "learning_rate": 5.050625704425547e-06, + "loss": 0.2771, + "step": 21005 + }, + { + "epoch": 4.8997201492537314, + "grad_norm": 0.3873899334787755, + "learning_rate": 5.049469007888298e-06, + "loss": 0.2604, + "step": 21010 + }, + { + "epoch": 4.900886194029851, + "grad_norm": 0.4091484298868379, + "learning_rate": 5.048325664065975e-06, + "loss": 0.2844, + "step": 21015 + }, + { + "epoch": 4.90205223880597, + "grad_norm": 0.42939427693103144, + "learning_rate": 5.047195673638596e-06, + "loss": 0.2894, + "step": 21020 + }, + { + "epoch": 4.903218283582089, + "grad_norm": 0.41019724653897144, + "learning_rate": 5.046079037278237e-06, + "loss": 0.2652, + "step": 21025 + }, + { + "epoch": 4.9043843283582085, + "grad_norm": 0.3975076882210794, + "learning_rate": 5.044975755649028e-06, + "loss": 0.2738, + "step": 21030 + }, + { + "epoch": 4.905550373134329, + "grad_norm": 0.40728381591915686, + "learning_rate": 5.043885829407164e-06, + "loss": 0.2841, + "step": 21035 + }, + { + "epoch": 4.906716417910448, + "grad_norm": 0.41166680066812117, + "learning_rate": 5.042809259200885e-06, + "loss": 0.272, + "step": 21040 + }, + { + "epoch": 4.907882462686567, + "grad_norm": 0.42353064926068584, + "learning_rate": 5.041746045670495e-06, + "loss": 0.2811, + "step": 21045 + }, + { + "epoch": 4.909048507462686, + "grad_norm": 0.4080630065092933, + "learning_rate": 5.040696189448356e-06, + "loss": 0.2732, + "step": 21050 + }, + { + "epoch": 4.910214552238806, + "grad_norm": 0.40443150645082443, + "learning_rate": 5.039659691158878e-06, + "loss": 0.2628, + "step": 21055 + }, + { + "epoch": 4.911380597014926, + "grad_norm": 0.3962804134326119, + "learning_rate": 5.038636551418533e-06, + "loss": 0.271, + "step": 21060 + }, + { + "epoch": 4.912546641791045, + "grad_norm": 0.4361127930648337, + "learning_rate": 5.0376267708358455e-06, + "loss": 0.2826, + "step": 21065 + }, + { + "epoch": 4.913712686567164, + "grad_norm": 0.4042774117247444, + "learning_rate": 5.036630350011395e-06, + "loss": 0.2827, + "step": 21070 + }, + { + "epoch": 4.914878731343284, + "grad_norm": 0.3954326308091401, + "learning_rate": 5.03564728953781e-06, + "loss": 0.282, + "step": 21075 + }, + { + "epoch": 4.916044776119403, + "grad_norm": 0.41373453186212483, + "learning_rate": 5.034677589999783e-06, + "loss": 0.2768, + "step": 21080 + }, + { + "epoch": 4.917210820895522, + "grad_norm": 0.4055191287627898, + "learning_rate": 5.033721251974047e-06, + "loss": 0.266, + "step": 21085 + }, + { + "epoch": 4.918376865671641, + "grad_norm": 0.4484508716939671, + "learning_rate": 5.032778276029403e-06, + "loss": 0.272, + "step": 21090 + }, + { + "epoch": 4.9195429104477615, + "grad_norm": 0.4157892756366168, + "learning_rate": 5.031848662726692e-06, + "loss": 0.2593, + "step": 21095 + }, + { + "epoch": 4.920708955223881, + "grad_norm": 0.4107094443613056, + "learning_rate": 5.030932412618815e-06, + "loss": 0.2769, + "step": 21100 + }, + { + "epoch": 4.921875, + "grad_norm": 0.42663644972197884, + "learning_rate": 5.030029526250719e-06, + "loss": 0.2715, + "step": 21105 + }, + { + "epoch": 4.923041044776119, + "grad_norm": 0.4256142655857328, + "learning_rate": 5.029140004159409e-06, + "loss": 0.2791, + "step": 21110 + }, + { + "epoch": 4.9242070895522385, + "grad_norm": 0.4328807366078749, + "learning_rate": 5.028263846873938e-06, + "loss": 0.2797, + "step": 21115 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.42725135225287103, + "learning_rate": 5.02740105491541e-06, + "loss": 0.2702, + "step": 21120 + }, + { + "epoch": 4.926539179104478, + "grad_norm": 0.40976720684938533, + "learning_rate": 5.026551628796982e-06, + "loss": 0.2933, + "step": 21125 + }, + { + "epoch": 4.927705223880597, + "grad_norm": 0.3838294277414686, + "learning_rate": 5.025715569023859e-06, + "loss": 0.2745, + "step": 21130 + }, + { + "epoch": 4.928871268656716, + "grad_norm": 0.40850157188644237, + "learning_rate": 5.024892876093299e-06, + "loss": 0.2697, + "step": 21135 + }, + { + "epoch": 4.930037313432836, + "grad_norm": 0.41855334152521473, + "learning_rate": 5.024083550494606e-06, + "loss": 0.2777, + "step": 21140 + }, + { + "epoch": 4.931203358208955, + "grad_norm": 0.40228325958343863, + "learning_rate": 5.023287592709136e-06, + "loss": 0.2701, + "step": 21145 + }, + { + "epoch": 4.932369402985074, + "grad_norm": 0.38806415061591565, + "learning_rate": 5.0225050032102965e-06, + "loss": 0.2734, + "step": 21150 + }, + { + "epoch": 4.933535447761194, + "grad_norm": 0.4301196796501381, + "learning_rate": 5.021735782463537e-06, + "loss": 0.2765, + "step": 21155 + }, + { + "epoch": 4.934701492537314, + "grad_norm": 0.4125398239917131, + "learning_rate": 5.020979930926365e-06, + "loss": 0.2778, + "step": 21160 + }, + { + "epoch": 4.935867537313433, + "grad_norm": 0.3839116135692184, + "learning_rate": 5.020237449048333e-06, + "loss": 0.2711, + "step": 21165 + }, + { + "epoch": 4.937033582089552, + "grad_norm": 0.4031583385870646, + "learning_rate": 5.0195083372710345e-06, + "loss": 0.2804, + "step": 21170 + }, + { + "epoch": 4.938199626865671, + "grad_norm": 0.3808448155322725, + "learning_rate": 5.018792596028123e-06, + "loss": 0.2688, + "step": 21175 + }, + { + "epoch": 4.9393656716417915, + "grad_norm": 0.42703062283967935, + "learning_rate": 5.018090225745291e-06, + "loss": 0.2714, + "step": 21180 + }, + { + "epoch": 4.940531716417911, + "grad_norm": 0.41642469146543964, + "learning_rate": 5.017401226840284e-06, + "loss": 0.2886, + "step": 21185 + }, + { + "epoch": 4.94169776119403, + "grad_norm": 0.40773956431401676, + "learning_rate": 5.016725599722889e-06, + "loss": 0.2887, + "step": 21190 + }, + { + "epoch": 4.942863805970149, + "grad_norm": 0.3887940281318315, + "learning_rate": 5.016063344794947e-06, + "loss": 0.2744, + "step": 21195 + }, + { + "epoch": 4.9440298507462686, + "grad_norm": 0.4157645935918462, + "learning_rate": 5.0154144624503365e-06, + "loss": 0.2725, + "step": 21200 + }, + { + "epoch": 4.945195895522388, + "grad_norm": 0.4210318173390705, + "learning_rate": 5.014778953074992e-06, + "loss": 0.2859, + "step": 21205 + }, + { + "epoch": 4.946361940298507, + "grad_norm": 0.3981207785886553, + "learning_rate": 5.014156817046891e-06, + "loss": 0.2792, + "step": 21210 + }, + { + "epoch": 4.947527985074627, + "grad_norm": 0.41941067877749943, + "learning_rate": 5.013548054736049e-06, + "loss": 0.2833, + "step": 21215 + }, + { + "epoch": 4.9486940298507465, + "grad_norm": 0.39804302722844526, + "learning_rate": 5.012952666504542e-06, + "loss": 0.2754, + "step": 21220 + }, + { + "epoch": 4.949860074626866, + "grad_norm": 0.39807259496106023, + "learning_rate": 5.012370652706484e-06, + "loss": 0.28, + "step": 21225 + }, + { + "epoch": 4.951026119402985, + "grad_norm": 0.41755613450166845, + "learning_rate": 5.011802013688029e-06, + "loss": 0.2866, + "step": 21230 + }, + { + "epoch": 4.952192164179104, + "grad_norm": 0.42508109516027964, + "learning_rate": 5.011246749787385e-06, + "loss": 0.2857, + "step": 21235 + }, + { + "epoch": 4.9533582089552235, + "grad_norm": 0.4049200674752226, + "learning_rate": 5.010704861334803e-06, + "loss": 0.2645, + "step": 21240 + }, + { + "epoch": 4.954524253731344, + "grad_norm": 0.40554740307454185, + "learning_rate": 5.010176348652576e-06, + "loss": 0.2879, + "step": 21245 + }, + { + "epoch": 4.955690298507463, + "grad_norm": 0.4171370392183362, + "learning_rate": 5.0096612120550436e-06, + "loss": 0.2748, + "step": 21250 + }, + { + "epoch": 4.956856343283582, + "grad_norm": 0.4135318586798704, + "learning_rate": 5.009159451848587e-06, + "loss": 0.276, + "step": 21255 + }, + { + "epoch": 4.958022388059701, + "grad_norm": 0.41017795039602367, + "learning_rate": 5.008671068331634e-06, + "loss": 0.2835, + "step": 21260 + }, + { + "epoch": 4.959188432835821, + "grad_norm": 0.38694873358561843, + "learning_rate": 5.00819606179466e-06, + "loss": 0.2702, + "step": 21265 + }, + { + "epoch": 4.96035447761194, + "grad_norm": 0.41101565067261825, + "learning_rate": 5.007734432520179e-06, + "loss": 0.2918, + "step": 21270 + }, + { + "epoch": 4.96152052238806, + "grad_norm": 0.40074163029173104, + "learning_rate": 5.0072861807827505e-06, + "loss": 0.2789, + "step": 21275 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.4234714500967549, + "learning_rate": 5.0068513068489765e-06, + "loss": 0.2775, + "step": 21280 + }, + { + "epoch": 4.963852611940299, + "grad_norm": 0.39278240101882295, + "learning_rate": 5.0064298109775035e-06, + "loss": 0.2665, + "step": 21285 + }, + { + "epoch": 4.965018656716418, + "grad_norm": 0.40837934561077827, + "learning_rate": 5.006021693419021e-06, + "loss": 0.2788, + "step": 21290 + }, + { + "epoch": 4.966184701492537, + "grad_norm": 0.4142150863607586, + "learning_rate": 5.0056269544162635e-06, + "loss": 0.2732, + "step": 21295 + }, + { + "epoch": 4.967350746268656, + "grad_norm": 0.40573497340467674, + "learning_rate": 5.0052455942040045e-06, + "loss": 0.2821, + "step": 21300 + }, + { + "epoch": 4.9685167910447765, + "grad_norm": 0.39800806054666515, + "learning_rate": 5.004877613009064e-06, + "loss": 0.2818, + "step": 21305 + }, + { + "epoch": 4.969682835820896, + "grad_norm": 0.38979452945694254, + "learning_rate": 5.0045230110503e-06, + "loss": 0.267, + "step": 21310 + }, + { + "epoch": 4.970848880597015, + "grad_norm": 0.42055164882379603, + "learning_rate": 5.00418178853862e-06, + "loss": 0.2715, + "step": 21315 + }, + { + "epoch": 4.972014925373134, + "grad_norm": 0.42732687241285094, + "learning_rate": 5.003853945676969e-06, + "loss": 0.2833, + "step": 21320 + }, + { + "epoch": 4.9731809701492535, + "grad_norm": 0.38840733087442564, + "learning_rate": 5.0035394826603345e-06, + "loss": 0.2809, + "step": 21325 + }, + { + "epoch": 4.974347014925373, + "grad_norm": 0.41284386720082655, + "learning_rate": 5.003238399675746e-06, + "loss": 0.2728, + "step": 21330 + }, + { + "epoch": 4.975513059701493, + "grad_norm": 0.413687332011822, + "learning_rate": 5.002950696902278e-06, + "loss": 0.2717, + "step": 21335 + }, + { + "epoch": 4.976679104477612, + "grad_norm": 0.39369282860208743, + "learning_rate": 5.002676374511046e-06, + "loss": 0.2678, + "step": 21340 + }, + { + "epoch": 4.9778451492537314, + "grad_norm": 0.410582306177516, + "learning_rate": 5.0024154326652044e-06, + "loss": 0.2773, + "step": 21345 + }, + { + "epoch": 4.979011194029851, + "grad_norm": 0.40082441046553735, + "learning_rate": 5.002167871519951e-06, + "loss": 0.2735, + "step": 21350 + }, + { + "epoch": 4.98017723880597, + "grad_norm": 0.40048548194936884, + "learning_rate": 5.001933691222527e-06, + "loss": 0.285, + "step": 21355 + }, + { + "epoch": 4.981343283582089, + "grad_norm": 0.40933313203727506, + "learning_rate": 5.001712891912217e-06, + "loss": 0.2828, + "step": 21360 + }, + { + "epoch": 4.9825093283582085, + "grad_norm": 0.37561592164505403, + "learning_rate": 5.001505473720337e-06, + "loss": 0.2697, + "step": 21365 + }, + { + "epoch": 4.983675373134329, + "grad_norm": 0.3940218577106496, + "learning_rate": 5.001311436770255e-06, + "loss": 0.2884, + "step": 21370 + }, + { + "epoch": 4.984841417910448, + "grad_norm": 0.40789697316039814, + "learning_rate": 5.001130781177377e-06, + "loss": 0.2771, + "step": 21375 + }, + { + "epoch": 4.986007462686567, + "grad_norm": 0.38966392808622125, + "learning_rate": 5.000963507049151e-06, + "loss": 0.2777, + "step": 21380 + }, + { + "epoch": 4.987173507462686, + "grad_norm": 0.42082273116940005, + "learning_rate": 5.000809614485062e-06, + "loss": 0.2759, + "step": 21385 + }, + { + "epoch": 4.988339552238806, + "grad_norm": 0.39524031881150823, + "learning_rate": 5.000669103576643e-06, + "loss": 0.2711, + "step": 21390 + }, + { + "epoch": 4.989505597014926, + "grad_norm": 0.4243407252055143, + "learning_rate": 5.000541974407462e-06, + "loss": 0.2781, + "step": 21395 + }, + { + "epoch": 4.990671641791045, + "grad_norm": 0.39442825126110137, + "learning_rate": 5.000428227053131e-06, + "loss": 0.2769, + "step": 21400 + }, + { + "epoch": 4.991837686567164, + "grad_norm": 0.39636388485481344, + "learning_rate": 5.000327861581302e-06, + "loss": 0.2745, + "step": 21405 + }, + { + "epoch": 4.993003731343284, + "grad_norm": 0.4109817745413501, + "learning_rate": 5.000240878051671e-06, + "loss": 0.2763, + "step": 21410 + }, + { + "epoch": 4.994169776119403, + "grad_norm": 0.419159995584321, + "learning_rate": 5.0001672765159696e-06, + "loss": 0.267, + "step": 21415 + }, + { + "epoch": 4.995335820895522, + "grad_norm": 0.40579325116029774, + "learning_rate": 5.000107057017976e-06, + "loss": 0.2907, + "step": 21420 + }, + { + "epoch": 4.996501865671641, + "grad_norm": 0.4013985907244209, + "learning_rate": 5.0000602195935046e-06, + "loss": 0.2784, + "step": 21425 + }, + { + "epoch": 4.9976679104477615, + "grad_norm": 0.40870836269752825, + "learning_rate": 5.000026764270413e-06, + "loss": 0.2677, + "step": 21430 + }, + { + "epoch": 4.998833955223881, + "grad_norm": 0.40234064324932556, + "learning_rate": 5.0000066910686e-06, + "loss": 0.2887, + "step": 21435 + }, + { + "epoch": 5.0, + "grad_norm": 0.4134369961168365, + "learning_rate": 5e-06, + "loss": 0.2589, + "step": 21440 + }, + { + "epoch": 5.0, + "step": 21440, + "total_flos": 2439673306546176.0, + "train_loss": 0.4061120442879289, + "train_runtime": 35572.055, + "train_samples_per_second": 4.821, + "train_steps_per_second": 0.603 + } + ], + "logging_steps": 5, + "max_steps": 21440, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2439673306546176.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}