{ "best_metric": 9.066787356459827e-07, "best_model_checkpoint": "speechVSnoise/checkpoint-9150", "epoch": 10.0, "eval_steps": 500, "global_step": 9150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00546448087431694, "grad_norm": 25.785985946655273, "learning_rate": 2.73224043715847e-07, "loss": 0.6864, "step": 5 }, { "epoch": 0.01092896174863388, "grad_norm": 57.12474822998047, "learning_rate": 5.46448087431694e-07, "loss": 0.6771, "step": 10 }, { "epoch": 0.01639344262295082, "grad_norm": 26.012365341186523, "learning_rate": 8.19672131147541e-07, "loss": 0.6308, "step": 15 }, { "epoch": 0.02185792349726776, "grad_norm": 19.3648624420166, "learning_rate": 1.092896174863388e-06, "loss": 0.4441, "step": 20 }, { "epoch": 0.0273224043715847, "grad_norm": 16.610984802246094, "learning_rate": 1.366120218579235e-06, "loss": 0.4245, "step": 25 }, { "epoch": 0.03278688524590164, "grad_norm": 16.175987243652344, "learning_rate": 1.639344262295082e-06, "loss": 0.3525, "step": 30 }, { "epoch": 0.03825136612021858, "grad_norm": 11.281105041503906, "learning_rate": 1.912568306010929e-06, "loss": 0.2203, "step": 35 }, { "epoch": 0.04371584699453552, "grad_norm": 8.959579467773438, "learning_rate": 2.185792349726776e-06, "loss": 0.1137, "step": 40 }, { "epoch": 0.04918032786885246, "grad_norm": 3.032709836959839, "learning_rate": 2.459016393442623e-06, "loss": 0.0931, "step": 45 }, { "epoch": 0.0546448087431694, "grad_norm": 5.218940258026123, "learning_rate": 2.73224043715847e-06, "loss": 0.0874, "step": 50 }, { "epoch": 0.060109289617486336, "grad_norm": 4.759418487548828, "learning_rate": 3.005464480874317e-06, "loss": 0.0685, "step": 55 }, { "epoch": 0.06557377049180328, "grad_norm": 0.7248989939689636, "learning_rate": 3.278688524590164e-06, "loss": 0.0543, "step": 60 }, { "epoch": 0.07103825136612021, "grad_norm": 20.059240341186523, "learning_rate": 3.551912568306011e-06, "loss": 0.0554, "step": 65 }, { "epoch": 0.07650273224043716, "grad_norm": 1.0316702127456665, "learning_rate": 3.825136612021858e-06, "loss": 0.0126, "step": 70 }, { "epoch": 0.08196721311475409, "grad_norm": 0.11562328785657883, "learning_rate": 4.098360655737704e-06, "loss": 0.0068, "step": 75 }, { "epoch": 0.08743169398907104, "grad_norm": 0.28197842836380005, "learning_rate": 4.371584699453552e-06, "loss": 0.006, "step": 80 }, { "epoch": 0.09289617486338798, "grad_norm": 0.1347256749868393, "learning_rate": 4.6448087431694e-06, "loss": 0.0069, "step": 85 }, { "epoch": 0.09836065573770492, "grad_norm": 0.14458715915679932, "learning_rate": 4.918032786885246e-06, "loss": 0.0026, "step": 90 }, { "epoch": 0.10382513661202186, "grad_norm": 0.4816187918186188, "learning_rate": 5.191256830601094e-06, "loss": 0.0013, "step": 95 }, { "epoch": 0.1092896174863388, "grad_norm": 0.030168676748871803, "learning_rate": 5.46448087431694e-06, "loss": 0.0014, "step": 100 }, { "epoch": 0.11475409836065574, "grad_norm": 0.14276443421840668, "learning_rate": 5.737704918032787e-06, "loss": 0.0058, "step": 105 }, { "epoch": 0.12021857923497267, "grad_norm": 0.053282059729099274, "learning_rate": 6.010928961748634e-06, "loss": 0.0034, "step": 110 }, { "epoch": 0.12568306010928962, "grad_norm": 0.16789610683918, "learning_rate": 6.284153005464481e-06, "loss": 0.0006, "step": 115 }, { "epoch": 0.13114754098360656, "grad_norm": 0.03340218961238861, "learning_rate": 6.557377049180328e-06, "loss": 0.001, "step": 120 }, { "epoch": 0.1366120218579235, "grad_norm": 0.04611071199178696, "learning_rate": 6.830601092896176e-06, "loss": 0.0009, "step": 125 }, { "epoch": 0.14207650273224043, "grad_norm": 0.017522484064102173, "learning_rate": 7.103825136612022e-06, "loss": 0.0003, "step": 130 }, { "epoch": 0.14754098360655737, "grad_norm": 0.011235825717449188, "learning_rate": 7.3770491803278695e-06, "loss": 0.0054, "step": 135 }, { "epoch": 0.15300546448087432, "grad_norm": 0.008810077793896198, "learning_rate": 7.650273224043716e-06, "loss": 0.0882, "step": 140 }, { "epoch": 0.15846994535519127, "grad_norm": 0.4507307708263397, "learning_rate": 7.923497267759564e-06, "loss": 0.001, "step": 145 }, { "epoch": 0.16393442622950818, "grad_norm": 0.01477157510817051, "learning_rate": 8.196721311475409e-06, "loss": 0.001, "step": 150 }, { "epoch": 0.16939890710382513, "grad_norm": 0.01235341839492321, "learning_rate": 8.469945355191257e-06, "loss": 0.0003, "step": 155 }, { "epoch": 0.17486338797814208, "grad_norm": 0.14400136470794678, "learning_rate": 8.743169398907103e-06, "loss": 0.0039, "step": 160 }, { "epoch": 0.18032786885245902, "grad_norm": 0.006246116943657398, "learning_rate": 9.016393442622952e-06, "loss": 0.0006, "step": 165 }, { "epoch": 0.18579234972677597, "grad_norm": 0.019680462777614594, "learning_rate": 9.2896174863388e-06, "loss": 0.0005, "step": 170 }, { "epoch": 0.1912568306010929, "grad_norm": 0.0067535098642110825, "learning_rate": 9.562841530054644e-06, "loss": 0.0001, "step": 175 }, { "epoch": 0.19672131147540983, "grad_norm": 50.647010803222656, "learning_rate": 9.836065573770493e-06, "loss": 0.0782, "step": 180 }, { "epoch": 0.20218579234972678, "grad_norm": 0.01110668946057558, "learning_rate": 1.0109289617486339e-05, "loss": 0.0005, "step": 185 }, { "epoch": 0.20765027322404372, "grad_norm": 0.0494246631860733, "learning_rate": 1.0382513661202187e-05, "loss": 0.0003, "step": 190 }, { "epoch": 0.21311475409836064, "grad_norm": 0.011345715261995792, "learning_rate": 1.0655737704918032e-05, "loss": 0.0003, "step": 195 }, { "epoch": 0.2185792349726776, "grad_norm": 0.004185713827610016, "learning_rate": 1.092896174863388e-05, "loss": 0.0001, "step": 200 }, { "epoch": 0.22404371584699453, "grad_norm": 0.30543509125709534, "learning_rate": 1.1202185792349727e-05, "loss": 0.024, "step": 205 }, { "epoch": 0.22950819672131148, "grad_norm": 0.007137789856642485, "learning_rate": 1.1475409836065575e-05, "loss": 0.0001, "step": 210 }, { "epoch": 0.23497267759562843, "grad_norm": 0.008462714031338692, "learning_rate": 1.1748633879781421e-05, "loss": 0.0001, "step": 215 }, { "epoch": 0.24043715846994534, "grad_norm": 0.015059815719723701, "learning_rate": 1.2021857923497268e-05, "loss": 0.0001, "step": 220 }, { "epoch": 0.2459016393442623, "grad_norm": 0.00993975717574358, "learning_rate": 1.2295081967213116e-05, "loss": 0.0001, "step": 225 }, { "epoch": 0.25136612021857924, "grad_norm": 0.0028505015652626753, "learning_rate": 1.2568306010928962e-05, "loss": 0.0378, "step": 230 }, { "epoch": 0.2568306010928962, "grad_norm": 0.012212187051773071, "learning_rate": 1.284153005464481e-05, "loss": 0.0001, "step": 235 }, { "epoch": 0.26229508196721313, "grad_norm": 0.007482603657990694, "learning_rate": 1.3114754098360657e-05, "loss": 0.0002, "step": 240 }, { "epoch": 0.2677595628415301, "grad_norm": 0.09465236961841583, "learning_rate": 1.3387978142076505e-05, "loss": 0.001, "step": 245 }, { "epoch": 0.273224043715847, "grad_norm": 0.028336774557828903, "learning_rate": 1.3661202185792351e-05, "loss": 0.0001, "step": 250 }, { "epoch": 0.2786885245901639, "grad_norm": 0.001924874261021614, "learning_rate": 1.3934426229508196e-05, "loss": 0.0002, "step": 255 }, { "epoch": 0.28415300546448086, "grad_norm": 0.14595231413841248, "learning_rate": 1.4207650273224044e-05, "loss": 0.0003, "step": 260 }, { "epoch": 0.2896174863387978, "grad_norm": 0.008462517522275448, "learning_rate": 1.448087431693989e-05, "loss": 0.0, "step": 265 }, { "epoch": 0.29508196721311475, "grad_norm": 1.1039444208145142, "learning_rate": 1.4754098360655739e-05, "loss": 0.0004, "step": 270 }, { "epoch": 0.3005464480874317, "grad_norm": 0.0011038613738492131, "learning_rate": 1.5027322404371585e-05, "loss": 0.0003, "step": 275 }, { "epoch": 0.30601092896174864, "grad_norm": 0.002112634014338255, "learning_rate": 1.5300546448087432e-05, "loss": 0.0003, "step": 280 }, { "epoch": 0.3114754098360656, "grad_norm": 0.001424513990059495, "learning_rate": 1.557377049180328e-05, "loss": 0.0001, "step": 285 }, { "epoch": 0.31693989071038253, "grad_norm": 0.002593505661934614, "learning_rate": 1.5846994535519128e-05, "loss": 0.0, "step": 290 }, { "epoch": 0.3224043715846995, "grad_norm": 0.006812725216150284, "learning_rate": 1.6120218579234975e-05, "loss": 0.0, "step": 295 }, { "epoch": 0.32786885245901637, "grad_norm": 0.0028373391833156347, "learning_rate": 1.6393442622950818e-05, "loss": 0.0, "step": 300 }, { "epoch": 0.3333333333333333, "grad_norm": 0.0126915592700243, "learning_rate": 1.6666666666666667e-05, "loss": 0.0001, "step": 305 }, { "epoch": 0.33879781420765026, "grad_norm": 0.041050828993320465, "learning_rate": 1.6939890710382514e-05, "loss": 0.0001, "step": 310 }, { "epoch": 0.3442622950819672, "grad_norm": 58.20376205444336, "learning_rate": 1.721311475409836e-05, "loss": 0.07, "step": 315 }, { "epoch": 0.34972677595628415, "grad_norm": 0.04524129629135132, "learning_rate": 1.7486338797814207e-05, "loss": 0.0001, "step": 320 }, { "epoch": 0.3551912568306011, "grad_norm": 0.0022006204817444086, "learning_rate": 1.7759562841530057e-05, "loss": 0.6515, "step": 325 }, { "epoch": 0.36065573770491804, "grad_norm": 0.00961753074079752, "learning_rate": 1.8032786885245903e-05, "loss": 0.0001, "step": 330 }, { "epoch": 0.366120218579235, "grad_norm": 0.033571019768714905, "learning_rate": 1.830601092896175e-05, "loss": 0.0033, "step": 335 }, { "epoch": 0.37158469945355194, "grad_norm": 0.028768986463546753, "learning_rate": 1.85792349726776e-05, "loss": 0.0385, "step": 340 }, { "epoch": 0.3770491803278688, "grad_norm": 0.01213754341006279, "learning_rate": 1.8852459016393442e-05, "loss": 0.0351, "step": 345 }, { "epoch": 0.3825136612021858, "grad_norm": 0.0052613429725170135, "learning_rate": 1.912568306010929e-05, "loss": 0.0007, "step": 350 }, { "epoch": 0.3879781420765027, "grad_norm": 0.013218444772064686, "learning_rate": 1.9398907103825135e-05, "loss": 0.0432, "step": 355 }, { "epoch": 0.39344262295081966, "grad_norm": 0.0013946377439424396, "learning_rate": 1.9672131147540985e-05, "loss": 0.0006, "step": 360 }, { "epoch": 0.3989071038251366, "grad_norm": 0.004633768927305937, "learning_rate": 1.994535519125683e-05, "loss": 0.0004, "step": 365 }, { "epoch": 0.40437158469945356, "grad_norm": 0.0033809032756835222, "learning_rate": 2.0218579234972678e-05, "loss": 0.0, "step": 370 }, { "epoch": 0.4098360655737705, "grad_norm": 0.003123817965388298, "learning_rate": 2.0491803278688525e-05, "loss": 0.0001, "step": 375 }, { "epoch": 0.41530054644808745, "grad_norm": 0.002844776026904583, "learning_rate": 2.0765027322404374e-05, "loss": 0.0001, "step": 380 }, { "epoch": 0.4207650273224044, "grad_norm": 0.0011928690364584327, "learning_rate": 2.103825136612022e-05, "loss": 0.0004, "step": 385 }, { "epoch": 0.4262295081967213, "grad_norm": 0.002345489338040352, "learning_rate": 2.1311475409836064e-05, "loss": 0.0, "step": 390 }, { "epoch": 0.43169398907103823, "grad_norm": 14.624146461486816, "learning_rate": 2.1584699453551914e-05, "loss": 0.1712, "step": 395 }, { "epoch": 0.4371584699453552, "grad_norm": 0.00196304963901639, "learning_rate": 2.185792349726776e-05, "loss": 0.0001, "step": 400 }, { "epoch": 0.4426229508196721, "grad_norm": 0.0016199051169678569, "learning_rate": 2.2131147540983607e-05, "loss": 0.0001, "step": 405 }, { "epoch": 0.44808743169398907, "grad_norm": 0.0034895865246653557, "learning_rate": 2.2404371584699453e-05, "loss": 0.0001, "step": 410 }, { "epoch": 0.453551912568306, "grad_norm": 0.12550325691699982, "learning_rate": 2.2677595628415303e-05, "loss": 0.0005, "step": 415 }, { "epoch": 0.45901639344262296, "grad_norm": 0.05105838552117348, "learning_rate": 2.295081967213115e-05, "loss": 0.0003, "step": 420 }, { "epoch": 0.4644808743169399, "grad_norm": 0.0021114815026521683, "learning_rate": 2.3224043715846996e-05, "loss": 0.0001, "step": 425 }, { "epoch": 0.46994535519125685, "grad_norm": 0.002628276590257883, "learning_rate": 2.3497267759562842e-05, "loss": 0.0003, "step": 430 }, { "epoch": 0.47540983606557374, "grad_norm": 0.0018892699154093862, "learning_rate": 2.377049180327869e-05, "loss": 0.0, "step": 435 }, { "epoch": 0.4808743169398907, "grad_norm": 0.03093365952372551, "learning_rate": 2.4043715846994535e-05, "loss": 0.0001, "step": 440 }, { "epoch": 0.48633879781420764, "grad_norm": 0.0004180266987532377, "learning_rate": 2.431693989071038e-05, "loss": 0.0, "step": 445 }, { "epoch": 0.4918032786885246, "grad_norm": 0.0009978563757613301, "learning_rate": 2.459016393442623e-05, "loss": 0.0, "step": 450 }, { "epoch": 0.4972677595628415, "grad_norm": 0.0017233375692740083, "learning_rate": 2.4863387978142078e-05, "loss": 0.0, "step": 455 }, { "epoch": 0.5027322404371585, "grad_norm": 43.42852783203125, "learning_rate": 2.5136612021857924e-05, "loss": 0.1477, "step": 460 }, { "epoch": 0.5081967213114754, "grad_norm": 0.001277887960895896, "learning_rate": 2.540983606557377e-05, "loss": 0.0, "step": 465 }, { "epoch": 0.5136612021857924, "grad_norm": 0.0195325780659914, "learning_rate": 2.568306010928962e-05, "loss": 0.0001, "step": 470 }, { "epoch": 0.5191256830601093, "grad_norm": 0.008195769973099232, "learning_rate": 2.5956284153005467e-05, "loss": 0.0113, "step": 475 }, { "epoch": 0.5245901639344263, "grad_norm": 0.001160804065875709, "learning_rate": 2.6229508196721314e-05, "loss": 0.0, "step": 480 }, { "epoch": 0.5300546448087432, "grad_norm": 0.042693402618169785, "learning_rate": 2.650273224043716e-05, "loss": 0.0001, "step": 485 }, { "epoch": 0.5355191256830601, "grad_norm": 0.005922058131545782, "learning_rate": 2.677595628415301e-05, "loss": 0.0, "step": 490 }, { "epoch": 0.5409836065573771, "grad_norm": 0.008849041536450386, "learning_rate": 2.7049180327868856e-05, "loss": 0.0001, "step": 495 }, { "epoch": 0.546448087431694, "grad_norm": 0.12916676700115204, "learning_rate": 2.7322404371584703e-05, "loss": 0.004, "step": 500 }, { "epoch": 0.5519125683060109, "grad_norm": 0.06572812050580978, "learning_rate": 2.7595628415300546e-05, "loss": 0.1066, "step": 505 }, { "epoch": 0.5573770491803278, "grad_norm": 0.0008375145844183862, "learning_rate": 2.7868852459016392e-05, "loss": 0.0, "step": 510 }, { "epoch": 0.5628415300546448, "grad_norm": 0.0012427978217601776, "learning_rate": 2.814207650273224e-05, "loss": 0.0, "step": 515 }, { "epoch": 0.5683060109289617, "grad_norm": 0.00252250162884593, "learning_rate": 2.841530054644809e-05, "loss": 0.0, "step": 520 }, { "epoch": 0.5737704918032787, "grad_norm": 0.057264067232608795, "learning_rate": 2.8688524590163935e-05, "loss": 0.0002, "step": 525 }, { "epoch": 0.5792349726775956, "grad_norm": 47.20601272583008, "learning_rate": 2.896174863387978e-05, "loss": 0.1031, "step": 530 }, { "epoch": 0.5846994535519126, "grad_norm": 0.0003861558507196605, "learning_rate": 2.9234972677595628e-05, "loss": 0.0, "step": 535 }, { "epoch": 0.5901639344262295, "grad_norm": 0.0016129296272993088, "learning_rate": 2.9508196721311478e-05, "loss": 0.0073, "step": 540 }, { "epoch": 0.5956284153005464, "grad_norm": 0.027187852188944817, "learning_rate": 2.9781420765027324e-05, "loss": 0.0, "step": 545 }, { "epoch": 0.6010928961748634, "grad_norm": 1.0486091375350952, "learning_rate": 3.005464480874317e-05, "loss": 0.001, "step": 550 }, { "epoch": 0.6065573770491803, "grad_norm": 0.35983479022979736, "learning_rate": 3.0327868852459017e-05, "loss": 0.006, "step": 555 }, { "epoch": 0.6120218579234973, "grad_norm": 0.0048343949019908905, "learning_rate": 3.0601092896174864e-05, "loss": 0.0, "step": 560 }, { "epoch": 0.6174863387978142, "grad_norm": 0.00185883860103786, "learning_rate": 3.087431693989071e-05, "loss": 0.0, "step": 565 }, { "epoch": 0.6229508196721312, "grad_norm": 0.00794074684381485, "learning_rate": 3.114754098360656e-05, "loss": 0.0, "step": 570 }, { "epoch": 0.6284153005464481, "grad_norm": 0.00952135305851698, "learning_rate": 3.142076502732241e-05, "loss": 0.0, "step": 575 }, { "epoch": 0.6338797814207651, "grad_norm": 0.0006016762927174568, "learning_rate": 3.1693989071038256e-05, "loss": 0.0, "step": 580 }, { "epoch": 0.639344262295082, "grad_norm": 0.0006445659091696143, "learning_rate": 3.19672131147541e-05, "loss": 0.0, "step": 585 }, { "epoch": 0.644808743169399, "grad_norm": 0.0007617264054715633, "learning_rate": 3.224043715846995e-05, "loss": 0.0, "step": 590 }, { "epoch": 0.6502732240437158, "grad_norm": 0.007806734647601843, "learning_rate": 3.251366120218579e-05, "loss": 0.0001, "step": 595 }, { "epoch": 0.6557377049180327, "grad_norm": 0.00035029446007683873, "learning_rate": 3.2786885245901635e-05, "loss": 0.0, "step": 600 }, { "epoch": 0.6612021857923497, "grad_norm": 0.0011228241492062807, "learning_rate": 3.306010928961749e-05, "loss": 0.0, "step": 605 }, { "epoch": 0.6666666666666666, "grad_norm": 0.00020948232850059867, "learning_rate": 3.3333333333333335e-05, "loss": 0.0, "step": 610 }, { "epoch": 0.6721311475409836, "grad_norm": 0.00047444229130633175, "learning_rate": 3.360655737704918e-05, "loss": 0.0, "step": 615 }, { "epoch": 0.6775956284153005, "grad_norm": 0.00022226192231755704, "learning_rate": 3.387978142076503e-05, "loss": 0.0, "step": 620 }, { "epoch": 0.6830601092896175, "grad_norm": 0.964410126209259, "learning_rate": 3.4153005464480874e-05, "loss": 0.0004, "step": 625 }, { "epoch": 0.6885245901639344, "grad_norm": 0.03180733695626259, "learning_rate": 3.442622950819672e-05, "loss": 0.0, "step": 630 }, { "epoch": 0.6939890710382514, "grad_norm": 73.32250213623047, "learning_rate": 3.469945355191257e-05, "loss": 0.1252, "step": 635 }, { "epoch": 0.6994535519125683, "grad_norm": 0.0005270802648738027, "learning_rate": 3.4972677595628414e-05, "loss": 0.0, "step": 640 }, { "epoch": 0.7049180327868853, "grad_norm": 3.74862003326416, "learning_rate": 3.524590163934427e-05, "loss": 0.0471, "step": 645 }, { "epoch": 0.7103825136612022, "grad_norm": 0.0006061477470211685, "learning_rate": 3.551912568306011e-05, "loss": 0.0014, "step": 650 }, { "epoch": 0.7158469945355191, "grad_norm": 0.0004408442764542997, "learning_rate": 3.579234972677596e-05, "loss": 0.0, "step": 655 }, { "epoch": 0.7213114754098361, "grad_norm": 0.000549810822121799, "learning_rate": 3.6065573770491806e-05, "loss": 0.0, "step": 660 }, { "epoch": 0.726775956284153, "grad_norm": 0.0006441313307732344, "learning_rate": 3.633879781420765e-05, "loss": 0.0, "step": 665 }, { "epoch": 0.73224043715847, "grad_norm": 0.001270699198357761, "learning_rate": 3.66120218579235e-05, "loss": 0.0, "step": 670 }, { "epoch": 0.7377049180327869, "grad_norm": 0.00043755839578807354, "learning_rate": 3.6885245901639346e-05, "loss": 0.0, "step": 675 }, { "epoch": 0.7431693989071039, "grad_norm": 0.0004439064650796354, "learning_rate": 3.71584699453552e-05, "loss": 0.0, "step": 680 }, { "epoch": 0.7486338797814208, "grad_norm": 1.3895764350891113, "learning_rate": 3.7431693989071045e-05, "loss": 0.0012, "step": 685 }, { "epoch": 0.7540983606557377, "grad_norm": 0.0003534202405717224, "learning_rate": 3.7704918032786885e-05, "loss": 0.0449, "step": 690 }, { "epoch": 0.7595628415300546, "grad_norm": 0.0076418425887823105, "learning_rate": 3.797814207650273e-05, "loss": 0.0012, "step": 695 }, { "epoch": 0.7650273224043715, "grad_norm": 8.513630018569529e-05, "learning_rate": 3.825136612021858e-05, "loss": 0.0118, "step": 700 }, { "epoch": 0.7704918032786885, "grad_norm": 0.000540974666364491, "learning_rate": 3.8524590163934424e-05, "loss": 0.0, "step": 705 }, { "epoch": 0.7759562841530054, "grad_norm": 0.0001819472381612286, "learning_rate": 3.879781420765027e-05, "loss": 0.2163, "step": 710 }, { "epoch": 0.7814207650273224, "grad_norm": 0.0011445063864812255, "learning_rate": 3.9071038251366124e-05, "loss": 0.0, "step": 715 }, { "epoch": 0.7868852459016393, "grad_norm": 0.0005601670709438622, "learning_rate": 3.934426229508197e-05, "loss": 0.0025, "step": 720 }, { "epoch": 0.7923497267759563, "grad_norm": 0.03481730446219444, "learning_rate": 3.961748633879782e-05, "loss": 0.0, "step": 725 }, { "epoch": 0.7978142076502732, "grad_norm": 10.071272850036621, "learning_rate": 3.989071038251366e-05, "loss": 0.077, "step": 730 }, { "epoch": 0.8032786885245902, "grad_norm": 0.40439388155937195, "learning_rate": 4.016393442622951e-05, "loss": 0.0054, "step": 735 }, { "epoch": 0.8087431693989071, "grad_norm": 0.0003790801565628499, "learning_rate": 4.0437158469945356e-05, "loss": 0.0973, "step": 740 }, { "epoch": 0.8142076502732241, "grad_norm": 0.0005046766018494964, "learning_rate": 4.07103825136612e-05, "loss": 0.0639, "step": 745 }, { "epoch": 0.819672131147541, "grad_norm": 0.005156039260327816, "learning_rate": 4.098360655737705e-05, "loss": 0.0001, "step": 750 }, { "epoch": 0.825136612021858, "grad_norm": 0.819869875907898, "learning_rate": 4.12568306010929e-05, "loss": 0.1133, "step": 755 }, { "epoch": 0.8306010928961749, "grad_norm": 0.0007013155845925212, "learning_rate": 4.153005464480875e-05, "loss": 0.0, "step": 760 }, { "epoch": 0.8360655737704918, "grad_norm": 0.0015922696329653263, "learning_rate": 4.1803278688524595e-05, "loss": 0.0037, "step": 765 }, { "epoch": 0.8415300546448088, "grad_norm": 0.007399330381304026, "learning_rate": 4.207650273224044e-05, "loss": 0.0001, "step": 770 }, { "epoch": 0.8469945355191257, "grad_norm": 0.06001625955104828, "learning_rate": 4.234972677595629e-05, "loss": 0.0003, "step": 775 }, { "epoch": 0.8524590163934426, "grad_norm": 0.00016822715406306088, "learning_rate": 4.262295081967213e-05, "loss": 0.0003, "step": 780 }, { "epoch": 0.8579234972677595, "grad_norm": 0.0044054812751710415, "learning_rate": 4.289617486338798e-05, "loss": 0.3591, "step": 785 }, { "epoch": 0.8633879781420765, "grad_norm": 0.001408412354066968, "learning_rate": 4.316939890710383e-05, "loss": 0.138, "step": 790 }, { "epoch": 0.8688524590163934, "grad_norm": 0.0046225362457334995, "learning_rate": 4.3442622950819674e-05, "loss": 0.0317, "step": 795 }, { "epoch": 0.8743169398907104, "grad_norm": 0.026879915967583656, "learning_rate": 4.371584699453552e-05, "loss": 0.0035, "step": 800 }, { "epoch": 0.8797814207650273, "grad_norm": 0.0007724014576524496, "learning_rate": 4.398907103825137e-05, "loss": 0.0001, "step": 805 }, { "epoch": 0.8852459016393442, "grad_norm": 0.0005259870667941868, "learning_rate": 4.426229508196721e-05, "loss": 0.0017, "step": 810 }, { "epoch": 0.8907103825136612, "grad_norm": 0.20176024734973907, "learning_rate": 4.453551912568306e-05, "loss": 0.0004, "step": 815 }, { "epoch": 0.8961748633879781, "grad_norm": 0.0004278122214600444, "learning_rate": 4.4808743169398906e-05, "loss": 0.2911, "step": 820 }, { "epoch": 0.9016393442622951, "grad_norm": 0.0018693683668971062, "learning_rate": 4.508196721311476e-05, "loss": 0.0, "step": 825 }, { "epoch": 0.907103825136612, "grad_norm": 0.0012298806104809046, "learning_rate": 4.5355191256830606e-05, "loss": 0.0001, "step": 830 }, { "epoch": 0.912568306010929, "grad_norm": 0.007856699638068676, "learning_rate": 4.562841530054645e-05, "loss": 0.0004, "step": 835 }, { "epoch": 0.9180327868852459, "grad_norm": 0.009999307803809643, "learning_rate": 4.59016393442623e-05, "loss": 0.0002, "step": 840 }, { "epoch": 0.9234972677595629, "grad_norm": 0.013572623953223228, "learning_rate": 4.6174863387978145e-05, "loss": 0.0002, "step": 845 }, { "epoch": 0.9289617486338798, "grad_norm": 0.006055152975022793, "learning_rate": 4.644808743169399e-05, "loss": 0.0113, "step": 850 }, { "epoch": 0.9344262295081968, "grad_norm": 0.001018978888168931, "learning_rate": 4.672131147540984e-05, "loss": 0.0001, "step": 855 }, { "epoch": 0.9398907103825137, "grad_norm": 0.0018887248588725924, "learning_rate": 4.6994535519125685e-05, "loss": 0.0001, "step": 860 }, { "epoch": 0.9453551912568307, "grad_norm": 0.0015452090883627534, "learning_rate": 4.726775956284154e-05, "loss": 0.1306, "step": 865 }, { "epoch": 0.9508196721311475, "grad_norm": 0.030793463811278343, "learning_rate": 4.754098360655738e-05, "loss": 0.0, "step": 870 }, { "epoch": 0.9562841530054644, "grad_norm": 0.0030877627432346344, "learning_rate": 4.7814207650273224e-05, "loss": 0.0, "step": 875 }, { "epoch": 0.9617486338797814, "grad_norm": 0.0027639123145490885, "learning_rate": 4.808743169398907e-05, "loss": 0.227, "step": 880 }, { "epoch": 0.9672131147540983, "grad_norm": 0.013874650001525879, "learning_rate": 4.836065573770492e-05, "loss": 0.0005, "step": 885 }, { "epoch": 0.9726775956284153, "grad_norm": 0.010947838425636292, "learning_rate": 4.863387978142076e-05, "loss": 0.005, "step": 890 }, { "epoch": 0.9781420765027322, "grad_norm": 0.004630269482731819, "learning_rate": 4.890710382513661e-05, "loss": 0.0001, "step": 895 }, { "epoch": 0.9836065573770492, "grad_norm": 0.0010024881921708584, "learning_rate": 4.918032786885246e-05, "loss": 0.0002, "step": 900 }, { "epoch": 0.9890710382513661, "grad_norm": 0.0025314155500382185, "learning_rate": 4.945355191256831e-05, "loss": 0.0918, "step": 905 }, { "epoch": 0.994535519125683, "grad_norm": 0.0026452147867530584, "learning_rate": 4.9726775956284156e-05, "loss": 0.0, "step": 910 }, { "epoch": 1.0, "grad_norm": 0.043630439788103104, "learning_rate": 5e-05, "loss": 0.0001, "step": 915 }, { "epoch": 1.0, "eval_loss": 0.053465329110622406, "eval_runtime": 658.7855, "eval_samples_per_second": 11.105, "eval_steps_per_second": 1.389, "step": 915 }, { "epoch": 1.005464480874317, "grad_norm": 0.020471155643463135, "learning_rate": 4.996964177292046e-05, "loss": 0.0243, "step": 920 }, { "epoch": 1.010928961748634, "grad_norm": 0.005614515859633684, "learning_rate": 4.9939283545840925e-05, "loss": 0.0001, "step": 925 }, { "epoch": 1.0163934426229508, "grad_norm": 0.0024823613930493593, "learning_rate": 4.990892531876138e-05, "loss": 0.0, "step": 930 }, { "epoch": 1.0218579234972678, "grad_norm": 0.0023433889728039503, "learning_rate": 4.987856709168185e-05, "loss": 0.0001, "step": 935 }, { "epoch": 1.0273224043715847, "grad_norm": 0.0012808924075216055, "learning_rate": 4.984820886460231e-05, "loss": 0.0, "step": 940 }, { "epoch": 1.0327868852459017, "grad_norm": 0.0008591926307417452, "learning_rate": 4.9817850637522776e-05, "loss": 0.0, "step": 945 }, { "epoch": 1.0382513661202186, "grad_norm": 0.001144225592724979, "learning_rate": 4.9787492410443234e-05, "loss": 0.0, "step": 950 }, { "epoch": 1.0437158469945356, "grad_norm": 0.006365107372403145, "learning_rate": 4.97571341833637e-05, "loss": 0.186, "step": 955 }, { "epoch": 1.0491803278688525, "grad_norm": 0.004594831261783838, "learning_rate": 4.9726775956284156e-05, "loss": 0.0003, "step": 960 }, { "epoch": 1.0546448087431695, "grad_norm": 0.01714457757771015, "learning_rate": 4.969641772920462e-05, "loss": 0.0005, "step": 965 }, { "epoch": 1.0601092896174864, "grad_norm": 0.009641066193580627, "learning_rate": 4.966605950212508e-05, "loss": 0.0026, "step": 970 }, { "epoch": 1.0655737704918034, "grad_norm": 0.002107401378452778, "learning_rate": 4.9635701275045536e-05, "loss": 0.0004, "step": 975 }, { "epoch": 1.0710382513661203, "grad_norm": 0.0008496601949445903, "learning_rate": 4.9605343047966e-05, "loss": 0.0, "step": 980 }, { "epoch": 1.0765027322404372, "grad_norm": 0.00019651043112389743, "learning_rate": 4.957498482088646e-05, "loss": 0.0, "step": 985 }, { "epoch": 1.0819672131147542, "grad_norm": 0.02816096320748329, "learning_rate": 4.954462659380692e-05, "loss": 0.0, "step": 990 }, { "epoch": 1.0874316939890711, "grad_norm": 0.00035151580232195556, "learning_rate": 4.951426836672739e-05, "loss": 0.1947, "step": 995 }, { "epoch": 1.092896174863388, "grad_norm": 0.0009560062899254262, "learning_rate": 4.948391013964785e-05, "loss": 0.0, "step": 1000 }, { "epoch": 1.098360655737705, "grad_norm": 0.0054119001142680645, "learning_rate": 4.945355191256831e-05, "loss": 0.0299, "step": 1005 }, { "epoch": 1.1038251366120218, "grad_norm": 0.04507184773683548, "learning_rate": 4.9423193685488774e-05, "loss": 0.1414, "step": 1010 }, { "epoch": 1.1092896174863387, "grad_norm": 0.005444942507892847, "learning_rate": 4.939283545840923e-05, "loss": 0.0001, "step": 1015 }, { "epoch": 1.1147540983606556, "grad_norm": 0.007937879301607609, "learning_rate": 4.936247723132969e-05, "loss": 0.0001, "step": 1020 }, { "epoch": 1.1202185792349726, "grad_norm": 0.003920862451195717, "learning_rate": 4.9332119004250154e-05, "loss": 0.0001, "step": 1025 }, { "epoch": 1.1256830601092895, "grad_norm": 0.0018985569477081299, "learning_rate": 4.930176077717061e-05, "loss": 0.1051, "step": 1030 }, { "epoch": 1.1311475409836065, "grad_norm": 0.0012923554750159383, "learning_rate": 4.9271402550091076e-05, "loss": 0.0, "step": 1035 }, { "epoch": 1.1366120218579234, "grad_norm": 0.016738856211304665, "learning_rate": 4.9241044323011534e-05, "loss": 0.0001, "step": 1040 }, { "epoch": 1.1420765027322404, "grad_norm": 0.0011123842559754848, "learning_rate": 4.9210686095932e-05, "loss": 0.1292, "step": 1045 }, { "epoch": 1.1475409836065573, "grad_norm": 0.0014614718966186047, "learning_rate": 4.918032786885246e-05, "loss": 0.0002, "step": 1050 }, { "epoch": 1.1530054644808743, "grad_norm": 0.5193114280700684, "learning_rate": 4.914996964177293e-05, "loss": 0.0008, "step": 1055 }, { "epoch": 1.1584699453551912, "grad_norm": 0.039534829556941986, "learning_rate": 4.9119611414693385e-05, "loss": 0.0004, "step": 1060 }, { "epoch": 1.1639344262295082, "grad_norm": 0.0015866112662479281, "learning_rate": 4.908925318761385e-05, "loss": 0.0, "step": 1065 }, { "epoch": 1.169398907103825, "grad_norm": 0.0004405885701999068, "learning_rate": 4.905889496053431e-05, "loss": 0.0001, "step": 1070 }, { "epoch": 1.174863387978142, "grad_norm": 0.0003017389390151948, "learning_rate": 4.9028536733454765e-05, "loss": 0.0001, "step": 1075 }, { "epoch": 1.180327868852459, "grad_norm": 0.002132098888978362, "learning_rate": 4.899817850637523e-05, "loss": 0.0, "step": 1080 }, { "epoch": 1.185792349726776, "grad_norm": 0.0002962806320283562, "learning_rate": 4.896782027929569e-05, "loss": 0.0001, "step": 1085 }, { "epoch": 1.1912568306010929, "grad_norm": 0.0759500116109848, "learning_rate": 4.893746205221615e-05, "loss": 0.2131, "step": 1090 }, { "epoch": 1.1967213114754098, "grad_norm": 0.006093773990869522, "learning_rate": 4.890710382513661e-05, "loss": 0.0001, "step": 1095 }, { "epoch": 1.2021857923497268, "grad_norm": 0.09465143084526062, "learning_rate": 4.8876745598057074e-05, "loss": 0.0022, "step": 1100 }, { "epoch": 1.2076502732240437, "grad_norm": 0.07754716277122498, "learning_rate": 4.884638737097754e-05, "loss": 0.0004, "step": 1105 }, { "epoch": 1.2131147540983607, "grad_norm": 0.0023268985096365213, "learning_rate": 4.8816029143898e-05, "loss": 0.0002, "step": 1110 }, { "epoch": 1.2185792349726776, "grad_norm": 0.0014323684154078364, "learning_rate": 4.878567091681846e-05, "loss": 0.0093, "step": 1115 }, { "epoch": 1.2240437158469946, "grad_norm": 0.14172188937664032, "learning_rate": 4.875531268973892e-05, "loss": 0.0429, "step": 1120 }, { "epoch": 1.2295081967213115, "grad_norm": 0.05727756395936012, "learning_rate": 4.872495446265938e-05, "loss": 0.0002, "step": 1125 }, { "epoch": 1.2349726775956285, "grad_norm": 0.001011449727229774, "learning_rate": 4.869459623557984e-05, "loss": 0.0, "step": 1130 }, { "epoch": 1.2404371584699454, "grad_norm": 0.0003146354283671826, "learning_rate": 4.8664238008500306e-05, "loss": 0.0002, "step": 1135 }, { "epoch": 1.2459016393442623, "grad_norm": 0.0003123127680737525, "learning_rate": 4.863387978142076e-05, "loss": 0.0, "step": 1140 }, { "epoch": 1.2513661202185793, "grad_norm": 0.000679291901178658, "learning_rate": 4.860352155434123e-05, "loss": 0.0, "step": 1145 }, { "epoch": 1.2568306010928962, "grad_norm": 0.00043421483132988214, "learning_rate": 4.857316332726169e-05, "loss": 0.0, "step": 1150 }, { "epoch": 1.2622950819672132, "grad_norm": 0.0009190288255922496, "learning_rate": 4.854280510018216e-05, "loss": 0.0, "step": 1155 }, { "epoch": 1.2677595628415301, "grad_norm": 0.00011454925697762519, "learning_rate": 4.8512446873102615e-05, "loss": 0.0, "step": 1160 }, { "epoch": 1.273224043715847, "grad_norm": 0.00038073299219831824, "learning_rate": 4.848208864602308e-05, "loss": 0.0, "step": 1165 }, { "epoch": 1.278688524590164, "grad_norm": 0.00031519224285148084, "learning_rate": 4.845173041894354e-05, "loss": 0.0, "step": 1170 }, { "epoch": 1.2841530054644807, "grad_norm": 0.00039192906115204096, "learning_rate": 4.8421372191863995e-05, "loss": 0.0001, "step": 1175 }, { "epoch": 1.289617486338798, "grad_norm": 0.00010484485392225906, "learning_rate": 4.839101396478446e-05, "loss": 0.0, "step": 1180 }, { "epoch": 1.2950819672131146, "grad_norm": 0.00030461253481917083, "learning_rate": 4.836065573770492e-05, "loss": 0.0, "step": 1185 }, { "epoch": 1.3005464480874318, "grad_norm": 0.00025512345018796623, "learning_rate": 4.833029751062538e-05, "loss": 0.0, "step": 1190 }, { "epoch": 1.3060109289617485, "grad_norm": 0.00021565568749792874, "learning_rate": 4.829993928354584e-05, "loss": 0.0, "step": 1195 }, { "epoch": 1.3114754098360657, "grad_norm": 0.0004679218982346356, "learning_rate": 4.8269581056466304e-05, "loss": 0.0001, "step": 1200 }, { "epoch": 1.3169398907103824, "grad_norm": 0.00021159886091481894, "learning_rate": 4.823922282938677e-05, "loss": 0.0, "step": 1205 }, { "epoch": 1.3224043715846996, "grad_norm": 0.00015547883231192827, "learning_rate": 4.820886460230723e-05, "loss": 0.0, "step": 1210 }, { "epoch": 1.3278688524590163, "grad_norm": 29.090713500976562, "learning_rate": 4.817850637522769e-05, "loss": 0.0739, "step": 1215 }, { "epoch": 1.3333333333333333, "grad_norm": 0.002653504954650998, "learning_rate": 4.814814814814815e-05, "loss": 0.0, "step": 1220 }, { "epoch": 1.3387978142076502, "grad_norm": 0.0016200868412852287, "learning_rate": 4.811778992106861e-05, "loss": 0.0036, "step": 1225 }, { "epoch": 1.3442622950819672, "grad_norm": 7.368043588940054e-05, "learning_rate": 4.808743169398907e-05, "loss": 0.0001, "step": 1230 }, { "epoch": 1.349726775956284, "grad_norm": 5.11952348460909e-05, "learning_rate": 4.8057073466909535e-05, "loss": 0.0008, "step": 1235 }, { "epoch": 1.355191256830601, "grad_norm": 0.0010039065964519978, "learning_rate": 4.802671523982999e-05, "loss": 0.2929, "step": 1240 }, { "epoch": 1.360655737704918, "grad_norm": 0.00012388975301291794, "learning_rate": 4.799635701275046e-05, "loss": 0.0, "step": 1245 }, { "epoch": 1.366120218579235, "grad_norm": 0.00013184835552237928, "learning_rate": 4.7965998785670915e-05, "loss": 0.0, "step": 1250 }, { "epoch": 1.3715846994535519, "grad_norm": 9.407256584381685e-05, "learning_rate": 4.793564055859138e-05, "loss": 0.0, "step": 1255 }, { "epoch": 1.3770491803278688, "grad_norm": 0.0016062518116086721, "learning_rate": 4.7905282331511844e-05, "loss": 0.0, "step": 1260 }, { "epoch": 1.3825136612021858, "grad_norm": 8.71685188030824e-05, "learning_rate": 4.787492410443231e-05, "loss": 0.0, "step": 1265 }, { "epoch": 1.3879781420765027, "grad_norm": 0.00011282307968940586, "learning_rate": 4.7844565877352766e-05, "loss": 0.0, "step": 1270 }, { "epoch": 1.3934426229508197, "grad_norm": 9.378144022775814e-05, "learning_rate": 4.7814207650273224e-05, "loss": 0.0, "step": 1275 }, { "epoch": 1.3989071038251366, "grad_norm": 0.00011343141522957012, "learning_rate": 4.778384942319369e-05, "loss": 0.0, "step": 1280 }, { "epoch": 1.4043715846994536, "grad_norm": 9.47820590226911e-05, "learning_rate": 4.7753491196114146e-05, "loss": 0.0, "step": 1285 }, { "epoch": 1.4098360655737705, "grad_norm": 8.373497985303402e-05, "learning_rate": 4.772313296903461e-05, "loss": 0.0, "step": 1290 }, { "epoch": 1.4153005464480874, "grad_norm": 0.00018082663882523775, "learning_rate": 4.769277474195507e-05, "loss": 0.1966, "step": 1295 }, { "epoch": 1.4207650273224044, "grad_norm": 0.06691992282867432, "learning_rate": 4.766241651487553e-05, "loss": 0.0006, "step": 1300 }, { "epoch": 1.4262295081967213, "grad_norm": 0.0007573667098768055, "learning_rate": 4.7632058287796e-05, "loss": 0.0001, "step": 1305 }, { "epoch": 1.4316939890710383, "grad_norm": 6.261147975921631, "learning_rate": 4.760170006071646e-05, "loss": 0.0841, "step": 1310 }, { "epoch": 1.4371584699453552, "grad_norm": 0.011942526325583458, "learning_rate": 4.757134183363692e-05, "loss": 0.0001, "step": 1315 }, { "epoch": 1.4426229508196722, "grad_norm": 0.010376262478530407, "learning_rate": 4.754098360655738e-05, "loss": 0.0002, "step": 1320 }, { "epoch": 1.4480874316939891, "grad_norm": 0.002821909496560693, "learning_rate": 4.751062537947784e-05, "loss": 0.0791, "step": 1325 }, { "epoch": 1.453551912568306, "grad_norm": 0.0008530141785740852, "learning_rate": 4.74802671523983e-05, "loss": 0.0161, "step": 1330 }, { "epoch": 1.459016393442623, "grad_norm": 0.0005102080176584423, "learning_rate": 4.7449908925318764e-05, "loss": 0.0002, "step": 1335 }, { "epoch": 1.46448087431694, "grad_norm": 0.011689051054418087, "learning_rate": 4.741955069823922e-05, "loss": 0.0002, "step": 1340 }, { "epoch": 1.469945355191257, "grad_norm": 0.0002597393176984042, "learning_rate": 4.7389192471159687e-05, "loss": 0.0001, "step": 1345 }, { "epoch": 1.4754098360655736, "grad_norm": 0.00023770586994942278, "learning_rate": 4.7358834244080144e-05, "loss": 0.0, "step": 1350 }, { "epoch": 1.4808743169398908, "grad_norm": 0.00020558437972795218, "learning_rate": 4.732847601700061e-05, "loss": 0.0, "step": 1355 }, { "epoch": 1.4863387978142075, "grad_norm": 0.0001555513881612569, "learning_rate": 4.729811778992107e-05, "loss": 0.0, "step": 1360 }, { "epoch": 1.4918032786885247, "grad_norm": 0.00018597490270622075, "learning_rate": 4.726775956284154e-05, "loss": 0.0, "step": 1365 }, { "epoch": 1.4972677595628414, "grad_norm": 0.0011901083635166287, "learning_rate": 4.7237401335761996e-05, "loss": 0.0, "step": 1370 }, { "epoch": 1.5027322404371586, "grad_norm": 0.00025052594719454646, "learning_rate": 4.720704310868245e-05, "loss": 0.0, "step": 1375 }, { "epoch": 1.5081967213114753, "grad_norm": 0.00035109854070469737, "learning_rate": 4.717668488160292e-05, "loss": 0.0, "step": 1380 }, { "epoch": 1.5136612021857925, "grad_norm": 37.61546325683594, "learning_rate": 4.7146326654523376e-05, "loss": 0.0997, "step": 1385 }, { "epoch": 1.5191256830601092, "grad_norm": 38.025840759277344, "learning_rate": 4.711596842744384e-05, "loss": 0.1406, "step": 1390 }, { "epoch": 1.5245901639344264, "grad_norm": 0.00024681788636371493, "learning_rate": 4.70856102003643e-05, "loss": 0.0, "step": 1395 }, { "epoch": 1.530054644808743, "grad_norm": 0.0002825226401910186, "learning_rate": 4.705525197328476e-05, "loss": 0.0003, "step": 1400 }, { "epoch": 1.5355191256830603, "grad_norm": 0.0002672713599167764, "learning_rate": 4.702489374620522e-05, "loss": 0.0003, "step": 1405 }, { "epoch": 1.540983606557377, "grad_norm": 0.013312156312167645, "learning_rate": 4.6994535519125685e-05, "loss": 0.0, "step": 1410 }, { "epoch": 1.5464480874316942, "grad_norm": 0.00021928714704699814, "learning_rate": 4.696417729204615e-05, "loss": 0.0, "step": 1415 }, { "epoch": 1.5519125683060109, "grad_norm": 0.3993873596191406, "learning_rate": 4.6933819064966614e-05, "loss": 0.0005, "step": 1420 }, { "epoch": 1.5573770491803278, "grad_norm": 0.00024002179270610213, "learning_rate": 4.690346083788707e-05, "loss": 0.0025, "step": 1425 }, { "epoch": 1.5628415300546448, "grad_norm": 0.00012442973093129694, "learning_rate": 4.687310261080753e-05, "loss": 0.0, "step": 1430 }, { "epoch": 1.5683060109289617, "grad_norm": 0.00023839778441470116, "learning_rate": 4.6842744383727994e-05, "loss": 0.1631, "step": 1435 }, { "epoch": 1.5737704918032787, "grad_norm": 0.006576848216354847, "learning_rate": 4.681238615664845e-05, "loss": 0.0, "step": 1440 }, { "epoch": 1.5792349726775956, "grad_norm": 0.024206487461924553, "learning_rate": 4.6782027929568916e-05, "loss": 0.0001, "step": 1445 }, { "epoch": 1.5846994535519126, "grad_norm": 0.00031997630139812827, "learning_rate": 4.6751669702489374e-05, "loss": 0.0, "step": 1450 }, { "epoch": 1.5901639344262295, "grad_norm": 0.06144952401518822, "learning_rate": 4.672131147540984e-05, "loss": 0.0003, "step": 1455 }, { "epoch": 1.5956284153005464, "grad_norm": 0.00033534682006575167, "learning_rate": 4.6690953248330296e-05, "loss": 0.0, "step": 1460 }, { "epoch": 1.6010928961748634, "grad_norm": 0.665640652179718, "learning_rate": 4.666059502125076e-05, "loss": 0.0003, "step": 1465 }, { "epoch": 1.6065573770491803, "grad_norm": 0.0002033864293480292, "learning_rate": 4.6630236794171225e-05, "loss": 0.0, "step": 1470 }, { "epoch": 1.6120218579234973, "grad_norm": 0.0018922536401078105, "learning_rate": 4.659987856709168e-05, "loss": 0.0002, "step": 1475 }, { "epoch": 1.6174863387978142, "grad_norm": 0.0003269801090937108, "learning_rate": 4.656952034001215e-05, "loss": 0.0003, "step": 1480 }, { "epoch": 1.6229508196721312, "grad_norm": 0.027018524706363678, "learning_rate": 4.6539162112932605e-05, "loss": 0.0001, "step": 1485 }, { "epoch": 1.6284153005464481, "grad_norm": 0.0002585135807748884, "learning_rate": 4.650880388585307e-05, "loss": 0.0, "step": 1490 }, { "epoch": 1.633879781420765, "grad_norm": 0.0009487815550528467, "learning_rate": 4.647844565877353e-05, "loss": 0.0, "step": 1495 }, { "epoch": 1.639344262295082, "grad_norm": 0.00017302001651842147, "learning_rate": 4.644808743169399e-05, "loss": 0.0006, "step": 1500 }, { "epoch": 1.644808743169399, "grad_norm": 0.0009016783442348242, "learning_rate": 4.641772920461445e-05, "loss": 0.0, "step": 1505 }, { "epoch": 1.650273224043716, "grad_norm": 0.0017405046382918954, "learning_rate": 4.6387370977534914e-05, "loss": 0.0003, "step": 1510 }, { "epoch": 1.6557377049180326, "grad_norm": 0.0001675246749073267, "learning_rate": 4.635701275045538e-05, "loss": 0.0002, "step": 1515 }, { "epoch": 1.6612021857923498, "grad_norm": 0.0001709494536044076, "learning_rate": 4.632665452337584e-05, "loss": 0.0, "step": 1520 }, { "epoch": 1.6666666666666665, "grad_norm": 0.000520666828379035, "learning_rate": 4.62962962962963e-05, "loss": 0.0007, "step": 1525 }, { "epoch": 1.6721311475409837, "grad_norm": 0.00013493937149178237, "learning_rate": 4.626593806921676e-05, "loss": 0.2165, "step": 1530 }, { "epoch": 1.6775956284153004, "grad_norm": 0.03252645209431648, "learning_rate": 4.623557984213722e-05, "loss": 0.0003, "step": 1535 }, { "epoch": 1.6830601092896176, "grad_norm": 0.00021406033192761242, "learning_rate": 4.620522161505768e-05, "loss": 0.0, "step": 1540 }, { "epoch": 1.6885245901639343, "grad_norm": 0.0006847565528005362, "learning_rate": 4.6174863387978145e-05, "loss": 0.0, "step": 1545 }, { "epoch": 1.6939890710382515, "grad_norm": 0.0003050083469133824, "learning_rate": 4.61445051608986e-05, "loss": 0.0, "step": 1550 }, { "epoch": 1.6994535519125682, "grad_norm": 0.017207743600010872, "learning_rate": 4.611414693381907e-05, "loss": 0.0001, "step": 1555 }, { "epoch": 1.7049180327868854, "grad_norm": 0.0002447327133268118, "learning_rate": 4.6083788706739525e-05, "loss": 0.0, "step": 1560 }, { "epoch": 1.710382513661202, "grad_norm": 0.00020444171968847513, "learning_rate": 4.605343047965999e-05, "loss": 0.0, "step": 1565 }, { "epoch": 1.7158469945355193, "grad_norm": 0.02190363220870495, "learning_rate": 4.6023072252580454e-05, "loss": 0.0001, "step": 1570 }, { "epoch": 1.721311475409836, "grad_norm": 0.00021700489742215723, "learning_rate": 4.599271402550091e-05, "loss": 0.0001, "step": 1575 }, { "epoch": 1.7267759562841531, "grad_norm": 0.00015953410184010863, "learning_rate": 4.5962355798421377e-05, "loss": 0.0, "step": 1580 }, { "epoch": 1.7322404371584699, "grad_norm": 0.00021170971740502864, "learning_rate": 4.5931997571341834e-05, "loss": 0.0, "step": 1585 }, { "epoch": 1.737704918032787, "grad_norm": 0.00016968738054856658, "learning_rate": 4.59016393442623e-05, "loss": 0.0639, "step": 1590 }, { "epoch": 1.7431693989071038, "grad_norm": 0.00013816288264933974, "learning_rate": 4.5871281117182757e-05, "loss": 0.1101, "step": 1595 }, { "epoch": 1.748633879781421, "grad_norm": 0.0002588335482869297, "learning_rate": 4.584092289010322e-05, "loss": 0.0, "step": 1600 }, { "epoch": 1.7540983606557377, "grad_norm": 0.0010825609788298607, "learning_rate": 4.581056466302368e-05, "loss": 0.0001, "step": 1605 }, { "epoch": 1.7595628415300546, "grad_norm": 0.00044289807556197047, "learning_rate": 4.578020643594414e-05, "loss": 0.2008, "step": 1610 }, { "epoch": 1.7650273224043715, "grad_norm": 0.0006378990947268903, "learning_rate": 4.57498482088646e-05, "loss": 0.0, "step": 1615 }, { "epoch": 1.7704918032786885, "grad_norm": 0.0013747804332524538, "learning_rate": 4.5719489981785066e-05, "loss": 0.0001, "step": 1620 }, { "epoch": 1.7759562841530054, "grad_norm": 0.0012717167846858501, "learning_rate": 4.568913175470553e-05, "loss": 0.0001, "step": 1625 }, { "epoch": 1.7814207650273224, "grad_norm": 0.006049423012882471, "learning_rate": 4.565877352762599e-05, "loss": 0.1173, "step": 1630 }, { "epoch": 1.7868852459016393, "grad_norm": 0.0012180794728919864, "learning_rate": 4.562841530054645e-05, "loss": 0.0001, "step": 1635 }, { "epoch": 1.7923497267759563, "grad_norm": 0.027454646304249763, "learning_rate": 4.559805707346691e-05, "loss": 0.0001, "step": 1640 }, { "epoch": 1.7978142076502732, "grad_norm": 0.0020080592948943377, "learning_rate": 4.5567698846387375e-05, "loss": 0.161, "step": 1645 }, { "epoch": 1.8032786885245902, "grad_norm": 0.003008060622960329, "learning_rate": 4.553734061930783e-05, "loss": 0.0001, "step": 1650 }, { "epoch": 1.8087431693989071, "grad_norm": 0.004481349140405655, "learning_rate": 4.55069823922283e-05, "loss": 0.0203, "step": 1655 }, { "epoch": 1.814207650273224, "grad_norm": 4.153011322021484, "learning_rate": 4.5476624165148755e-05, "loss": 0.1353, "step": 1660 }, { "epoch": 1.819672131147541, "grad_norm": 0.010290669277310371, "learning_rate": 4.544626593806922e-05, "loss": 0.0005, "step": 1665 }, { "epoch": 1.825136612021858, "grad_norm": 0.007870933972299099, "learning_rate": 4.541590771098968e-05, "loss": 0.0074, "step": 1670 }, { "epoch": 1.830601092896175, "grad_norm": 0.0014877247158437967, "learning_rate": 4.538554948391014e-05, "loss": 0.0012, "step": 1675 }, { "epoch": 1.8360655737704918, "grad_norm": 0.0007460130145773292, "learning_rate": 4.5355191256830606e-05, "loss": 0.0001, "step": 1680 }, { "epoch": 1.8415300546448088, "grad_norm": 0.05613982677459717, "learning_rate": 4.5324833029751064e-05, "loss": 0.0003, "step": 1685 }, { "epoch": 1.8469945355191257, "grad_norm": 0.0004631394112948328, "learning_rate": 4.529447480267153e-05, "loss": 0.0001, "step": 1690 }, { "epoch": 1.8524590163934427, "grad_norm": 0.00038068211870267987, "learning_rate": 4.5264116575591986e-05, "loss": 0.0003, "step": 1695 }, { "epoch": 1.8579234972677594, "grad_norm": 0.00032613801886327565, "learning_rate": 4.523375834851245e-05, "loss": 0.0, "step": 1700 }, { "epoch": 1.8633879781420766, "grad_norm": 0.000517977518029511, "learning_rate": 4.520340012143291e-05, "loss": 0.1396, "step": 1705 }, { "epoch": 1.8688524590163933, "grad_norm": 0.0025108163245022297, "learning_rate": 4.517304189435337e-05, "loss": 0.0003, "step": 1710 }, { "epoch": 1.8743169398907105, "grad_norm": 0.2560445964336395, "learning_rate": 4.514268366727383e-05, "loss": 0.069, "step": 1715 }, { "epoch": 1.8797814207650272, "grad_norm": 0.0028507369570434093, "learning_rate": 4.5112325440194295e-05, "loss": 0.0022, "step": 1720 }, { "epoch": 1.8852459016393444, "grad_norm": 0.001581528689712286, "learning_rate": 4.508196721311476e-05, "loss": 0.0, "step": 1725 }, { "epoch": 1.890710382513661, "grad_norm": 0.0012307813158258796, "learning_rate": 4.505160898603522e-05, "loss": 0.0018, "step": 1730 }, { "epoch": 1.8961748633879782, "grad_norm": 0.0004025879898108542, "learning_rate": 4.502125075895568e-05, "loss": 0.0006, "step": 1735 }, { "epoch": 1.901639344262295, "grad_norm": 0.0005517560639418662, "learning_rate": 4.499089253187614e-05, "loss": 0.0005, "step": 1740 }, { "epoch": 1.9071038251366121, "grad_norm": 0.00021167936210986227, "learning_rate": 4.4960534304796604e-05, "loss": 0.0005, "step": 1745 }, { "epoch": 1.9125683060109289, "grad_norm": 0.0001789630769053474, "learning_rate": 4.493017607771706e-05, "loss": 0.0, "step": 1750 }, { "epoch": 1.918032786885246, "grad_norm": 0.04136138781905174, "learning_rate": 4.4899817850637526e-05, "loss": 0.0002, "step": 1755 }, { "epoch": 1.9234972677595628, "grad_norm": 0.0001342537289019674, "learning_rate": 4.4869459623557984e-05, "loss": 0.0006, "step": 1760 }, { "epoch": 1.92896174863388, "grad_norm": 36.38466262817383, "learning_rate": 4.483910139647845e-05, "loss": 0.3505, "step": 1765 }, { "epoch": 1.9344262295081966, "grad_norm": 0.007546951994299889, "learning_rate": 4.4808743169398906e-05, "loss": 0.0, "step": 1770 }, { "epoch": 1.9398907103825138, "grad_norm": 0.2711917459964752, "learning_rate": 4.477838494231937e-05, "loss": 0.0005, "step": 1775 }, { "epoch": 1.9453551912568305, "grad_norm": 0.022145679220557213, "learning_rate": 4.4748026715239835e-05, "loss": 0.0003, "step": 1780 }, { "epoch": 1.9508196721311475, "grad_norm": 0.0005350765422917902, "learning_rate": 4.471766848816029e-05, "loss": 0.0, "step": 1785 }, { "epoch": 1.9562841530054644, "grad_norm": 0.0005207122885622084, "learning_rate": 4.468731026108076e-05, "loss": 0.0001, "step": 1790 }, { "epoch": 1.9617486338797814, "grad_norm": 0.0017709174426272511, "learning_rate": 4.4656952034001215e-05, "loss": 0.0, "step": 1795 }, { "epoch": 1.9672131147540983, "grad_norm": 0.00032043023384176195, "learning_rate": 4.462659380692168e-05, "loss": 0.0, "step": 1800 }, { "epoch": 1.9726775956284153, "grad_norm": 0.013697931542992592, "learning_rate": 4.459623557984214e-05, "loss": 0.0001, "step": 1805 }, { "epoch": 1.9781420765027322, "grad_norm": 0.00043464158079586923, "learning_rate": 4.45658773527626e-05, "loss": 0.0, "step": 1810 }, { "epoch": 1.9836065573770492, "grad_norm": 0.020648222416639328, "learning_rate": 4.453551912568306e-05, "loss": 0.0726, "step": 1815 }, { "epoch": 1.989071038251366, "grad_norm": 0.0003471345698926598, "learning_rate": 4.4505160898603524e-05, "loss": 0.0, "step": 1820 }, { "epoch": 1.994535519125683, "grad_norm": 0.00031178101198747754, "learning_rate": 4.447480267152398e-05, "loss": 0.0, "step": 1825 }, { "epoch": 2.0, "grad_norm": 0.00018788916349876672, "learning_rate": 4.4444444444444447e-05, "loss": 0.0, "step": 1830 }, { "epoch": 2.0, "eval_loss": 0.050701793283224106, "eval_runtime": 660.1575, "eval_samples_per_second": 11.082, "eval_steps_per_second": 1.386, "step": 1830 }, { "epoch": 2.0054644808743167, "grad_norm": 0.00025049722171388566, "learning_rate": 4.441408621736491e-05, "loss": 0.0, "step": 1835 }, { "epoch": 2.010928961748634, "grad_norm": 8.584993362426758, "learning_rate": 4.438372799028537e-05, "loss": 0.0062, "step": 1840 }, { "epoch": 2.0163934426229506, "grad_norm": 0.0003093411505687982, "learning_rate": 4.435336976320583e-05, "loss": 0.0, "step": 1845 }, { "epoch": 2.021857923497268, "grad_norm": 0.00020034695626236498, "learning_rate": 4.432301153612629e-05, "loss": 0.0, "step": 1850 }, { "epoch": 2.0273224043715845, "grad_norm": 0.0001671861537033692, "learning_rate": 4.4292653309046756e-05, "loss": 0.0008, "step": 1855 }, { "epoch": 2.0327868852459017, "grad_norm": 0.014300575479865074, "learning_rate": 4.426229508196721e-05, "loss": 0.0001, "step": 1860 }, { "epoch": 2.0382513661202184, "grad_norm": 8.975714445114136e-05, "learning_rate": 4.423193685488768e-05, "loss": 0.0, "step": 1865 }, { "epoch": 2.0437158469945356, "grad_norm": 0.00019374603289179504, "learning_rate": 4.4201578627808136e-05, "loss": 0.0, "step": 1870 }, { "epoch": 2.0491803278688523, "grad_norm": 0.00013538934581447393, "learning_rate": 4.41712204007286e-05, "loss": 0.0, "step": 1875 }, { "epoch": 2.0546448087431695, "grad_norm": 0.0002743653894867748, "learning_rate": 4.4140862173649065e-05, "loss": 0.0001, "step": 1880 }, { "epoch": 2.060109289617486, "grad_norm": 7.988169090822339e-05, "learning_rate": 4.411050394656952e-05, "loss": 0.0002, "step": 1885 }, { "epoch": 2.0655737704918034, "grad_norm": 6.720585952280089e-05, "learning_rate": 4.408014571948999e-05, "loss": 0.0, "step": 1890 }, { "epoch": 2.07103825136612, "grad_norm": 8.488036110065877e-05, "learning_rate": 4.4049787492410445e-05, "loss": 0.0, "step": 1895 }, { "epoch": 2.0765027322404372, "grad_norm": 0.00010288408520864323, "learning_rate": 4.401942926533091e-05, "loss": 0.0, "step": 1900 }, { "epoch": 2.081967213114754, "grad_norm": 6.435842078644782e-05, "learning_rate": 4.398907103825137e-05, "loss": 0.0001, "step": 1905 }, { "epoch": 2.087431693989071, "grad_norm": 0.0001191817136714235, "learning_rate": 4.395871281117183e-05, "loss": 0.0, "step": 1910 }, { "epoch": 2.092896174863388, "grad_norm": 7.335934787988663e-05, "learning_rate": 4.392835458409229e-05, "loss": 0.0, "step": 1915 }, { "epoch": 2.098360655737705, "grad_norm": 9.272222087020054e-05, "learning_rate": 4.3897996357012754e-05, "loss": 0.0, "step": 1920 }, { "epoch": 2.1038251366120218, "grad_norm": 5.950667036813684e-05, "learning_rate": 4.386763812993321e-05, "loss": 0.0, "step": 1925 }, { "epoch": 2.109289617486339, "grad_norm": 0.00011658170114969835, "learning_rate": 4.3837279902853676e-05, "loss": 0.0001, "step": 1930 }, { "epoch": 2.1147540983606556, "grad_norm": 7.576384814456105e-05, "learning_rate": 4.380692167577414e-05, "loss": 0.0001, "step": 1935 }, { "epoch": 2.120218579234973, "grad_norm": 0.00017848057905212045, "learning_rate": 4.37765634486946e-05, "loss": 0.0, "step": 1940 }, { "epoch": 2.1256830601092895, "grad_norm": 0.00016036807210184634, "learning_rate": 4.374620522161506e-05, "loss": 0.0, "step": 1945 }, { "epoch": 2.1311475409836067, "grad_norm": 0.0007356128189712763, "learning_rate": 4.371584699453552e-05, "loss": 0.0, "step": 1950 }, { "epoch": 2.1366120218579234, "grad_norm": 6.098092126194388e-05, "learning_rate": 4.3685488767455985e-05, "loss": 0.0, "step": 1955 }, { "epoch": 2.1420765027322406, "grad_norm": 0.00010312991798855364, "learning_rate": 4.365513054037644e-05, "loss": 0.0, "step": 1960 }, { "epoch": 2.1475409836065573, "grad_norm": 8.894432539818808e-05, "learning_rate": 4.362477231329691e-05, "loss": 0.0, "step": 1965 }, { "epoch": 2.1530054644808745, "grad_norm": 8.428292494500056e-05, "learning_rate": 4.3594414086217365e-05, "loss": 0.0, "step": 1970 }, { "epoch": 2.158469945355191, "grad_norm": 8.923211134970188e-05, "learning_rate": 4.356405585913783e-05, "loss": 0.0, "step": 1975 }, { "epoch": 2.1639344262295084, "grad_norm": 0.0001834633876569569, "learning_rate": 4.353369763205829e-05, "loss": 0.0449, "step": 1980 }, { "epoch": 2.169398907103825, "grad_norm": 0.0003559277392923832, "learning_rate": 4.350333940497875e-05, "loss": 0.0, "step": 1985 }, { "epoch": 2.1748633879781423, "grad_norm": 5.8480047300690785e-05, "learning_rate": 4.3472981177899216e-05, "loss": 0.0, "step": 1990 }, { "epoch": 2.180327868852459, "grad_norm": 9.875800606096163e-05, "learning_rate": 4.3442622950819674e-05, "loss": 0.0002, "step": 1995 }, { "epoch": 2.185792349726776, "grad_norm": 0.00011199543223483488, "learning_rate": 4.341226472374014e-05, "loss": 0.0012, "step": 2000 }, { "epoch": 2.191256830601093, "grad_norm": 6.932274845894426e-05, "learning_rate": 4.3381906496660596e-05, "loss": 0.0321, "step": 2005 }, { "epoch": 2.19672131147541, "grad_norm": 0.022693945094943047, "learning_rate": 4.335154826958106e-05, "loss": 0.1198, "step": 2010 }, { "epoch": 2.202185792349727, "grad_norm": 5.4680960602127016e-05, "learning_rate": 4.332119004250152e-05, "loss": 0.0, "step": 2015 }, { "epoch": 2.2076502732240435, "grad_norm": 10.618327140808105, "learning_rate": 4.329083181542198e-05, "loss": 0.0059, "step": 2020 }, { "epoch": 2.2131147540983607, "grad_norm": 7.450117846019566e-05, "learning_rate": 4.326047358834244e-05, "loss": 0.0, "step": 2025 }, { "epoch": 2.2185792349726774, "grad_norm": 0.004406645428389311, "learning_rate": 4.32301153612629e-05, "loss": 0.0, "step": 2030 }, { "epoch": 2.2240437158469946, "grad_norm": 8.840003283694386e-05, "learning_rate": 4.319975713418336e-05, "loss": 0.0, "step": 2035 }, { "epoch": 2.2295081967213113, "grad_norm": 0.005674728192389011, "learning_rate": 4.316939890710383e-05, "loss": 0.0, "step": 2040 }, { "epoch": 2.2349726775956285, "grad_norm": 25.384016036987305, "learning_rate": 4.313904068002429e-05, "loss": 0.0272, "step": 2045 }, { "epoch": 2.240437158469945, "grad_norm": 8.648393850307912e-05, "learning_rate": 4.310868245294475e-05, "loss": 0.2247, "step": 2050 }, { "epoch": 2.2459016393442623, "grad_norm": 36.4394645690918, "learning_rate": 4.3078324225865214e-05, "loss": 0.1866, "step": 2055 }, { "epoch": 2.251366120218579, "grad_norm": 0.00034720319672487676, "learning_rate": 4.304796599878567e-05, "loss": 0.0, "step": 2060 }, { "epoch": 2.2568306010928962, "grad_norm": 0.001058422145433724, "learning_rate": 4.3017607771706137e-05, "loss": 0.0001, "step": 2065 }, { "epoch": 2.262295081967213, "grad_norm": 0.0010224126745015383, "learning_rate": 4.2987249544626594e-05, "loss": 0.0287, "step": 2070 }, { "epoch": 2.26775956284153, "grad_norm": 0.00043366028694435954, "learning_rate": 4.295689131754706e-05, "loss": 0.0112, "step": 2075 }, { "epoch": 2.273224043715847, "grad_norm": 0.00029680755687877536, "learning_rate": 4.2926533090467517e-05, "loss": 0.0, "step": 2080 }, { "epoch": 2.278688524590164, "grad_norm": 0.0005055105430074036, "learning_rate": 4.289617486338798e-05, "loss": 0.0, "step": 2085 }, { "epoch": 2.2841530054644807, "grad_norm": 0.00031777381082065403, "learning_rate": 4.2865816636308446e-05, "loss": 0.0, "step": 2090 }, { "epoch": 2.289617486338798, "grad_norm": 0.00021517739514820278, "learning_rate": 4.28354584092289e-05, "loss": 0.0, "step": 2095 }, { "epoch": 2.2950819672131146, "grad_norm": 0.0009914422407746315, "learning_rate": 4.280510018214937e-05, "loss": 0.0469, "step": 2100 }, { "epoch": 2.300546448087432, "grad_norm": 0.00023720291210338473, "learning_rate": 4.2774741955069826e-05, "loss": 0.0, "step": 2105 }, { "epoch": 2.3060109289617485, "grad_norm": 0.00918621476739645, "learning_rate": 4.274438372799029e-05, "loss": 0.0001, "step": 2110 }, { "epoch": 2.3114754098360657, "grad_norm": 0.002750314772129059, "learning_rate": 4.271402550091075e-05, "loss": 0.0001, "step": 2115 }, { "epoch": 2.3169398907103824, "grad_norm": 0.0003200963546987623, "learning_rate": 4.268366727383121e-05, "loss": 0.0003, "step": 2120 }, { "epoch": 2.3224043715846996, "grad_norm": 0.0004604012647178024, "learning_rate": 4.265330904675167e-05, "loss": 0.0, "step": 2125 }, { "epoch": 2.3278688524590163, "grad_norm": 11.309343338012695, "learning_rate": 4.262295081967213e-05, "loss": 0.0048, "step": 2130 }, { "epoch": 2.3333333333333335, "grad_norm": 0.0001469504932174459, "learning_rate": 4.259259259259259e-05, "loss": 0.0, "step": 2135 }, { "epoch": 2.33879781420765, "grad_norm": 0.00021601478511001915, "learning_rate": 4.256223436551306e-05, "loss": 0.0, "step": 2140 }, { "epoch": 2.3442622950819674, "grad_norm": 0.00023017208150122315, "learning_rate": 4.253187613843352e-05, "loss": 0.0857, "step": 2145 }, { "epoch": 2.349726775956284, "grad_norm": 38.37042999267578, "learning_rate": 4.250151791135398e-05, "loss": 0.2799, "step": 2150 }, { "epoch": 2.3551912568306013, "grad_norm": 0.0014018617803230882, "learning_rate": 4.2471159684274444e-05, "loss": 0.0, "step": 2155 }, { "epoch": 2.360655737704918, "grad_norm": 0.07934489101171494, "learning_rate": 4.24408014571949e-05, "loss": 0.0837, "step": 2160 }, { "epoch": 2.366120218579235, "grad_norm": 0.0009544425411149859, "learning_rate": 4.2410443230115366e-05, "loss": 0.0016, "step": 2165 }, { "epoch": 2.371584699453552, "grad_norm": 0.0019977609626948833, "learning_rate": 4.2380085003035824e-05, "loss": 0.0, "step": 2170 }, { "epoch": 2.3770491803278686, "grad_norm": 0.0008353716693818569, "learning_rate": 4.234972677595629e-05, "loss": 0.1553, "step": 2175 }, { "epoch": 2.3825136612021858, "grad_norm": 0.00047743169125169516, "learning_rate": 4.2319368548876746e-05, "loss": 0.0004, "step": 2180 }, { "epoch": 2.387978142076503, "grad_norm": 1.1335127353668213, "learning_rate": 4.2289010321797204e-05, "loss": 0.0041, "step": 2185 }, { "epoch": 2.3934426229508197, "grad_norm": 0.0003214840835426003, "learning_rate": 4.225865209471767e-05, "loss": 0.0003, "step": 2190 }, { "epoch": 2.3989071038251364, "grad_norm": 0.0005583087913691998, "learning_rate": 4.222829386763813e-05, "loss": 0.0, "step": 2195 }, { "epoch": 2.4043715846994536, "grad_norm": 0.00019280117703601718, "learning_rate": 4.21979356405586e-05, "loss": 0.0, "step": 2200 }, { "epoch": 2.4098360655737707, "grad_norm": 0.0002531936625018716, "learning_rate": 4.2167577413479055e-05, "loss": 0.0, "step": 2205 }, { "epoch": 2.4153005464480874, "grad_norm": 0.0003483460459392518, "learning_rate": 4.213721918639952e-05, "loss": 0.0, "step": 2210 }, { "epoch": 2.420765027322404, "grad_norm": 0.0002563974994700402, "learning_rate": 4.210686095931998e-05, "loss": 0.0001, "step": 2215 }, { "epoch": 2.4262295081967213, "grad_norm": 0.00037459208397194743, "learning_rate": 4.207650273224044e-05, "loss": 0.0, "step": 2220 }, { "epoch": 2.431693989071038, "grad_norm": 0.0010059193009510636, "learning_rate": 4.20461445051609e-05, "loss": 0.1636, "step": 2225 }, { "epoch": 2.4371584699453552, "grad_norm": 0.07868228107690811, "learning_rate": 4.201578627808136e-05, "loss": 0.0002, "step": 2230 }, { "epoch": 2.442622950819672, "grad_norm": 0.003810027614235878, "learning_rate": 4.198542805100182e-05, "loss": 0.0003, "step": 2235 }, { "epoch": 2.448087431693989, "grad_norm": 0.0003104351635556668, "learning_rate": 4.195506982392228e-05, "loss": 0.0005, "step": 2240 }, { "epoch": 2.453551912568306, "grad_norm": 0.000299435923807323, "learning_rate": 4.1924711596842744e-05, "loss": 0.0002, "step": 2245 }, { "epoch": 2.459016393442623, "grad_norm": 0.00018563756020739675, "learning_rate": 4.189435336976321e-05, "loss": 0.0, "step": 2250 }, { "epoch": 2.4644808743169397, "grad_norm": 0.00043477851431816816, "learning_rate": 4.186399514268367e-05, "loss": 0.0, "step": 2255 }, { "epoch": 2.469945355191257, "grad_norm": 0.00019215783686377108, "learning_rate": 4.183363691560413e-05, "loss": 0.0, "step": 2260 }, { "epoch": 2.4754098360655736, "grad_norm": 0.00027334748301655054, "learning_rate": 4.1803278688524595e-05, "loss": 0.0, "step": 2265 }, { "epoch": 2.480874316939891, "grad_norm": 0.0002772251609712839, "learning_rate": 4.177292046144505e-05, "loss": 0.0, "step": 2270 }, { "epoch": 2.4863387978142075, "grad_norm": 0.0001554929040139541, "learning_rate": 4.174256223436552e-05, "loss": 0.0, "step": 2275 }, { "epoch": 2.4918032786885247, "grad_norm": 0.00038592342752963305, "learning_rate": 4.1712204007285975e-05, "loss": 0.0, "step": 2280 }, { "epoch": 2.4972677595628414, "grad_norm": 0.0002766764082480222, "learning_rate": 4.168184578020643e-05, "loss": 0.2958, "step": 2285 }, { "epoch": 2.5027322404371586, "grad_norm": 0.006776416674256325, "learning_rate": 4.16514875531269e-05, "loss": 0.0001, "step": 2290 }, { "epoch": 2.5081967213114753, "grad_norm": 0.037279609590768814, "learning_rate": 4.162112932604736e-05, "loss": 0.0003, "step": 2295 }, { "epoch": 2.5136612021857925, "grad_norm": 0.0005434492486529052, "learning_rate": 4.1590771098967827e-05, "loss": 0.002, "step": 2300 }, { "epoch": 2.519125683060109, "grad_norm": 0.0008327377145178616, "learning_rate": 4.1560412871888284e-05, "loss": 0.0, "step": 2305 }, { "epoch": 2.5245901639344264, "grad_norm": 0.00018225843086838722, "learning_rate": 4.153005464480875e-05, "loss": 0.0, "step": 2310 }, { "epoch": 2.530054644808743, "grad_norm": 0.0016029364196583629, "learning_rate": 4.1499696417729207e-05, "loss": 0.0001, "step": 2315 }, { "epoch": 2.5355191256830603, "grad_norm": 0.00022132557933218777, "learning_rate": 4.146933819064967e-05, "loss": 0.0001, "step": 2320 }, { "epoch": 2.540983606557377, "grad_norm": 0.008287652395665646, "learning_rate": 4.143897996357013e-05, "loss": 0.0001, "step": 2325 }, { "epoch": 2.546448087431694, "grad_norm": 0.0076728700660169125, "learning_rate": 4.1408621736490587e-05, "loss": 0.0, "step": 2330 }, { "epoch": 2.551912568306011, "grad_norm": 0.00015040539437904954, "learning_rate": 4.137826350941105e-05, "loss": 0.0, "step": 2335 }, { "epoch": 2.557377049180328, "grad_norm": 0.0013361191377043724, "learning_rate": 4.134790528233151e-05, "loss": 0.0, "step": 2340 }, { "epoch": 2.5628415300546448, "grad_norm": 0.003388547571375966, "learning_rate": 4.131754705525197e-05, "loss": 0.0, "step": 2345 }, { "epoch": 2.5683060109289615, "grad_norm": 0.000265285256318748, "learning_rate": 4.128718882817244e-05, "loss": 0.0, "step": 2350 }, { "epoch": 2.5737704918032787, "grad_norm": 0.0002575261751189828, "learning_rate": 4.12568306010929e-05, "loss": 0.0, "step": 2355 }, { "epoch": 2.579234972677596, "grad_norm": 0.00025711252237670124, "learning_rate": 4.122647237401336e-05, "loss": 0.0, "step": 2360 }, { "epoch": 2.5846994535519126, "grad_norm": 0.0004015800659544766, "learning_rate": 4.1196114146933825e-05, "loss": 0.0, "step": 2365 }, { "epoch": 2.5901639344262293, "grad_norm": 0.0002642312610987574, "learning_rate": 4.116575591985428e-05, "loss": 0.1755, "step": 2370 }, { "epoch": 2.5956284153005464, "grad_norm": 0.00034015090204775333, "learning_rate": 4.113539769277475e-05, "loss": 0.0, "step": 2375 }, { "epoch": 2.6010928961748636, "grad_norm": 0.00024969771038740873, "learning_rate": 4.1105039465695205e-05, "loss": 0.0041, "step": 2380 }, { "epoch": 2.6065573770491803, "grad_norm": 0.0018195788143202662, "learning_rate": 4.107468123861566e-05, "loss": 0.0002, "step": 2385 }, { "epoch": 2.612021857923497, "grad_norm": 0.0009367292514070868, "learning_rate": 4.104432301153613e-05, "loss": 0.0, "step": 2390 }, { "epoch": 2.6174863387978142, "grad_norm": 0.0002414148475509137, "learning_rate": 4.1013964784456585e-05, "loss": 0.0, "step": 2395 }, { "epoch": 2.6229508196721314, "grad_norm": 0.00028640354867093265, "learning_rate": 4.098360655737705e-05, "loss": 0.0, "step": 2400 }, { "epoch": 2.628415300546448, "grad_norm": 0.00019009907555300742, "learning_rate": 4.0953248330297514e-05, "loss": 0.0, "step": 2405 }, { "epoch": 2.633879781420765, "grad_norm": 0.00015359176904894412, "learning_rate": 4.092289010321798e-05, "loss": 0.029, "step": 2410 }, { "epoch": 2.639344262295082, "grad_norm": 0.0025620998349040747, "learning_rate": 4.0892531876138436e-05, "loss": 0.0, "step": 2415 }, { "epoch": 2.644808743169399, "grad_norm": 0.00017074740026146173, "learning_rate": 4.08621736490589e-05, "loss": 0.0, "step": 2420 }, { "epoch": 2.650273224043716, "grad_norm": 0.00021745880076196045, "learning_rate": 4.083181542197936e-05, "loss": 0.0, "step": 2425 }, { "epoch": 2.6557377049180326, "grad_norm": 0.0006353080389089882, "learning_rate": 4.080145719489982e-05, "loss": 0.0, "step": 2430 }, { "epoch": 2.66120218579235, "grad_norm": 0.000403941870899871, "learning_rate": 4.077109896782028e-05, "loss": 0.0, "step": 2435 }, { "epoch": 2.6666666666666665, "grad_norm": 0.00042110533104278147, "learning_rate": 4.074074074074074e-05, "loss": 0.0, "step": 2440 }, { "epoch": 2.6721311475409837, "grad_norm": 0.0002816633495967835, "learning_rate": 4.07103825136612e-05, "loss": 0.0001, "step": 2445 }, { "epoch": 2.6775956284153004, "grad_norm": 0.036186400800943375, "learning_rate": 4.068002428658167e-05, "loss": 0.0001, "step": 2450 }, { "epoch": 2.6830601092896176, "grad_norm": 0.00018799924873746932, "learning_rate": 4.064966605950213e-05, "loss": 0.0002, "step": 2455 }, { "epoch": 2.6885245901639343, "grad_norm": 0.000202516297576949, "learning_rate": 4.061930783242259e-05, "loss": 0.0, "step": 2460 }, { "epoch": 2.6939890710382515, "grad_norm": 0.01154984999448061, "learning_rate": 4.0588949605343054e-05, "loss": 0.0, "step": 2465 }, { "epoch": 2.699453551912568, "grad_norm": 0.0010380310704931617, "learning_rate": 4.055859137826351e-05, "loss": 0.0, "step": 2470 }, { "epoch": 2.7049180327868854, "grad_norm": 9.886401676340029e-05, "learning_rate": 4.0528233151183976e-05, "loss": 0.0, "step": 2475 }, { "epoch": 2.710382513661202, "grad_norm": 0.003762443084269762, "learning_rate": 4.0497874924104434e-05, "loss": 0.0, "step": 2480 }, { "epoch": 2.7158469945355193, "grad_norm": 0.00016515249444637448, "learning_rate": 4.046751669702489e-05, "loss": 0.0, "step": 2485 }, { "epoch": 2.721311475409836, "grad_norm": 0.007088791113346815, "learning_rate": 4.0437158469945356e-05, "loss": 0.0, "step": 2490 }, { "epoch": 2.726775956284153, "grad_norm": 0.00012914990657009184, "learning_rate": 4.0406800242865814e-05, "loss": 0.0, "step": 2495 }, { "epoch": 2.73224043715847, "grad_norm": 10.831242561340332, "learning_rate": 4.037644201578628e-05, "loss": 0.0252, "step": 2500 }, { "epoch": 2.737704918032787, "grad_norm": 0.000647792941890657, "learning_rate": 4.034608378870674e-05, "loss": 0.0, "step": 2505 }, { "epoch": 2.7431693989071038, "grad_norm": 0.0003103922645095736, "learning_rate": 4.031572556162721e-05, "loss": 0.0, "step": 2510 }, { "epoch": 2.748633879781421, "grad_norm": 0.00011969503248110414, "learning_rate": 4.0285367334547665e-05, "loss": 0.0001, "step": 2515 }, { "epoch": 2.7540983606557377, "grad_norm": 7.822127372492105e-05, "learning_rate": 4.025500910746813e-05, "loss": 0.0232, "step": 2520 }, { "epoch": 2.7595628415300544, "grad_norm": 0.8959982395172119, "learning_rate": 4.022465088038859e-05, "loss": 0.0007, "step": 2525 }, { "epoch": 2.7650273224043715, "grad_norm": 5.191492164158262e-05, "learning_rate": 4.019429265330905e-05, "loss": 0.0507, "step": 2530 }, { "epoch": 2.7704918032786887, "grad_norm": 3.3753425668692216e-05, "learning_rate": 4.016393442622951e-05, "loss": 0.0, "step": 2535 }, { "epoch": 2.7759562841530054, "grad_norm": 0.0015989969251677394, "learning_rate": 4.013357619914997e-05, "loss": 0.2098, "step": 2540 }, { "epoch": 2.781420765027322, "grad_norm": 0.0004384935018606484, "learning_rate": 4.010321797207043e-05, "loss": 0.0, "step": 2545 }, { "epoch": 2.7868852459016393, "grad_norm": 0.0006148093962110579, "learning_rate": 4.007285974499089e-05, "loss": 0.0003, "step": 2550 }, { "epoch": 2.7923497267759565, "grad_norm": 0.0004790659877471626, "learning_rate": 4.0042501517911354e-05, "loss": 0.0001, "step": 2555 }, { "epoch": 2.797814207650273, "grad_norm": 0.00042726221727207303, "learning_rate": 4.001214329083182e-05, "loss": 0.0, "step": 2560 }, { "epoch": 2.80327868852459, "grad_norm": 0.0007618932868354023, "learning_rate": 3.998178506375228e-05, "loss": 0.0001, "step": 2565 }, { "epoch": 2.808743169398907, "grad_norm": 0.0008412741590291262, "learning_rate": 3.995142683667274e-05, "loss": 0.0001, "step": 2570 }, { "epoch": 2.8142076502732243, "grad_norm": 0.0014517188537865877, "learning_rate": 3.9921068609593206e-05, "loss": 0.0004, "step": 2575 }, { "epoch": 2.819672131147541, "grad_norm": 0.00429339986294508, "learning_rate": 3.989071038251366e-05, "loss": 0.0486, "step": 2580 }, { "epoch": 2.8251366120218577, "grad_norm": 0.00370145495980978, "learning_rate": 3.986035215543412e-05, "loss": 0.0003, "step": 2585 }, { "epoch": 2.830601092896175, "grad_norm": 0.03380095958709717, "learning_rate": 3.9829993928354586e-05, "loss": 0.1322, "step": 2590 }, { "epoch": 2.836065573770492, "grad_norm": 0.0033515288960188627, "learning_rate": 3.979963570127504e-05, "loss": 0.0001, "step": 2595 }, { "epoch": 2.841530054644809, "grad_norm": 0.0014321180060505867, "learning_rate": 3.976927747419551e-05, "loss": 0.0001, "step": 2600 }, { "epoch": 2.8469945355191255, "grad_norm": 0.03204883262515068, "learning_rate": 3.9738919247115966e-05, "loss": 0.0003, "step": 2605 }, { "epoch": 2.8524590163934427, "grad_norm": 0.008516996167600155, "learning_rate": 3.970856102003643e-05, "loss": 0.0001, "step": 2610 }, { "epoch": 2.8579234972677594, "grad_norm": 0.001425994443707168, "learning_rate": 3.9678202792956895e-05, "loss": 0.0, "step": 2615 }, { "epoch": 2.8633879781420766, "grad_norm": 0.0012937316205352545, "learning_rate": 3.964784456587736e-05, "loss": 0.0, "step": 2620 }, { "epoch": 2.8688524590163933, "grad_norm": 0.03010261245071888, "learning_rate": 3.961748633879782e-05, "loss": 0.0002, "step": 2625 }, { "epoch": 2.8743169398907105, "grad_norm": 0.0004600689571816474, "learning_rate": 3.958712811171828e-05, "loss": 0.0, "step": 2630 }, { "epoch": 2.879781420765027, "grad_norm": 0.00028389832004904747, "learning_rate": 3.955676988463874e-05, "loss": 0.0, "step": 2635 }, { "epoch": 2.8852459016393444, "grad_norm": 0.0002460504474584013, "learning_rate": 3.95264116575592e-05, "loss": 0.0, "step": 2640 }, { "epoch": 2.890710382513661, "grad_norm": 0.0009908434003591537, "learning_rate": 3.949605343047966e-05, "loss": 0.0, "step": 2645 }, { "epoch": 2.8961748633879782, "grad_norm": 0.00033436762169003487, "learning_rate": 3.946569520340012e-05, "loss": 0.0001, "step": 2650 }, { "epoch": 2.901639344262295, "grad_norm": 0.13410432636737823, "learning_rate": 3.9435336976320584e-05, "loss": 0.0004, "step": 2655 }, { "epoch": 2.907103825136612, "grad_norm": 0.0007000039331614971, "learning_rate": 3.940497874924105e-05, "loss": 0.0, "step": 2660 }, { "epoch": 2.912568306010929, "grad_norm": 0.0011719991452991962, "learning_rate": 3.937462052216151e-05, "loss": 0.0001, "step": 2665 }, { "epoch": 2.918032786885246, "grad_norm": 0.0003357301466166973, "learning_rate": 3.934426229508197e-05, "loss": 0.0, "step": 2670 }, { "epoch": 2.9234972677595628, "grad_norm": 0.00033655870356597006, "learning_rate": 3.9313904068002435e-05, "loss": 0.0, "step": 2675 }, { "epoch": 2.92896174863388, "grad_norm": 0.000492806953843683, "learning_rate": 3.928354584092289e-05, "loss": 0.0, "step": 2680 }, { "epoch": 2.9344262295081966, "grad_norm": 3.1253645420074463, "learning_rate": 3.925318761384335e-05, "loss": 0.003, "step": 2685 }, { "epoch": 2.939890710382514, "grad_norm": 0.0002712997666094452, "learning_rate": 3.9222829386763815e-05, "loss": 0.0001, "step": 2690 }, { "epoch": 2.9453551912568305, "grad_norm": 0.00011406480916775763, "learning_rate": 3.919247115968427e-05, "loss": 0.0, "step": 2695 }, { "epoch": 2.9508196721311473, "grad_norm": 0.00010400738392490894, "learning_rate": 3.916211293260474e-05, "loss": 0.0, "step": 2700 }, { "epoch": 2.9562841530054644, "grad_norm": 0.00011682388139888644, "learning_rate": 3.9131754705525195e-05, "loss": 0.0, "step": 2705 }, { "epoch": 2.9617486338797816, "grad_norm": 0.00019643260748125613, "learning_rate": 3.910139647844566e-05, "loss": 0.3659, "step": 2710 }, { "epoch": 2.9672131147540983, "grad_norm": 0.002085154876112938, "learning_rate": 3.9071038251366124e-05, "loss": 0.0011, "step": 2715 }, { "epoch": 2.972677595628415, "grad_norm": 0.00412380276247859, "learning_rate": 3.904068002428659e-05, "loss": 0.0003, "step": 2720 }, { "epoch": 2.978142076502732, "grad_norm": 0.007716290187090635, "learning_rate": 3.9010321797207046e-05, "loss": 0.0001, "step": 2725 }, { "epoch": 2.9836065573770494, "grad_norm": 0.00448417104780674, "learning_rate": 3.897996357012751e-05, "loss": 0.0002, "step": 2730 }, { "epoch": 2.989071038251366, "grad_norm": 0.008972808718681335, "learning_rate": 3.894960534304797e-05, "loss": 0.0001, "step": 2735 }, { "epoch": 2.994535519125683, "grad_norm": 0.0038951467722654343, "learning_rate": 3.8919247115968426e-05, "loss": 0.0001, "step": 2740 }, { "epoch": 3.0, "grad_norm": 0.00253617693670094, "learning_rate": 3.888888888888889e-05, "loss": 0.0003, "step": 2745 }, { "epoch": 3.0, "eval_loss": 0.005263752304017544, "eval_runtime": 654.9252, "eval_samples_per_second": 11.171, "eval_steps_per_second": 1.397, "step": 2745 }, { "epoch": 3.0054644808743167, "grad_norm": 0.003645621705800295, "learning_rate": 3.885853066180935e-05, "loss": 0.0001, "step": 2750 }, { "epoch": 3.010928961748634, "grad_norm": 0.0024589765816926956, "learning_rate": 3.882817243472981e-05, "loss": 0.0001, "step": 2755 }, { "epoch": 3.0163934426229506, "grad_norm": 0.00158478575758636, "learning_rate": 3.879781420765027e-05, "loss": 0.0, "step": 2760 }, { "epoch": 3.021857923497268, "grad_norm": 0.000341015518642962, "learning_rate": 3.8767455980570735e-05, "loss": 0.0, "step": 2765 }, { "epoch": 3.0273224043715845, "grad_norm": 0.0012479755096137524, "learning_rate": 3.87370977534912e-05, "loss": 0.0, "step": 2770 }, { "epoch": 3.0327868852459017, "grad_norm": 0.0006929456721991301, "learning_rate": 3.8706739526411664e-05, "loss": 0.0, "step": 2775 }, { "epoch": 3.0382513661202184, "grad_norm": 0.0006383946747519076, "learning_rate": 3.867638129933212e-05, "loss": 0.0, "step": 2780 }, { "epoch": 3.0437158469945356, "grad_norm": 0.001444321358576417, "learning_rate": 3.864602307225258e-05, "loss": 0.0, "step": 2785 }, { "epoch": 3.0491803278688523, "grad_norm": 0.00034748337930068374, "learning_rate": 3.8615664845173044e-05, "loss": 0.0, "step": 2790 }, { "epoch": 3.0546448087431695, "grad_norm": 0.0002488761965651065, "learning_rate": 3.85853066180935e-05, "loss": 0.0, "step": 2795 }, { "epoch": 3.060109289617486, "grad_norm": 0.0011106929741799831, "learning_rate": 3.8554948391013967e-05, "loss": 0.0, "step": 2800 }, { "epoch": 3.0655737704918034, "grad_norm": 0.0006728707812726498, "learning_rate": 3.8524590163934424e-05, "loss": 0.0, "step": 2805 }, { "epoch": 3.07103825136612, "grad_norm": 0.0007144726696424186, "learning_rate": 3.849423193685489e-05, "loss": 0.0, "step": 2810 }, { "epoch": 3.0765027322404372, "grad_norm": 0.0004376015567686409, "learning_rate": 3.8463873709775347e-05, "loss": 0.0, "step": 2815 }, { "epoch": 3.081967213114754, "grad_norm": 0.0008594472892582417, "learning_rate": 3.843351548269581e-05, "loss": 0.0, "step": 2820 }, { "epoch": 3.087431693989071, "grad_norm": 0.0019469836261123419, "learning_rate": 3.8403157255616276e-05, "loss": 0.2017, "step": 2825 }, { "epoch": 3.092896174863388, "grad_norm": 0.00610232213512063, "learning_rate": 3.837279902853674e-05, "loss": 0.0001, "step": 2830 }, { "epoch": 3.098360655737705, "grad_norm": 0.0029869854915887117, "learning_rate": 3.83424408014572e-05, "loss": 0.0001, "step": 2835 }, { "epoch": 3.1038251366120218, "grad_norm": 0.0915936678647995, "learning_rate": 3.8312082574377656e-05, "loss": 0.002, "step": 2840 }, { "epoch": 3.109289617486339, "grad_norm": 0.007518662605434656, "learning_rate": 3.828172434729812e-05, "loss": 0.0001, "step": 2845 }, { "epoch": 3.1147540983606556, "grad_norm": 0.003184305736795068, "learning_rate": 3.825136612021858e-05, "loss": 0.0008, "step": 2850 }, { "epoch": 3.120218579234973, "grad_norm": 0.004786128643900156, "learning_rate": 3.822100789313904e-05, "loss": 0.0005, "step": 2855 }, { "epoch": 3.1256830601092895, "grad_norm": 0.003782659536227584, "learning_rate": 3.81906496660595e-05, "loss": 0.0003, "step": 2860 }, { "epoch": 3.1311475409836067, "grad_norm": 0.006366930436342955, "learning_rate": 3.8160291438979965e-05, "loss": 0.0004, "step": 2865 }, { "epoch": 3.1366120218579234, "grad_norm": 0.0005854523042216897, "learning_rate": 3.812993321190043e-05, "loss": 0.0, "step": 2870 }, { "epoch": 3.1420765027322406, "grad_norm": 0.044832780957221985, "learning_rate": 3.8099574984820894e-05, "loss": 0.0002, "step": 2875 }, { "epoch": 3.1475409836065573, "grad_norm": 0.00037537323078140616, "learning_rate": 3.806921675774135e-05, "loss": 0.0, "step": 2880 }, { "epoch": 3.1530054644808745, "grad_norm": 0.00020258377480786294, "learning_rate": 3.8038858530661816e-05, "loss": 0.0, "step": 2885 }, { "epoch": 3.158469945355191, "grad_norm": 0.00040754053043201566, "learning_rate": 3.8008500303582274e-05, "loss": 0.0, "step": 2890 }, { "epoch": 3.1639344262295084, "grad_norm": 0.00027699521160684526, "learning_rate": 3.797814207650273e-05, "loss": 0.0, "step": 2895 }, { "epoch": 3.169398907103825, "grad_norm": 0.00014932159683667123, "learning_rate": 3.7947783849423196e-05, "loss": 0.0, "step": 2900 }, { "epoch": 3.1748633879781423, "grad_norm": 0.00015883053129073232, "learning_rate": 3.7917425622343654e-05, "loss": 0.0, "step": 2905 }, { "epoch": 3.180327868852459, "grad_norm": 0.0006496338173747063, "learning_rate": 3.788706739526412e-05, "loss": 0.0001, "step": 2910 }, { "epoch": 3.185792349726776, "grad_norm": 0.00013984047109261155, "learning_rate": 3.7856709168184576e-05, "loss": 0.0007, "step": 2915 }, { "epoch": 3.191256830601093, "grad_norm": 8.843530667945743e-05, "learning_rate": 3.782635094110504e-05, "loss": 0.0001, "step": 2920 }, { "epoch": 3.19672131147541, "grad_norm": 0.023396478965878487, "learning_rate": 3.7795992714025505e-05, "loss": 0.0001, "step": 2925 }, { "epoch": 3.202185792349727, "grad_norm": 7.777877908665687e-05, "learning_rate": 3.776563448694597e-05, "loss": 0.0, "step": 2930 }, { "epoch": 3.2076502732240435, "grad_norm": 8.528177568223327e-05, "learning_rate": 3.773527625986643e-05, "loss": 0.0, "step": 2935 }, { "epoch": 3.2131147540983607, "grad_norm": 0.012153876014053822, "learning_rate": 3.7704918032786885e-05, "loss": 0.0001, "step": 2940 }, { "epoch": 3.2185792349726774, "grad_norm": 7.584688864881173e-05, "learning_rate": 3.767455980570735e-05, "loss": 0.0, "step": 2945 }, { "epoch": 3.2240437158469946, "grad_norm": 6.136555748526007e-05, "learning_rate": 3.764420157862781e-05, "loss": 0.0, "step": 2950 }, { "epoch": 3.2295081967213113, "grad_norm": 0.001584029640071094, "learning_rate": 3.761384335154827e-05, "loss": 0.0, "step": 2955 }, { "epoch": 3.2349726775956285, "grad_norm": 7.862479105824605e-05, "learning_rate": 3.758348512446873e-05, "loss": 0.0, "step": 2960 }, { "epoch": 3.240437158469945, "grad_norm": 0.0008851040038280189, "learning_rate": 3.7553126897389194e-05, "loss": 0.0, "step": 2965 }, { "epoch": 3.2459016393442623, "grad_norm": 5.676416913047433e-05, "learning_rate": 3.752276867030965e-05, "loss": 0.0001, "step": 2970 }, { "epoch": 3.251366120218579, "grad_norm": 4.914839267730713, "learning_rate": 3.7492410443230116e-05, "loss": 0.0017, "step": 2975 }, { "epoch": 3.2568306010928962, "grad_norm": 0.00020339513139333576, "learning_rate": 3.746205221615058e-05, "loss": 0.0, "step": 2980 }, { "epoch": 3.262295081967213, "grad_norm": 0.0002010238531511277, "learning_rate": 3.7431693989071045e-05, "loss": 0.0, "step": 2985 }, { "epoch": 3.26775956284153, "grad_norm": 7.606866711284965e-05, "learning_rate": 3.74013357619915e-05, "loss": 0.1313, "step": 2990 }, { "epoch": 3.273224043715847, "grad_norm": 0.0005323002696968615, "learning_rate": 3.737097753491196e-05, "loss": 0.0755, "step": 2995 }, { "epoch": 3.278688524590164, "grad_norm": 5.004248669138178e-05, "learning_rate": 3.7340619307832425e-05, "loss": 0.0001, "step": 3000 }, { "epoch": 3.2841530054644807, "grad_norm": 12.265705108642578, "learning_rate": 3.731026108075288e-05, "loss": 0.3303, "step": 3005 }, { "epoch": 3.289617486338798, "grad_norm": 0.09155070781707764, "learning_rate": 3.727990285367335e-05, "loss": 0.0004, "step": 3010 }, { "epoch": 3.2950819672131146, "grad_norm": 0.0008653182885609567, "learning_rate": 3.7249544626593805e-05, "loss": 0.0, "step": 3015 }, { "epoch": 3.300546448087432, "grad_norm": 0.0018131888937205076, "learning_rate": 3.721918639951427e-05, "loss": 0.0001, "step": 3020 }, { "epoch": 3.3060109289617485, "grad_norm": 0.00172705901786685, "learning_rate": 3.7188828172434734e-05, "loss": 0.0003, "step": 3025 }, { "epoch": 3.3114754098360657, "grad_norm": 0.00302744354121387, "learning_rate": 3.71584699453552e-05, "loss": 0.0003, "step": 3030 }, { "epoch": 3.3169398907103824, "grad_norm": 0.0013879016041755676, "learning_rate": 3.7128111718275657e-05, "loss": 0.0005, "step": 3035 }, { "epoch": 3.3224043715846996, "grad_norm": 0.024859091266989708, "learning_rate": 3.7097753491196114e-05, "loss": 0.0002, "step": 3040 }, { "epoch": 3.3278688524590163, "grad_norm": 0.0013450469123199582, "learning_rate": 3.706739526411658e-05, "loss": 0.0, "step": 3045 }, { "epoch": 3.3333333333333335, "grad_norm": 0.0011400197399780154, "learning_rate": 3.7037037037037037e-05, "loss": 0.0, "step": 3050 }, { "epoch": 3.33879781420765, "grad_norm": 0.0009009820641949773, "learning_rate": 3.70066788099575e-05, "loss": 0.0002, "step": 3055 }, { "epoch": 3.3442622950819674, "grad_norm": 0.005573240574449301, "learning_rate": 3.697632058287796e-05, "loss": 0.0001, "step": 3060 }, { "epoch": 3.349726775956284, "grad_norm": 0.0005607991479337215, "learning_rate": 3.694596235579842e-05, "loss": 0.0, "step": 3065 }, { "epoch": 3.3551912568306013, "grad_norm": 0.00025221219402737916, "learning_rate": 3.691560412871888e-05, "loss": 0.0001, "step": 3070 }, { "epoch": 3.360655737704918, "grad_norm": 0.0004546682757791132, "learning_rate": 3.6885245901639346e-05, "loss": 0.0, "step": 3075 }, { "epoch": 3.366120218579235, "grad_norm": 0.00032192622893489897, "learning_rate": 3.685488767455981e-05, "loss": 0.0, "step": 3080 }, { "epoch": 3.371584699453552, "grad_norm": 0.0008003158727660775, "learning_rate": 3.6824529447480275e-05, "loss": 0.0, "step": 3085 }, { "epoch": 3.3770491803278686, "grad_norm": 0.00030273263109847903, "learning_rate": 3.679417122040073e-05, "loss": 0.0, "step": 3090 }, { "epoch": 3.3825136612021858, "grad_norm": 0.0005664460477419198, "learning_rate": 3.676381299332119e-05, "loss": 0.0, "step": 3095 }, { "epoch": 3.387978142076503, "grad_norm": 0.2617197632789612, "learning_rate": 3.6733454766241655e-05, "loss": 0.0003, "step": 3100 }, { "epoch": 3.3934426229508197, "grad_norm": 0.003910338506102562, "learning_rate": 3.670309653916211e-05, "loss": 0.0, "step": 3105 }, { "epoch": 3.3989071038251364, "grad_norm": 0.00028743871371261775, "learning_rate": 3.667273831208258e-05, "loss": 0.0, "step": 3110 }, { "epoch": 3.4043715846994536, "grad_norm": 0.00032127246959134936, "learning_rate": 3.6642380085003035e-05, "loss": 0.0, "step": 3115 }, { "epoch": 3.4098360655737707, "grad_norm": 0.00023537426022812724, "learning_rate": 3.66120218579235e-05, "loss": 0.0, "step": 3120 }, { "epoch": 3.4153005464480874, "grad_norm": 0.00016202160622924566, "learning_rate": 3.658166363084396e-05, "loss": 0.0, "step": 3125 }, { "epoch": 3.420765027322404, "grad_norm": 0.0001964465918717906, "learning_rate": 3.655130540376442e-05, "loss": 0.0, "step": 3130 }, { "epoch": 3.4262295081967213, "grad_norm": 0.0026353199500590563, "learning_rate": 3.6520947176684886e-05, "loss": 0.0, "step": 3135 }, { "epoch": 3.431693989071038, "grad_norm": 0.0001332706306129694, "learning_rate": 3.6490588949605344e-05, "loss": 0.0, "step": 3140 }, { "epoch": 3.4371584699453552, "grad_norm": 0.00010366823698859662, "learning_rate": 3.646023072252581e-05, "loss": 0.0, "step": 3145 }, { "epoch": 3.442622950819672, "grad_norm": 0.0001762459141900763, "learning_rate": 3.6429872495446266e-05, "loss": 0.0, "step": 3150 }, { "epoch": 3.448087431693989, "grad_norm": 0.00022949972481001168, "learning_rate": 3.639951426836673e-05, "loss": 0.0, "step": 3155 }, { "epoch": 3.453551912568306, "grad_norm": 0.014571224339306355, "learning_rate": 3.636915604128719e-05, "loss": 0.0001, "step": 3160 }, { "epoch": 3.459016393442623, "grad_norm": 0.003986168187111616, "learning_rate": 3.633879781420765e-05, "loss": 0.0, "step": 3165 }, { "epoch": 3.4644808743169397, "grad_norm": 0.00015372096095234156, "learning_rate": 3.630843958712811e-05, "loss": 0.0001, "step": 3170 }, { "epoch": 3.469945355191257, "grad_norm": 0.00022497742611449212, "learning_rate": 3.6278081360048575e-05, "loss": 0.0, "step": 3175 }, { "epoch": 3.4754098360655736, "grad_norm": 0.00011189820361323655, "learning_rate": 3.624772313296903e-05, "loss": 0.0002, "step": 3180 }, { "epoch": 3.480874316939891, "grad_norm": 0.00029375962913036346, "learning_rate": 3.62173649058895e-05, "loss": 0.0, "step": 3185 }, { "epoch": 3.4863387978142075, "grad_norm": 0.00013606030552182347, "learning_rate": 3.618700667880996e-05, "loss": 0.0, "step": 3190 }, { "epoch": 3.4918032786885247, "grad_norm": 0.00935420859605074, "learning_rate": 3.615664845173042e-05, "loss": 0.0, "step": 3195 }, { "epoch": 3.4972677595628414, "grad_norm": 8.235384302679449e-05, "learning_rate": 3.6126290224650884e-05, "loss": 0.0, "step": 3200 }, { "epoch": 3.5027322404371586, "grad_norm": 0.00010109972208738327, "learning_rate": 3.609593199757134e-05, "loss": 0.0, "step": 3205 }, { "epoch": 3.5081967213114753, "grad_norm": 7.307323539862409e-05, "learning_rate": 3.6065573770491806e-05, "loss": 0.0, "step": 3210 }, { "epoch": 3.5136612021857925, "grad_norm": 0.0001078033892554231, "learning_rate": 3.6035215543412264e-05, "loss": 0.0, "step": 3215 }, { "epoch": 3.519125683060109, "grad_norm": 0.000121029355796054, "learning_rate": 3.600485731633273e-05, "loss": 0.0, "step": 3220 }, { "epoch": 3.5245901639344264, "grad_norm": 0.0001414872967870906, "learning_rate": 3.5974499089253186e-05, "loss": 0.0001, "step": 3225 }, { "epoch": 3.530054644808743, "grad_norm": 0.00011328158143442124, "learning_rate": 3.594414086217365e-05, "loss": 0.0, "step": 3230 }, { "epoch": 3.5355191256830603, "grad_norm": 0.00020013573521282524, "learning_rate": 3.5913782635094115e-05, "loss": 0.0, "step": 3235 }, { "epoch": 3.540983606557377, "grad_norm": 0.00011696962610585615, "learning_rate": 3.588342440801457e-05, "loss": 0.0, "step": 3240 }, { "epoch": 3.546448087431694, "grad_norm": 7.634115172550082e-05, "learning_rate": 3.585306618093504e-05, "loss": 0.0, "step": 3245 }, { "epoch": 3.551912568306011, "grad_norm": 0.006965294014662504, "learning_rate": 3.5822707953855495e-05, "loss": 0.0, "step": 3250 }, { "epoch": 3.557377049180328, "grad_norm": 0.0001289156498387456, "learning_rate": 3.579234972677596e-05, "loss": 0.0, "step": 3255 }, { "epoch": 3.5628415300546448, "grad_norm": 0.00012060425797244534, "learning_rate": 3.576199149969642e-05, "loss": 0.0, "step": 3260 }, { "epoch": 3.5683060109289615, "grad_norm": 0.0001801663893274963, "learning_rate": 3.573163327261688e-05, "loss": 0.0, "step": 3265 }, { "epoch": 3.5737704918032787, "grad_norm": 9.889312786981463e-05, "learning_rate": 3.570127504553734e-05, "loss": 0.0, "step": 3270 }, { "epoch": 3.579234972677596, "grad_norm": 8.248529047705233e-05, "learning_rate": 3.5670916818457804e-05, "loss": 0.0, "step": 3275 }, { "epoch": 3.5846994535519126, "grad_norm": 0.00016700288688298315, "learning_rate": 3.564055859137826e-05, "loss": 0.0, "step": 3280 }, { "epoch": 3.5901639344262293, "grad_norm": 8.013186015887186e-05, "learning_rate": 3.5610200364298727e-05, "loss": 0.0, "step": 3285 }, { "epoch": 3.5956284153005464, "grad_norm": 9.400265844305977e-05, "learning_rate": 3.557984213721919e-05, "loss": 0.0, "step": 3290 }, { "epoch": 3.6010928961748636, "grad_norm": 0.0011224903864786029, "learning_rate": 3.554948391013965e-05, "loss": 0.0, "step": 3295 }, { "epoch": 3.6065573770491803, "grad_norm": 0.0001463467488065362, "learning_rate": 3.551912568306011e-05, "loss": 0.0, "step": 3300 }, { "epoch": 3.612021857923497, "grad_norm": 0.0007439145119860768, "learning_rate": 3.548876745598057e-05, "loss": 0.0, "step": 3305 }, { "epoch": 3.6174863387978142, "grad_norm": 0.00013679706898983568, "learning_rate": 3.5458409228901036e-05, "loss": 0.0, "step": 3310 }, { "epoch": 3.6229508196721314, "grad_norm": 0.00010268126789014786, "learning_rate": 3.542805100182149e-05, "loss": 0.0, "step": 3315 }, { "epoch": 3.628415300546448, "grad_norm": 0.00028116197790950537, "learning_rate": 3.539769277474196e-05, "loss": 0.0, "step": 3320 }, { "epoch": 3.633879781420765, "grad_norm": 9.449726348975673e-05, "learning_rate": 3.5367334547662416e-05, "loss": 0.0, "step": 3325 }, { "epoch": 3.639344262295082, "grad_norm": 0.00010181035031564534, "learning_rate": 3.533697632058288e-05, "loss": 0.0, "step": 3330 }, { "epoch": 3.644808743169399, "grad_norm": 6.961561302887276e-05, "learning_rate": 3.530661809350334e-05, "loss": 0.0, "step": 3335 }, { "epoch": 3.650273224043716, "grad_norm": 0.00010002183262258768, "learning_rate": 3.52762598664238e-05, "loss": 0.0, "step": 3340 }, { "epoch": 3.6557377049180326, "grad_norm": 9.894008690025657e-05, "learning_rate": 3.524590163934427e-05, "loss": 0.0, "step": 3345 }, { "epoch": 3.66120218579235, "grad_norm": 0.00013567868154495955, "learning_rate": 3.5215543412264725e-05, "loss": 0.0, "step": 3350 }, { "epoch": 3.6666666666666665, "grad_norm": 0.0002798748027998954, "learning_rate": 3.518518518518519e-05, "loss": 0.0106, "step": 3355 }, { "epoch": 3.6721311475409837, "grad_norm": 0.0008901433320716023, "learning_rate": 3.515482695810565e-05, "loss": 0.0, "step": 3360 }, { "epoch": 3.6775956284153004, "grad_norm": 0.008999276906251907, "learning_rate": 3.512446873102611e-05, "loss": 0.0001, "step": 3365 }, { "epoch": 3.6830601092896176, "grad_norm": 0.0022385860793292522, "learning_rate": 3.509411050394657e-05, "loss": 0.0, "step": 3370 }, { "epoch": 3.6885245901639343, "grad_norm": 0.0003485867637209594, "learning_rate": 3.5063752276867034e-05, "loss": 0.0, "step": 3375 }, { "epoch": 3.6939890710382515, "grad_norm": 0.00025722169084474444, "learning_rate": 3.503339404978749e-05, "loss": 0.0, "step": 3380 }, { "epoch": 3.699453551912568, "grad_norm": 7.106779230525717e-05, "learning_rate": 3.5003035822707956e-05, "loss": 0.0, "step": 3385 }, { "epoch": 3.7049180327868854, "grad_norm": 0.0002384383842581883, "learning_rate": 3.4972677595628414e-05, "loss": 0.0, "step": 3390 }, { "epoch": 3.710382513661202, "grad_norm": 0.0002946928725577891, "learning_rate": 3.494231936854888e-05, "loss": 0.1995, "step": 3395 }, { "epoch": 3.7158469945355193, "grad_norm": 0.0007392108091153204, "learning_rate": 3.491196114146934e-05, "loss": 0.0, "step": 3400 }, { "epoch": 3.721311475409836, "grad_norm": 0.001694119768217206, "learning_rate": 3.48816029143898e-05, "loss": 0.0, "step": 3405 }, { "epoch": 3.726775956284153, "grad_norm": 0.0009142764611169696, "learning_rate": 3.4851244687310265e-05, "loss": 0.0, "step": 3410 }, { "epoch": 3.73224043715847, "grad_norm": 0.0035774747375398874, "learning_rate": 3.482088646023072e-05, "loss": 0.0001, "step": 3415 }, { "epoch": 3.737704918032787, "grad_norm": 0.0033966186456382275, "learning_rate": 3.479052823315119e-05, "loss": 0.0009, "step": 3420 }, { "epoch": 3.7431693989071038, "grad_norm": 0.0004668271285481751, "learning_rate": 3.4760170006071645e-05, "loss": 0.0001, "step": 3425 }, { "epoch": 3.748633879781421, "grad_norm": 0.0003218352794647217, "learning_rate": 3.472981177899211e-05, "loss": 0.0001, "step": 3430 }, { "epoch": 3.7540983606557377, "grad_norm": 0.037057679146528244, "learning_rate": 3.469945355191257e-05, "loss": 0.0001, "step": 3435 }, { "epoch": 3.7595628415300544, "grad_norm": 0.031188733875751495, "learning_rate": 3.466909532483303e-05, "loss": 0.001, "step": 3440 }, { "epoch": 3.7650273224043715, "grad_norm": 0.00015825718583073467, "learning_rate": 3.4638737097753496e-05, "loss": 0.0005, "step": 3445 }, { "epoch": 3.7704918032786887, "grad_norm": 0.0037946077063679695, "learning_rate": 3.4608378870673954e-05, "loss": 0.0, "step": 3450 }, { "epoch": 3.7759562841530054, "grad_norm": 0.0002360966900596395, "learning_rate": 3.457802064359442e-05, "loss": 0.0, "step": 3455 }, { "epoch": 3.781420765027322, "grad_norm": 0.024776164442300797, "learning_rate": 3.4547662416514876e-05, "loss": 0.0003, "step": 3460 }, { "epoch": 3.7868852459016393, "grad_norm": 0.0002828339929692447, "learning_rate": 3.451730418943534e-05, "loss": 0.0, "step": 3465 }, { "epoch": 3.7923497267759565, "grad_norm": 0.00010138905781786889, "learning_rate": 3.44869459623558e-05, "loss": 0.0001, "step": 3470 }, { "epoch": 3.797814207650273, "grad_norm": 0.00016995143960230052, "learning_rate": 3.445658773527626e-05, "loss": 0.0, "step": 3475 }, { "epoch": 3.80327868852459, "grad_norm": 0.00012559971946757287, "learning_rate": 3.442622950819672e-05, "loss": 0.0, "step": 3480 }, { "epoch": 3.808743169398907, "grad_norm": 0.0006380723789334297, "learning_rate": 3.4395871281117185e-05, "loss": 0.0, "step": 3485 }, { "epoch": 3.8142076502732243, "grad_norm": 0.00035217360709793866, "learning_rate": 3.436551305403764e-05, "loss": 0.0001, "step": 3490 }, { "epoch": 3.819672131147541, "grad_norm": 0.0002526458411011845, "learning_rate": 3.433515482695811e-05, "loss": 0.0, "step": 3495 }, { "epoch": 3.8251366120218577, "grad_norm": 9.788498573470861e-05, "learning_rate": 3.430479659987857e-05, "loss": 0.0, "step": 3500 }, { "epoch": 3.830601092896175, "grad_norm": 0.00022861824254505336, "learning_rate": 3.427443837279903e-05, "loss": 0.0001, "step": 3505 }, { "epoch": 3.836065573770492, "grad_norm": 0.00013972212036605924, "learning_rate": 3.4244080145719494e-05, "loss": 0.0, "step": 3510 }, { "epoch": 3.841530054644809, "grad_norm": 0.0001966755517059937, "learning_rate": 3.421372191863995e-05, "loss": 0.0, "step": 3515 }, { "epoch": 3.8469945355191255, "grad_norm": 0.0006457020062953234, "learning_rate": 3.4183363691560417e-05, "loss": 0.0, "step": 3520 }, { "epoch": 3.8524590163934427, "grad_norm": 0.00014496638323180377, "learning_rate": 3.4153005464480874e-05, "loss": 0.0, "step": 3525 }, { "epoch": 3.8579234972677594, "grad_norm": 7.40105751901865e-05, "learning_rate": 3.412264723740134e-05, "loss": 0.0, "step": 3530 }, { "epoch": 3.8633879781420766, "grad_norm": 0.041569244116544724, "learning_rate": 3.4092289010321797e-05, "loss": 0.0003, "step": 3535 }, { "epoch": 3.8688524590163933, "grad_norm": 0.0001943446695804596, "learning_rate": 3.406193078324226e-05, "loss": 0.0, "step": 3540 }, { "epoch": 3.8743169398907105, "grad_norm": 0.00015503684699069709, "learning_rate": 3.403157255616272e-05, "loss": 0.0, "step": 3545 }, { "epoch": 3.879781420765027, "grad_norm": 0.000700871052686125, "learning_rate": 3.400121432908318e-05, "loss": 0.0, "step": 3550 }, { "epoch": 3.8852459016393444, "grad_norm": 5.401040834840387e-05, "learning_rate": 3.397085610200365e-05, "loss": 0.0, "step": 3555 }, { "epoch": 3.890710382513661, "grad_norm": 0.000344992644386366, "learning_rate": 3.3940497874924106e-05, "loss": 0.0, "step": 3560 }, { "epoch": 3.8961748633879782, "grad_norm": 6.189793202793226e-05, "learning_rate": 3.391013964784457e-05, "loss": 0.0274, "step": 3565 }, { "epoch": 3.901639344262295, "grad_norm": 6.884944741614163e-05, "learning_rate": 3.387978142076503e-05, "loss": 0.0, "step": 3570 }, { "epoch": 3.907103825136612, "grad_norm": 6.29270653007552e-05, "learning_rate": 3.384942319368549e-05, "loss": 0.0, "step": 3575 }, { "epoch": 3.912568306010929, "grad_norm": 0.00013839226448908448, "learning_rate": 3.381906496660595e-05, "loss": 0.0, "step": 3580 }, { "epoch": 3.918032786885246, "grad_norm": 0.0001924206007970497, "learning_rate": 3.3788706739526415e-05, "loss": 0.0, "step": 3585 }, { "epoch": 3.9234972677595628, "grad_norm": 0.013641326688230038, "learning_rate": 3.375834851244687e-05, "loss": 0.3372, "step": 3590 }, { "epoch": 3.92896174863388, "grad_norm": 0.00014125218149274588, "learning_rate": 3.372799028536734e-05, "loss": 0.0001, "step": 3595 }, { "epoch": 3.9344262295081966, "grad_norm": 0.005081810522824526, "learning_rate": 3.36976320582878e-05, "loss": 0.0, "step": 3600 }, { "epoch": 3.939890710382514, "grad_norm": 0.00014312181156128645, "learning_rate": 3.366727383120826e-05, "loss": 0.0, "step": 3605 }, { "epoch": 3.9453551912568305, "grad_norm": 0.00023116619559004903, "learning_rate": 3.3636915604128724e-05, "loss": 0.0, "step": 3610 }, { "epoch": 3.9508196721311473, "grad_norm": 0.00011138137051602826, "learning_rate": 3.360655737704918e-05, "loss": 0.0, "step": 3615 }, { "epoch": 3.9562841530054644, "grad_norm": 0.00024408698664046824, "learning_rate": 3.3576199149969646e-05, "loss": 0.0, "step": 3620 }, { "epoch": 3.9617486338797816, "grad_norm": 0.012111087329685688, "learning_rate": 3.3545840922890104e-05, "loss": 0.0, "step": 3625 }, { "epoch": 3.9672131147540983, "grad_norm": 0.0005235079443082213, "learning_rate": 3.351548269581057e-05, "loss": 0.0, "step": 3630 }, { "epoch": 3.972677595628415, "grad_norm": 0.013302493840456009, "learning_rate": 3.3485124468731026e-05, "loss": 0.0004, "step": 3635 }, { "epoch": 3.978142076502732, "grad_norm": 0.00025188663857989013, "learning_rate": 3.345476624165149e-05, "loss": 0.0, "step": 3640 }, { "epoch": 3.9836065573770494, "grad_norm": 0.00011241805623285472, "learning_rate": 3.342440801457195e-05, "loss": 0.0, "step": 3645 }, { "epoch": 3.989071038251366, "grad_norm": 0.0006041810265742242, "learning_rate": 3.339404978749241e-05, "loss": 0.0, "step": 3650 }, { "epoch": 3.994535519125683, "grad_norm": 0.00017458312504459172, "learning_rate": 3.336369156041288e-05, "loss": 0.0, "step": 3655 }, { "epoch": 4.0, "grad_norm": 0.00013060122728347778, "learning_rate": 3.3333333333333335e-05, "loss": 0.0, "step": 3660 }, { "epoch": 4.0, "eval_loss": 0.002205207943916321, "eval_runtime": 657.1958, "eval_samples_per_second": 11.132, "eval_steps_per_second": 1.392, "step": 3660 }, { "epoch": 4.005464480874317, "grad_norm": 0.00011811502918135375, "learning_rate": 3.33029751062538e-05, "loss": 0.0, "step": 3665 }, { "epoch": 4.0109289617486334, "grad_norm": 8.069877367233858e-05, "learning_rate": 3.327261687917426e-05, "loss": 0.0, "step": 3670 }, { "epoch": 4.016393442622951, "grad_norm": 0.00011450869351392612, "learning_rate": 3.324225865209472e-05, "loss": 0.0, "step": 3675 }, { "epoch": 4.021857923497268, "grad_norm": 0.0001386746735079214, "learning_rate": 3.321190042501518e-05, "loss": 0.0, "step": 3680 }, { "epoch": 4.027322404371585, "grad_norm": 0.00010339313303120434, "learning_rate": 3.3181542197935644e-05, "loss": 0.0, "step": 3685 }, { "epoch": 4.032786885245901, "grad_norm": 0.0001254361413884908, "learning_rate": 3.31511839708561e-05, "loss": 0.0, "step": 3690 }, { "epoch": 4.038251366120218, "grad_norm": 0.0004021910426672548, "learning_rate": 3.312082574377656e-05, "loss": 0.0, "step": 3695 }, { "epoch": 4.043715846994536, "grad_norm": 0.0001132467805291526, "learning_rate": 3.3090467516697024e-05, "loss": 0.0, "step": 3700 }, { "epoch": 4.049180327868853, "grad_norm": 0.00153514021076262, "learning_rate": 3.306010928961749e-05, "loss": 0.0, "step": 3705 }, { "epoch": 4.054644808743169, "grad_norm": 9.872686496237293e-05, "learning_rate": 3.302975106253795e-05, "loss": 0.0, "step": 3710 }, { "epoch": 4.060109289617486, "grad_norm": 0.00011148265184601769, "learning_rate": 3.299939283545841e-05, "loss": 0.0, "step": 3715 }, { "epoch": 4.065573770491803, "grad_norm": 0.00012307016004342586, "learning_rate": 3.2969034608378875e-05, "loss": 0.0, "step": 3720 }, { "epoch": 4.0710382513661205, "grad_norm": 8.422257087659091e-05, "learning_rate": 3.293867638129933e-05, "loss": 0.0, "step": 3725 }, { "epoch": 4.076502732240437, "grad_norm": 0.0008893508929759264, "learning_rate": 3.29083181542198e-05, "loss": 0.0001, "step": 3730 }, { "epoch": 4.081967213114754, "grad_norm": 8.832754247123376e-05, "learning_rate": 3.2877959927140255e-05, "loss": 0.0, "step": 3735 }, { "epoch": 4.087431693989071, "grad_norm": 9.743066038936377e-05, "learning_rate": 3.284760170006072e-05, "loss": 0.0, "step": 3740 }, { "epoch": 4.092896174863388, "grad_norm": 0.00011171947699040174, "learning_rate": 3.281724347298118e-05, "loss": 0.0, "step": 3745 }, { "epoch": 4.098360655737705, "grad_norm": 0.00016356426931452006, "learning_rate": 3.2786885245901635e-05, "loss": 0.0, "step": 3750 }, { "epoch": 4.103825136612022, "grad_norm": 0.0012780509423464537, "learning_rate": 3.27565270188221e-05, "loss": 0.0, "step": 3755 }, { "epoch": 4.109289617486339, "grad_norm": 0.00011072060442529619, "learning_rate": 3.2726168791742564e-05, "loss": 0.0, "step": 3760 }, { "epoch": 4.114754098360656, "grad_norm": 0.00011896173964487389, "learning_rate": 3.269581056466303e-05, "loss": 0.0, "step": 3765 }, { "epoch": 4.120218579234972, "grad_norm": 0.00013041883357800543, "learning_rate": 3.2665452337583487e-05, "loss": 0.0, "step": 3770 }, { "epoch": 4.1256830601092895, "grad_norm": 0.006071773823350668, "learning_rate": 3.263509411050395e-05, "loss": 0.0, "step": 3775 }, { "epoch": 4.131147540983607, "grad_norm": 8.175130642484874e-05, "learning_rate": 3.260473588342441e-05, "loss": 0.0, "step": 3780 }, { "epoch": 4.136612021857924, "grad_norm": 0.005459274630993605, "learning_rate": 3.257437765634487e-05, "loss": 0.0, "step": 3785 }, { "epoch": 4.14207650273224, "grad_norm": 0.00033711790456436574, "learning_rate": 3.254401942926533e-05, "loss": 0.0, "step": 3790 }, { "epoch": 4.147540983606557, "grad_norm": 0.00011819975770777091, "learning_rate": 3.251366120218579e-05, "loss": 0.0, "step": 3795 }, { "epoch": 4.1530054644808745, "grad_norm": 4.659785554395057e-05, "learning_rate": 3.248330297510625e-05, "loss": 0.0, "step": 3800 }, { "epoch": 4.158469945355192, "grad_norm": 0.0001775735872797668, "learning_rate": 3.245294474802672e-05, "loss": 0.0, "step": 3805 }, { "epoch": 4.163934426229508, "grad_norm": 0.00011735469161067158, "learning_rate": 3.242258652094718e-05, "loss": 0.0, "step": 3810 }, { "epoch": 4.169398907103825, "grad_norm": 0.00012988239177502692, "learning_rate": 3.239222829386764e-05, "loss": 0.0, "step": 3815 }, { "epoch": 4.174863387978142, "grad_norm": 0.004161364398896694, "learning_rate": 3.2361870066788105e-05, "loss": 0.0, "step": 3820 }, { "epoch": 4.180327868852459, "grad_norm": 0.0001383601047564298, "learning_rate": 3.233151183970856e-05, "loss": 0.0, "step": 3825 }, { "epoch": 4.185792349726776, "grad_norm": 8.963741856859997e-05, "learning_rate": 3.230115361262903e-05, "loss": 0.0, "step": 3830 }, { "epoch": 4.191256830601093, "grad_norm": 8.04738374426961e-05, "learning_rate": 3.2270795385549485e-05, "loss": 0.0, "step": 3835 }, { "epoch": 4.19672131147541, "grad_norm": 0.00012023324961774051, "learning_rate": 3.224043715846995e-05, "loss": 0.0, "step": 3840 }, { "epoch": 4.202185792349727, "grad_norm": 8.154928946169093e-05, "learning_rate": 3.221007893139041e-05, "loss": 0.0, "step": 3845 }, { "epoch": 4.2076502732240435, "grad_norm": 9.04480621102266e-05, "learning_rate": 3.2179720704310865e-05, "loss": 0.0, "step": 3850 }, { "epoch": 4.213114754098361, "grad_norm": 0.0001085352196241729, "learning_rate": 3.214936247723133e-05, "loss": 0.0, "step": 3855 }, { "epoch": 4.218579234972678, "grad_norm": 8.086708112386987e-05, "learning_rate": 3.2119004250151794e-05, "loss": 0.0, "step": 3860 }, { "epoch": 4.224043715846994, "grad_norm": 0.00012486812192946672, "learning_rate": 3.208864602307226e-05, "loss": 0.0, "step": 3865 }, { "epoch": 4.229508196721311, "grad_norm": 5.529926420422271e-05, "learning_rate": 3.2058287795992716e-05, "loss": 0.0, "step": 3870 }, { "epoch": 4.2349726775956285, "grad_norm": 0.001104644499719143, "learning_rate": 3.202792956891318e-05, "loss": 0.0, "step": 3875 }, { "epoch": 4.240437158469946, "grad_norm": 8.401143713854253e-05, "learning_rate": 3.199757134183364e-05, "loss": 0.0, "step": 3880 }, { "epoch": 4.245901639344262, "grad_norm": 7.888235995778814e-05, "learning_rate": 3.19672131147541e-05, "loss": 0.0, "step": 3885 }, { "epoch": 4.251366120218579, "grad_norm": 0.0001080270521924831, "learning_rate": 3.193685488767456e-05, "loss": 0.0, "step": 3890 }, { "epoch": 4.256830601092896, "grad_norm": 5.707483796868473e-05, "learning_rate": 3.1906496660595025e-05, "loss": 0.0, "step": 3895 }, { "epoch": 4.262295081967213, "grad_norm": 7.05120837665163e-05, "learning_rate": 3.187613843351548e-05, "loss": 0.0, "step": 3900 }, { "epoch": 4.26775956284153, "grad_norm": 6.852354272268713e-05, "learning_rate": 3.184578020643594e-05, "loss": 0.0, "step": 3905 }, { "epoch": 4.273224043715847, "grad_norm": 0.0026433016173541546, "learning_rate": 3.1815421979356405e-05, "loss": 0.0, "step": 3910 }, { "epoch": 4.278688524590164, "grad_norm": 8.075160440057516e-05, "learning_rate": 3.178506375227687e-05, "loss": 0.0, "step": 3915 }, { "epoch": 4.284153005464481, "grad_norm": 8.570487989345565e-05, "learning_rate": 3.1754705525197334e-05, "loss": 0.0, "step": 3920 }, { "epoch": 4.2896174863387975, "grad_norm": 0.0025646265130490065, "learning_rate": 3.172434729811779e-05, "loss": 0.0, "step": 3925 }, { "epoch": 4.295081967213115, "grad_norm": 6.75572082400322e-05, "learning_rate": 3.1693989071038256e-05, "loss": 0.0, "step": 3930 }, { "epoch": 4.300546448087432, "grad_norm": 7.079667557263747e-05, "learning_rate": 3.1663630843958714e-05, "loss": 0.0, "step": 3935 }, { "epoch": 4.306010928961749, "grad_norm": 4.038075712742284e-05, "learning_rate": 3.163327261687918e-05, "loss": 0.0, "step": 3940 }, { "epoch": 4.311475409836065, "grad_norm": 5.5980359320528805e-05, "learning_rate": 3.1602914389799636e-05, "loss": 0.0, "step": 3945 }, { "epoch": 4.316939890710382, "grad_norm": 0.00010364054469391704, "learning_rate": 3.1572556162720094e-05, "loss": 0.0, "step": 3950 }, { "epoch": 4.3224043715847, "grad_norm": 5.6818211305653676e-05, "learning_rate": 3.154219793564056e-05, "loss": 0.0, "step": 3955 }, { "epoch": 4.327868852459017, "grad_norm": 9.114974091062322e-05, "learning_rate": 3.1511839708561016e-05, "loss": 0.0, "step": 3960 }, { "epoch": 4.333333333333333, "grad_norm": 7.907680992502719e-05, "learning_rate": 3.148148148148148e-05, "loss": 0.0, "step": 3965 }, { "epoch": 4.33879781420765, "grad_norm": 7.477829058188945e-05, "learning_rate": 3.1451123254401945e-05, "loss": 0.0, "step": 3970 }, { "epoch": 4.344262295081967, "grad_norm": 0.0014824882382526994, "learning_rate": 3.142076502732241e-05, "loss": 0.0, "step": 3975 }, { "epoch": 4.3497267759562845, "grad_norm": 9.136074368143454e-05, "learning_rate": 3.139040680024287e-05, "loss": 0.0, "step": 3980 }, { "epoch": 4.355191256830601, "grad_norm": 0.00038130677421577275, "learning_rate": 3.136004857316333e-05, "loss": 0.0, "step": 3985 }, { "epoch": 4.360655737704918, "grad_norm": 7.651744090253487e-05, "learning_rate": 3.132969034608379e-05, "loss": 0.0, "step": 3990 }, { "epoch": 4.366120218579235, "grad_norm": 7.331320375669748e-05, "learning_rate": 3.1299332119004254e-05, "loss": 0.0, "step": 3995 }, { "epoch": 4.371584699453552, "grad_norm": 9.19502999749966e-05, "learning_rate": 3.126897389192471e-05, "loss": 0.0, "step": 4000 }, { "epoch": 4.377049180327869, "grad_norm": 7.570043089799583e-05, "learning_rate": 3.123861566484517e-05, "loss": 0.0, "step": 4005 }, { "epoch": 4.382513661202186, "grad_norm": 7.069893035804853e-05, "learning_rate": 3.1208257437765634e-05, "loss": 0.0, "step": 4010 }, { "epoch": 4.387978142076503, "grad_norm": 5.1304690714459866e-05, "learning_rate": 3.11778992106861e-05, "loss": 0.0, "step": 4015 }, { "epoch": 4.39344262295082, "grad_norm": 8.647384674986824e-05, "learning_rate": 3.114754098360656e-05, "loss": 0.0, "step": 4020 }, { "epoch": 4.398907103825136, "grad_norm": 7.207443559309468e-05, "learning_rate": 3.111718275652702e-05, "loss": 0.0, "step": 4025 }, { "epoch": 4.404371584699454, "grad_norm": 0.00024384768039453775, "learning_rate": 3.1086824529447486e-05, "loss": 0.0, "step": 4030 }, { "epoch": 4.409836065573771, "grad_norm": 6.280928937485442e-05, "learning_rate": 3.105646630236794e-05, "loss": 0.0, "step": 4035 }, { "epoch": 4.415300546448087, "grad_norm": 5.470717223943211e-05, "learning_rate": 3.102610807528841e-05, "loss": 0.0, "step": 4040 }, { "epoch": 4.420765027322404, "grad_norm": 4.7346966312034056e-05, "learning_rate": 3.0995749848208866e-05, "loss": 0.0, "step": 4045 }, { "epoch": 4.426229508196721, "grad_norm": 0.00017950611072592437, "learning_rate": 3.096539162112932e-05, "loss": 0.0, "step": 4050 }, { "epoch": 4.4316939890710385, "grad_norm": 5.147058254806325e-05, "learning_rate": 3.093503339404979e-05, "loss": 0.0, "step": 4055 }, { "epoch": 4.437158469945355, "grad_norm": 7.883601210778579e-05, "learning_rate": 3.0904675166970246e-05, "loss": 0.0, "step": 4060 }, { "epoch": 4.442622950819672, "grad_norm": 6.258031498873606e-05, "learning_rate": 3.087431693989071e-05, "loss": 0.0, "step": 4065 }, { "epoch": 4.448087431693989, "grad_norm": 0.0001280458818655461, "learning_rate": 3.0843958712811175e-05, "loss": 0.0, "step": 4070 }, { "epoch": 4.453551912568306, "grad_norm": 5.015691931475885e-05, "learning_rate": 3.081360048573164e-05, "loss": 0.0, "step": 4075 }, { "epoch": 4.459016393442623, "grad_norm": 5.9115078329341486e-05, "learning_rate": 3.07832422586521e-05, "loss": 0.0, "step": 4080 }, { "epoch": 4.46448087431694, "grad_norm": 6.528561789309606e-05, "learning_rate": 3.075288403157256e-05, "loss": 0.001, "step": 4085 }, { "epoch": 4.469945355191257, "grad_norm": 0.00010480813944013789, "learning_rate": 3.072252580449302e-05, "loss": 0.0, "step": 4090 }, { "epoch": 4.475409836065574, "grad_norm": 3.688339711516164e-05, "learning_rate": 3.0692167577413484e-05, "loss": 0.0, "step": 4095 }, { "epoch": 4.48087431693989, "grad_norm": 0.0001360525202471763, "learning_rate": 3.066180935033394e-05, "loss": 0.0, "step": 4100 }, { "epoch": 4.4863387978142075, "grad_norm": 3.33041389239952e-05, "learning_rate": 3.06314511232544e-05, "loss": 0.0, "step": 4105 }, { "epoch": 4.491803278688525, "grad_norm": 3.9832579204812646e-05, "learning_rate": 3.0601092896174864e-05, "loss": 0.0, "step": 4110 }, { "epoch": 4.497267759562842, "grad_norm": 2.829811637639068e-05, "learning_rate": 3.057073466909532e-05, "loss": 0.0001, "step": 4115 }, { "epoch": 4.502732240437158, "grad_norm": 0.00016292250074911863, "learning_rate": 3.0540376442015786e-05, "loss": 0.0, "step": 4120 }, { "epoch": 4.508196721311475, "grad_norm": 2.8942140488652512e-05, "learning_rate": 3.0510018214936247e-05, "loss": 0.0, "step": 4125 }, { "epoch": 4.5136612021857925, "grad_norm": 5.0521703087724745e-05, "learning_rate": 3.047965998785671e-05, "loss": 0.0, "step": 4130 }, { "epoch": 4.51912568306011, "grad_norm": 2.7378995582694188e-05, "learning_rate": 3.0449301760777173e-05, "loss": 0.0, "step": 4135 }, { "epoch": 4.524590163934426, "grad_norm": 2.603435677883681e-05, "learning_rate": 3.0418943533697637e-05, "loss": 0.0, "step": 4140 }, { "epoch": 4.530054644808743, "grad_norm": 2.8875801945105195e-05, "learning_rate": 3.0388585306618095e-05, "loss": 0.0, "step": 4145 }, { "epoch": 4.53551912568306, "grad_norm": 2.562619192758575e-05, "learning_rate": 3.0358227079538553e-05, "loss": 0.0, "step": 4150 }, { "epoch": 4.540983606557377, "grad_norm": 3.5220946301706135e-05, "learning_rate": 3.0327868852459017e-05, "loss": 0.0003, "step": 4155 }, { "epoch": 4.546448087431694, "grad_norm": 2.3912976757856086e-05, "learning_rate": 3.029751062537948e-05, "loss": 0.0001, "step": 4160 }, { "epoch": 4.551912568306011, "grad_norm": 2.4303217287524603e-05, "learning_rate": 3.0267152398299943e-05, "loss": 0.0, "step": 4165 }, { "epoch": 4.557377049180328, "grad_norm": 2.267379932163749e-05, "learning_rate": 3.02367941712204e-05, "loss": 0.0, "step": 4170 }, { "epoch": 4.562841530054644, "grad_norm": 2.6442226953804493e-05, "learning_rate": 3.0206435944140865e-05, "loss": 0.0, "step": 4175 }, { "epoch": 4.5683060109289615, "grad_norm": 2.2860698663862422e-05, "learning_rate": 3.0176077717061323e-05, "loss": 0.0, "step": 4180 }, { "epoch": 4.573770491803279, "grad_norm": 2.834216684277635e-05, "learning_rate": 3.0145719489981787e-05, "loss": 0.0, "step": 4185 }, { "epoch": 4.579234972677596, "grad_norm": 2.5860501409624703e-05, "learning_rate": 3.011536126290225e-05, "loss": 0.0, "step": 4190 }, { "epoch": 4.584699453551913, "grad_norm": 2.1242882212391123e-05, "learning_rate": 3.0085003035822713e-05, "loss": 0.0, "step": 4195 }, { "epoch": 4.590163934426229, "grad_norm": 2.523986404412426e-05, "learning_rate": 3.005464480874317e-05, "loss": 0.0, "step": 4200 }, { "epoch": 4.595628415300546, "grad_norm": 2.213427251263056e-05, "learning_rate": 3.002428658166363e-05, "loss": 0.0, "step": 4205 }, { "epoch": 4.601092896174864, "grad_norm": 2.5897697923937812e-05, "learning_rate": 2.9993928354584093e-05, "loss": 0.0, "step": 4210 }, { "epoch": 4.60655737704918, "grad_norm": 0.0009925027843564749, "learning_rate": 2.9963570127504554e-05, "loss": 0.0, "step": 4215 }, { "epoch": 4.612021857923497, "grad_norm": 2.2528745830641128e-05, "learning_rate": 2.993321190042502e-05, "loss": 0.0, "step": 4220 }, { "epoch": 4.617486338797814, "grad_norm": 2.1108824512339197e-05, "learning_rate": 2.9902853673345476e-05, "loss": 0.0, "step": 4225 }, { "epoch": 4.622950819672131, "grad_norm": 2.6829531634575687e-05, "learning_rate": 2.987249544626594e-05, "loss": 0.0, "step": 4230 }, { "epoch": 4.628415300546449, "grad_norm": 2.457356640661601e-05, "learning_rate": 2.98421372191864e-05, "loss": 0.0, "step": 4235 }, { "epoch": 4.633879781420765, "grad_norm": 0.0010403507621958852, "learning_rate": 2.9811778992106863e-05, "loss": 0.0, "step": 4240 }, { "epoch": 4.639344262295082, "grad_norm": 2.311925527465064e-05, "learning_rate": 2.9781420765027324e-05, "loss": 0.0, "step": 4245 }, { "epoch": 4.644808743169399, "grad_norm": 0.00011339302727719769, "learning_rate": 2.9751062537947782e-05, "loss": 0.0, "step": 4250 }, { "epoch": 4.6502732240437155, "grad_norm": 2.0847737687290646e-05, "learning_rate": 2.9720704310868247e-05, "loss": 0.0, "step": 4255 }, { "epoch": 4.655737704918033, "grad_norm": 4.812885163119063e-05, "learning_rate": 2.9690346083788704e-05, "loss": 0.0, "step": 4260 }, { "epoch": 4.66120218579235, "grad_norm": 2.5263596398872323e-05, "learning_rate": 2.965998785670917e-05, "loss": 0.0, "step": 4265 }, { "epoch": 4.666666666666667, "grad_norm": 2.3823860828997567e-05, "learning_rate": 2.962962962962963e-05, "loss": 0.0, "step": 4270 }, { "epoch": 4.672131147540983, "grad_norm": 2.5453253329033032e-05, "learning_rate": 2.9599271402550094e-05, "loss": 0.0, "step": 4275 }, { "epoch": 4.6775956284153, "grad_norm": 3.0383727789740078e-05, "learning_rate": 2.9568913175470552e-05, "loss": 0.0, "step": 4280 }, { "epoch": 4.683060109289618, "grad_norm": 2.0236289856256917e-05, "learning_rate": 2.9538554948391017e-05, "loss": 0.0, "step": 4285 }, { "epoch": 4.688524590163935, "grad_norm": 4.196896406938322e-05, "learning_rate": 2.9508196721311478e-05, "loss": 0.0, "step": 4290 }, { "epoch": 4.693989071038251, "grad_norm": 2.054395736195147e-05, "learning_rate": 2.9477838494231942e-05, "loss": 0.0, "step": 4295 }, { "epoch": 4.699453551912568, "grad_norm": 2.2467031158157624e-05, "learning_rate": 2.94474802671524e-05, "loss": 0.0, "step": 4300 }, { "epoch": 4.704918032786885, "grad_norm": 1.898876143968664e-05, "learning_rate": 2.9417122040072858e-05, "loss": 0.0, "step": 4305 }, { "epoch": 4.7103825136612025, "grad_norm": 2.9287553843460046e-05, "learning_rate": 2.9386763812993322e-05, "loss": 0.0, "step": 4310 }, { "epoch": 4.715846994535519, "grad_norm": 2.107855470967479e-05, "learning_rate": 2.9356405585913783e-05, "loss": 0.0, "step": 4315 }, { "epoch": 4.721311475409836, "grad_norm": 7.70809201640077e-05, "learning_rate": 2.9326047358834248e-05, "loss": 0.0, "step": 4320 }, { "epoch": 4.726775956284153, "grad_norm": 1.9799354049609974e-05, "learning_rate": 2.9295689131754706e-05, "loss": 0.0, "step": 4325 }, { "epoch": 4.73224043715847, "grad_norm": 2.4164644855773076e-05, "learning_rate": 2.926533090467517e-05, "loss": 0.0, "step": 4330 }, { "epoch": 4.737704918032787, "grad_norm": 2.102001235471107e-05, "learning_rate": 2.9234972677595628e-05, "loss": 0.0, "step": 4335 }, { "epoch": 4.743169398907104, "grad_norm": 0.00021176210429985076, "learning_rate": 2.9204614450516093e-05, "loss": 0.0, "step": 4340 }, { "epoch": 4.748633879781421, "grad_norm": 0.0021099280565977097, "learning_rate": 2.9174256223436554e-05, "loss": 0.0, "step": 4345 }, { "epoch": 4.754098360655737, "grad_norm": 3.413944796193391e-05, "learning_rate": 2.9143897996357018e-05, "loss": 0.0, "step": 4350 }, { "epoch": 4.759562841530054, "grad_norm": 5.8268306020181626e-05, "learning_rate": 2.9113539769277476e-05, "loss": 0.0, "step": 4355 }, { "epoch": 4.7650273224043715, "grad_norm": 2.4852219212334603e-05, "learning_rate": 2.9083181542197934e-05, "loss": 0.0, "step": 4360 }, { "epoch": 4.770491803278689, "grad_norm": 2.554810453148093e-05, "learning_rate": 2.9052823315118398e-05, "loss": 0.0, "step": 4365 }, { "epoch": 4.775956284153006, "grad_norm": 1.9379966033739038e-05, "learning_rate": 2.902246508803886e-05, "loss": 0.0, "step": 4370 }, { "epoch": 4.781420765027322, "grad_norm": 2.287175811943598e-05, "learning_rate": 2.8992106860959324e-05, "loss": 0.0, "step": 4375 }, { "epoch": 4.786885245901639, "grad_norm": 1.9639024685602635e-05, "learning_rate": 2.896174863387978e-05, "loss": 0.0, "step": 4380 }, { "epoch": 4.7923497267759565, "grad_norm": 2.56021576205967e-05, "learning_rate": 2.8931390406800246e-05, "loss": 0.0, "step": 4385 }, { "epoch": 4.797814207650273, "grad_norm": 3.311937689431943e-05, "learning_rate": 2.8901032179720704e-05, "loss": 0.0, "step": 4390 }, { "epoch": 4.80327868852459, "grad_norm": 0.00010998953075613827, "learning_rate": 2.8870673952641168e-05, "loss": 0.0, "step": 4395 }, { "epoch": 4.808743169398907, "grad_norm": 2.0269997548894025e-05, "learning_rate": 2.884031572556163e-05, "loss": 0.0, "step": 4400 }, { "epoch": 4.814207650273224, "grad_norm": 2.814541949192062e-05, "learning_rate": 2.8809957498482087e-05, "loss": 0.0, "step": 4405 }, { "epoch": 4.8196721311475414, "grad_norm": 2.220185888290871e-05, "learning_rate": 2.877959927140255e-05, "loss": 0.0, "step": 4410 }, { "epoch": 4.825136612021858, "grad_norm": 2.1637810277752578e-05, "learning_rate": 2.874924104432301e-05, "loss": 0.0, "step": 4415 }, { "epoch": 4.830601092896175, "grad_norm": 2.2878619347466156e-05, "learning_rate": 2.8718882817243474e-05, "loss": 0.0, "step": 4420 }, { "epoch": 4.836065573770492, "grad_norm": 2.0245261111995205e-05, "learning_rate": 2.8688524590163935e-05, "loss": 0.0, "step": 4425 }, { "epoch": 4.841530054644808, "grad_norm": 2.429804953862913e-05, "learning_rate": 2.86581663630844e-05, "loss": 0.0, "step": 4430 }, { "epoch": 4.8469945355191255, "grad_norm": 0.007154061924666166, "learning_rate": 2.8627808136004857e-05, "loss": 0.2029, "step": 4435 }, { "epoch": 4.852459016393443, "grad_norm": 4.056124816997908e-05, "learning_rate": 2.8597449908925322e-05, "loss": 0.001, "step": 4440 }, { "epoch": 4.85792349726776, "grad_norm": 3.3877342502819374e-05, "learning_rate": 2.856709168184578e-05, "loss": 0.0001, "step": 4445 }, { "epoch": 4.863387978142076, "grad_norm": 0.1083371564745903, "learning_rate": 2.8536733454766244e-05, "loss": 0.0005, "step": 4450 }, { "epoch": 4.868852459016393, "grad_norm": 4.0910981624620035e-05, "learning_rate": 2.8506375227686705e-05, "loss": 0.0, "step": 4455 }, { "epoch": 4.8743169398907105, "grad_norm": 5.547309774556197e-05, "learning_rate": 2.8476017000607163e-05, "loss": 0.0885, "step": 4460 }, { "epoch": 4.879781420765028, "grad_norm": 0.002973817056044936, "learning_rate": 2.8445658773527627e-05, "loss": 0.0, "step": 4465 }, { "epoch": 4.885245901639344, "grad_norm": 7.371945685008541e-05, "learning_rate": 2.841530054644809e-05, "loss": 0.0, "step": 4470 }, { "epoch": 4.890710382513661, "grad_norm": 0.00017757054592948407, "learning_rate": 2.8384942319368553e-05, "loss": 0.0005, "step": 4475 }, { "epoch": 4.896174863387978, "grad_norm": 3.507310611894354e-05, "learning_rate": 2.835458409228901e-05, "loss": 0.0, "step": 4480 }, { "epoch": 4.901639344262295, "grad_norm": 0.00014538239338435233, "learning_rate": 2.8324225865209475e-05, "loss": 0.1127, "step": 4485 }, { "epoch": 4.907103825136612, "grad_norm": 7.141881360439584e-05, "learning_rate": 2.8293867638129933e-05, "loss": 0.0005, "step": 4490 }, { "epoch": 4.912568306010929, "grad_norm": 5.7522782299201936e-05, "learning_rate": 2.8263509411050398e-05, "loss": 0.0, "step": 4495 }, { "epoch": 4.918032786885246, "grad_norm": 0.0008930726326070726, "learning_rate": 2.823315118397086e-05, "loss": 0.0, "step": 4500 }, { "epoch": 4.923497267759563, "grad_norm": 0.0020042534451931715, "learning_rate": 2.8202792956891317e-05, "loss": 0.0, "step": 4505 }, { "epoch": 4.9289617486338795, "grad_norm": 5.080427217762917e-05, "learning_rate": 2.817243472981178e-05, "loss": 0.0002, "step": 4510 }, { "epoch": 4.934426229508197, "grad_norm": 5.520815830095671e-05, "learning_rate": 2.814207650273224e-05, "loss": 0.0, "step": 4515 }, { "epoch": 4.939890710382514, "grad_norm": 3.327699232613668e-05, "learning_rate": 2.8111718275652703e-05, "loss": 0.0, "step": 4520 }, { "epoch": 4.945355191256831, "grad_norm": 3.853649468510412e-05, "learning_rate": 2.8081360048573164e-05, "loss": 0.1111, "step": 4525 }, { "epoch": 4.950819672131147, "grad_norm": 4.388443994685076e-05, "learning_rate": 2.805100182149363e-05, "loss": 0.1132, "step": 4530 }, { "epoch": 4.956284153005464, "grad_norm": 0.00019437754235696048, "learning_rate": 2.8020643594414087e-05, "loss": 0.0, "step": 4535 }, { "epoch": 4.961748633879782, "grad_norm": 0.00022693238861393183, "learning_rate": 2.799028536733455e-05, "loss": 0.0102, "step": 4540 }, { "epoch": 4.967213114754099, "grad_norm": 0.00043771814671345055, "learning_rate": 2.795992714025501e-05, "loss": 0.0007, "step": 4545 }, { "epoch": 4.972677595628415, "grad_norm": 0.0002889480092562735, "learning_rate": 2.7929568913175473e-05, "loss": 0.0, "step": 4550 }, { "epoch": 4.978142076502732, "grad_norm": 7.144697883632034e-05, "learning_rate": 2.7899210686095935e-05, "loss": 0.0, "step": 4555 }, { "epoch": 4.983606557377049, "grad_norm": 0.0003368402540218085, "learning_rate": 2.7868852459016392e-05, "loss": 0.0, "step": 4560 }, { "epoch": 4.989071038251366, "grad_norm": 0.0009206884424202144, "learning_rate": 2.7838494231936857e-05, "loss": 0.0, "step": 4565 }, { "epoch": 4.994535519125683, "grad_norm": 5.6415323342662305e-05, "learning_rate": 2.7808136004857315e-05, "loss": 0.0116, "step": 4570 }, { "epoch": 5.0, "grad_norm": 7.074388850014657e-05, "learning_rate": 2.777777777777778e-05, "loss": 0.0, "step": 4575 }, { "epoch": 5.0, "eval_loss": 0.0017390144057571888, "eval_runtime": 676.778, "eval_samples_per_second": 10.81, "eval_steps_per_second": 1.352, "step": 4575 }, { "epoch": 5.005464480874317, "grad_norm": 0.00010769537038868293, "learning_rate": 2.774741955069824e-05, "loss": 0.0, "step": 4580 }, { "epoch": 5.0109289617486334, "grad_norm": 0.00015485276526305825, "learning_rate": 2.7717061323618705e-05, "loss": 0.0, "step": 4585 }, { "epoch": 5.016393442622951, "grad_norm": 0.0003547978412825614, "learning_rate": 2.7686703096539162e-05, "loss": 0.0, "step": 4590 }, { "epoch": 5.021857923497268, "grad_norm": 0.0005272823618724942, "learning_rate": 2.7656344869459627e-05, "loss": 0.0, "step": 4595 }, { "epoch": 5.027322404371585, "grad_norm": 0.0015367731684818864, "learning_rate": 2.7625986642380085e-05, "loss": 0.0, "step": 4600 }, { "epoch": 5.032786885245901, "grad_norm": 0.00010357119754189625, "learning_rate": 2.7595628415300546e-05, "loss": 0.0, "step": 4605 }, { "epoch": 5.038251366120218, "grad_norm": 5.5412652727682143e-05, "learning_rate": 2.756527018822101e-05, "loss": 0.0001, "step": 4610 }, { "epoch": 5.043715846994536, "grad_norm": 0.0001249344350071624, "learning_rate": 2.7534911961141468e-05, "loss": 0.0, "step": 4615 }, { "epoch": 5.049180327868853, "grad_norm": 0.0027478511910885572, "learning_rate": 2.7504553734061933e-05, "loss": 0.0, "step": 4620 }, { "epoch": 5.054644808743169, "grad_norm": 0.0025896786246448755, "learning_rate": 2.747419550698239e-05, "loss": 0.073, "step": 4625 }, { "epoch": 5.060109289617486, "grad_norm": 0.00015381591219920665, "learning_rate": 2.7443837279902855e-05, "loss": 0.0, "step": 4630 }, { "epoch": 5.065573770491803, "grad_norm": 0.00017391947039868683, "learning_rate": 2.7413479052823316e-05, "loss": 0.0, "step": 4635 }, { "epoch": 5.0710382513661205, "grad_norm": 0.00014396451297216117, "learning_rate": 2.738312082574378e-05, "loss": 0.0, "step": 4640 }, { "epoch": 5.076502732240437, "grad_norm": 0.00019471778068691492, "learning_rate": 2.7352762598664238e-05, "loss": 0.0, "step": 4645 }, { "epoch": 5.081967213114754, "grad_norm": 0.00017244904302060604, "learning_rate": 2.7322404371584703e-05, "loss": 0.0001, "step": 4650 }, { "epoch": 5.087431693989071, "grad_norm": 0.00013770755322184414, "learning_rate": 2.7292046144505164e-05, "loss": 0.0, "step": 4655 }, { "epoch": 5.092896174863388, "grad_norm": 0.0009572531562298536, "learning_rate": 2.726168791742562e-05, "loss": 0.0, "step": 4660 }, { "epoch": 5.098360655737705, "grad_norm": 0.005346562713384628, "learning_rate": 2.7231329690346086e-05, "loss": 0.0, "step": 4665 }, { "epoch": 5.103825136612022, "grad_norm": 0.00018971337703987956, "learning_rate": 2.7200971463266544e-05, "loss": 0.0, "step": 4670 }, { "epoch": 5.109289617486339, "grad_norm": 0.00018444436136633158, "learning_rate": 2.717061323618701e-05, "loss": 0.0, "step": 4675 }, { "epoch": 5.114754098360656, "grad_norm": 0.00036197478766553104, "learning_rate": 2.714025500910747e-05, "loss": 0.0961, "step": 4680 }, { "epoch": 5.120218579234972, "grad_norm": 0.00030608210363425314, "learning_rate": 2.7109896782027934e-05, "loss": 0.0, "step": 4685 }, { "epoch": 5.1256830601092895, "grad_norm": 0.010481505654752254, "learning_rate": 2.7079538554948392e-05, "loss": 0.0, "step": 4690 }, { "epoch": 5.131147540983607, "grad_norm": 0.0004171307082287967, "learning_rate": 2.7049180327868856e-05, "loss": 0.0001, "step": 4695 }, { "epoch": 5.136612021857924, "grad_norm": 0.00030121684540063143, "learning_rate": 2.7018822100789314e-05, "loss": 0.0, "step": 4700 }, { "epoch": 5.14207650273224, "grad_norm": 8.655495184939355e-05, "learning_rate": 2.6988463873709775e-05, "loss": 0.0, "step": 4705 }, { "epoch": 5.147540983606557, "grad_norm": 0.00010451547859702259, "learning_rate": 2.695810564663024e-05, "loss": 0.0, "step": 4710 }, { "epoch": 5.1530054644808745, "grad_norm": 0.013965661637485027, "learning_rate": 2.6927747419550697e-05, "loss": 0.0001, "step": 4715 }, { "epoch": 5.158469945355192, "grad_norm": 0.00010520996147533879, "learning_rate": 2.6897389192471162e-05, "loss": 0.0, "step": 4720 }, { "epoch": 5.163934426229508, "grad_norm": 9.894570393953472e-05, "learning_rate": 2.686703096539162e-05, "loss": 0.0, "step": 4725 }, { "epoch": 5.169398907103825, "grad_norm": 7.636708323843777e-05, "learning_rate": 2.6836672738312084e-05, "loss": 0.0001, "step": 4730 }, { "epoch": 5.174863387978142, "grad_norm": 6.734576891176403e-05, "learning_rate": 2.6806314511232545e-05, "loss": 0.0, "step": 4735 }, { "epoch": 5.180327868852459, "grad_norm": 8.934761717682704e-05, "learning_rate": 2.677595628415301e-05, "loss": 0.0001, "step": 4740 }, { "epoch": 5.185792349726776, "grad_norm": 0.0001079771900549531, "learning_rate": 2.6745598057073468e-05, "loss": 0.0, "step": 4745 }, { "epoch": 5.191256830601093, "grad_norm": 8.022497786441818e-05, "learning_rate": 2.6715239829993932e-05, "loss": 0.0462, "step": 4750 }, { "epoch": 5.19672131147541, "grad_norm": 0.0001275836257264018, "learning_rate": 2.668488160291439e-05, "loss": 0.0001, "step": 4755 }, { "epoch": 5.202185792349727, "grad_norm": 0.00036970950895920396, "learning_rate": 2.665452337583485e-05, "loss": 0.0, "step": 4760 }, { "epoch": 5.2076502732240435, "grad_norm": 0.0001368650555377826, "learning_rate": 2.6624165148755316e-05, "loss": 0.0, "step": 4765 }, { "epoch": 5.213114754098361, "grad_norm": 9.971411782316864e-05, "learning_rate": 2.6593806921675773e-05, "loss": 0.0, "step": 4770 }, { "epoch": 5.218579234972678, "grad_norm": 7.680917769903317e-05, "learning_rate": 2.6563448694596238e-05, "loss": 0.0, "step": 4775 }, { "epoch": 5.224043715846994, "grad_norm": 9.601003694115207e-05, "learning_rate": 2.6533090467516696e-05, "loss": 0.0, "step": 4780 }, { "epoch": 5.229508196721311, "grad_norm": 9.728920849738643e-05, "learning_rate": 2.650273224043716e-05, "loss": 0.0, "step": 4785 }, { "epoch": 5.2349726775956285, "grad_norm": 8.36860272102058e-05, "learning_rate": 2.647237401335762e-05, "loss": 0.0, "step": 4790 }, { "epoch": 5.240437158469946, "grad_norm": 0.00011909649037988856, "learning_rate": 2.6442015786278086e-05, "loss": 0.0001, "step": 4795 }, { "epoch": 5.245901639344262, "grad_norm": 6.814736843807623e-05, "learning_rate": 2.6411657559198543e-05, "loss": 0.0, "step": 4800 }, { "epoch": 5.251366120218579, "grad_norm": 0.00010630305769154802, "learning_rate": 2.6381299332119008e-05, "loss": 0.0001, "step": 4805 }, { "epoch": 5.256830601092896, "grad_norm": 0.00012657047773245722, "learning_rate": 2.6350941105039466e-05, "loss": 0.0001, "step": 4810 }, { "epoch": 5.262295081967213, "grad_norm": 9.68086751527153e-05, "learning_rate": 2.6320582877959927e-05, "loss": 0.0, "step": 4815 }, { "epoch": 5.26775956284153, "grad_norm": 0.0001090406221919693, "learning_rate": 2.629022465088039e-05, "loss": 0.0, "step": 4820 }, { "epoch": 5.273224043715847, "grad_norm": 0.00010317601845599711, "learning_rate": 2.625986642380085e-05, "loss": 0.0, "step": 4825 }, { "epoch": 5.278688524590164, "grad_norm": 6.058696817490272e-05, "learning_rate": 2.6229508196721314e-05, "loss": 0.0, "step": 4830 }, { "epoch": 5.284153005464481, "grad_norm": 8.128488116199151e-05, "learning_rate": 2.619914996964177e-05, "loss": 0.0, "step": 4835 }, { "epoch": 5.2896174863387975, "grad_norm": 7.313517562579364e-05, "learning_rate": 2.6168791742562236e-05, "loss": 0.0, "step": 4840 }, { "epoch": 5.295081967213115, "grad_norm": 8.592737140133977e-05, "learning_rate": 2.6138433515482697e-05, "loss": 0.0, "step": 4845 }, { "epoch": 5.300546448087432, "grad_norm": 0.00022616136993747205, "learning_rate": 2.610807528840316e-05, "loss": 0.0, "step": 4850 }, { "epoch": 5.306010928961749, "grad_norm": 0.005083122756332159, "learning_rate": 2.607771706132362e-05, "loss": 0.0, "step": 4855 }, { "epoch": 5.311475409836065, "grad_norm": 0.00020489096641540527, "learning_rate": 2.604735883424408e-05, "loss": 0.0, "step": 4860 }, { "epoch": 5.316939890710382, "grad_norm": 0.00010977088822983205, "learning_rate": 2.6017000607164545e-05, "loss": 0.0, "step": 4865 }, { "epoch": 5.3224043715847, "grad_norm": 7.336941052926704e-05, "learning_rate": 2.5986642380085003e-05, "loss": 0.0, "step": 4870 }, { "epoch": 5.327868852459017, "grad_norm": 5.920181138208136e-05, "learning_rate": 2.5956284153005467e-05, "loss": 0.0, "step": 4875 }, { "epoch": 5.333333333333333, "grad_norm": 0.00023859695647843182, "learning_rate": 2.5925925925925925e-05, "loss": 0.0, "step": 4880 }, { "epoch": 5.33879781420765, "grad_norm": 7.648219616385177e-05, "learning_rate": 2.589556769884639e-05, "loss": 0.0, "step": 4885 }, { "epoch": 5.344262295081967, "grad_norm": 5.340289862942882e-05, "learning_rate": 2.586520947176685e-05, "loss": 0.0, "step": 4890 }, { "epoch": 5.3497267759562845, "grad_norm": 0.00011013679613824934, "learning_rate": 2.5834851244687315e-05, "loss": 0.0001, "step": 4895 }, { "epoch": 5.355191256830601, "grad_norm": 8.648641960462555e-05, "learning_rate": 2.5804493017607773e-05, "loss": 0.0, "step": 4900 }, { "epoch": 5.360655737704918, "grad_norm": 0.00014422819367609918, "learning_rate": 2.5774134790528237e-05, "loss": 0.0, "step": 4905 }, { "epoch": 5.366120218579235, "grad_norm": 9.631054126657546e-05, "learning_rate": 2.5743776563448695e-05, "loss": 0.1501, "step": 4910 }, { "epoch": 5.371584699453552, "grad_norm": 0.0006303318659774959, "learning_rate": 2.5713418336369156e-05, "loss": 0.0002, "step": 4915 }, { "epoch": 5.377049180327869, "grad_norm": 0.0002542531001381576, "learning_rate": 2.568306010928962e-05, "loss": 0.0005, "step": 4920 }, { "epoch": 5.382513661202186, "grad_norm": 0.00015373634232673794, "learning_rate": 2.565270188221008e-05, "loss": 0.0006, "step": 4925 }, { "epoch": 5.387978142076503, "grad_norm": 0.0001260093122255057, "learning_rate": 2.5622343655130543e-05, "loss": 0.0, "step": 4930 }, { "epoch": 5.39344262295082, "grad_norm": 0.00012952568067703396, "learning_rate": 2.5591985428051e-05, "loss": 0.0002, "step": 4935 }, { "epoch": 5.398907103825136, "grad_norm": 0.00015524527407251298, "learning_rate": 2.5561627200971465e-05, "loss": 0.0014, "step": 4940 }, { "epoch": 5.404371584699454, "grad_norm": 5.933045395067893e-05, "learning_rate": 2.5531268973891926e-05, "loss": 0.0, "step": 4945 }, { "epoch": 5.409836065573771, "grad_norm": 9.225990652339533e-05, "learning_rate": 2.550091074681239e-05, "loss": 0.0, "step": 4950 }, { "epoch": 5.415300546448087, "grad_norm": 9.452126687392592e-05, "learning_rate": 2.547055251973285e-05, "loss": 0.0, "step": 4955 }, { "epoch": 5.420765027322404, "grad_norm": 9.709106961963698e-05, "learning_rate": 2.5440194292653306e-05, "loss": 0.0001, "step": 4960 }, { "epoch": 5.426229508196721, "grad_norm": 5.982341099297628e-05, "learning_rate": 2.540983606557377e-05, "loss": 0.0, "step": 4965 }, { "epoch": 5.4316939890710385, "grad_norm": 8.735879964660853e-05, "learning_rate": 2.5379477838494232e-05, "loss": 0.0, "step": 4970 }, { "epoch": 5.437158469945355, "grad_norm": 9.017515549203381e-05, "learning_rate": 2.5349119611414697e-05, "loss": 0.0, "step": 4975 }, { "epoch": 5.442622950819672, "grad_norm": 0.00011913449998246506, "learning_rate": 2.5318761384335154e-05, "loss": 0.0, "step": 4980 }, { "epoch": 5.448087431693989, "grad_norm": 7.266786269610748e-05, "learning_rate": 2.528840315725562e-05, "loss": 0.0, "step": 4985 }, { "epoch": 5.453551912568306, "grad_norm": 4.6318815293489024e-05, "learning_rate": 2.5258044930176077e-05, "loss": 0.0, "step": 4990 }, { "epoch": 5.459016393442623, "grad_norm": 0.00011023716069757938, "learning_rate": 2.522768670309654e-05, "loss": 0.0, "step": 4995 }, { "epoch": 5.46448087431694, "grad_norm": 0.019288262352347374, "learning_rate": 2.5197328476017002e-05, "loss": 0.0001, "step": 5000 }, { "epoch": 5.469945355191257, "grad_norm": 8.01812275312841e-05, "learning_rate": 2.5166970248937467e-05, "loss": 0.0, "step": 5005 }, { "epoch": 5.475409836065574, "grad_norm": 5.575180330197327e-05, "learning_rate": 2.5136612021857924e-05, "loss": 0.0, "step": 5010 }, { "epoch": 5.48087431693989, "grad_norm": 6.652524461969733e-05, "learning_rate": 2.5106253794778382e-05, "loss": 0.0, "step": 5015 }, { "epoch": 5.4863387978142075, "grad_norm": 5.448004958452657e-05, "learning_rate": 2.5075895567698847e-05, "loss": 0.0001, "step": 5020 }, { "epoch": 5.491803278688525, "grad_norm": 5.704570867237635e-05, "learning_rate": 2.5045537340619308e-05, "loss": 0.0063, "step": 5025 }, { "epoch": 5.497267759562842, "grad_norm": 4.593836274580099e-05, "learning_rate": 2.5015179113539772e-05, "loss": 0.0002, "step": 5030 }, { "epoch": 5.502732240437158, "grad_norm": 3.784838554565795e-05, "learning_rate": 2.498482088646023e-05, "loss": 0.0, "step": 5035 }, { "epoch": 5.508196721311475, "grad_norm": 0.00014102249406278133, "learning_rate": 2.495446265938069e-05, "loss": 0.0, "step": 5040 }, { "epoch": 5.5136612021857925, "grad_norm": 0.00031027215300127864, "learning_rate": 2.4924104432301156e-05, "loss": 0.0, "step": 5045 }, { "epoch": 5.51912568306011, "grad_norm": 3.274340997450054e-05, "learning_rate": 2.4893746205221617e-05, "loss": 0.0, "step": 5050 }, { "epoch": 5.524590163934426, "grad_norm": 5.2845189202344045e-05, "learning_rate": 2.4863387978142078e-05, "loss": 0.0, "step": 5055 }, { "epoch": 5.530054644808743, "grad_norm": 25.070024490356445, "learning_rate": 2.483302975106254e-05, "loss": 0.1856, "step": 5060 }, { "epoch": 5.53551912568306, "grad_norm": 0.0268250722438097, "learning_rate": 2.4802671523983e-05, "loss": 0.0, "step": 5065 }, { "epoch": 5.540983606557377, "grad_norm": 0.0001650653430260718, "learning_rate": 2.477231329690346e-05, "loss": 0.0041, "step": 5070 }, { "epoch": 5.546448087431694, "grad_norm": 0.0002502961433492601, "learning_rate": 2.4741955069823926e-05, "loss": 0.0, "step": 5075 }, { "epoch": 5.551912568306011, "grad_norm": 0.00017075585492420942, "learning_rate": 2.4711596842744387e-05, "loss": 0.0001, "step": 5080 }, { "epoch": 5.557377049180328, "grad_norm": 7.287297194125131e-05, "learning_rate": 2.4681238615664845e-05, "loss": 0.0, "step": 5085 }, { "epoch": 5.562841530054644, "grad_norm": 0.00028358641429804265, "learning_rate": 2.4650880388585306e-05, "loss": 0.0, "step": 5090 }, { "epoch": 5.5683060109289615, "grad_norm": 0.0002642961626406759, "learning_rate": 2.4620522161505767e-05, "loss": 0.0, "step": 5095 }, { "epoch": 5.573770491803279, "grad_norm": 0.0006491028470918536, "learning_rate": 2.459016393442623e-05, "loss": 0.0, "step": 5100 }, { "epoch": 5.579234972677596, "grad_norm": 0.00018292821187060326, "learning_rate": 2.4559805707346693e-05, "loss": 0.0001, "step": 5105 }, { "epoch": 5.584699453551913, "grad_norm": 0.0009338250965811312, "learning_rate": 2.4529447480267154e-05, "loss": 0.0001, "step": 5110 }, { "epoch": 5.590163934426229, "grad_norm": 0.03507191687822342, "learning_rate": 2.4499089253187615e-05, "loss": 0.0001, "step": 5115 }, { "epoch": 5.595628415300546, "grad_norm": 8.738631731830537e-05, "learning_rate": 2.4468731026108076e-05, "loss": 0.0, "step": 5120 }, { "epoch": 5.601092896174864, "grad_norm": 7.589610322611406e-05, "learning_rate": 2.4438372799028537e-05, "loss": 0.0, "step": 5125 }, { "epoch": 5.60655737704918, "grad_norm": 0.0003264937549829483, "learning_rate": 2.4408014571949e-05, "loss": 0.0, "step": 5130 }, { "epoch": 5.612021857923497, "grad_norm": 0.00025950855342671275, "learning_rate": 2.437765634486946e-05, "loss": 0.0, "step": 5135 }, { "epoch": 5.617486338797814, "grad_norm": 0.00013693803339265287, "learning_rate": 2.434729811778992e-05, "loss": 0.0001, "step": 5140 }, { "epoch": 5.622950819672131, "grad_norm": 0.0001310525694862008, "learning_rate": 2.431693989071038e-05, "loss": 0.0, "step": 5145 }, { "epoch": 5.628415300546449, "grad_norm": 6.411856156773865e-05, "learning_rate": 2.4286581663630846e-05, "loss": 0.0, "step": 5150 }, { "epoch": 5.633879781420765, "grad_norm": 0.00021306371490936726, "learning_rate": 2.4256223436551307e-05, "loss": 0.0146, "step": 5155 }, { "epoch": 5.639344262295082, "grad_norm": 0.00023212283849716187, "learning_rate": 2.422586520947177e-05, "loss": 0.0001, "step": 5160 }, { "epoch": 5.644808743169399, "grad_norm": 0.0002247579104732722, "learning_rate": 2.419550698239223e-05, "loss": 0.0, "step": 5165 }, { "epoch": 5.6502732240437155, "grad_norm": 0.0003208070411346853, "learning_rate": 2.416514875531269e-05, "loss": 0.0, "step": 5170 }, { "epoch": 5.655737704918033, "grad_norm": 0.0003484230546746403, "learning_rate": 2.4134790528233152e-05, "loss": 0.0, "step": 5175 }, { "epoch": 5.66120218579235, "grad_norm": 0.010849053040146828, "learning_rate": 2.4104432301153616e-05, "loss": 0.0001, "step": 5180 }, { "epoch": 5.666666666666667, "grad_norm": 0.000284497975371778, "learning_rate": 2.4074074074074074e-05, "loss": 0.0831, "step": 5185 }, { "epoch": 5.672131147540983, "grad_norm": 0.0001850848348112777, "learning_rate": 2.4043715846994535e-05, "loss": 0.2457, "step": 5190 }, { "epoch": 5.6775956284153, "grad_norm": 0.0009068201761692762, "learning_rate": 2.4013357619914996e-05, "loss": 0.0, "step": 5195 }, { "epoch": 5.683060109289618, "grad_norm": 0.0002595623955130577, "learning_rate": 2.3982999392835457e-05, "loss": 0.0502, "step": 5200 }, { "epoch": 5.688524590163935, "grad_norm": 0.00040304780122824013, "learning_rate": 2.3952641165755922e-05, "loss": 0.003, "step": 5205 }, { "epoch": 5.693989071038251, "grad_norm": 0.0006999442121013999, "learning_rate": 2.3922282938676383e-05, "loss": 0.0, "step": 5210 }, { "epoch": 5.699453551912568, "grad_norm": 0.0008923182031139731, "learning_rate": 2.3891924711596844e-05, "loss": 0.0001, "step": 5215 }, { "epoch": 5.704918032786885, "grad_norm": 0.0028996258042752743, "learning_rate": 2.3861566484517305e-05, "loss": 0.0001, "step": 5220 }, { "epoch": 5.7103825136612025, "grad_norm": 0.011778953485190868, "learning_rate": 2.3831208257437767e-05, "loss": 0.0001, "step": 5225 }, { "epoch": 5.715846994535519, "grad_norm": 0.0009088412043638527, "learning_rate": 2.380085003035823e-05, "loss": 0.0004, "step": 5230 }, { "epoch": 5.721311475409836, "grad_norm": 0.0005468827439472079, "learning_rate": 2.377049180327869e-05, "loss": 0.0, "step": 5235 }, { "epoch": 5.726775956284153, "grad_norm": 0.0005143904127180576, "learning_rate": 2.374013357619915e-05, "loss": 0.0, "step": 5240 }, { "epoch": 5.73224043715847, "grad_norm": 0.012319618836045265, "learning_rate": 2.370977534911961e-05, "loss": 0.0001, "step": 5245 }, { "epoch": 5.737704918032787, "grad_norm": 0.00032935780473053455, "learning_rate": 2.3679417122040072e-05, "loss": 0.0, "step": 5250 }, { "epoch": 5.743169398907104, "grad_norm": 0.00028610375011339784, "learning_rate": 2.3649058894960537e-05, "loss": 0.0001, "step": 5255 }, { "epoch": 5.748633879781421, "grad_norm": 0.0014600688591599464, "learning_rate": 2.3618700667880998e-05, "loss": 0.0, "step": 5260 }, { "epoch": 5.754098360655737, "grad_norm": 0.0003803087747655809, "learning_rate": 2.358834244080146e-05, "loss": 0.0, "step": 5265 }, { "epoch": 5.759562841530054, "grad_norm": 0.0012569596292451024, "learning_rate": 2.355798421372192e-05, "loss": 0.0, "step": 5270 }, { "epoch": 5.7650273224043715, "grad_norm": 0.00016804836923256516, "learning_rate": 2.352762598664238e-05, "loss": 0.0001, "step": 5275 }, { "epoch": 5.770491803278689, "grad_norm": 0.00032320586615242064, "learning_rate": 2.3497267759562842e-05, "loss": 0.0, "step": 5280 }, { "epoch": 5.775956284153006, "grad_norm": 0.00043845665641129017, "learning_rate": 2.3466909532483307e-05, "loss": 0.0, "step": 5285 }, { "epoch": 5.781420765027322, "grad_norm": 0.0004221588606014848, "learning_rate": 2.3436551305403765e-05, "loss": 0.0, "step": 5290 }, { "epoch": 5.786885245901639, "grad_norm": 0.00026679132133722305, "learning_rate": 2.3406193078324226e-05, "loss": 0.0, "step": 5295 }, { "epoch": 5.7923497267759565, "grad_norm": 0.00012715437333099544, "learning_rate": 2.3375834851244687e-05, "loss": 0.0, "step": 5300 }, { "epoch": 5.797814207650273, "grad_norm": 0.00016335255349986255, "learning_rate": 2.3345476624165148e-05, "loss": 0.0, "step": 5305 }, { "epoch": 5.80327868852459, "grad_norm": 0.0005586635088548064, "learning_rate": 2.3315118397085612e-05, "loss": 0.0, "step": 5310 }, { "epoch": 5.808743169398907, "grad_norm": 0.0017712228000164032, "learning_rate": 2.3284760170006074e-05, "loss": 0.0, "step": 5315 }, { "epoch": 5.814207650273224, "grad_norm": 0.000200931157451123, "learning_rate": 2.3254401942926535e-05, "loss": 0.0, "step": 5320 }, { "epoch": 5.8196721311475414, "grad_norm": 0.00016768582281656563, "learning_rate": 2.3224043715846996e-05, "loss": 0.0, "step": 5325 }, { "epoch": 5.825136612021858, "grad_norm": 0.0001333783147856593, "learning_rate": 2.3193685488767457e-05, "loss": 0.0, "step": 5330 }, { "epoch": 5.830601092896175, "grad_norm": 0.00022269372129812837, "learning_rate": 2.316332726168792e-05, "loss": 0.0, "step": 5335 }, { "epoch": 5.836065573770492, "grad_norm": 0.00021091346570756286, "learning_rate": 2.313296903460838e-05, "loss": 0.0001, "step": 5340 }, { "epoch": 5.841530054644808, "grad_norm": 0.0034973840229213238, "learning_rate": 2.310261080752884e-05, "loss": 0.0, "step": 5345 }, { "epoch": 5.8469945355191255, "grad_norm": 0.00016680124099366367, "learning_rate": 2.30722525804493e-05, "loss": 0.0, "step": 5350 }, { "epoch": 5.852459016393443, "grad_norm": 0.00016794119437690824, "learning_rate": 2.3041894353369763e-05, "loss": 0.0, "step": 5355 }, { "epoch": 5.85792349726776, "grad_norm": 0.0001823761558625847, "learning_rate": 2.3011536126290227e-05, "loss": 0.0, "step": 5360 }, { "epoch": 5.863387978142076, "grad_norm": 0.00018940315931104124, "learning_rate": 2.2981177899210688e-05, "loss": 0.0, "step": 5365 }, { "epoch": 5.868852459016393, "grad_norm": 0.0001646766031626612, "learning_rate": 2.295081967213115e-05, "loss": 0.0, "step": 5370 }, { "epoch": 5.8743169398907105, "grad_norm": 0.00026264862390235066, "learning_rate": 2.292046144505161e-05, "loss": 0.0, "step": 5375 }, { "epoch": 5.879781420765028, "grad_norm": 0.00020386996038723737, "learning_rate": 2.289010321797207e-05, "loss": 0.0, "step": 5380 }, { "epoch": 5.885245901639344, "grad_norm": 0.00019918520411010832, "learning_rate": 2.2859744990892533e-05, "loss": 0.0, "step": 5385 }, { "epoch": 5.890710382513661, "grad_norm": 0.0001257824624190107, "learning_rate": 2.2829386763812994e-05, "loss": 0.0, "step": 5390 }, { "epoch": 5.896174863387978, "grad_norm": 0.00011864578118547797, "learning_rate": 2.2799028536733455e-05, "loss": 0.0, "step": 5395 }, { "epoch": 5.901639344262295, "grad_norm": 0.00027177087031304836, "learning_rate": 2.2768670309653916e-05, "loss": 0.0, "step": 5400 }, { "epoch": 5.907103825136612, "grad_norm": 0.014322335831820965, "learning_rate": 2.2738312082574377e-05, "loss": 0.0, "step": 5405 }, { "epoch": 5.912568306010929, "grad_norm": 0.00014552676293533295, "learning_rate": 2.270795385549484e-05, "loss": 0.0, "step": 5410 }, { "epoch": 5.918032786885246, "grad_norm": 0.0001333577383775264, "learning_rate": 2.2677595628415303e-05, "loss": 0.0, "step": 5415 }, { "epoch": 5.923497267759563, "grad_norm": 9.596958989277482e-05, "learning_rate": 2.2647237401335764e-05, "loss": 0.0, "step": 5420 }, { "epoch": 5.9289617486338795, "grad_norm": 0.0062270862981677055, "learning_rate": 2.2616879174256225e-05, "loss": 0.0, "step": 5425 }, { "epoch": 5.934426229508197, "grad_norm": 0.00010670346091501415, "learning_rate": 2.2586520947176686e-05, "loss": 0.0, "step": 5430 }, { "epoch": 5.939890710382514, "grad_norm": 0.0011700303293764591, "learning_rate": 2.2556162720097147e-05, "loss": 0.0, "step": 5435 }, { "epoch": 5.945355191256831, "grad_norm": 0.0001294987159781158, "learning_rate": 2.252580449301761e-05, "loss": 0.0, "step": 5440 }, { "epoch": 5.950819672131147, "grad_norm": 8.402008097618818e-05, "learning_rate": 2.249544626593807e-05, "loss": 0.0, "step": 5445 }, { "epoch": 5.956284153005464, "grad_norm": 6.773810309823602e-05, "learning_rate": 2.246508803885853e-05, "loss": 0.0, "step": 5450 }, { "epoch": 5.961748633879782, "grad_norm": 0.00012332104961387813, "learning_rate": 2.2434729811778992e-05, "loss": 0.0, "step": 5455 }, { "epoch": 5.967213114754099, "grad_norm": 9.364177822135389e-05, "learning_rate": 2.2404371584699453e-05, "loss": 0.0, "step": 5460 }, { "epoch": 5.972677595628415, "grad_norm": 0.0001126359638874419, "learning_rate": 2.2374013357619918e-05, "loss": 0.0, "step": 5465 }, { "epoch": 5.978142076502732, "grad_norm": 0.0001698069245321676, "learning_rate": 2.234365513054038e-05, "loss": 0.0, "step": 5470 }, { "epoch": 5.983606557377049, "grad_norm": 0.00012718628568109125, "learning_rate": 2.231329690346084e-05, "loss": 0.0, "step": 5475 }, { "epoch": 5.989071038251366, "grad_norm": 0.14746598899364471, "learning_rate": 2.22829386763813e-05, "loss": 0.0004, "step": 5480 }, { "epoch": 5.994535519125683, "grad_norm": 0.00011750194244086742, "learning_rate": 2.2252580449301762e-05, "loss": 0.0, "step": 5485 }, { "epoch": 6.0, "grad_norm": 0.00011425597767811269, "learning_rate": 2.2222222222222223e-05, "loss": 0.0047, "step": 5490 }, { "epoch": 6.0, "eval_loss": 0.0026069784071296453, "eval_runtime": 667.3626, "eval_samples_per_second": 10.963, "eval_steps_per_second": 1.371, "step": 5490 }, { "epoch": 6.005464480874317, "grad_norm": 0.011173485778272152, "learning_rate": 2.2191863995142684e-05, "loss": 0.0, "step": 5495 }, { "epoch": 6.0109289617486334, "grad_norm": 8.967368921730667e-05, "learning_rate": 2.2161505768063146e-05, "loss": 0.0, "step": 5500 }, { "epoch": 6.016393442622951, "grad_norm": 4.766190977534279e-05, "learning_rate": 2.2131147540983607e-05, "loss": 0.0, "step": 5505 }, { "epoch": 6.021857923497268, "grad_norm": 0.002877232152968645, "learning_rate": 2.2100789313904068e-05, "loss": 0.0, "step": 5510 }, { "epoch": 6.027322404371585, "grad_norm": 5.845217310707085e-05, "learning_rate": 2.2070431086824532e-05, "loss": 0.0, "step": 5515 }, { "epoch": 6.032786885245901, "grad_norm": 5.649280865327455e-05, "learning_rate": 2.2040072859744993e-05, "loss": 0.0, "step": 5520 }, { "epoch": 6.038251366120218, "grad_norm": 0.00013220343680586666, "learning_rate": 2.2009714632665455e-05, "loss": 0.0, "step": 5525 }, { "epoch": 6.043715846994536, "grad_norm": 5.454764686874114e-05, "learning_rate": 2.1979356405585916e-05, "loss": 0.0, "step": 5530 }, { "epoch": 6.049180327868853, "grad_norm": 7.190610631369054e-05, "learning_rate": 2.1948998178506377e-05, "loss": 0.0, "step": 5535 }, { "epoch": 6.054644808743169, "grad_norm": 0.00018310271843802184, "learning_rate": 2.1918639951426838e-05, "loss": 0.0, "step": 5540 }, { "epoch": 6.060109289617486, "grad_norm": 5.9327132476028055e-05, "learning_rate": 2.18882817243473e-05, "loss": 0.0, "step": 5545 }, { "epoch": 6.065573770491803, "grad_norm": 0.00018289749277755618, "learning_rate": 2.185792349726776e-05, "loss": 0.176, "step": 5550 }, { "epoch": 6.0710382513661205, "grad_norm": 0.000174393251654692, "learning_rate": 2.182756527018822e-05, "loss": 0.0, "step": 5555 }, { "epoch": 6.076502732240437, "grad_norm": 0.015763528645038605, "learning_rate": 2.1797207043108682e-05, "loss": 0.0001, "step": 5560 }, { "epoch": 6.081967213114754, "grad_norm": 0.0009447432239539921, "learning_rate": 2.1766848816029144e-05, "loss": 0.0, "step": 5565 }, { "epoch": 6.087431693989071, "grad_norm": 0.0006745553691871464, "learning_rate": 2.1736490588949608e-05, "loss": 0.0, "step": 5570 }, { "epoch": 6.092896174863388, "grad_norm": 0.0007902304059825838, "learning_rate": 2.170613236187007e-05, "loss": 0.0, "step": 5575 }, { "epoch": 6.098360655737705, "grad_norm": 0.003297103103250265, "learning_rate": 2.167577413479053e-05, "loss": 0.0, "step": 5580 }, { "epoch": 6.103825136612022, "grad_norm": 0.0008036267245188355, "learning_rate": 2.164541590771099e-05, "loss": 0.0002, "step": 5585 }, { "epoch": 6.109289617486339, "grad_norm": 0.0005939893890172243, "learning_rate": 2.161505768063145e-05, "loss": 0.0001, "step": 5590 }, { "epoch": 6.114754098360656, "grad_norm": 9.905810293275863e-05, "learning_rate": 2.1584699453551914e-05, "loss": 0.0, "step": 5595 }, { "epoch": 6.120218579234972, "grad_norm": 0.0003050408558920026, "learning_rate": 2.1554341226472375e-05, "loss": 0.0, "step": 5600 }, { "epoch": 6.1256830601092895, "grad_norm": 0.001176638645119965, "learning_rate": 2.1523982999392836e-05, "loss": 0.0001, "step": 5605 }, { "epoch": 6.131147540983607, "grad_norm": 0.0001844777725636959, "learning_rate": 2.1493624772313297e-05, "loss": 0.0, "step": 5610 }, { "epoch": 6.136612021857924, "grad_norm": 0.0003056370187550783, "learning_rate": 2.1463266545233758e-05, "loss": 0.0, "step": 5615 }, { "epoch": 6.14207650273224, "grad_norm": 0.01378646306693554, "learning_rate": 2.1432908318154223e-05, "loss": 0.0001, "step": 5620 }, { "epoch": 6.147540983606557, "grad_norm": 0.0001626972807571292, "learning_rate": 2.1402550091074684e-05, "loss": 0.0, "step": 5625 }, { "epoch": 6.1530054644808745, "grad_norm": 0.0002374864707235247, "learning_rate": 2.1372191863995145e-05, "loss": 0.0, "step": 5630 }, { "epoch": 6.158469945355192, "grad_norm": 0.0007631028420291841, "learning_rate": 2.1341833636915606e-05, "loss": 0.0, "step": 5635 }, { "epoch": 6.163934426229508, "grad_norm": 0.0007890752167440951, "learning_rate": 2.1311475409836064e-05, "loss": 0.0001, "step": 5640 }, { "epoch": 6.169398907103825, "grad_norm": 0.00014803047815803438, "learning_rate": 2.128111718275653e-05, "loss": 0.0, "step": 5645 }, { "epoch": 6.174863387978142, "grad_norm": 0.0001815713185351342, "learning_rate": 2.125075895567699e-05, "loss": 0.0, "step": 5650 }, { "epoch": 6.180327868852459, "grad_norm": 0.0002662826154846698, "learning_rate": 2.122040072859745e-05, "loss": 0.0, "step": 5655 }, { "epoch": 6.185792349726776, "grad_norm": 0.00023396898177452385, "learning_rate": 2.1190042501517912e-05, "loss": 0.0, "step": 5660 }, { "epoch": 6.191256830601093, "grad_norm": 0.0009009042987599969, "learning_rate": 2.1159684274438373e-05, "loss": 0.0, "step": 5665 }, { "epoch": 6.19672131147541, "grad_norm": 0.00019553759193513542, "learning_rate": 2.1129326047358834e-05, "loss": 0.0, "step": 5670 }, { "epoch": 6.202185792349727, "grad_norm": 0.0001129544252762571, "learning_rate": 2.10989678202793e-05, "loss": 0.0, "step": 5675 }, { "epoch": 6.2076502732240435, "grad_norm": 0.0001647875178605318, "learning_rate": 2.106860959319976e-05, "loss": 0.0, "step": 5680 }, { "epoch": 6.213114754098361, "grad_norm": 7.417155575240031e-05, "learning_rate": 2.103825136612022e-05, "loss": 0.0, "step": 5685 }, { "epoch": 6.218579234972678, "grad_norm": 4.739851283375174e-05, "learning_rate": 2.100789313904068e-05, "loss": 0.0, "step": 5690 }, { "epoch": 6.224043715846994, "grad_norm": 0.31114462018013, "learning_rate": 2.097753491196114e-05, "loss": 0.0014, "step": 5695 }, { "epoch": 6.229508196721311, "grad_norm": 0.00010993971227435395, "learning_rate": 2.0947176684881604e-05, "loss": 0.0, "step": 5700 }, { "epoch": 6.2349726775956285, "grad_norm": 0.00012361927656456828, "learning_rate": 2.0916818457802065e-05, "loss": 0.0, "step": 5705 }, { "epoch": 6.240437158469946, "grad_norm": 3.7131321732886136e-05, "learning_rate": 2.0886460230722527e-05, "loss": 0.0, "step": 5710 }, { "epoch": 6.245901639344262, "grad_norm": 8.455519855488092e-05, "learning_rate": 2.0856102003642988e-05, "loss": 0.0, "step": 5715 }, { "epoch": 6.251366120218579, "grad_norm": 4.8671347030904144e-05, "learning_rate": 2.082574377656345e-05, "loss": 0.0, "step": 5720 }, { "epoch": 6.256830601092896, "grad_norm": 2.6576733944239095e-05, "learning_rate": 2.0795385549483913e-05, "loss": 0.0, "step": 5725 }, { "epoch": 6.262295081967213, "grad_norm": 0.00012619678454939276, "learning_rate": 2.0765027322404374e-05, "loss": 0.0, "step": 5730 }, { "epoch": 6.26775956284153, "grad_norm": 0.00012123715714551508, "learning_rate": 2.0734669095324836e-05, "loss": 0.0, "step": 5735 }, { "epoch": 6.273224043715847, "grad_norm": 6.571730773430318e-05, "learning_rate": 2.0704310868245293e-05, "loss": 0.0, "step": 5740 }, { "epoch": 6.278688524590164, "grad_norm": 0.00016342464368790388, "learning_rate": 2.0673952641165754e-05, "loss": 0.0, "step": 5745 }, { "epoch": 6.284153005464481, "grad_norm": 7.798125443514436e-05, "learning_rate": 2.064359441408622e-05, "loss": 0.0, "step": 5750 }, { "epoch": 6.2896174863387975, "grad_norm": 4.3517509766388685e-05, "learning_rate": 2.061323618700668e-05, "loss": 0.0, "step": 5755 }, { "epoch": 6.295081967213115, "grad_norm": 5.0315840780967847e-05, "learning_rate": 2.058287795992714e-05, "loss": 0.0, "step": 5760 }, { "epoch": 6.300546448087432, "grad_norm": 5.3943444072501734e-05, "learning_rate": 2.0552519732847602e-05, "loss": 0.0, "step": 5765 }, { "epoch": 6.306010928961749, "grad_norm": 0.00030482446891255677, "learning_rate": 2.0522161505768063e-05, "loss": 0.0, "step": 5770 }, { "epoch": 6.311475409836065, "grad_norm": 6.828513869550079e-05, "learning_rate": 2.0491803278688525e-05, "loss": 0.0, "step": 5775 }, { "epoch": 6.316939890710382, "grad_norm": 9.909820801112801e-05, "learning_rate": 2.046144505160899e-05, "loss": 0.0, "step": 5780 }, { "epoch": 6.3224043715847, "grad_norm": 9.688820864539593e-05, "learning_rate": 2.043108682452945e-05, "loss": 0.0, "step": 5785 }, { "epoch": 6.327868852459017, "grad_norm": 3.0218470783438534e-05, "learning_rate": 2.040072859744991e-05, "loss": 0.0, "step": 5790 }, { "epoch": 6.333333333333333, "grad_norm": 0.002815735526382923, "learning_rate": 2.037037037037037e-05, "loss": 0.0, "step": 5795 }, { "epoch": 6.33879781420765, "grad_norm": 5.187254646443762e-05, "learning_rate": 2.0340012143290834e-05, "loss": 0.0, "step": 5800 }, { "epoch": 6.344262295081967, "grad_norm": 3.162359644193202e-05, "learning_rate": 2.0309653916211295e-05, "loss": 0.0, "step": 5805 }, { "epoch": 6.3497267759562845, "grad_norm": 0.0004508834390435368, "learning_rate": 2.0279295689131756e-05, "loss": 0.0, "step": 5810 }, { "epoch": 6.355191256830601, "grad_norm": 5.048835009802133e-05, "learning_rate": 2.0248937462052217e-05, "loss": 0.0, "step": 5815 }, { "epoch": 6.360655737704918, "grad_norm": 4.7119614464463666e-05, "learning_rate": 2.0218579234972678e-05, "loss": 0.0, "step": 5820 }, { "epoch": 6.366120218579235, "grad_norm": 6.0273188864812255e-05, "learning_rate": 2.018822100789314e-05, "loss": 0.0001, "step": 5825 }, { "epoch": 6.371584699453552, "grad_norm": 3.4625056287040934e-05, "learning_rate": 2.0157862780813604e-05, "loss": 0.0, "step": 5830 }, { "epoch": 6.377049180327869, "grad_norm": 0.00014075507351662964, "learning_rate": 2.0127504553734065e-05, "loss": 0.0, "step": 5835 }, { "epoch": 6.382513661202186, "grad_norm": 0.0014786600368097425, "learning_rate": 2.0097146326654526e-05, "loss": 0.0, "step": 5840 }, { "epoch": 6.387978142076503, "grad_norm": 3.1873085390543565e-05, "learning_rate": 2.0066788099574984e-05, "loss": 0.0, "step": 5845 }, { "epoch": 6.39344262295082, "grad_norm": 5.925606092205271e-05, "learning_rate": 2.0036429872495445e-05, "loss": 0.0, "step": 5850 }, { "epoch": 6.398907103825136, "grad_norm": 7.679805275984108e-05, "learning_rate": 2.000607164541591e-05, "loss": 0.0, "step": 5855 }, { "epoch": 6.404371584699454, "grad_norm": 0.00013544733519665897, "learning_rate": 1.997571341833637e-05, "loss": 0.0, "step": 5860 }, { "epoch": 6.409836065573771, "grad_norm": 0.0003026534104719758, "learning_rate": 1.994535519125683e-05, "loss": 0.0, "step": 5865 }, { "epoch": 6.415300546448087, "grad_norm": 0.00010233583452645689, "learning_rate": 1.9914996964177293e-05, "loss": 0.0, "step": 5870 }, { "epoch": 6.420765027322404, "grad_norm": 0.0002885489957407117, "learning_rate": 1.9884638737097754e-05, "loss": 0.0, "step": 5875 }, { "epoch": 6.426229508196721, "grad_norm": 7.025294326012954e-05, "learning_rate": 1.9854280510018215e-05, "loss": 0.0, "step": 5880 }, { "epoch": 6.4316939890710385, "grad_norm": 5.036436778027564e-05, "learning_rate": 1.982392228293868e-05, "loss": 0.0, "step": 5885 }, { "epoch": 6.437158469945355, "grad_norm": 4.766888378071599e-05, "learning_rate": 1.979356405585914e-05, "loss": 0.0, "step": 5890 }, { "epoch": 6.442622950819672, "grad_norm": 2.0245690393494442e-05, "learning_rate": 1.97632058287796e-05, "loss": 0.0, "step": 5895 }, { "epoch": 6.448087431693989, "grad_norm": 0.0019478934118524194, "learning_rate": 1.973284760170006e-05, "loss": 0.0001, "step": 5900 }, { "epoch": 6.453551912568306, "grad_norm": 5.526033783098683e-05, "learning_rate": 1.9702489374620524e-05, "loss": 0.0, "step": 5905 }, { "epoch": 6.459016393442623, "grad_norm": 4.0888528019422665e-05, "learning_rate": 1.9672131147540985e-05, "loss": 0.0, "step": 5910 }, { "epoch": 6.46448087431694, "grad_norm": 4.284735405235551e-05, "learning_rate": 1.9641772920461446e-05, "loss": 0.0, "step": 5915 }, { "epoch": 6.469945355191257, "grad_norm": 3.235202530049719e-05, "learning_rate": 1.9611414693381907e-05, "loss": 0.0, "step": 5920 }, { "epoch": 6.475409836065574, "grad_norm": 8.918684761738405e-05, "learning_rate": 1.958105646630237e-05, "loss": 0.0, "step": 5925 }, { "epoch": 6.48087431693989, "grad_norm": 2.8192551326355897e-05, "learning_rate": 1.955069823922283e-05, "loss": 0.0, "step": 5930 }, { "epoch": 6.4863387978142075, "grad_norm": 6.065002526156604e-05, "learning_rate": 1.9520340012143294e-05, "loss": 0.0, "step": 5935 }, { "epoch": 6.491803278688525, "grad_norm": 8.560925198253244e-05, "learning_rate": 1.9489981785063755e-05, "loss": 0.0, "step": 5940 }, { "epoch": 6.497267759562842, "grad_norm": 3.3522384910611436e-05, "learning_rate": 1.9459623557984213e-05, "loss": 0.0, "step": 5945 }, { "epoch": 6.502732240437158, "grad_norm": 8.88680006028153e-05, "learning_rate": 1.9429265330904674e-05, "loss": 0.0, "step": 5950 }, { "epoch": 6.508196721311475, "grad_norm": 6.100683458498679e-05, "learning_rate": 1.9398907103825135e-05, "loss": 0.0, "step": 5955 }, { "epoch": 6.5136612021857925, "grad_norm": 1.864024488895666e-05, "learning_rate": 1.93685488767456e-05, "loss": 0.0, "step": 5960 }, { "epoch": 6.51912568306011, "grad_norm": 0.00014385611575562507, "learning_rate": 1.933819064966606e-05, "loss": 0.0, "step": 5965 }, { "epoch": 6.524590163934426, "grad_norm": 3.123824717476964e-05, "learning_rate": 1.9307832422586522e-05, "loss": 0.0, "step": 5970 }, { "epoch": 6.530054644808743, "grad_norm": 2.9815590096404776e-05, "learning_rate": 1.9277474195506983e-05, "loss": 0.0, "step": 5975 }, { "epoch": 6.53551912568306, "grad_norm": 2.5311090212198906e-05, "learning_rate": 1.9247115968427444e-05, "loss": 0.0, "step": 5980 }, { "epoch": 6.540983606557377, "grad_norm": 6.320253305602819e-05, "learning_rate": 1.9216757741347906e-05, "loss": 0.0, "step": 5985 }, { "epoch": 6.546448087431694, "grad_norm": 6.124599894974381e-05, "learning_rate": 1.918639951426837e-05, "loss": 0.0, "step": 5990 }, { "epoch": 6.551912568306011, "grad_norm": 0.0008633544784970582, "learning_rate": 1.9156041287188828e-05, "loss": 0.0, "step": 5995 }, { "epoch": 6.557377049180328, "grad_norm": 6.350692274281755e-05, "learning_rate": 1.912568306010929e-05, "loss": 0.0, "step": 6000 }, { "epoch": 6.562841530054644, "grad_norm": 3.356828892719932e-05, "learning_rate": 1.909532483302975e-05, "loss": 0.0, "step": 6005 }, { "epoch": 6.5683060109289615, "grad_norm": 0.00026475227787159383, "learning_rate": 1.9064966605950215e-05, "loss": 0.0, "step": 6010 }, { "epoch": 6.573770491803279, "grad_norm": 2.997915908053983e-05, "learning_rate": 1.9034608378870676e-05, "loss": 0.0, "step": 6015 }, { "epoch": 6.579234972677596, "grad_norm": 4.210296174278483e-05, "learning_rate": 1.9004250151791137e-05, "loss": 0.0, "step": 6020 }, { "epoch": 6.584699453551913, "grad_norm": 4.2359999497421086e-05, "learning_rate": 1.8973891924711598e-05, "loss": 0.0, "step": 6025 }, { "epoch": 6.590163934426229, "grad_norm": 4.0853054088074714e-05, "learning_rate": 1.894353369763206e-05, "loss": 0.0, "step": 6030 }, { "epoch": 6.595628415300546, "grad_norm": 1.567585968587082e-05, "learning_rate": 1.891317547055252e-05, "loss": 0.0, "step": 6035 }, { "epoch": 6.601092896174864, "grad_norm": 3.672630919027142e-05, "learning_rate": 1.8882817243472985e-05, "loss": 0.0, "step": 6040 }, { "epoch": 6.60655737704918, "grad_norm": 2.127391599060502e-05, "learning_rate": 1.8852459016393442e-05, "loss": 0.0, "step": 6045 }, { "epoch": 6.612021857923497, "grad_norm": 4.033459481433965e-05, "learning_rate": 1.8822100789313904e-05, "loss": 0.0, "step": 6050 }, { "epoch": 6.617486338797814, "grad_norm": 3.772424315684475e-05, "learning_rate": 1.8791742562234365e-05, "loss": 0.0, "step": 6055 }, { "epoch": 6.622950819672131, "grad_norm": 4.5684719225391746e-05, "learning_rate": 1.8761384335154826e-05, "loss": 0.0, "step": 6060 }, { "epoch": 6.628415300546449, "grad_norm": 4.037997859995812e-05, "learning_rate": 1.873102610807529e-05, "loss": 0.0, "step": 6065 }, { "epoch": 6.633879781420765, "grad_norm": 2.9085753340041265e-05, "learning_rate": 1.870066788099575e-05, "loss": 0.0, "step": 6070 }, { "epoch": 6.639344262295082, "grad_norm": 4.5837143261451274e-05, "learning_rate": 1.8670309653916213e-05, "loss": 0.0, "step": 6075 }, { "epoch": 6.644808743169399, "grad_norm": 2.149335887224879e-05, "learning_rate": 1.8639951426836674e-05, "loss": 0.0, "step": 6080 }, { "epoch": 6.6502732240437155, "grad_norm": 2.6230225557810627e-05, "learning_rate": 1.8609593199757135e-05, "loss": 0.0, "step": 6085 }, { "epoch": 6.655737704918033, "grad_norm": 5.128745760885067e-05, "learning_rate": 1.85792349726776e-05, "loss": 0.0, "step": 6090 }, { "epoch": 6.66120218579235, "grad_norm": 2.8409345759428106e-05, "learning_rate": 1.8548876745598057e-05, "loss": 0.0, "step": 6095 }, { "epoch": 6.666666666666667, "grad_norm": 1.829441862355452e-05, "learning_rate": 1.8518518518518518e-05, "loss": 0.0, "step": 6100 }, { "epoch": 6.672131147540983, "grad_norm": 2.8088525141356513e-05, "learning_rate": 1.848816029143898e-05, "loss": 0.0, "step": 6105 }, { "epoch": 6.6775956284153, "grad_norm": 0.00019966061518061906, "learning_rate": 1.845780206435944e-05, "loss": 0.0, "step": 6110 }, { "epoch": 6.683060109289618, "grad_norm": 3.2912070309976116e-05, "learning_rate": 1.8427443837279905e-05, "loss": 0.0, "step": 6115 }, { "epoch": 6.688524590163935, "grad_norm": 3.402951915632002e-05, "learning_rate": 1.8397085610200366e-05, "loss": 0.0, "step": 6120 }, { "epoch": 6.693989071038251, "grad_norm": 5.13341037731152e-05, "learning_rate": 1.8366727383120827e-05, "loss": 0.0, "step": 6125 }, { "epoch": 6.699453551912568, "grad_norm": 3.986077354056761e-05, "learning_rate": 1.833636915604129e-05, "loss": 0.0, "step": 6130 }, { "epoch": 6.704918032786885, "grad_norm": 4.487119076657109e-05, "learning_rate": 1.830601092896175e-05, "loss": 0.0, "step": 6135 }, { "epoch": 6.7103825136612025, "grad_norm": 1.591966429259628e-05, "learning_rate": 1.827565270188221e-05, "loss": 0.0, "step": 6140 }, { "epoch": 6.715846994535519, "grad_norm": 2.8471571567934006e-05, "learning_rate": 1.8245294474802672e-05, "loss": 0.0, "step": 6145 }, { "epoch": 6.721311475409836, "grad_norm": 2.545778625062667e-05, "learning_rate": 1.8214936247723133e-05, "loss": 0.0, "step": 6150 }, { "epoch": 6.726775956284153, "grad_norm": 4.5180757297202945e-05, "learning_rate": 1.8184578020643594e-05, "loss": 0.0, "step": 6155 }, { "epoch": 6.73224043715847, "grad_norm": 3.133527570753358e-05, "learning_rate": 1.8154219793564055e-05, "loss": 0.0, "step": 6160 }, { "epoch": 6.737704918032787, "grad_norm": 2.0645300537580624e-05, "learning_rate": 1.8123861566484516e-05, "loss": 0.0, "step": 6165 }, { "epoch": 6.743169398907104, "grad_norm": 3.279056545579806e-05, "learning_rate": 1.809350333940498e-05, "loss": 0.0, "step": 6170 }, { "epoch": 6.748633879781421, "grad_norm": 3.055525303352624e-05, "learning_rate": 1.8063145112325442e-05, "loss": 0.0, "step": 6175 }, { "epoch": 6.754098360655737, "grad_norm": 1.9079752746620215e-05, "learning_rate": 1.8032786885245903e-05, "loss": 0.0, "step": 6180 }, { "epoch": 6.759562841530054, "grad_norm": 0.00016503347433172166, "learning_rate": 1.8002428658166364e-05, "loss": 0.0, "step": 6185 }, { "epoch": 6.7650273224043715, "grad_norm": 2.6253288524458185e-05, "learning_rate": 1.7972070431086825e-05, "loss": 0.0, "step": 6190 }, { "epoch": 6.770491803278689, "grad_norm": 2.5179529984598048e-05, "learning_rate": 1.7941712204007287e-05, "loss": 0.0, "step": 6195 }, { "epoch": 6.775956284153006, "grad_norm": 3.118176755378954e-05, "learning_rate": 1.7911353976927748e-05, "loss": 0.0, "step": 6200 }, { "epoch": 6.781420765027322, "grad_norm": 2.4124465198838152e-05, "learning_rate": 1.788099574984821e-05, "loss": 0.0, "step": 6205 }, { "epoch": 6.786885245901639, "grad_norm": 0.2607771158218384, "learning_rate": 1.785063752276867e-05, "loss": 0.0002, "step": 6210 }, { "epoch": 6.7923497267759565, "grad_norm": 1.626986158953514e-05, "learning_rate": 1.782027929568913e-05, "loss": 0.0, "step": 6215 }, { "epoch": 6.797814207650273, "grad_norm": 0.0010127287823706865, "learning_rate": 1.7789921068609596e-05, "loss": 0.0, "step": 6220 }, { "epoch": 6.80327868852459, "grad_norm": 3.336209192639217e-05, "learning_rate": 1.7759562841530057e-05, "loss": 0.0, "step": 6225 }, { "epoch": 6.808743169398907, "grad_norm": 3.142928471788764e-05, "learning_rate": 1.7729204614450518e-05, "loss": 0.0, "step": 6230 }, { "epoch": 6.814207650273224, "grad_norm": 2.5947849280782975e-05, "learning_rate": 1.769884638737098e-05, "loss": 0.0, "step": 6235 }, { "epoch": 6.8196721311475414, "grad_norm": 0.001310658873990178, "learning_rate": 1.766848816029144e-05, "loss": 0.0, "step": 6240 }, { "epoch": 6.825136612021858, "grad_norm": 7.160162931540981e-05, "learning_rate": 1.76381299332119e-05, "loss": 0.0, "step": 6245 }, { "epoch": 6.830601092896175, "grad_norm": 2.0879228031844832e-05, "learning_rate": 1.7607771706132362e-05, "loss": 0.0, "step": 6250 }, { "epoch": 6.836065573770492, "grad_norm": 9.69652392086573e-05, "learning_rate": 1.7577413479052823e-05, "loss": 0.0, "step": 6255 }, { "epoch": 6.841530054644808, "grad_norm": 1.9431065084063448e-05, "learning_rate": 1.7547055251973285e-05, "loss": 0.0, "step": 6260 }, { "epoch": 6.8469945355191255, "grad_norm": 5.2403873269213364e-05, "learning_rate": 1.7516697024893746e-05, "loss": 0.0, "step": 6265 }, { "epoch": 6.852459016393443, "grad_norm": 8.865645213518292e-05, "learning_rate": 1.7486338797814207e-05, "loss": 0.0, "step": 6270 }, { "epoch": 6.85792349726776, "grad_norm": 2.442375807731878e-05, "learning_rate": 1.745598057073467e-05, "loss": 0.0, "step": 6275 }, { "epoch": 6.863387978142076, "grad_norm": 2.5425528292544186e-05, "learning_rate": 1.7425622343655132e-05, "loss": 0.0, "step": 6280 }, { "epoch": 6.868852459016393, "grad_norm": 5.923685966990888e-05, "learning_rate": 1.7395264116575594e-05, "loss": 0.0, "step": 6285 }, { "epoch": 6.8743169398907105, "grad_norm": 2.9877415727241896e-05, "learning_rate": 1.7364905889496055e-05, "loss": 0.0, "step": 6290 }, { "epoch": 6.879781420765028, "grad_norm": 6.036084596416913e-05, "learning_rate": 1.7334547662416516e-05, "loss": 0.0001, "step": 6295 }, { "epoch": 6.885245901639344, "grad_norm": 3.035015834029764e-05, "learning_rate": 1.7304189435336977e-05, "loss": 0.0, "step": 6300 }, { "epoch": 6.890710382513661, "grad_norm": 1.9475093722576275e-05, "learning_rate": 1.7273831208257438e-05, "loss": 0.0, "step": 6305 }, { "epoch": 6.896174863387978, "grad_norm": 2.220989335910417e-05, "learning_rate": 1.72434729811779e-05, "loss": 0.0, "step": 6310 }, { "epoch": 6.901639344262295, "grad_norm": 2.0596038666553795e-05, "learning_rate": 1.721311475409836e-05, "loss": 0.0, "step": 6315 }, { "epoch": 6.907103825136612, "grad_norm": 0.00045092753134667873, "learning_rate": 1.718275652701882e-05, "loss": 0.0, "step": 6320 }, { "epoch": 6.912568306010929, "grad_norm": 1.8349317542742938e-05, "learning_rate": 1.7152398299939286e-05, "loss": 0.0, "step": 6325 }, { "epoch": 6.918032786885246, "grad_norm": 1.6973086530924775e-05, "learning_rate": 1.7122040072859747e-05, "loss": 0.0, "step": 6330 }, { "epoch": 6.923497267759563, "grad_norm": 3.073631160077639e-05, "learning_rate": 1.7091681845780208e-05, "loss": 0.0, "step": 6335 }, { "epoch": 6.9289617486338795, "grad_norm": 1.8282253222423606e-05, "learning_rate": 1.706132361870067e-05, "loss": 0.0, "step": 6340 }, { "epoch": 6.934426229508197, "grad_norm": 1.3297793884703424e-05, "learning_rate": 1.703096539162113e-05, "loss": 0.0, "step": 6345 }, { "epoch": 6.939890710382514, "grad_norm": 1.68807619047584e-05, "learning_rate": 1.700060716454159e-05, "loss": 0.0, "step": 6350 }, { "epoch": 6.945355191256831, "grad_norm": 1.75243585545104e-05, "learning_rate": 1.6970248937462053e-05, "loss": 0.0, "step": 6355 }, { "epoch": 6.950819672131147, "grad_norm": 1.8891807485488243e-05, "learning_rate": 1.6939890710382514e-05, "loss": 0.0, "step": 6360 }, { "epoch": 6.956284153005464, "grad_norm": 1.1831501069536898e-05, "learning_rate": 1.6909532483302975e-05, "loss": 0.0, "step": 6365 }, { "epoch": 6.961748633879782, "grad_norm": 3.065099008381367e-05, "learning_rate": 1.6879174256223436e-05, "loss": 0.0, "step": 6370 }, { "epoch": 6.967213114754099, "grad_norm": 2.236567161162384e-05, "learning_rate": 1.68488160291439e-05, "loss": 0.0, "step": 6375 }, { "epoch": 6.972677595628415, "grad_norm": 2.7795680580311455e-05, "learning_rate": 1.6818457802064362e-05, "loss": 0.0, "step": 6380 }, { "epoch": 6.978142076502732, "grad_norm": 4.299731517676264e-05, "learning_rate": 1.6788099574984823e-05, "loss": 0.0, "step": 6385 }, { "epoch": 6.983606557377049, "grad_norm": 2.10926154977642e-05, "learning_rate": 1.6757741347905284e-05, "loss": 0.0, "step": 6390 }, { "epoch": 6.989071038251366, "grad_norm": 3.6098430427955464e-05, "learning_rate": 1.6727383120825745e-05, "loss": 0.0, "step": 6395 }, { "epoch": 6.994535519125683, "grad_norm": 1.9995864931843244e-05, "learning_rate": 1.6697024893746206e-05, "loss": 0.0, "step": 6400 }, { "epoch": 7.0, "grad_norm": 1.6834292182466015e-05, "learning_rate": 1.6666666666666667e-05, "loss": 0.0, "step": 6405 }, { "epoch": 7.0, "eval_loss": 0.0011112982174381614, "eval_runtime": 697.2626, "eval_samples_per_second": 10.492, "eval_steps_per_second": 1.312, "step": 6405 }, { "epoch": 7.005464480874317, "grad_norm": 1.7212223610840738e-05, "learning_rate": 1.663630843958713e-05, "loss": 0.0, "step": 6410 }, { "epoch": 7.0109289617486334, "grad_norm": 2.5316898245364428e-05, "learning_rate": 1.660595021250759e-05, "loss": 0.0, "step": 6415 }, { "epoch": 7.016393442622951, "grad_norm": 1.4173186173138674e-05, "learning_rate": 1.657559198542805e-05, "loss": 0.0, "step": 6420 }, { "epoch": 7.021857923497268, "grad_norm": 1.2397636965033598e-05, "learning_rate": 1.6545233758348512e-05, "loss": 0.0, "step": 6425 }, { "epoch": 7.027322404371585, "grad_norm": 8.636420716356952e-06, "learning_rate": 1.6514875531268976e-05, "loss": 0.0, "step": 6430 }, { "epoch": 7.032786885245901, "grad_norm": 1.855434493336361e-05, "learning_rate": 1.6484517304189438e-05, "loss": 0.0, "step": 6435 }, { "epoch": 7.038251366120218, "grad_norm": 2.6530422474024817e-05, "learning_rate": 1.64541590771099e-05, "loss": 0.0, "step": 6440 }, { "epoch": 7.043715846994536, "grad_norm": 1.7378168195136823e-05, "learning_rate": 1.642380085003036e-05, "loss": 0.0, "step": 6445 }, { "epoch": 7.049180327868853, "grad_norm": 2.0945200958522037e-05, "learning_rate": 1.6393442622950818e-05, "loss": 0.0, "step": 6450 }, { "epoch": 7.054644808743169, "grad_norm": 9.741668327478692e-05, "learning_rate": 1.6363084395871282e-05, "loss": 0.0, "step": 6455 }, { "epoch": 7.060109289617486, "grad_norm": 1.0040616871265229e-05, "learning_rate": 1.6332726168791743e-05, "loss": 0.0, "step": 6460 }, { "epoch": 7.065573770491803, "grad_norm": 1.7955384464585222e-05, "learning_rate": 1.6302367941712204e-05, "loss": 0.0, "step": 6465 }, { "epoch": 7.0710382513661205, "grad_norm": 1.3347224921744782e-05, "learning_rate": 1.6272009714632666e-05, "loss": 0.0, "step": 6470 }, { "epoch": 7.076502732240437, "grad_norm": 2.3174603484221734e-05, "learning_rate": 1.6241651487553127e-05, "loss": 0.0, "step": 6475 }, { "epoch": 7.081967213114754, "grad_norm": 0.00023038995277602226, "learning_rate": 1.621129326047359e-05, "loss": 0.0, "step": 6480 }, { "epoch": 7.087431693989071, "grad_norm": 1.3241274245956447e-05, "learning_rate": 1.6180935033394052e-05, "loss": 0.0, "step": 6485 }, { "epoch": 7.092896174863388, "grad_norm": 2.079802652588114e-05, "learning_rate": 1.6150576806314513e-05, "loss": 0.0, "step": 6490 }, { "epoch": 7.098360655737705, "grad_norm": 2.717071765800938e-05, "learning_rate": 1.6120218579234975e-05, "loss": 0.0, "step": 6495 }, { "epoch": 7.103825136612022, "grad_norm": 0.0020049286540597677, "learning_rate": 1.6089860352155432e-05, "loss": 0.0, "step": 6500 }, { "epoch": 7.109289617486339, "grad_norm": 2.1362251573009416e-05, "learning_rate": 1.6059502125075897e-05, "loss": 0.0, "step": 6505 }, { "epoch": 7.114754098360656, "grad_norm": 1.807320222724229e-05, "learning_rate": 1.6029143897996358e-05, "loss": 0.0, "step": 6510 }, { "epoch": 7.120218579234972, "grad_norm": 1.5250131582433823e-05, "learning_rate": 1.599878567091682e-05, "loss": 0.0, "step": 6515 }, { "epoch": 7.1256830601092895, "grad_norm": 2.2128344426164404e-05, "learning_rate": 1.596842744383728e-05, "loss": 0.0, "step": 6520 }, { "epoch": 7.131147540983607, "grad_norm": 2.122944533766713e-05, "learning_rate": 1.593806921675774e-05, "loss": 0.0, "step": 6525 }, { "epoch": 7.136612021857924, "grad_norm": 1.889590930659324e-05, "learning_rate": 1.5907710989678202e-05, "loss": 0.0, "step": 6530 }, { "epoch": 7.14207650273224, "grad_norm": 1.700487337075174e-05, "learning_rate": 1.5877352762598667e-05, "loss": 0.0, "step": 6535 }, { "epoch": 7.147540983606557, "grad_norm": 1.6051513739512302e-05, "learning_rate": 1.5846994535519128e-05, "loss": 0.0, "step": 6540 }, { "epoch": 7.1530054644808745, "grad_norm": 5.704201976186596e-05, "learning_rate": 1.581663630843959e-05, "loss": 0.0, "step": 6545 }, { "epoch": 7.158469945355192, "grad_norm": 1.7098071111831814e-05, "learning_rate": 1.5786278081360047e-05, "loss": 0.0, "step": 6550 }, { "epoch": 7.163934426229508, "grad_norm": 0.00030831946060061455, "learning_rate": 1.5755919854280508e-05, "loss": 0.0, "step": 6555 }, { "epoch": 7.169398907103825, "grad_norm": 8.064251596806571e-05, "learning_rate": 1.5725561627200973e-05, "loss": 0.0, "step": 6560 }, { "epoch": 7.174863387978142, "grad_norm": 2.111588401021436e-05, "learning_rate": 1.5695203400121434e-05, "loss": 0.0, "step": 6565 }, { "epoch": 7.180327868852459, "grad_norm": 2.722311364777852e-05, "learning_rate": 1.5664845173041895e-05, "loss": 0.0, "step": 6570 }, { "epoch": 7.185792349726776, "grad_norm": 2.8191749152028933e-05, "learning_rate": 1.5634486945962356e-05, "loss": 0.0, "step": 6575 }, { "epoch": 7.191256830601093, "grad_norm": 5.410460289567709e-05, "learning_rate": 1.5604128718882817e-05, "loss": 0.0, "step": 6580 }, { "epoch": 7.19672131147541, "grad_norm": 0.0008463920094072819, "learning_rate": 1.557377049180328e-05, "loss": 0.0, "step": 6585 }, { "epoch": 7.202185792349727, "grad_norm": 1.6411495380452834e-05, "learning_rate": 1.5543412264723743e-05, "loss": 0.0, "step": 6590 }, { "epoch": 7.2076502732240435, "grad_norm": 4.104662366444245e-05, "learning_rate": 1.5513054037644204e-05, "loss": 0.0, "step": 6595 }, { "epoch": 7.213114754098361, "grad_norm": 2.1741001546615735e-05, "learning_rate": 1.548269581056466e-05, "loss": 0.0, "step": 6600 }, { "epoch": 7.218579234972678, "grad_norm": 2.477094494679477e-05, "learning_rate": 1.5452337583485123e-05, "loss": 0.0, "step": 6605 }, { "epoch": 7.224043715846994, "grad_norm": 1.5612909919582307e-05, "learning_rate": 1.5421979356405587e-05, "loss": 0.0, "step": 6610 }, { "epoch": 7.229508196721311, "grad_norm": 1.5115789210540242e-05, "learning_rate": 1.539162112932605e-05, "loss": 0.0, "step": 6615 }, { "epoch": 7.2349726775956285, "grad_norm": 2.655894604686182e-05, "learning_rate": 1.536126290224651e-05, "loss": 0.0, "step": 6620 }, { "epoch": 7.240437158469946, "grad_norm": 2.9735285352217034e-05, "learning_rate": 1.533090467516697e-05, "loss": 0.0, "step": 6625 }, { "epoch": 7.245901639344262, "grad_norm": 0.0004776114656124264, "learning_rate": 1.5300546448087432e-05, "loss": 0.0, "step": 6630 }, { "epoch": 7.251366120218579, "grad_norm": 1.4129350347502623e-05, "learning_rate": 1.5270188221007893e-05, "loss": 0.0, "step": 6635 }, { "epoch": 7.256830601092896, "grad_norm": 4.216094748699106e-05, "learning_rate": 1.5239829993928356e-05, "loss": 0.0, "step": 6640 }, { "epoch": 7.262295081967213, "grad_norm": 2.8377304261084646e-05, "learning_rate": 1.5209471766848819e-05, "loss": 0.0, "step": 6645 }, { "epoch": 7.26775956284153, "grad_norm": 1.1897628610313404e-05, "learning_rate": 1.5179113539769276e-05, "loss": 0.0, "step": 6650 }, { "epoch": 7.273224043715847, "grad_norm": 3.0133123800624162e-05, "learning_rate": 1.514875531268974e-05, "loss": 0.0, "step": 6655 }, { "epoch": 7.278688524590164, "grad_norm": 1.8536607967689633e-05, "learning_rate": 1.51183970856102e-05, "loss": 0.0, "step": 6660 }, { "epoch": 7.284153005464481, "grad_norm": 4.778361471835524e-05, "learning_rate": 1.5088038858530661e-05, "loss": 0.0, "step": 6665 }, { "epoch": 7.2896174863387975, "grad_norm": 1.5681132936151698e-05, "learning_rate": 1.5057680631451124e-05, "loss": 0.0, "step": 6670 }, { "epoch": 7.295081967213115, "grad_norm": 2.1964269762975164e-05, "learning_rate": 1.5027322404371585e-05, "loss": 0.0, "step": 6675 }, { "epoch": 7.300546448087432, "grad_norm": 0.0007649378385394812, "learning_rate": 1.4996964177292046e-05, "loss": 0.0, "step": 6680 }, { "epoch": 7.306010928961749, "grad_norm": 1.5521494788117707e-05, "learning_rate": 1.496660595021251e-05, "loss": 0.0, "step": 6685 }, { "epoch": 7.311475409836065, "grad_norm": 1.1767200703616254e-05, "learning_rate": 1.493624772313297e-05, "loss": 0.0, "step": 6690 }, { "epoch": 7.316939890710382, "grad_norm": 2.8079559342586435e-05, "learning_rate": 1.4905889496053432e-05, "loss": 0.0, "step": 6695 }, { "epoch": 7.3224043715847, "grad_norm": 4.105495463591069e-05, "learning_rate": 1.4875531268973891e-05, "loss": 0.0, "step": 6700 }, { "epoch": 7.327868852459017, "grad_norm": 2.0951249098288827e-05, "learning_rate": 1.4845173041894352e-05, "loss": 0.0, "step": 6705 }, { "epoch": 7.333333333333333, "grad_norm": 1.3757561646343675e-05, "learning_rate": 1.4814814814814815e-05, "loss": 0.0, "step": 6710 }, { "epoch": 7.33879781420765, "grad_norm": 1.4121252206678037e-05, "learning_rate": 1.4784456587735276e-05, "loss": 0.0, "step": 6715 }, { "epoch": 7.344262295081967, "grad_norm": 1.3257107639219612e-05, "learning_rate": 1.4754098360655739e-05, "loss": 0.0, "step": 6720 }, { "epoch": 7.3497267759562845, "grad_norm": 2.558304367994424e-05, "learning_rate": 1.47237401335762e-05, "loss": 0.0, "step": 6725 }, { "epoch": 7.355191256830601, "grad_norm": 1.6294507076963782e-05, "learning_rate": 1.4693381906496661e-05, "loss": 0.0, "step": 6730 }, { "epoch": 7.360655737704918, "grad_norm": 0.0001479545171605423, "learning_rate": 1.4663023679417124e-05, "loss": 0.0, "step": 6735 }, { "epoch": 7.366120218579235, "grad_norm": 1.7503914932603948e-05, "learning_rate": 1.4632665452337585e-05, "loss": 0.0, "step": 6740 }, { "epoch": 7.371584699453552, "grad_norm": 1.6494923329446465e-05, "learning_rate": 1.4602307225258046e-05, "loss": 0.0, "step": 6745 }, { "epoch": 7.377049180327869, "grad_norm": 0.000659099780023098, "learning_rate": 1.4571948998178509e-05, "loss": 0.0, "step": 6750 }, { "epoch": 7.382513661202186, "grad_norm": 1.7199408830492757e-05, "learning_rate": 1.4541590771098967e-05, "loss": 0.0, "step": 6755 }, { "epoch": 7.387978142076503, "grad_norm": 3.6840476241195574e-05, "learning_rate": 1.451123254401943e-05, "loss": 0.0, "step": 6760 }, { "epoch": 7.39344262295082, "grad_norm": 2.0265113562345505e-05, "learning_rate": 1.448087431693989e-05, "loss": 0.0, "step": 6765 }, { "epoch": 7.398907103825136, "grad_norm": 0.00412601325660944, "learning_rate": 1.4450516089860352e-05, "loss": 0.0, "step": 6770 }, { "epoch": 7.404371584699454, "grad_norm": 7.587561412947252e-05, "learning_rate": 1.4420157862780815e-05, "loss": 0.0, "step": 6775 }, { "epoch": 7.409836065573771, "grad_norm": 1.973733196791727e-05, "learning_rate": 1.4389799635701276e-05, "loss": 0.0, "step": 6780 }, { "epoch": 7.415300546448087, "grad_norm": 3.874331741826609e-05, "learning_rate": 1.4359441408621737e-05, "loss": 0.0, "step": 6785 }, { "epoch": 7.420765027322404, "grad_norm": 8.414830517722294e-05, "learning_rate": 1.43290831815422e-05, "loss": 0.0, "step": 6790 }, { "epoch": 7.426229508196721, "grad_norm": 1.557487848913297e-05, "learning_rate": 1.4298724954462661e-05, "loss": 0.0, "step": 6795 }, { "epoch": 7.4316939890710385, "grad_norm": 1.9917782992706634e-05, "learning_rate": 1.4268366727383122e-05, "loss": 0.0, "step": 6800 }, { "epoch": 7.437158469945355, "grad_norm": 8.511933992849663e-05, "learning_rate": 1.4238008500303581e-05, "loss": 0.0, "step": 6805 }, { "epoch": 7.442622950819672, "grad_norm": 1.9273149518994614e-05, "learning_rate": 1.4207650273224044e-05, "loss": 0.0, "step": 6810 }, { "epoch": 7.448087431693989, "grad_norm": 0.0007372593972831964, "learning_rate": 1.4177292046144505e-05, "loss": 0.0, "step": 6815 }, { "epoch": 7.453551912568306, "grad_norm": 1.7892272808239795e-05, "learning_rate": 1.4146933819064967e-05, "loss": 0.0, "step": 6820 }, { "epoch": 7.459016393442623, "grad_norm": 1.3487725482264068e-05, "learning_rate": 1.411657559198543e-05, "loss": 0.0, "step": 6825 }, { "epoch": 7.46448087431694, "grad_norm": 1.5921907106530853e-05, "learning_rate": 1.408621736490589e-05, "loss": 0.0, "step": 6830 }, { "epoch": 7.469945355191257, "grad_norm": 1.7433028915547766e-05, "learning_rate": 1.4055859137826352e-05, "loss": 0.0, "step": 6835 }, { "epoch": 7.475409836065574, "grad_norm": 2.843215588654857e-05, "learning_rate": 1.4025500910746814e-05, "loss": 0.0, "step": 6840 }, { "epoch": 7.48087431693989, "grad_norm": 5.431610770756379e-05, "learning_rate": 1.3995142683667276e-05, "loss": 0.0, "step": 6845 }, { "epoch": 7.4863387978142075, "grad_norm": 1.080147376342211e-05, "learning_rate": 1.3964784456587737e-05, "loss": 0.0, "step": 6850 }, { "epoch": 7.491803278688525, "grad_norm": 1.6685771697666496e-05, "learning_rate": 1.3934426229508196e-05, "loss": 0.0, "step": 6855 }, { "epoch": 7.497267759562842, "grad_norm": 2.097435026371386e-05, "learning_rate": 1.3904068002428657e-05, "loss": 0.0, "step": 6860 }, { "epoch": 7.502732240437158, "grad_norm": 1.7133286746684462e-05, "learning_rate": 1.387370977534912e-05, "loss": 0.0, "step": 6865 }, { "epoch": 7.508196721311475, "grad_norm": 0.0006649263086728752, "learning_rate": 1.3843351548269581e-05, "loss": 0.0, "step": 6870 }, { "epoch": 7.5136612021857925, "grad_norm": 2.0749612303916365e-05, "learning_rate": 1.3812993321190042e-05, "loss": 0.0, "step": 6875 }, { "epoch": 7.51912568306011, "grad_norm": 2.099968696711585e-05, "learning_rate": 1.3782635094110505e-05, "loss": 0.0, "step": 6880 }, { "epoch": 7.524590163934426, "grad_norm": 1.3263029359222855e-05, "learning_rate": 1.3752276867030966e-05, "loss": 0.0, "step": 6885 }, { "epoch": 7.530054644808743, "grad_norm": 1.159540352091426e-05, "learning_rate": 1.3721918639951427e-05, "loss": 0.0, "step": 6890 }, { "epoch": 7.53551912568306, "grad_norm": 1.8703914975048974e-05, "learning_rate": 1.369156041287189e-05, "loss": 0.0, "step": 6895 }, { "epoch": 7.540983606557377, "grad_norm": 1.2056356354150921e-05, "learning_rate": 1.3661202185792351e-05, "loss": 0.0, "step": 6900 }, { "epoch": 7.546448087431694, "grad_norm": 6.520111492136493e-05, "learning_rate": 1.363084395871281e-05, "loss": 0.0, "step": 6905 }, { "epoch": 7.551912568306011, "grad_norm": 1.4154918972053565e-05, "learning_rate": 1.3600485731633272e-05, "loss": 0.0, "step": 6910 }, { "epoch": 7.557377049180328, "grad_norm": 7.266816828632727e-05, "learning_rate": 1.3570127504553735e-05, "loss": 0.0, "step": 6915 }, { "epoch": 7.562841530054644, "grad_norm": 1.5945193808875047e-05, "learning_rate": 1.3539769277474196e-05, "loss": 0.0, "step": 6920 }, { "epoch": 7.5683060109289615, "grad_norm": 9.222345397574827e-06, "learning_rate": 1.3509411050394657e-05, "loss": 0.0, "step": 6925 }, { "epoch": 7.573770491803279, "grad_norm": 1.5671421351726167e-05, "learning_rate": 1.347905282331512e-05, "loss": 0.0, "step": 6930 }, { "epoch": 7.579234972677596, "grad_norm": 2.936273995146621e-05, "learning_rate": 1.3448694596235581e-05, "loss": 0.0, "step": 6935 }, { "epoch": 7.584699453551913, "grad_norm": 0.0006553413695655763, "learning_rate": 1.3418336369156042e-05, "loss": 0.0, "step": 6940 }, { "epoch": 7.590163934426229, "grad_norm": 1.565740785736125e-05, "learning_rate": 1.3387978142076505e-05, "loss": 0.0, "step": 6945 }, { "epoch": 7.595628415300546, "grad_norm": 9.471644261793699e-06, "learning_rate": 1.3357619914996966e-05, "loss": 0.0, "step": 6950 }, { "epoch": 7.601092896174864, "grad_norm": 1.3549584764405154e-05, "learning_rate": 1.3327261687917426e-05, "loss": 0.0, "step": 6955 }, { "epoch": 7.60655737704918, "grad_norm": 1.6184301784960553e-05, "learning_rate": 1.3296903460837887e-05, "loss": 0.0, "step": 6960 }, { "epoch": 7.612021857923497, "grad_norm": 8.797870577836875e-06, "learning_rate": 1.3266545233758348e-05, "loss": 0.0, "step": 6965 }, { "epoch": 7.617486338797814, "grad_norm": 1.9648776287795044e-05, "learning_rate": 1.323618700667881e-05, "loss": 0.0, "step": 6970 }, { "epoch": 7.622950819672131, "grad_norm": 0.0006378691759891808, "learning_rate": 1.3205828779599272e-05, "loss": 0.0, "step": 6975 }, { "epoch": 7.628415300546449, "grad_norm": 2.147053055523429e-05, "learning_rate": 1.3175470552519733e-05, "loss": 0.0, "step": 6980 }, { "epoch": 7.633879781420765, "grad_norm": 1.374123621644685e-05, "learning_rate": 1.3145112325440196e-05, "loss": 0.0, "step": 6985 }, { "epoch": 7.639344262295082, "grad_norm": 1.2073891412001103e-05, "learning_rate": 1.3114754098360657e-05, "loss": 0.0, "step": 6990 }, { "epoch": 7.644808743169399, "grad_norm": 2.7077034246758558e-05, "learning_rate": 1.3084395871281118e-05, "loss": 0.0, "step": 6995 }, { "epoch": 7.6502732240437155, "grad_norm": 1.192458876175806e-05, "learning_rate": 1.305403764420158e-05, "loss": 0.0, "step": 7000 }, { "epoch": 7.655737704918033, "grad_norm": 1.2989978131372482e-05, "learning_rate": 1.302367941712204e-05, "loss": 0.0, "step": 7005 }, { "epoch": 7.66120218579235, "grad_norm": 7.656402158318087e-05, "learning_rate": 1.2993321190042501e-05, "loss": 0.0, "step": 7010 }, { "epoch": 7.666666666666667, "grad_norm": 1.8988159354194067e-05, "learning_rate": 1.2962962962962962e-05, "loss": 0.0, "step": 7015 }, { "epoch": 7.672131147540983, "grad_norm": 1.1558569894987158e-05, "learning_rate": 1.2932604735883425e-05, "loss": 0.0, "step": 7020 }, { "epoch": 7.6775956284153, "grad_norm": 1.1080450349254534e-05, "learning_rate": 1.2902246508803886e-05, "loss": 0.0, "step": 7025 }, { "epoch": 7.683060109289618, "grad_norm": 9.0610474217101e-06, "learning_rate": 1.2871888281724348e-05, "loss": 0.0, "step": 7030 }, { "epoch": 7.688524590163935, "grad_norm": 2.1570349417743273e-05, "learning_rate": 1.284153005464481e-05, "loss": 0.0, "step": 7035 }, { "epoch": 7.693989071038251, "grad_norm": 8.527462341589853e-06, "learning_rate": 1.2811171827565271e-05, "loss": 0.0, "step": 7040 }, { "epoch": 7.699453551912568, "grad_norm": 3.9674927393207327e-05, "learning_rate": 1.2780813600485733e-05, "loss": 0.191, "step": 7045 }, { "epoch": 7.704918032786885, "grad_norm": 1.570533640915528e-05, "learning_rate": 1.2750455373406195e-05, "loss": 0.0, "step": 7050 }, { "epoch": 7.7103825136612025, "grad_norm": 9.140562178799883e-05, "learning_rate": 1.2720097146326653e-05, "loss": 0.0, "step": 7055 }, { "epoch": 7.715846994535519, "grad_norm": 4.640642509912141e-05, "learning_rate": 1.2689738919247116e-05, "loss": 0.0, "step": 7060 }, { "epoch": 7.721311475409836, "grad_norm": 2.3676704586250708e-05, "learning_rate": 1.2659380692167577e-05, "loss": 0.0, "step": 7065 }, { "epoch": 7.726775956284153, "grad_norm": 0.049377378076314926, "learning_rate": 1.2629022465088038e-05, "loss": 0.0001, "step": 7070 }, { "epoch": 7.73224043715847, "grad_norm": 1.8265061953570694e-05, "learning_rate": 1.2598664238008501e-05, "loss": 0.0, "step": 7075 }, { "epoch": 7.737704918032787, "grad_norm": 1.7306145309703425e-05, "learning_rate": 1.2568306010928962e-05, "loss": 0.0, "step": 7080 }, { "epoch": 7.743169398907104, "grad_norm": 0.0007131117163226008, "learning_rate": 1.2537947783849423e-05, "loss": 0.0, "step": 7085 }, { "epoch": 7.748633879781421, "grad_norm": 4.315848491387442e-05, "learning_rate": 1.2507589556769886e-05, "loss": 0.0, "step": 7090 }, { "epoch": 7.754098360655737, "grad_norm": 0.0017947383457794785, "learning_rate": 1.2477231329690346e-05, "loss": 0.0, "step": 7095 }, { "epoch": 7.759562841530054, "grad_norm": 0.0001714023237582296, "learning_rate": 1.2446873102610808e-05, "loss": 0.0002, "step": 7100 }, { "epoch": 7.7650273224043715, "grad_norm": 3.838367047137581e-05, "learning_rate": 1.241651487553127e-05, "loss": 0.0, "step": 7105 }, { "epoch": 7.770491803278689, "grad_norm": 4.043810986331664e-05, "learning_rate": 1.238615664845173e-05, "loss": 0.0, "step": 7110 }, { "epoch": 7.775956284153006, "grad_norm": 1.842474921431858e-05, "learning_rate": 1.2355798421372194e-05, "loss": 0.0, "step": 7115 }, { "epoch": 7.781420765027322, "grad_norm": 1.2997180419915821e-05, "learning_rate": 1.2325440194292653e-05, "loss": 0.0, "step": 7120 }, { "epoch": 7.786885245901639, "grad_norm": 1.204277214128524e-05, "learning_rate": 1.2295081967213116e-05, "loss": 0.0, "step": 7125 }, { "epoch": 7.7923497267759565, "grad_norm": 1.7841040971688926e-05, "learning_rate": 1.2264723740133577e-05, "loss": 0.0, "step": 7130 }, { "epoch": 7.797814207650273, "grad_norm": 8.681177132530138e-05, "learning_rate": 1.2234365513054038e-05, "loss": 0.0, "step": 7135 }, { "epoch": 7.80327868852459, "grad_norm": 1.538177821203135e-05, "learning_rate": 1.22040072859745e-05, "loss": 0.0, "step": 7140 }, { "epoch": 7.808743169398907, "grad_norm": 1.1661175449262373e-05, "learning_rate": 1.217364905889496e-05, "loss": 0.0, "step": 7145 }, { "epoch": 7.814207650273224, "grad_norm": 1.3177917026041541e-05, "learning_rate": 1.2143290831815423e-05, "loss": 0.0001, "step": 7150 }, { "epoch": 7.8196721311475414, "grad_norm": 1.9387889551580884e-05, "learning_rate": 1.2112932604735884e-05, "loss": 0.0, "step": 7155 }, { "epoch": 7.825136612021858, "grad_norm": 1.555710332468152e-05, "learning_rate": 1.2082574377656345e-05, "loss": 0.0, "step": 7160 }, { "epoch": 7.830601092896175, "grad_norm": 1.2738814803014975e-05, "learning_rate": 1.2052216150576808e-05, "loss": 0.0, "step": 7165 }, { "epoch": 7.836065573770492, "grad_norm": 0.00013192101323511451, "learning_rate": 1.2021857923497268e-05, "loss": 0.0, "step": 7170 }, { "epoch": 7.841530054644808, "grad_norm": 1.0073224075313192e-05, "learning_rate": 1.1991499696417729e-05, "loss": 0.0, "step": 7175 }, { "epoch": 7.8469945355191255, "grad_norm": 1.0778838259284385e-05, "learning_rate": 1.1961141469338192e-05, "loss": 0.0, "step": 7180 }, { "epoch": 7.852459016393443, "grad_norm": 2.35039769904688e-05, "learning_rate": 1.1930783242258653e-05, "loss": 0.0, "step": 7185 }, { "epoch": 7.85792349726776, "grad_norm": 1.8142010958399624e-05, "learning_rate": 1.1900425015179116e-05, "loss": 0.0, "step": 7190 }, { "epoch": 7.863387978142076, "grad_norm": 1.2536640497273766e-05, "learning_rate": 1.1870066788099575e-05, "loss": 0.0, "step": 7195 }, { "epoch": 7.868852459016393, "grad_norm": 1.3795261111226864e-05, "learning_rate": 1.1839708561020036e-05, "loss": 0.0, "step": 7200 }, { "epoch": 7.8743169398907105, "grad_norm": 1.054769836628111e-05, "learning_rate": 1.1809350333940499e-05, "loss": 0.0, "step": 7205 }, { "epoch": 7.879781420765028, "grad_norm": 3.637406916823238e-05, "learning_rate": 1.177899210686096e-05, "loss": 0.0, "step": 7210 }, { "epoch": 7.885245901639344, "grad_norm": 1.304059424001025e-05, "learning_rate": 1.1748633879781421e-05, "loss": 0.0, "step": 7215 }, { "epoch": 7.890710382513661, "grad_norm": 1.0693567674024962e-05, "learning_rate": 1.1718275652701882e-05, "loss": 0.0, "step": 7220 }, { "epoch": 7.896174863387978, "grad_norm": 1.3089750609651674e-05, "learning_rate": 1.1687917425622343e-05, "loss": 0.0, "step": 7225 }, { "epoch": 7.901639344262295, "grad_norm": 1.2414632692525629e-05, "learning_rate": 1.1657559198542806e-05, "loss": 0.0, "step": 7230 }, { "epoch": 7.907103825136612, "grad_norm": 0.0006229742430150509, "learning_rate": 1.1627200971463267e-05, "loss": 0.0, "step": 7235 }, { "epoch": 7.912568306010929, "grad_norm": 1.361664089927217e-05, "learning_rate": 1.1596842744383728e-05, "loss": 0.0, "step": 7240 }, { "epoch": 7.918032786885246, "grad_norm": 1.4006955098011531e-05, "learning_rate": 1.156648451730419e-05, "loss": 0.0, "step": 7245 }, { "epoch": 7.923497267759563, "grad_norm": 1.6826483260956593e-05, "learning_rate": 1.153612629022465e-05, "loss": 0.0, "step": 7250 }, { "epoch": 7.9289617486338795, "grad_norm": 1.3367481187742669e-05, "learning_rate": 1.1505768063145114e-05, "loss": 0.0, "step": 7255 }, { "epoch": 7.934426229508197, "grad_norm": 2.5410168746020645e-05, "learning_rate": 1.1475409836065575e-05, "loss": 0.0, "step": 7260 }, { "epoch": 7.939890710382514, "grad_norm": 1.2079419320798479e-05, "learning_rate": 1.1445051608986036e-05, "loss": 0.0001, "step": 7265 }, { "epoch": 7.945355191256831, "grad_norm": 8.898069609131198e-06, "learning_rate": 1.1414693381906497e-05, "loss": 0.0, "step": 7270 }, { "epoch": 7.950819672131147, "grad_norm": 1.2275562767172232e-05, "learning_rate": 1.1384335154826958e-05, "loss": 0.0, "step": 7275 }, { "epoch": 7.956284153005464, "grad_norm": 1.0254681910737418e-05, "learning_rate": 1.135397692774742e-05, "loss": 0.0, "step": 7280 }, { "epoch": 7.961748633879782, "grad_norm": 1.1392396118026227e-05, "learning_rate": 1.1323618700667882e-05, "loss": 0.0, "step": 7285 }, { "epoch": 7.967213114754099, "grad_norm": 3.0942966986913234e-05, "learning_rate": 1.1293260473588343e-05, "loss": 0.0, "step": 7290 }, { "epoch": 7.972677595628415, "grad_norm": 8.617805178801063e-06, "learning_rate": 1.1262902246508804e-05, "loss": 0.0, "step": 7295 }, { "epoch": 7.978142076502732, "grad_norm": 1.213542327604955e-05, "learning_rate": 1.1232544019429265e-05, "loss": 0.0, "step": 7300 }, { "epoch": 7.983606557377049, "grad_norm": 9.413888619747013e-06, "learning_rate": 1.1202185792349727e-05, "loss": 0.0, "step": 7305 }, { "epoch": 7.989071038251366, "grad_norm": 1.631492887099739e-05, "learning_rate": 1.117182756527019e-05, "loss": 0.0, "step": 7310 }, { "epoch": 7.994535519125683, "grad_norm": 2.603645043564029e-05, "learning_rate": 1.114146933819065e-05, "loss": 0.0, "step": 7315 }, { "epoch": 8.0, "grad_norm": 1.8122764231520705e-05, "learning_rate": 1.1111111111111112e-05, "loss": 0.0, "step": 7320 }, { "epoch": 8.0, "eval_loss": 1.6960844959612587e-06, "eval_runtime": 658.0485, "eval_samples_per_second": 11.118, "eval_steps_per_second": 1.39, "step": 7320 }, { "epoch": 8.005464480874316, "grad_norm": 1.7527658201288432e-05, "learning_rate": 1.1080752884031573e-05, "loss": 0.0, "step": 7325 }, { "epoch": 8.010928961748634, "grad_norm": 1.5412620996357873e-05, "learning_rate": 1.1050394656952034e-05, "loss": 0.0, "step": 7330 }, { "epoch": 8.01639344262295, "grad_norm": 1.1587193512241356e-05, "learning_rate": 1.1020036429872497e-05, "loss": 0.0, "step": 7335 }, { "epoch": 8.021857923497267, "grad_norm": 2.150508043996524e-05, "learning_rate": 1.0989678202792958e-05, "loss": 0.0, "step": 7340 }, { "epoch": 8.027322404371585, "grad_norm": 9.768027666723356e-06, "learning_rate": 1.0959319975713419e-05, "loss": 0.0, "step": 7345 }, { "epoch": 8.032786885245901, "grad_norm": 1.269577751372708e-05, "learning_rate": 1.092896174863388e-05, "loss": 0.0, "step": 7350 }, { "epoch": 8.03825136612022, "grad_norm": 1.0717486475186888e-05, "learning_rate": 1.0898603521554341e-05, "loss": 0.0, "step": 7355 }, { "epoch": 8.043715846994536, "grad_norm": 1.7090585970436223e-05, "learning_rate": 1.0868245294474804e-05, "loss": 0.0, "step": 7360 }, { "epoch": 8.049180327868852, "grad_norm": 1.6057352695497684e-05, "learning_rate": 1.0837887067395265e-05, "loss": 0.0, "step": 7365 }, { "epoch": 8.05464480874317, "grad_norm": 1.3813902114634402e-05, "learning_rate": 1.0807528840315725e-05, "loss": 0.0, "step": 7370 }, { "epoch": 8.060109289617486, "grad_norm": 1.1777274266933091e-05, "learning_rate": 1.0777170613236187e-05, "loss": 0.0, "step": 7375 }, { "epoch": 8.065573770491802, "grad_norm": 1.597378468431998e-05, "learning_rate": 1.0746812386156649e-05, "loss": 0.0, "step": 7380 }, { "epoch": 8.07103825136612, "grad_norm": 5.7681496400618926e-05, "learning_rate": 1.0716454159077111e-05, "loss": 0.0, "step": 7385 }, { "epoch": 8.076502732240437, "grad_norm": 2.5443077902309597e-05, "learning_rate": 1.0686095931997573e-05, "loss": 0.0, "step": 7390 }, { "epoch": 8.081967213114755, "grad_norm": 1.2299603440624196e-05, "learning_rate": 1.0655737704918032e-05, "loss": 0.0, "step": 7395 }, { "epoch": 8.087431693989071, "grad_norm": 1.8994456695509143e-05, "learning_rate": 1.0625379477838495e-05, "loss": 0.0, "step": 7400 }, { "epoch": 8.092896174863387, "grad_norm": 1.2942989997100085e-05, "learning_rate": 1.0595021250758956e-05, "loss": 0.0, "step": 7405 }, { "epoch": 8.098360655737705, "grad_norm": 1.2755004718201235e-05, "learning_rate": 1.0564663023679417e-05, "loss": 0.0, "step": 7410 }, { "epoch": 8.103825136612022, "grad_norm": 1.5904519386822358e-05, "learning_rate": 1.053430479659988e-05, "loss": 0.0, "step": 7415 }, { "epoch": 8.109289617486338, "grad_norm": 8.2708374975482e-06, "learning_rate": 1.050394656952034e-05, "loss": 0.0, "step": 7420 }, { "epoch": 8.114754098360656, "grad_norm": 0.0006852184887975454, "learning_rate": 1.0473588342440802e-05, "loss": 0.0, "step": 7425 }, { "epoch": 8.120218579234972, "grad_norm": 0.00019752304069697857, "learning_rate": 1.0443230115361263e-05, "loss": 0.0, "step": 7430 }, { "epoch": 8.12568306010929, "grad_norm": 5.792970841866918e-05, "learning_rate": 1.0412871888281724e-05, "loss": 0.0, "step": 7435 }, { "epoch": 8.131147540983607, "grad_norm": 8.440183592028916e-06, "learning_rate": 1.0382513661202187e-05, "loss": 0.0, "step": 7440 }, { "epoch": 8.136612021857923, "grad_norm": 1.4333023500512354e-05, "learning_rate": 1.0352155434122647e-05, "loss": 0.0, "step": 7445 }, { "epoch": 8.142076502732241, "grad_norm": 2.101344034599606e-05, "learning_rate": 1.032179720704311e-05, "loss": 0.0, "step": 7450 }, { "epoch": 8.147540983606557, "grad_norm": 1.0169248525926378e-05, "learning_rate": 1.029143897996357e-05, "loss": 0.0, "step": 7455 }, { "epoch": 8.153005464480874, "grad_norm": 1.284426252823323e-05, "learning_rate": 1.0261080752884032e-05, "loss": 0.0, "step": 7460 }, { "epoch": 8.158469945355192, "grad_norm": 2.392159149167128e-05, "learning_rate": 1.0230722525804495e-05, "loss": 0.0, "step": 7465 }, { "epoch": 8.163934426229508, "grad_norm": 2.8029156965203583e-05, "learning_rate": 1.0200364298724956e-05, "loss": 0.0, "step": 7470 }, { "epoch": 8.169398907103826, "grad_norm": 0.0004401684273034334, "learning_rate": 1.0170006071645417e-05, "loss": 0.0, "step": 7475 }, { "epoch": 8.174863387978142, "grad_norm": 8.851263373799156e-06, "learning_rate": 1.0139647844565878e-05, "loss": 0.0, "step": 7480 }, { "epoch": 8.180327868852459, "grad_norm": 0.00010571895109023899, "learning_rate": 1.0109289617486339e-05, "loss": 0.0, "step": 7485 }, { "epoch": 8.185792349726777, "grad_norm": 1.1960997653659433e-05, "learning_rate": 1.0078931390406802e-05, "loss": 0.0, "step": 7490 }, { "epoch": 8.191256830601093, "grad_norm": 2.6386380341136828e-05, "learning_rate": 1.0048573163327263e-05, "loss": 0.0, "step": 7495 }, { "epoch": 8.19672131147541, "grad_norm": 2.395056617388036e-05, "learning_rate": 1.0018214936247722e-05, "loss": 0.0, "step": 7500 }, { "epoch": 8.202185792349727, "grad_norm": 7.888736035965849e-06, "learning_rate": 9.987856709168185e-06, "loss": 0.0, "step": 7505 }, { "epoch": 8.207650273224044, "grad_norm": 1.250679724762449e-05, "learning_rate": 9.957498482088646e-06, "loss": 0.0, "step": 7510 }, { "epoch": 8.21311475409836, "grad_norm": 6.540030153701082e-05, "learning_rate": 9.927140255009108e-06, "loss": 0.0, "step": 7515 }, { "epoch": 8.218579234972678, "grad_norm": 2.1273477614158764e-05, "learning_rate": 9.89678202792957e-06, "loss": 0.0, "step": 7520 }, { "epoch": 8.224043715846994, "grad_norm": 1.247999352926854e-05, "learning_rate": 9.86642380085003e-06, "loss": 0.0, "step": 7525 }, { "epoch": 8.229508196721312, "grad_norm": 1.9308432456455193e-05, "learning_rate": 9.836065573770493e-06, "loss": 0.0, "step": 7530 }, { "epoch": 8.234972677595628, "grad_norm": 1.2912430975120515e-05, "learning_rate": 9.805707346690954e-06, "loss": 0.0, "step": 7535 }, { "epoch": 8.240437158469945, "grad_norm": 1.1097822607553098e-05, "learning_rate": 9.775349119611415e-06, "loss": 0.0, "step": 7540 }, { "epoch": 8.245901639344263, "grad_norm": 3.416691470192745e-05, "learning_rate": 9.744990892531878e-06, "loss": 0.0, "step": 7545 }, { "epoch": 8.251366120218579, "grad_norm": 1.2960584172105882e-05, "learning_rate": 9.714632665452337e-06, "loss": 0.0, "step": 7550 }, { "epoch": 8.256830601092895, "grad_norm": 5.391754166339524e-05, "learning_rate": 9.6842744383728e-06, "loss": 0.0, "step": 7555 }, { "epoch": 8.262295081967213, "grad_norm": 1.9058212274103425e-05, "learning_rate": 9.653916211293261e-06, "loss": 0.0, "step": 7560 }, { "epoch": 8.26775956284153, "grad_norm": 8.722054190002382e-06, "learning_rate": 9.623557984213722e-06, "loss": 0.0, "step": 7565 }, { "epoch": 8.273224043715848, "grad_norm": 1.2237047485541552e-05, "learning_rate": 9.593199757134185e-06, "loss": 0.0, "step": 7570 }, { "epoch": 8.278688524590164, "grad_norm": 5.237998266238719e-05, "learning_rate": 9.562841530054644e-06, "loss": 0.0, "step": 7575 }, { "epoch": 8.28415300546448, "grad_norm": 9.937410140992142e-06, "learning_rate": 9.532483302975107e-06, "loss": 0.0, "step": 7580 }, { "epoch": 8.289617486338798, "grad_norm": 1.3053542716079392e-05, "learning_rate": 9.502125075895568e-06, "loss": 0.0, "step": 7585 }, { "epoch": 8.295081967213115, "grad_norm": 2.3322838387684897e-05, "learning_rate": 9.47176684881603e-06, "loss": 0.0, "step": 7590 }, { "epoch": 8.300546448087431, "grad_norm": 0.000983793055638671, "learning_rate": 9.441408621736492e-06, "loss": 0.0, "step": 7595 }, { "epoch": 8.306010928961749, "grad_norm": 0.00012811202032025903, "learning_rate": 9.411050394656952e-06, "loss": 0.0, "step": 7600 }, { "epoch": 8.311475409836065, "grad_norm": 1.6763411622378044e-05, "learning_rate": 9.380692167577413e-06, "loss": 0.0, "step": 7605 }, { "epoch": 8.316939890710383, "grad_norm": 3.3423602872062474e-05, "learning_rate": 9.350333940497876e-06, "loss": 0.0, "step": 7610 }, { "epoch": 8.3224043715847, "grad_norm": 1.1610113688220736e-05, "learning_rate": 9.319975713418337e-06, "loss": 0.0, "step": 7615 }, { "epoch": 8.327868852459016, "grad_norm": 2.3615129975951277e-05, "learning_rate": 9.2896174863388e-06, "loss": 0.0, "step": 7620 }, { "epoch": 8.333333333333334, "grad_norm": 7.937882401165552e-06, "learning_rate": 9.259259259259259e-06, "loss": 0.0, "step": 7625 }, { "epoch": 8.33879781420765, "grad_norm": 1.095014886232093e-05, "learning_rate": 9.22890103217972e-06, "loss": 0.0, "step": 7630 }, { "epoch": 8.344262295081966, "grad_norm": 7.690862730669323e-06, "learning_rate": 9.198542805100183e-06, "loss": 0.0, "step": 7635 }, { "epoch": 8.349726775956285, "grad_norm": 1.6766489352448843e-05, "learning_rate": 9.168184578020644e-06, "loss": 0.0, "step": 7640 }, { "epoch": 8.3551912568306, "grad_norm": 0.0001764387561706826, "learning_rate": 9.137826350941105e-06, "loss": 0.0001, "step": 7645 }, { "epoch": 8.360655737704919, "grad_norm": 7.887566607678309e-05, "learning_rate": 9.107468123861566e-06, "loss": 0.0, "step": 7650 }, { "epoch": 8.366120218579235, "grad_norm": 1.2673816854658071e-05, "learning_rate": 9.077109896782028e-06, "loss": 0.0, "step": 7655 }, { "epoch": 8.371584699453551, "grad_norm": 4.3066185753559694e-05, "learning_rate": 9.04675166970249e-06, "loss": 0.0, "step": 7660 }, { "epoch": 8.37704918032787, "grad_norm": 1.3214439604780637e-05, "learning_rate": 9.016393442622952e-06, "loss": 0.0, "step": 7665 }, { "epoch": 8.382513661202186, "grad_norm": 1.4271675354393665e-05, "learning_rate": 8.986035215543413e-06, "loss": 0.0, "step": 7670 }, { "epoch": 8.387978142076502, "grad_norm": 0.00020471119205467403, "learning_rate": 8.955676988463874e-06, "loss": 0.0, "step": 7675 }, { "epoch": 8.39344262295082, "grad_norm": 5.713020800612867e-05, "learning_rate": 8.925318761384335e-06, "loss": 0.0, "step": 7680 }, { "epoch": 8.398907103825136, "grad_norm": 0.0001489708956796676, "learning_rate": 8.894960534304798e-06, "loss": 0.0, "step": 7685 }, { "epoch": 8.404371584699454, "grad_norm": 2.921839040936902e-05, "learning_rate": 8.864602307225259e-06, "loss": 0.0, "step": 7690 }, { "epoch": 8.40983606557377, "grad_norm": 9.445561772736255e-06, "learning_rate": 8.83424408014572e-06, "loss": 0.0, "step": 7695 }, { "epoch": 8.415300546448087, "grad_norm": 1.582982440595515e-05, "learning_rate": 8.803885853066181e-06, "loss": 0.0, "step": 7700 }, { "epoch": 8.420765027322405, "grad_norm": 1.92226125363959e-05, "learning_rate": 8.773527625986642e-06, "loss": 0.0, "step": 7705 }, { "epoch": 8.426229508196721, "grad_norm": 5.412556856754236e-05, "learning_rate": 8.743169398907103e-06, "loss": 0.0, "step": 7710 }, { "epoch": 8.431693989071038, "grad_norm": 2.1962119717500173e-05, "learning_rate": 8.712811171827566e-06, "loss": 0.0, "step": 7715 }, { "epoch": 8.437158469945356, "grad_norm": 0.0006890874356031418, "learning_rate": 8.682452944748027e-06, "loss": 0.0, "step": 7720 }, { "epoch": 8.442622950819672, "grad_norm": 2.5473029381828383e-05, "learning_rate": 8.652094717668488e-06, "loss": 0.0, "step": 7725 }, { "epoch": 8.448087431693988, "grad_norm": 8.04679439170286e-06, "learning_rate": 8.62173649058895e-06, "loss": 0.0, "step": 7730 }, { "epoch": 8.453551912568306, "grad_norm": 0.0006084745400585234, "learning_rate": 8.59137826350941e-06, "loss": 0.0, "step": 7735 }, { "epoch": 8.459016393442623, "grad_norm": 1.0556545021245256e-05, "learning_rate": 8.561020036429874e-06, "loss": 0.0, "step": 7740 }, { "epoch": 8.46448087431694, "grad_norm": 2.4723283786443062e-05, "learning_rate": 8.530661809350335e-06, "loss": 0.0, "step": 7745 }, { "epoch": 8.469945355191257, "grad_norm": 1.2455606338335201e-05, "learning_rate": 8.500303582270796e-06, "loss": 0.0, "step": 7750 }, { "epoch": 8.475409836065573, "grad_norm": 4.926376277580857e-05, "learning_rate": 8.469945355191257e-06, "loss": 0.0, "step": 7755 }, { "epoch": 8.480874316939891, "grad_norm": 1.2116083780711051e-05, "learning_rate": 8.439587128111718e-06, "loss": 0.0, "step": 7760 }, { "epoch": 8.486338797814208, "grad_norm": 0.0004952167510055006, "learning_rate": 8.409228901032181e-06, "loss": 0.0, "step": 7765 }, { "epoch": 8.491803278688524, "grad_norm": 1.5964784324751236e-05, "learning_rate": 8.378870673952642e-06, "loss": 0.0, "step": 7770 }, { "epoch": 8.497267759562842, "grad_norm": 2.0253926777513698e-05, "learning_rate": 8.348512446873103e-06, "loss": 0.0, "step": 7775 }, { "epoch": 8.502732240437158, "grad_norm": 4.6292418119264767e-05, "learning_rate": 8.318154219793564e-06, "loss": 0.0, "step": 7780 }, { "epoch": 8.508196721311476, "grad_norm": 1.7785914678825065e-05, "learning_rate": 8.287795992714025e-06, "loss": 0.0, "step": 7785 }, { "epoch": 8.513661202185792, "grad_norm": 0.0006781655829399824, "learning_rate": 8.257437765634488e-06, "loss": 0.0, "step": 7790 }, { "epoch": 8.519125683060109, "grad_norm": 9.413172847416718e-06, "learning_rate": 8.22707953855495e-06, "loss": 0.0, "step": 7795 }, { "epoch": 8.524590163934427, "grad_norm": 0.00011244224879192188, "learning_rate": 8.196721311475409e-06, "loss": 0.0, "step": 7800 }, { "epoch": 8.530054644808743, "grad_norm": 2.30734749493422e-05, "learning_rate": 8.166363084395872e-06, "loss": 0.0, "step": 7805 }, { "epoch": 8.53551912568306, "grad_norm": 9.177093488688115e-06, "learning_rate": 8.136004857316333e-06, "loss": 0.0, "step": 7810 }, { "epoch": 8.540983606557377, "grad_norm": 0.00042046752059832215, "learning_rate": 8.105646630236796e-06, "loss": 0.0, "step": 7815 }, { "epoch": 8.546448087431694, "grad_norm": 0.0004966892302036285, "learning_rate": 8.075288403157257e-06, "loss": 0.0, "step": 7820 }, { "epoch": 8.551912568306012, "grad_norm": 3.082709008594975e-05, "learning_rate": 8.044930176077716e-06, "loss": 0.0, "step": 7825 }, { "epoch": 8.557377049180328, "grad_norm": 2.252473495900631e-05, "learning_rate": 8.014571948998179e-06, "loss": 0.0, "step": 7830 }, { "epoch": 8.562841530054644, "grad_norm": 8.535890628991183e-06, "learning_rate": 7.98421372191864e-06, "loss": 0.0, "step": 7835 }, { "epoch": 8.568306010928962, "grad_norm": 9.065177437150851e-06, "learning_rate": 7.953855494839101e-06, "loss": 0.0, "step": 7840 }, { "epoch": 8.573770491803279, "grad_norm": 9.072325156012084e-06, "learning_rate": 7.923497267759564e-06, "loss": 0.0, "step": 7845 }, { "epoch": 8.579234972677595, "grad_norm": 3.252259557484649e-05, "learning_rate": 7.893139040680023e-06, "loss": 0.0, "step": 7850 }, { "epoch": 8.584699453551913, "grad_norm": 4.2116338590858504e-05, "learning_rate": 7.862780813600486e-06, "loss": 0.0, "step": 7855 }, { "epoch": 8.59016393442623, "grad_norm": 9.784484973351937e-06, "learning_rate": 7.832422586520947e-06, "loss": 0.0, "step": 7860 }, { "epoch": 8.595628415300546, "grad_norm": 1.4877758985676337e-05, "learning_rate": 7.802064359441409e-06, "loss": 0.0, "step": 7865 }, { "epoch": 8.601092896174864, "grad_norm": 2.199526716140099e-05, "learning_rate": 7.771706132361871e-06, "loss": 0.0, "step": 7870 }, { "epoch": 8.60655737704918, "grad_norm": 1.5495361367356963e-05, "learning_rate": 7.74134790528233e-06, "loss": 0.0, "step": 7875 }, { "epoch": 8.612021857923498, "grad_norm": 3.3597727451706305e-05, "learning_rate": 7.710989678202794e-06, "loss": 0.0, "step": 7880 }, { "epoch": 8.617486338797814, "grad_norm": 0.00010077483602799475, "learning_rate": 7.680631451123255e-06, "loss": 0.0, "step": 7885 }, { "epoch": 8.62295081967213, "grad_norm": 1.6386205970775336e-05, "learning_rate": 7.650273224043716e-06, "loss": 0.0, "step": 7890 }, { "epoch": 8.628415300546449, "grad_norm": 0.0001092545353458263, "learning_rate": 7.619914996964178e-06, "loss": 0.0, "step": 7895 }, { "epoch": 8.633879781420765, "grad_norm": 1.4560269846697338e-05, "learning_rate": 7.589556769884638e-06, "loss": 0.0, "step": 7900 }, { "epoch": 8.639344262295083, "grad_norm": 1.3944583770353347e-05, "learning_rate": 7.5591985428051e-06, "loss": 0.0, "step": 7905 }, { "epoch": 8.6448087431694, "grad_norm": 9.102309559239075e-05, "learning_rate": 7.528840315725562e-06, "loss": 0.0, "step": 7910 }, { "epoch": 8.650273224043715, "grad_norm": 0.0014770968118682504, "learning_rate": 7.498482088646023e-06, "loss": 0.0, "step": 7915 }, { "epoch": 8.655737704918034, "grad_norm": 1.5495656043640338e-05, "learning_rate": 7.468123861566485e-06, "loss": 0.0, "step": 7920 }, { "epoch": 8.66120218579235, "grad_norm": 1.086993779608747e-05, "learning_rate": 7.4377656344869455e-06, "loss": 0.0, "step": 7925 }, { "epoch": 8.666666666666666, "grad_norm": 2.422315264993813e-05, "learning_rate": 7.4074074074074075e-06, "loss": 0.0, "step": 7930 }, { "epoch": 8.672131147540984, "grad_norm": 1.2280898772587534e-05, "learning_rate": 7.3770491803278695e-06, "loss": 0.0, "step": 7935 }, { "epoch": 8.6775956284153, "grad_norm": 7.415174422931159e-06, "learning_rate": 7.346690953248331e-06, "loss": 0.0, "step": 7940 }, { "epoch": 8.683060109289617, "grad_norm": 1.318469821853796e-05, "learning_rate": 7.3163327261687926e-06, "loss": 0.0, "step": 7945 }, { "epoch": 8.688524590163935, "grad_norm": 7.0088044594740495e-06, "learning_rate": 7.2859744990892545e-06, "loss": 0.0, "step": 7950 }, { "epoch": 8.693989071038251, "grad_norm": 1.3353911526792217e-05, "learning_rate": 7.255616272009715e-06, "loss": 0.0, "step": 7955 }, { "epoch": 8.699453551912569, "grad_norm": 5.0659851694945246e-05, "learning_rate": 7.225258044930176e-06, "loss": 0.0, "step": 7960 }, { "epoch": 8.704918032786885, "grad_norm": 0.00012321716349106282, "learning_rate": 7.194899817850638e-06, "loss": 0.0, "step": 7965 }, { "epoch": 8.710382513661202, "grad_norm": 7.799344530212693e-06, "learning_rate": 7.1645415907711e-06, "loss": 0.0, "step": 7970 }, { "epoch": 8.71584699453552, "grad_norm": 1.204593718284741e-05, "learning_rate": 7.134183363691561e-06, "loss": 0.0, "step": 7975 }, { "epoch": 8.721311475409836, "grad_norm": 7.936175279610325e-06, "learning_rate": 7.103825136612022e-06, "loss": 0.0, "step": 7980 }, { "epoch": 8.726775956284152, "grad_norm": 1.0668262802937534e-05, "learning_rate": 7.073466909532483e-06, "loss": 0.0, "step": 7985 }, { "epoch": 8.73224043715847, "grad_norm": 1.0944058885797858e-05, "learning_rate": 7.043108682452945e-06, "loss": 0.0, "step": 7990 }, { "epoch": 8.737704918032787, "grad_norm": 3.984873183071613e-05, "learning_rate": 7.012750455373407e-06, "loss": 0.0, "step": 7995 }, { "epoch": 8.743169398907105, "grad_norm": 2.668611705303192e-05, "learning_rate": 6.982392228293868e-06, "loss": 0.0, "step": 8000 }, { "epoch": 8.748633879781421, "grad_norm": 3.1083716748980805e-05, "learning_rate": 6.952034001214329e-06, "loss": 0.0, "step": 8005 }, { "epoch": 8.754098360655737, "grad_norm": 5.123380105942488e-05, "learning_rate": 6.921675774134791e-06, "loss": 0.0, "step": 8010 }, { "epoch": 8.759562841530055, "grad_norm": 3.057342837564647e-05, "learning_rate": 6.891317547055253e-06, "loss": 0.0, "step": 8015 }, { "epoch": 8.765027322404372, "grad_norm": 1.0824885066540446e-05, "learning_rate": 6.860959319975714e-06, "loss": 0.0, "step": 8020 }, { "epoch": 8.770491803278688, "grad_norm": 1.0632065823301673e-05, "learning_rate": 6.830601092896176e-06, "loss": 0.0, "step": 8025 }, { "epoch": 8.775956284153006, "grad_norm": 0.00019011649419553578, "learning_rate": 6.800242865816636e-06, "loss": 0.0, "step": 8030 }, { "epoch": 8.781420765027322, "grad_norm": 2.6195617465418763e-05, "learning_rate": 6.769884638737098e-06, "loss": 0.0, "step": 8035 }, { "epoch": 8.78688524590164, "grad_norm": 1.6233725546044298e-05, "learning_rate": 6.73952641165756e-06, "loss": 0.0, "step": 8040 }, { "epoch": 8.792349726775956, "grad_norm": 1.057436293194769e-05, "learning_rate": 6.709168184578021e-06, "loss": 0.0, "step": 8045 }, { "epoch": 8.797814207650273, "grad_norm": 4.19470998167526e-05, "learning_rate": 6.678809957498483e-06, "loss": 0.0, "step": 8050 }, { "epoch": 8.80327868852459, "grad_norm": 1.0058052794192918e-05, "learning_rate": 6.648451730418943e-06, "loss": 0.0, "step": 8055 }, { "epoch": 8.808743169398907, "grad_norm": 7.835977885406464e-05, "learning_rate": 6.618093503339405e-06, "loss": 0.0, "step": 8060 }, { "epoch": 8.814207650273223, "grad_norm": 0.00041921433876268566, "learning_rate": 6.5877352762598664e-06, "loss": 0.0, "step": 8065 }, { "epoch": 8.819672131147541, "grad_norm": 8.555156455258839e-06, "learning_rate": 6.557377049180328e-06, "loss": 0.0, "step": 8070 }, { "epoch": 8.825136612021858, "grad_norm": 0.00044895359314978123, "learning_rate": 6.52701882210079e-06, "loss": 0.0, "step": 8075 }, { "epoch": 8.830601092896174, "grad_norm": 2.7534799301065505e-05, "learning_rate": 6.496660595021251e-06, "loss": 0.0, "step": 8080 }, { "epoch": 8.836065573770492, "grad_norm": 2.2113057639217004e-05, "learning_rate": 6.466302367941713e-06, "loss": 0.0, "step": 8085 }, { "epoch": 8.841530054644808, "grad_norm": 7.0613623393001035e-06, "learning_rate": 6.435944140862174e-06, "loss": 0.0, "step": 8090 }, { "epoch": 8.846994535519126, "grad_norm": 4.0296203223988414e-05, "learning_rate": 6.405585913782636e-06, "loss": 0.0, "step": 8095 }, { "epoch": 8.852459016393443, "grad_norm": 3.491979805403389e-05, "learning_rate": 6.375227686703098e-06, "loss": 0.0, "step": 8100 }, { "epoch": 8.857923497267759, "grad_norm": 1.561005956318695e-05, "learning_rate": 6.344869459623558e-06, "loss": 0.0, "step": 8105 }, { "epoch": 8.863387978142077, "grad_norm": 2.3691472961218096e-05, "learning_rate": 6.314511232544019e-06, "loss": 0.0, "step": 8110 }, { "epoch": 8.868852459016393, "grad_norm": 1.4078505955694709e-05, "learning_rate": 6.284153005464481e-06, "loss": 0.0, "step": 8115 }, { "epoch": 8.87431693989071, "grad_norm": 8.696079748915508e-05, "learning_rate": 6.253794778384943e-06, "loss": 0.0, "step": 8120 }, { "epoch": 8.879781420765028, "grad_norm": 1.038772188621806e-05, "learning_rate": 6.223436551305404e-06, "loss": 0.0, "step": 8125 }, { "epoch": 8.885245901639344, "grad_norm": 1.992170473386068e-05, "learning_rate": 6.193078324225865e-06, "loss": 0.0, "step": 8130 }, { "epoch": 8.890710382513662, "grad_norm": 2.0900015442748554e-05, "learning_rate": 6.1627200971463265e-06, "loss": 0.0, "step": 8135 }, { "epoch": 8.896174863387978, "grad_norm": 1.2317065738898236e-05, "learning_rate": 6.1323618700667884e-06, "loss": 0.0, "step": 8140 }, { "epoch": 8.901639344262295, "grad_norm": 4.956311022397131e-05, "learning_rate": 6.10200364298725e-06, "loss": 0.0, "step": 8145 }, { "epoch": 8.907103825136613, "grad_norm": 8.253177838923875e-06, "learning_rate": 6.0716454159077115e-06, "loss": 0.0, "step": 8150 }, { "epoch": 8.912568306010929, "grad_norm": 6.0874495829921216e-05, "learning_rate": 6.041287188828173e-06, "loss": 0.0, "step": 8155 }, { "epoch": 8.918032786885245, "grad_norm": 1.0368218681833241e-05, "learning_rate": 6.010928961748634e-06, "loss": 0.0, "step": 8160 }, { "epoch": 8.923497267759563, "grad_norm": 1.1900097888428718e-05, "learning_rate": 5.980570734669096e-06, "loss": 0.0, "step": 8165 }, { "epoch": 8.92896174863388, "grad_norm": 5.567036350839771e-05, "learning_rate": 5.950212507589558e-06, "loss": 0.0, "step": 8170 }, { "epoch": 8.934426229508198, "grad_norm": 1.0399870916444343e-05, "learning_rate": 5.919854280510018e-06, "loss": 0.0, "step": 8175 }, { "epoch": 8.939890710382514, "grad_norm": 1.9010061805602163e-05, "learning_rate": 5.88949605343048e-06, "loss": 0.0, "step": 8180 }, { "epoch": 8.94535519125683, "grad_norm": 9.974577551474795e-06, "learning_rate": 5.859137826350941e-06, "loss": 0.0, "step": 8185 }, { "epoch": 8.950819672131148, "grad_norm": 2.6583495127852075e-05, "learning_rate": 5.828779599271403e-06, "loss": 0.0, "step": 8190 }, { "epoch": 8.956284153005464, "grad_norm": 0.001321590505540371, "learning_rate": 5.798421372191864e-06, "loss": 0.0, "step": 8195 }, { "epoch": 8.96174863387978, "grad_norm": 1.875944872153923e-05, "learning_rate": 5.768063145112325e-06, "loss": 0.0, "step": 8200 }, { "epoch": 8.967213114754099, "grad_norm": 8.05357103672577e-06, "learning_rate": 5.737704918032787e-06, "loss": 0.0, "step": 8205 }, { "epoch": 8.972677595628415, "grad_norm": 0.0005711750127375126, "learning_rate": 5.7073466909532485e-06, "loss": 0.0, "step": 8210 }, { "epoch": 8.978142076502731, "grad_norm": 2.2052852727938443e-05, "learning_rate": 5.67698846387371e-06, "loss": 0.0, "step": 8215 }, { "epoch": 8.98360655737705, "grad_norm": 4.620003528543748e-05, "learning_rate": 5.646630236794172e-06, "loss": 0.0, "step": 8220 }, { "epoch": 8.989071038251366, "grad_norm": 1.6385482012992725e-05, "learning_rate": 5.616272009714633e-06, "loss": 0.0, "step": 8225 }, { "epoch": 8.994535519125684, "grad_norm": 9.964457603928167e-06, "learning_rate": 5.585913782635095e-06, "loss": 0.0, "step": 8230 }, { "epoch": 9.0, "grad_norm": 7.681346687604673e-06, "learning_rate": 5.555555555555556e-06, "loss": 0.0, "step": 8235 }, { "epoch": 9.0, "eval_loss": 1.2326364640102838e-06, "eval_runtime": 661.3599, "eval_samples_per_second": 11.062, "eval_steps_per_second": 1.384, "step": 8235 }, { "epoch": 9.005464480874316, "grad_norm": 1.251523553946754e-05, "learning_rate": 5.525197328476017e-06, "loss": 0.0, "step": 8240 }, { "epoch": 9.010928961748634, "grad_norm": 1.9697485186043195e-05, "learning_rate": 5.494839101396479e-06, "loss": 0.0, "step": 8245 }, { "epoch": 9.01639344262295, "grad_norm": 8.097184036159888e-05, "learning_rate": 5.46448087431694e-06, "loss": 0.0, "step": 8250 }, { "epoch": 9.021857923497267, "grad_norm": 8.922267625166569e-06, "learning_rate": 5.434122647237402e-06, "loss": 0.0, "step": 8255 }, { "epoch": 9.027322404371585, "grad_norm": 8.49717889650492e-06, "learning_rate": 5.403764420157862e-06, "loss": 0.0, "step": 8260 }, { "epoch": 9.032786885245901, "grad_norm": 9.96219114313135e-06, "learning_rate": 5.373406193078324e-06, "loss": 0.0, "step": 8265 }, { "epoch": 9.03825136612022, "grad_norm": 7.731101504759863e-06, "learning_rate": 5.343047965998786e-06, "loss": 0.0, "step": 8270 }, { "epoch": 9.043715846994536, "grad_norm": 2.623209365992807e-05, "learning_rate": 5.312689738919247e-06, "loss": 0.0, "step": 8275 }, { "epoch": 9.049180327868852, "grad_norm": 3.5027180274482816e-05, "learning_rate": 5.2823315118397085e-06, "loss": 0.0, "step": 8280 }, { "epoch": 9.05464480874317, "grad_norm": 5.4955471568973735e-05, "learning_rate": 5.25197328476017e-06, "loss": 0.0, "step": 8285 }, { "epoch": 9.060109289617486, "grad_norm": 5.6031560234259814e-05, "learning_rate": 5.221615057680632e-06, "loss": 0.0, "step": 8290 }, { "epoch": 9.065573770491802, "grad_norm": 2.9362845452851616e-05, "learning_rate": 5.191256830601094e-06, "loss": 0.0, "step": 8295 }, { "epoch": 9.07103825136612, "grad_norm": 1.5086310668266378e-05, "learning_rate": 5.160898603521555e-06, "loss": 0.0, "step": 8300 }, { "epoch": 9.076502732240437, "grad_norm": 1.0832651241798885e-05, "learning_rate": 5.130540376442016e-06, "loss": 0.0, "step": 8305 }, { "epoch": 9.081967213114755, "grad_norm": 8.822114068607334e-06, "learning_rate": 5.100182149362478e-06, "loss": 0.0, "step": 8310 }, { "epoch": 9.087431693989071, "grad_norm": 1.0006731827161275e-05, "learning_rate": 5.069823922282939e-06, "loss": 0.0, "step": 8315 }, { "epoch": 9.092896174863387, "grad_norm": 8.003929906408302e-06, "learning_rate": 5.039465695203401e-06, "loss": 0.0, "step": 8320 }, { "epoch": 9.098360655737705, "grad_norm": 9.480178050580435e-06, "learning_rate": 5.009107468123861e-06, "loss": 0.0, "step": 8325 }, { "epoch": 9.103825136612022, "grad_norm": 9.101410796574783e-06, "learning_rate": 4.978749241044323e-06, "loss": 0.0, "step": 8330 }, { "epoch": 9.109289617486338, "grad_norm": 1.1568869922484737e-05, "learning_rate": 4.948391013964785e-06, "loss": 0.0, "step": 8335 }, { "epoch": 9.114754098360656, "grad_norm": 9.305016646976583e-06, "learning_rate": 4.918032786885246e-06, "loss": 0.0, "step": 8340 }, { "epoch": 9.120218579234972, "grad_norm": 1.3673022294824477e-05, "learning_rate": 4.8876745598057074e-06, "loss": 0.0, "step": 8345 }, { "epoch": 9.12568306010929, "grad_norm": 0.0009493128163740039, "learning_rate": 4.8573163327261686e-06, "loss": 0.0, "step": 8350 }, { "epoch": 9.131147540983607, "grad_norm": 1.3566643247031607e-05, "learning_rate": 4.8269581056466305e-06, "loss": 0.0, "step": 8355 }, { "epoch": 9.136612021857923, "grad_norm": 9.785872862266842e-06, "learning_rate": 4.7965998785670925e-06, "loss": 0.0, "step": 8360 }, { "epoch": 9.142076502732241, "grad_norm": 3.198983904439956e-05, "learning_rate": 4.766241651487554e-06, "loss": 0.0, "step": 8365 }, { "epoch": 9.147540983606557, "grad_norm": 1.2671367585426196e-05, "learning_rate": 4.735883424408015e-06, "loss": 0.0, "step": 8370 }, { "epoch": 9.153005464480874, "grad_norm": 1.4876402019581292e-05, "learning_rate": 4.705525197328476e-06, "loss": 0.0, "step": 8375 }, { "epoch": 9.158469945355192, "grad_norm": 1.182084724860033e-05, "learning_rate": 4.675166970248938e-06, "loss": 0.0, "step": 8380 }, { "epoch": 9.163934426229508, "grad_norm": 2.6076608264702372e-05, "learning_rate": 4.6448087431694e-06, "loss": 0.0, "step": 8385 }, { "epoch": 9.169398907103826, "grad_norm": 1.273778070753906e-05, "learning_rate": 4.61445051608986e-06, "loss": 0.0, "step": 8390 }, { "epoch": 9.174863387978142, "grad_norm": 0.0007501414511352777, "learning_rate": 4.584092289010322e-06, "loss": 0.0, "step": 8395 }, { "epoch": 9.180327868852459, "grad_norm": 7.561423444713e-06, "learning_rate": 4.553734061930783e-06, "loss": 0.0, "step": 8400 }, { "epoch": 9.185792349726777, "grad_norm": 7.892997018643655e-06, "learning_rate": 4.523375834851245e-06, "loss": 0.0, "step": 8405 }, { "epoch": 9.191256830601093, "grad_norm": 1.2118057384213898e-05, "learning_rate": 4.493017607771706e-06, "loss": 0.0, "step": 8410 }, { "epoch": 9.19672131147541, "grad_norm": 8.411708222411107e-06, "learning_rate": 4.4626593806921675e-06, "loss": 0.0, "step": 8415 }, { "epoch": 9.202185792349727, "grad_norm": 8.92472053237725e-06, "learning_rate": 4.4323011536126294e-06, "loss": 0.0, "step": 8420 }, { "epoch": 9.207650273224044, "grad_norm": 8.844925105222501e-06, "learning_rate": 4.401942926533091e-06, "loss": 0.0, "step": 8425 }, { "epoch": 9.21311475409836, "grad_norm": 9.090890671359375e-06, "learning_rate": 4.371584699453552e-06, "loss": 0.0, "step": 8430 }, { "epoch": 9.218579234972678, "grad_norm": 0.00030929245986044407, "learning_rate": 4.341226472374014e-06, "loss": 0.0, "step": 8435 }, { "epoch": 9.224043715846994, "grad_norm": 9.536963261780329e-06, "learning_rate": 4.310868245294475e-06, "loss": 0.0, "step": 8440 }, { "epoch": 9.229508196721312, "grad_norm": 1.0459447366883978e-05, "learning_rate": 4.280510018214937e-06, "loss": 0.0, "step": 8445 }, { "epoch": 9.234972677595628, "grad_norm": 1.430536576663144e-05, "learning_rate": 4.250151791135398e-06, "loss": 0.0, "step": 8450 }, { "epoch": 9.240437158469945, "grad_norm": 8.668756890983786e-06, "learning_rate": 4.219793564055859e-06, "loss": 0.0, "step": 8455 }, { "epoch": 9.245901639344263, "grad_norm": 4.2804833356058225e-05, "learning_rate": 4.189435336976321e-06, "loss": 0.0, "step": 8460 }, { "epoch": 9.251366120218579, "grad_norm": 1.0608948286972009e-05, "learning_rate": 4.159077109896782e-06, "loss": 0.0, "step": 8465 }, { "epoch": 9.256830601092895, "grad_norm": 0.0008651612442918122, "learning_rate": 4.128718882817244e-06, "loss": 0.0, "step": 8470 }, { "epoch": 9.262295081967213, "grad_norm": 1.1031417670892552e-05, "learning_rate": 4.098360655737704e-06, "loss": 0.0, "step": 8475 }, { "epoch": 9.26775956284153, "grad_norm": 1.4733498574059922e-05, "learning_rate": 4.068002428658166e-06, "loss": 0.0, "step": 8480 }, { "epoch": 9.273224043715848, "grad_norm": 1.1113019354525022e-05, "learning_rate": 4.037644201578628e-06, "loss": 0.0, "step": 8485 }, { "epoch": 9.278688524590164, "grad_norm": 1.671107384026982e-05, "learning_rate": 4.0072859744990895e-06, "loss": 0.0, "step": 8490 }, { "epoch": 9.28415300546448, "grad_norm": 1.4919727618689649e-05, "learning_rate": 3.976927747419551e-06, "loss": 0.0, "step": 8495 }, { "epoch": 9.289617486338798, "grad_norm": 1.0741515325207729e-05, "learning_rate": 3.946569520340012e-06, "loss": 0.0, "step": 8500 }, { "epoch": 9.295081967213115, "grad_norm": 1.4579722119378857e-05, "learning_rate": 3.916211293260474e-06, "loss": 0.0, "step": 8505 }, { "epoch": 9.300546448087431, "grad_norm": 0.001307551865465939, "learning_rate": 3.885853066180936e-06, "loss": 0.0, "step": 8510 }, { "epoch": 9.306010928961749, "grad_norm": 1.936557237058878e-05, "learning_rate": 3.855494839101397e-06, "loss": 0.0, "step": 8515 }, { "epoch": 9.311475409836065, "grad_norm": 2.149451756849885e-05, "learning_rate": 3.825136612021858e-06, "loss": 0.0, "step": 8520 }, { "epoch": 9.316939890710383, "grad_norm": 8.021363464649767e-05, "learning_rate": 3.794778384942319e-06, "loss": 0.0, "step": 8525 }, { "epoch": 9.3224043715847, "grad_norm": 3.3864893339341506e-05, "learning_rate": 3.764420157862781e-06, "loss": 0.0, "step": 8530 }, { "epoch": 9.327868852459016, "grad_norm": 3.2648382330080494e-05, "learning_rate": 3.7340619307832426e-06, "loss": 0.0, "step": 8535 }, { "epoch": 9.333333333333334, "grad_norm": 1.936520311573986e-05, "learning_rate": 3.7037037037037037e-06, "loss": 0.0, "step": 8540 }, { "epoch": 9.33879781420765, "grad_norm": 7.971724699018523e-06, "learning_rate": 3.6733454766241653e-06, "loss": 0.0, "step": 8545 }, { "epoch": 9.344262295081966, "grad_norm": 1.110977063945029e-05, "learning_rate": 3.6429872495446273e-06, "loss": 0.0, "step": 8550 }, { "epoch": 9.349726775956285, "grad_norm": 1.2776695257343818e-05, "learning_rate": 3.612629022465088e-06, "loss": 0.0, "step": 8555 }, { "epoch": 9.3551912568306, "grad_norm": 4.267817348591052e-05, "learning_rate": 3.58227079538555e-06, "loss": 0.0, "step": 8560 }, { "epoch": 9.360655737704919, "grad_norm": 9.000210411613807e-06, "learning_rate": 3.551912568306011e-06, "loss": 0.0, "step": 8565 }, { "epoch": 9.366120218579235, "grad_norm": 1.3303446394274943e-05, "learning_rate": 3.5215543412264726e-06, "loss": 0.0, "step": 8570 }, { "epoch": 9.371584699453551, "grad_norm": 5.926217636442743e-05, "learning_rate": 3.491196114146934e-06, "loss": 0.0, "step": 8575 }, { "epoch": 9.37704918032787, "grad_norm": 9.163100003206637e-06, "learning_rate": 3.4608378870673953e-06, "loss": 0.0, "step": 8580 }, { "epoch": 9.382513661202186, "grad_norm": 8.968859219748992e-06, "learning_rate": 3.430479659987857e-06, "loss": 0.0, "step": 8585 }, { "epoch": 9.387978142076502, "grad_norm": 1.1057231859012973e-05, "learning_rate": 3.400121432908318e-06, "loss": 0.0, "step": 8590 }, { "epoch": 9.39344262295082, "grad_norm": 9.770934411790222e-06, "learning_rate": 3.36976320582878e-06, "loss": 0.0, "step": 8595 }, { "epoch": 9.398907103825136, "grad_norm": 0.0007320807198993862, "learning_rate": 3.3394049787492415e-06, "loss": 0.0, "step": 8600 }, { "epoch": 9.404371584699454, "grad_norm": 0.0010416394798085093, "learning_rate": 3.3090467516697027e-06, "loss": 0.0, "step": 8605 }, { "epoch": 9.40983606557377, "grad_norm": 1.6940070054261014e-05, "learning_rate": 3.278688524590164e-06, "loss": 0.0, "step": 8610 }, { "epoch": 9.415300546448087, "grad_norm": 1.8109252778231166e-05, "learning_rate": 3.2483302975106253e-06, "loss": 0.0, "step": 8615 }, { "epoch": 9.420765027322405, "grad_norm": 1.22038818517467e-05, "learning_rate": 3.217972070431087e-06, "loss": 0.0, "step": 8620 }, { "epoch": 9.426229508196721, "grad_norm": 9.562543709762394e-06, "learning_rate": 3.187613843351549e-06, "loss": 0.0, "step": 8625 }, { "epoch": 9.431693989071038, "grad_norm": 1.2685290130320936e-05, "learning_rate": 3.1572556162720096e-06, "loss": 0.0, "step": 8630 }, { "epoch": 9.437158469945356, "grad_norm": 9.530190254736226e-06, "learning_rate": 3.1268973891924715e-06, "loss": 0.0, "step": 8635 }, { "epoch": 9.442622950819672, "grad_norm": 3.304435813333839e-05, "learning_rate": 3.0965391621129327e-06, "loss": 0.0, "step": 8640 }, { "epoch": 9.448087431693988, "grad_norm": 1.0142703104065731e-05, "learning_rate": 3.0661809350333942e-06, "loss": 0.0, "step": 8645 }, { "epoch": 9.453551912568306, "grad_norm": 3.2079547963803634e-05, "learning_rate": 3.0358227079538558e-06, "loss": 0.0, "step": 8650 }, { "epoch": 9.459016393442623, "grad_norm": 7.287067546712933e-06, "learning_rate": 3.005464480874317e-06, "loss": 0.0, "step": 8655 }, { "epoch": 9.46448087431694, "grad_norm": 0.00036683311918750405, "learning_rate": 2.975106253794779e-06, "loss": 0.0, "step": 8660 }, { "epoch": 9.469945355191257, "grad_norm": 1.1772449397540186e-05, "learning_rate": 2.94474802671524e-06, "loss": 0.0, "step": 8665 }, { "epoch": 9.475409836065573, "grad_norm": 0.0004519826325122267, "learning_rate": 2.9143897996357016e-06, "loss": 0.0, "step": 8670 }, { "epoch": 9.480874316939891, "grad_norm": 1.3867033885617275e-05, "learning_rate": 2.8840315725561627e-06, "loss": 0.0, "step": 8675 }, { "epoch": 9.486338797814208, "grad_norm": 1.1242904292885214e-05, "learning_rate": 2.8536733454766242e-06, "loss": 0.0, "step": 8680 }, { "epoch": 9.491803278688524, "grad_norm": 1.9089002307737246e-05, "learning_rate": 2.823315118397086e-06, "loss": 0.0, "step": 8685 }, { "epoch": 9.497267759562842, "grad_norm": 6.980729267525021e-06, "learning_rate": 2.7929568913175473e-06, "loss": 0.0, "step": 8690 }, { "epoch": 9.502732240437158, "grad_norm": 2.4231898350990377e-05, "learning_rate": 2.7625986642380085e-06, "loss": 0.0, "step": 8695 }, { "epoch": 9.508196721311476, "grad_norm": 4.8574838729109615e-05, "learning_rate": 2.73224043715847e-06, "loss": 0.0, "step": 8700 }, { "epoch": 9.513661202185792, "grad_norm": 4.2161831515841186e-05, "learning_rate": 2.701882210078931e-06, "loss": 0.0, "step": 8705 }, { "epoch": 9.519125683060109, "grad_norm": 9.971013241738547e-06, "learning_rate": 2.671523982999393e-06, "loss": 0.0, "step": 8710 }, { "epoch": 9.524590163934427, "grad_norm": 2.4025157472351566e-05, "learning_rate": 2.6411657559198543e-06, "loss": 0.0, "step": 8715 }, { "epoch": 9.530054644808743, "grad_norm": 2.1537007341976278e-05, "learning_rate": 2.610807528840316e-06, "loss": 0.0, "step": 8720 }, { "epoch": 9.53551912568306, "grad_norm": 0.0013983087847009301, "learning_rate": 2.5804493017607774e-06, "loss": 0.0, "step": 8725 }, { "epoch": 9.540983606557377, "grad_norm": 8.980196980701294e-06, "learning_rate": 2.550091074681239e-06, "loss": 0.0, "step": 8730 }, { "epoch": 9.546448087431694, "grad_norm": 8.051301847444847e-06, "learning_rate": 2.5197328476017005e-06, "loss": 0.0, "step": 8735 }, { "epoch": 9.551912568306012, "grad_norm": 1.0314599421690218e-05, "learning_rate": 2.4893746205221616e-06, "loss": 0.0, "step": 8740 }, { "epoch": 9.557377049180328, "grad_norm": 9.725902600621339e-06, "learning_rate": 2.459016393442623e-06, "loss": 0.0, "step": 8745 }, { "epoch": 9.562841530054644, "grad_norm": 2.8515923986560665e-05, "learning_rate": 2.4286581663630843e-06, "loss": 0.0, "step": 8750 }, { "epoch": 9.568306010928962, "grad_norm": 0.0003265489067416638, "learning_rate": 2.3982999392835463e-06, "loss": 0.0, "step": 8755 }, { "epoch": 9.573770491803279, "grad_norm": 8.852043720253278e-06, "learning_rate": 2.3679417122040074e-06, "loss": 0.0, "step": 8760 }, { "epoch": 9.579234972677595, "grad_norm": 7.82777806307422e-06, "learning_rate": 2.337583485124469e-06, "loss": 0.0, "step": 8765 }, { "epoch": 9.584699453551913, "grad_norm": 1.2773216440109536e-05, "learning_rate": 2.30722525804493e-06, "loss": 0.0, "step": 8770 }, { "epoch": 9.59016393442623, "grad_norm": 7.689292942814063e-06, "learning_rate": 2.2768670309653916e-06, "loss": 0.0, "step": 8775 }, { "epoch": 9.595628415300546, "grad_norm": 1.9598150174715556e-05, "learning_rate": 2.246508803885853e-06, "loss": 0.0, "step": 8780 }, { "epoch": 9.601092896174864, "grad_norm": 1.991266617551446e-05, "learning_rate": 2.2161505768063147e-06, "loss": 0.0, "step": 8785 }, { "epoch": 9.60655737704918, "grad_norm": 8.718829121789895e-06, "learning_rate": 2.185792349726776e-06, "loss": 0.0, "step": 8790 }, { "epoch": 9.612021857923498, "grad_norm": 1.2843075637647416e-05, "learning_rate": 2.1554341226472374e-06, "loss": 0.0, "step": 8795 }, { "epoch": 9.617486338797814, "grad_norm": 8.296131454699207e-06, "learning_rate": 2.125075895567699e-06, "loss": 0.0, "step": 8800 }, { "epoch": 9.62295081967213, "grad_norm": 9.450495781493373e-06, "learning_rate": 2.0947176684881605e-06, "loss": 0.0, "step": 8805 }, { "epoch": 9.628415300546449, "grad_norm": 8.672905096318573e-06, "learning_rate": 2.064359441408622e-06, "loss": 0.0, "step": 8810 }, { "epoch": 9.633879781420765, "grad_norm": 1.4991738680691924e-05, "learning_rate": 2.034001214329083e-06, "loss": 0.0, "step": 8815 }, { "epoch": 9.639344262295083, "grad_norm": 1.4407401977223344e-05, "learning_rate": 2.0036429872495447e-06, "loss": 0.0, "step": 8820 }, { "epoch": 9.6448087431694, "grad_norm": 2.0339235561550595e-05, "learning_rate": 1.973284760170006e-06, "loss": 0.0, "step": 8825 }, { "epoch": 9.650273224043715, "grad_norm": 1.8044936950900592e-05, "learning_rate": 1.942926533090468e-06, "loss": 0.0, "step": 8830 }, { "epoch": 9.655737704918034, "grad_norm": 8.918646926758811e-06, "learning_rate": 1.912568306010929e-06, "loss": 0.0, "step": 8835 }, { "epoch": 9.66120218579235, "grad_norm": 1.3933055015513673e-05, "learning_rate": 1.8822100789313905e-06, "loss": 0.0, "step": 8840 }, { "epoch": 9.666666666666666, "grad_norm": 1.8471546354703605e-05, "learning_rate": 1.8518518518518519e-06, "loss": 0.0, "step": 8845 }, { "epoch": 9.672131147540984, "grad_norm": 1.658586916164495e-05, "learning_rate": 1.8214936247723136e-06, "loss": 0.0, "step": 8850 }, { "epoch": 9.6775956284153, "grad_norm": 8.949955372372642e-06, "learning_rate": 1.791135397692775e-06, "loss": 0.0, "step": 8855 }, { "epoch": 9.683060109289617, "grad_norm": 8.244574019045103e-06, "learning_rate": 1.7607771706132363e-06, "loss": 0.0, "step": 8860 }, { "epoch": 9.688524590163935, "grad_norm": 1.4154088603390846e-05, "learning_rate": 1.7304189435336977e-06, "loss": 0.0, "step": 8865 }, { "epoch": 9.693989071038251, "grad_norm": 7.654744877072517e-06, "learning_rate": 1.700060716454159e-06, "loss": 0.0, "step": 8870 }, { "epoch": 9.699453551912569, "grad_norm": 3.458944411249831e-05, "learning_rate": 1.6697024893746208e-06, "loss": 0.0, "step": 8875 }, { "epoch": 9.704918032786885, "grad_norm": 3.436910265008919e-05, "learning_rate": 1.639344262295082e-06, "loss": 0.0, "step": 8880 }, { "epoch": 9.710382513661202, "grad_norm": 1.1345953680574894e-05, "learning_rate": 1.6089860352155434e-06, "loss": 0.0, "step": 8885 }, { "epoch": 9.71584699453552, "grad_norm": 9.515051715425216e-06, "learning_rate": 1.5786278081360048e-06, "loss": 0.0, "step": 8890 }, { "epoch": 9.721311475409836, "grad_norm": 1.4450808521360159e-05, "learning_rate": 1.5482695810564663e-06, "loss": 0.0, "step": 8895 }, { "epoch": 9.726775956284152, "grad_norm": 0.0002483540738467127, "learning_rate": 1.5179113539769279e-06, "loss": 0.0, "step": 8900 }, { "epoch": 9.73224043715847, "grad_norm": 7.709054443694185e-06, "learning_rate": 1.4875531268973894e-06, "loss": 0.0, "step": 8905 }, { "epoch": 9.737704918032787, "grad_norm": 0.0009973180713132024, "learning_rate": 1.4571948998178508e-06, "loss": 0.0, "step": 8910 }, { "epoch": 9.743169398907105, "grad_norm": 6.907128408784047e-05, "learning_rate": 1.4268366727383121e-06, "loss": 0.0, "step": 8915 }, { "epoch": 9.748633879781421, "grad_norm": 9.242547093890607e-06, "learning_rate": 1.3964784456587737e-06, "loss": 0.0, "step": 8920 }, { "epoch": 9.754098360655737, "grad_norm": 1.0322552952857222e-05, "learning_rate": 1.366120218579235e-06, "loss": 0.0, "step": 8925 }, { "epoch": 9.759562841530055, "grad_norm": 7.239196747832466e-06, "learning_rate": 1.3357619914996966e-06, "loss": 0.0, "step": 8930 }, { "epoch": 9.765027322404372, "grad_norm": 1.5557798178633675e-05, "learning_rate": 1.305403764420158e-06, "loss": 0.0, "step": 8935 }, { "epoch": 9.770491803278688, "grad_norm": 2.291934288223274e-05, "learning_rate": 1.2750455373406195e-06, "loss": 0.0, "step": 8940 }, { "epoch": 9.775956284153006, "grad_norm": 7.496446869481588e-06, "learning_rate": 1.2446873102610808e-06, "loss": 0.0, "step": 8945 }, { "epoch": 9.781420765027322, "grad_norm": 4.650903429137543e-05, "learning_rate": 1.2143290831815421e-06, "loss": 0.0, "step": 8950 }, { "epoch": 9.78688524590164, "grad_norm": 9.247813977708574e-06, "learning_rate": 1.1839708561020037e-06, "loss": 0.0, "step": 8955 }, { "epoch": 9.792349726775956, "grad_norm": 0.00309645663946867, "learning_rate": 1.153612629022465e-06, "loss": 0.0, "step": 8960 }, { "epoch": 9.797814207650273, "grad_norm": 9.541035979054868e-06, "learning_rate": 1.1232544019429266e-06, "loss": 0.0, "step": 8965 }, { "epoch": 9.80327868852459, "grad_norm": 6.343203949654708e-06, "learning_rate": 1.092896174863388e-06, "loss": 0.0, "step": 8970 }, { "epoch": 9.808743169398907, "grad_norm": 2.5990406356868334e-05, "learning_rate": 1.0625379477838495e-06, "loss": 0.0, "step": 8975 }, { "epoch": 9.814207650273223, "grad_norm": 0.0003062534669879824, "learning_rate": 1.032179720704311e-06, "loss": 0.0, "step": 8980 }, { "epoch": 9.819672131147541, "grad_norm": 1.3689349543710705e-05, "learning_rate": 1.0018214936247724e-06, "loss": 0.0, "step": 8985 }, { "epoch": 9.825136612021858, "grad_norm": 8.631909622636158e-06, "learning_rate": 9.71463266545234e-07, "loss": 0.0, "step": 8990 }, { "epoch": 9.830601092896174, "grad_norm": 1.931633050844539e-05, "learning_rate": 9.411050394656953e-07, "loss": 0.0, "step": 8995 }, { "epoch": 9.836065573770492, "grad_norm": 9.17322267923737e-06, "learning_rate": 9.107468123861568e-07, "loss": 0.0, "step": 9000 }, { "epoch": 9.841530054644808, "grad_norm": 3.237798227928579e-05, "learning_rate": 8.803885853066182e-07, "loss": 0.0, "step": 9005 }, { "epoch": 9.846994535519126, "grad_norm": 9.100303941522725e-06, "learning_rate": 8.500303582270795e-07, "loss": 0.0, "step": 9010 }, { "epoch": 9.852459016393443, "grad_norm": 9.24307187233353e-06, "learning_rate": 8.19672131147541e-07, "loss": 0.0, "step": 9015 }, { "epoch": 9.857923497267759, "grad_norm": 0.00014155333337839693, "learning_rate": 7.893139040680024e-07, "loss": 0.0, "step": 9020 }, { "epoch": 9.863387978142077, "grad_norm": 1.3452459825202823e-05, "learning_rate": 7.589556769884639e-07, "loss": 0.0, "step": 9025 }, { "epoch": 9.868852459016393, "grad_norm": 0.00011776310566347092, "learning_rate": 7.285974499089254e-07, "loss": 0.0, "step": 9030 }, { "epoch": 9.87431693989071, "grad_norm": 1.1997640285699163e-05, "learning_rate": 6.982392228293868e-07, "loss": 0.0, "step": 9035 }, { "epoch": 9.879781420765028, "grad_norm": 2.0175972167635337e-05, "learning_rate": 6.678809957498483e-07, "loss": 0.0, "step": 9040 }, { "epoch": 9.885245901639344, "grad_norm": 0.0003090601530857384, "learning_rate": 6.375227686703097e-07, "loss": 0.0, "step": 9045 }, { "epoch": 9.890710382513662, "grad_norm": 7.763220310152974e-06, "learning_rate": 6.071645415907711e-07, "loss": 0.0, "step": 9050 }, { "epoch": 9.896174863387978, "grad_norm": 8.539267582818866e-06, "learning_rate": 5.768063145112325e-07, "loss": 0.0, "step": 9055 }, { "epoch": 9.901639344262295, "grad_norm": 4.812333645531908e-05, "learning_rate": 5.46448087431694e-07, "loss": 0.0, "step": 9060 }, { "epoch": 9.907103825136613, "grad_norm": 7.236944838950876e-06, "learning_rate": 5.160898603521555e-07, "loss": 0.0, "step": 9065 }, { "epoch": 9.912568306010929, "grad_norm": 7.316658411582466e-06, "learning_rate": 4.85731633272617e-07, "loss": 0.0, "step": 9070 }, { "epoch": 9.918032786885245, "grad_norm": 8.218483344535343e-06, "learning_rate": 4.553734061930784e-07, "loss": 0.0, "step": 9075 }, { "epoch": 9.923497267759563, "grad_norm": 1.1103961696790066e-05, "learning_rate": 4.2501517911353975e-07, "loss": 0.0, "step": 9080 }, { "epoch": 9.92896174863388, "grad_norm": 9.260171282221563e-06, "learning_rate": 3.946569520340012e-07, "loss": 0.0, "step": 9085 }, { "epoch": 9.934426229508198, "grad_norm": 1.0400766768725589e-05, "learning_rate": 3.642987249544627e-07, "loss": 0.0, "step": 9090 }, { "epoch": 9.939890710382514, "grad_norm": 9.862003025773447e-06, "learning_rate": 3.3394049787492414e-07, "loss": 0.0, "step": 9095 }, { "epoch": 9.94535519125683, "grad_norm": 7.058705705276225e-06, "learning_rate": 3.0358227079538554e-07, "loss": 0.0, "step": 9100 }, { "epoch": 9.950819672131148, "grad_norm": 1.4703833585372195e-05, "learning_rate": 2.73224043715847e-07, "loss": 0.0, "step": 9105 }, { "epoch": 9.956284153005464, "grad_norm": 7.783631190250162e-06, "learning_rate": 2.428658166363085e-07, "loss": 0.0, "step": 9110 }, { "epoch": 9.96174863387978, "grad_norm": 1.1165495379827917e-05, "learning_rate": 2.1250758955676987e-07, "loss": 0.0, "step": 9115 }, { "epoch": 9.967213114754099, "grad_norm": 0.0003044393961317837, "learning_rate": 1.8214936247723135e-07, "loss": 0.0, "step": 9120 }, { "epoch": 9.972677595628415, "grad_norm": 1.3321034202817827e-05, "learning_rate": 1.5179113539769277e-07, "loss": 0.0, "step": 9125 }, { "epoch": 9.978142076502731, "grad_norm": 2.1599676983896643e-05, "learning_rate": 1.2143290831815424e-07, "loss": 0.0, "step": 9130 }, { "epoch": 9.98360655737705, "grad_norm": 1.0389105227659456e-05, "learning_rate": 9.107468123861567e-08, "loss": 0.0, "step": 9135 }, { "epoch": 9.989071038251366, "grad_norm": 6.75558248985908e-06, "learning_rate": 6.071645415907712e-08, "loss": 0.0, "step": 9140 }, { "epoch": 9.994535519125684, "grad_norm": 0.00030103125027380884, "learning_rate": 3.035822707953856e-08, "loss": 0.0, "step": 9145 }, { "epoch": 10.0, "grad_norm": 1.5515264749410562e-05, "learning_rate": 0.0, "loss": 0.0, "step": 9150 }, { "epoch": 10.0, "eval_loss": 9.066787356459827e-07, "eval_runtime": 670.1167, "eval_samples_per_second": 10.918, "eval_steps_per_second": 1.365, "step": 9150 } ], "logging_steps": 5, "max_steps": 9150, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 2.5121727459832627e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }