diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12911 @@ +{ + "best_metric": 9.066787356459827e-07, + "best_model_checkpoint": "speechVSnoise/checkpoint-9150", + "epoch": 10.0, + "eval_steps": 500, + "global_step": 9150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00546448087431694, + "grad_norm": 25.785985946655273, + "learning_rate": 2.73224043715847e-07, + "loss": 0.6864, + "step": 5 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 57.12474822998047, + "learning_rate": 5.46448087431694e-07, + "loss": 0.6771, + "step": 10 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 26.012365341186523, + "learning_rate": 8.19672131147541e-07, + "loss": 0.6308, + "step": 15 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 19.3648624420166, + "learning_rate": 1.092896174863388e-06, + "loss": 0.4441, + "step": 20 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 16.610984802246094, + "learning_rate": 1.366120218579235e-06, + "loss": 0.4245, + "step": 25 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 16.175987243652344, + "learning_rate": 1.639344262295082e-06, + "loss": 0.3525, + "step": 30 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 11.281105041503906, + "learning_rate": 1.912568306010929e-06, + "loss": 0.2203, + "step": 35 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 8.959579467773438, + "learning_rate": 2.185792349726776e-06, + "loss": 0.1137, + "step": 40 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 3.032709836959839, + "learning_rate": 2.459016393442623e-06, + "loss": 0.0931, + "step": 45 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 5.218940258026123, + "learning_rate": 2.73224043715847e-06, + "loss": 0.0874, + "step": 50 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 4.759418487548828, + "learning_rate": 3.005464480874317e-06, + "loss": 0.0685, + "step": 55 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.7248989939689636, + "learning_rate": 3.278688524590164e-06, + "loss": 0.0543, + "step": 60 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 20.059240341186523, + "learning_rate": 3.551912568306011e-06, + "loss": 0.0554, + "step": 65 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 1.0316702127456665, + "learning_rate": 3.825136612021858e-06, + "loss": 0.0126, + "step": 70 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.11562328785657883, + "learning_rate": 4.098360655737704e-06, + "loss": 0.0068, + "step": 75 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.28197842836380005, + "learning_rate": 4.371584699453552e-06, + "loss": 0.006, + "step": 80 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 0.1347256749868393, + "learning_rate": 4.6448087431694e-06, + "loss": 0.0069, + "step": 85 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.14458715915679932, + "learning_rate": 4.918032786885246e-06, + "loss": 0.0026, + "step": 90 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 0.4816187918186188, + "learning_rate": 5.191256830601094e-06, + "loss": 0.0013, + "step": 95 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.030168676748871803, + "learning_rate": 5.46448087431694e-06, + "loss": 0.0014, + "step": 100 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 0.14276443421840668, + "learning_rate": 5.737704918032787e-06, + "loss": 0.0058, + "step": 105 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 0.053282059729099274, + "learning_rate": 6.010928961748634e-06, + "loss": 0.0034, + "step": 110 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 0.16789610683918, + "learning_rate": 6.284153005464481e-06, + "loss": 0.0006, + "step": 115 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.03340218961238861, + "learning_rate": 6.557377049180328e-06, + "loss": 0.001, + "step": 120 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.04611071199178696, + "learning_rate": 6.830601092896176e-06, + "loss": 0.0009, + "step": 125 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 0.017522484064102173, + "learning_rate": 7.103825136612022e-06, + "loss": 0.0003, + "step": 130 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 0.011235825717449188, + "learning_rate": 7.3770491803278695e-06, + "loss": 0.0054, + "step": 135 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 0.008810077793896198, + "learning_rate": 7.650273224043716e-06, + "loss": 0.0882, + "step": 140 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 0.4507307708263397, + "learning_rate": 7.923497267759564e-06, + "loss": 0.001, + "step": 145 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.01477157510817051, + "learning_rate": 8.196721311475409e-06, + "loss": 0.001, + "step": 150 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 0.01235341839492321, + "learning_rate": 8.469945355191257e-06, + "loss": 0.0003, + "step": 155 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 0.14400136470794678, + "learning_rate": 8.743169398907103e-06, + "loss": 0.0039, + "step": 160 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.006246116943657398, + "learning_rate": 9.016393442622952e-06, + "loss": 0.0006, + "step": 165 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 0.019680462777614594, + "learning_rate": 9.2896174863388e-06, + "loss": 0.0005, + "step": 170 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.0067535098642110825, + "learning_rate": 9.562841530054644e-06, + "loss": 0.0001, + "step": 175 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 50.647010803222656, + "learning_rate": 9.836065573770493e-06, + "loss": 0.0782, + "step": 180 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 0.01110668946057558, + "learning_rate": 1.0109289617486339e-05, + "loss": 0.0005, + "step": 185 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 0.0494246631860733, + "learning_rate": 1.0382513661202187e-05, + "loss": 0.0003, + "step": 190 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 0.011345715261995792, + "learning_rate": 1.0655737704918032e-05, + "loss": 0.0003, + "step": 195 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.004185713827610016, + "learning_rate": 1.092896174863388e-05, + "loss": 0.0001, + "step": 200 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 0.30543509125709534, + "learning_rate": 1.1202185792349727e-05, + "loss": 0.024, + "step": 205 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.007137789856642485, + "learning_rate": 1.1475409836065575e-05, + "loss": 0.0001, + "step": 210 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 0.008462714031338692, + "learning_rate": 1.1748633879781421e-05, + "loss": 0.0001, + "step": 215 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 0.015059815719723701, + "learning_rate": 1.2021857923497268e-05, + "loss": 0.0001, + "step": 220 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 0.00993975717574358, + "learning_rate": 1.2295081967213116e-05, + "loss": 0.0001, + "step": 225 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 0.0028505015652626753, + "learning_rate": 1.2568306010928962e-05, + "loss": 0.0378, + "step": 230 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 0.012212187051773071, + "learning_rate": 1.284153005464481e-05, + "loss": 0.0001, + "step": 235 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.007482603657990694, + "learning_rate": 1.3114754098360657e-05, + "loss": 0.0002, + "step": 240 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 0.09465236961841583, + "learning_rate": 1.3387978142076505e-05, + "loss": 0.001, + "step": 245 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.028336774557828903, + "learning_rate": 1.3661202185792351e-05, + "loss": 0.0001, + "step": 250 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 0.001924874261021614, + "learning_rate": 1.3934426229508196e-05, + "loss": 0.0002, + "step": 255 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 0.14595231413841248, + "learning_rate": 1.4207650273224044e-05, + "loss": 0.0003, + "step": 260 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 0.008462517522275448, + "learning_rate": 1.448087431693989e-05, + "loss": 0.0, + "step": 265 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 1.1039444208145142, + "learning_rate": 1.4754098360655739e-05, + "loss": 0.0004, + "step": 270 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 0.0011038613738492131, + "learning_rate": 1.5027322404371585e-05, + "loss": 0.0003, + "step": 275 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.002112634014338255, + "learning_rate": 1.5300546448087432e-05, + "loss": 0.0003, + "step": 280 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 0.001424513990059495, + "learning_rate": 1.557377049180328e-05, + "loss": 0.0001, + "step": 285 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 0.002593505661934614, + "learning_rate": 1.5846994535519128e-05, + "loss": 0.0, + "step": 290 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 0.006812725216150284, + "learning_rate": 1.6120218579234975e-05, + "loss": 0.0, + "step": 295 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.0028373391833156347, + "learning_rate": 1.6393442622950818e-05, + "loss": 0.0, + "step": 300 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.0126915592700243, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0001, + "step": 305 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 0.041050828993320465, + "learning_rate": 1.6939890710382514e-05, + "loss": 0.0001, + "step": 310 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 58.20376205444336, + "learning_rate": 1.721311475409836e-05, + "loss": 0.07, + "step": 315 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 0.04524129629135132, + "learning_rate": 1.7486338797814207e-05, + "loss": 0.0001, + "step": 320 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.0022006204817444086, + "learning_rate": 1.7759562841530057e-05, + "loss": 0.6515, + "step": 325 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.00961753074079752, + "learning_rate": 1.8032786885245903e-05, + "loss": 0.0001, + "step": 330 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 0.033571019768714905, + "learning_rate": 1.830601092896175e-05, + "loss": 0.0033, + "step": 335 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 0.028768986463546753, + "learning_rate": 1.85792349726776e-05, + "loss": 0.0385, + "step": 340 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 0.01213754341006279, + "learning_rate": 1.8852459016393442e-05, + "loss": 0.0351, + "step": 345 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.0052613429725170135, + "learning_rate": 1.912568306010929e-05, + "loss": 0.0007, + "step": 350 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 0.013218444772064686, + "learning_rate": 1.9398907103825135e-05, + "loss": 0.0432, + "step": 355 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.0013946377439424396, + "learning_rate": 1.9672131147540985e-05, + "loss": 0.0006, + "step": 360 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 0.004633768927305937, + "learning_rate": 1.994535519125683e-05, + "loss": 0.0004, + "step": 365 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 0.0033809032756835222, + "learning_rate": 2.0218579234972678e-05, + "loss": 0.0, + "step": 370 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.003123817965388298, + "learning_rate": 2.0491803278688525e-05, + "loss": 0.0001, + "step": 375 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 0.002844776026904583, + "learning_rate": 2.0765027322404374e-05, + "loss": 0.0001, + "step": 380 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 0.0011928690364584327, + "learning_rate": 2.103825136612022e-05, + "loss": 0.0004, + "step": 385 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.002345489338040352, + "learning_rate": 2.1311475409836064e-05, + "loss": 0.0, + "step": 390 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 14.624146461486816, + "learning_rate": 2.1584699453551914e-05, + "loss": 0.1712, + "step": 395 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.00196304963901639, + "learning_rate": 2.185792349726776e-05, + "loss": 0.0001, + "step": 400 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 0.0016199051169678569, + "learning_rate": 2.2131147540983607e-05, + "loss": 0.0001, + "step": 405 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 0.0034895865246653557, + "learning_rate": 2.2404371584699453e-05, + "loss": 0.0001, + "step": 410 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 0.12550325691699982, + "learning_rate": 2.2677595628415303e-05, + "loss": 0.0005, + "step": 415 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.05105838552117348, + "learning_rate": 2.295081967213115e-05, + "loss": 0.0003, + "step": 420 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.0021114815026521683, + "learning_rate": 2.3224043715846996e-05, + "loss": 0.0001, + "step": 425 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 0.002628276590257883, + "learning_rate": 2.3497267759562842e-05, + "loss": 0.0003, + "step": 430 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 0.0018892699154093862, + "learning_rate": 2.377049180327869e-05, + "loss": 0.0, + "step": 435 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.03093365952372551, + "learning_rate": 2.4043715846994535e-05, + "loss": 0.0001, + "step": 440 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 0.0004180266987532377, + "learning_rate": 2.431693989071038e-05, + "loss": 0.0, + "step": 445 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.0009978563757613301, + "learning_rate": 2.459016393442623e-05, + "loss": 0.0, + "step": 450 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 0.0017233375692740083, + "learning_rate": 2.4863387978142078e-05, + "loss": 0.0, + "step": 455 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 43.42852783203125, + "learning_rate": 2.5136612021857924e-05, + "loss": 0.1477, + "step": 460 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 0.001277887960895896, + "learning_rate": 2.540983606557377e-05, + "loss": 0.0, + "step": 465 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 0.0195325780659914, + "learning_rate": 2.568306010928962e-05, + "loss": 0.0001, + "step": 470 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.008195769973099232, + "learning_rate": 2.5956284153005467e-05, + "loss": 0.0113, + "step": 475 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.001160804065875709, + "learning_rate": 2.6229508196721314e-05, + "loss": 0.0, + "step": 480 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 0.042693402618169785, + "learning_rate": 2.650273224043716e-05, + "loss": 0.0001, + "step": 485 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 0.005922058131545782, + "learning_rate": 2.677595628415301e-05, + "loss": 0.0, + "step": 490 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.008849041536450386, + "learning_rate": 2.7049180327868856e-05, + "loss": 0.0001, + "step": 495 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.12916676700115204, + "learning_rate": 2.7322404371584703e-05, + "loss": 0.004, + "step": 500 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 0.06572812050580978, + "learning_rate": 2.7595628415300546e-05, + "loss": 0.1066, + "step": 505 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.0008375145844183862, + "learning_rate": 2.7868852459016392e-05, + "loss": 0.0, + "step": 510 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 0.0012427978217601776, + "learning_rate": 2.814207650273224e-05, + "loss": 0.0, + "step": 515 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.00252250162884593, + "learning_rate": 2.841530054644809e-05, + "loss": 0.0, + "step": 520 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.057264067232608795, + "learning_rate": 2.8688524590163935e-05, + "loss": 0.0002, + "step": 525 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 47.20601272583008, + "learning_rate": 2.896174863387978e-05, + "loss": 0.1031, + "step": 530 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 0.0003861558507196605, + "learning_rate": 2.9234972677595628e-05, + "loss": 0.0, + "step": 535 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 0.0016129296272993088, + "learning_rate": 2.9508196721311478e-05, + "loss": 0.0073, + "step": 540 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 0.027187852188944817, + "learning_rate": 2.9781420765027324e-05, + "loss": 0.0, + "step": 545 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 1.0486091375350952, + "learning_rate": 3.005464480874317e-05, + "loss": 0.001, + "step": 550 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.35983479022979736, + "learning_rate": 3.0327868852459017e-05, + "loss": 0.006, + "step": 555 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 0.0048343949019908905, + "learning_rate": 3.0601092896174864e-05, + "loss": 0.0, + "step": 560 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 0.00185883860103786, + "learning_rate": 3.087431693989071e-05, + "loss": 0.0, + "step": 565 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 0.00794074684381485, + "learning_rate": 3.114754098360656e-05, + "loss": 0.0, + "step": 570 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.00952135305851698, + "learning_rate": 3.142076502732241e-05, + "loss": 0.0, + "step": 575 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 0.0006016762927174568, + "learning_rate": 3.1693989071038256e-05, + "loss": 0.0, + "step": 580 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 0.0006445659091696143, + "learning_rate": 3.19672131147541e-05, + "loss": 0.0, + "step": 585 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 0.0007617264054715633, + "learning_rate": 3.224043715846995e-05, + "loss": 0.0, + "step": 590 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 0.007806734647601843, + "learning_rate": 3.251366120218579e-05, + "loss": 0.0001, + "step": 595 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.00035029446007683873, + "learning_rate": 3.2786885245901635e-05, + "loss": 0.0, + "step": 600 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 0.0011228241492062807, + "learning_rate": 3.306010928961749e-05, + "loss": 0.0, + "step": 605 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.00020948232850059867, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0, + "step": 610 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 0.00047444229130633175, + "learning_rate": 3.360655737704918e-05, + "loss": 0.0, + "step": 615 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 0.00022226192231755704, + "learning_rate": 3.387978142076503e-05, + "loss": 0.0, + "step": 620 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.964410126209259, + "learning_rate": 3.4153005464480874e-05, + "loss": 0.0004, + "step": 625 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.03180733695626259, + "learning_rate": 3.442622950819672e-05, + "loss": 0.0, + "step": 630 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 73.32250213623047, + "learning_rate": 3.469945355191257e-05, + "loss": 0.1252, + "step": 635 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.0005270802648738027, + "learning_rate": 3.4972677595628414e-05, + "loss": 0.0, + "step": 640 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 3.74862003326416, + "learning_rate": 3.524590163934427e-05, + "loss": 0.0471, + "step": 645 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.0006061477470211685, + "learning_rate": 3.551912568306011e-05, + "loss": 0.0014, + "step": 650 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 0.0004408442764542997, + "learning_rate": 3.579234972677596e-05, + "loss": 0.0, + "step": 655 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.000549810822121799, + "learning_rate": 3.6065573770491806e-05, + "loss": 0.0, + "step": 660 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 0.0006441313307732344, + "learning_rate": 3.633879781420765e-05, + "loss": 0.0, + "step": 665 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 0.001270699198357761, + "learning_rate": 3.66120218579235e-05, + "loss": 0.0, + "step": 670 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.00043755839578807354, + "learning_rate": 3.6885245901639346e-05, + "loss": 0.0, + "step": 675 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.0004439064650796354, + "learning_rate": 3.71584699453552e-05, + "loss": 0.0, + "step": 680 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 1.3895764350891113, + "learning_rate": 3.7431693989071045e-05, + "loss": 0.0012, + "step": 685 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.0003534202405717224, + "learning_rate": 3.7704918032786885e-05, + "loss": 0.0449, + "step": 690 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 0.0076418425887823105, + "learning_rate": 3.797814207650273e-05, + "loss": 0.0012, + "step": 695 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 8.513630018569529e-05, + "learning_rate": 3.825136612021858e-05, + "loss": 0.0118, + "step": 700 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.000540974666364491, + "learning_rate": 3.8524590163934424e-05, + "loss": 0.0, + "step": 705 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 0.0001819472381612286, + "learning_rate": 3.879781420765027e-05, + "loss": 0.2163, + "step": 710 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 0.0011445063864812255, + "learning_rate": 3.9071038251366124e-05, + "loss": 0.0, + "step": 715 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.0005601670709438622, + "learning_rate": 3.934426229508197e-05, + "loss": 0.0025, + "step": 720 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.03481730446219444, + "learning_rate": 3.961748633879782e-05, + "loss": 0.0, + "step": 725 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 10.071272850036621, + "learning_rate": 3.989071038251366e-05, + "loss": 0.077, + "step": 730 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.40439388155937195, + "learning_rate": 4.016393442622951e-05, + "loss": 0.0054, + "step": 735 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 0.0003790801565628499, + "learning_rate": 4.0437158469945356e-05, + "loss": 0.0973, + "step": 740 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 0.0005046766018494964, + "learning_rate": 4.07103825136612e-05, + "loss": 0.0639, + "step": 745 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.005156039260327816, + "learning_rate": 4.098360655737705e-05, + "loss": 0.0001, + "step": 750 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 0.819869875907898, + "learning_rate": 4.12568306010929e-05, + "loss": 0.1133, + "step": 755 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 0.0007013155845925212, + "learning_rate": 4.153005464480875e-05, + "loss": 0.0, + "step": 760 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 0.0015922696329653263, + "learning_rate": 4.1803278688524595e-05, + "loss": 0.0037, + "step": 765 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 0.007399330381304026, + "learning_rate": 4.207650273224044e-05, + "loss": 0.0001, + "step": 770 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.06001625955104828, + "learning_rate": 4.234972677595629e-05, + "loss": 0.0003, + "step": 775 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.00016822715406306088, + "learning_rate": 4.262295081967213e-05, + "loss": 0.0003, + "step": 780 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 0.0044054812751710415, + "learning_rate": 4.289617486338798e-05, + "loss": 0.3591, + "step": 785 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 0.001408412354066968, + "learning_rate": 4.316939890710383e-05, + "loss": 0.138, + "step": 790 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 0.0046225362457334995, + "learning_rate": 4.3442622950819674e-05, + "loss": 0.0317, + "step": 795 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.026879915967583656, + "learning_rate": 4.371584699453552e-05, + "loss": 0.0035, + "step": 800 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 0.0007724014576524496, + "learning_rate": 4.398907103825137e-05, + "loss": 0.0001, + "step": 805 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.0005259870667941868, + "learning_rate": 4.426229508196721e-05, + "loss": 0.0017, + "step": 810 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 0.20176024734973907, + "learning_rate": 4.453551912568306e-05, + "loss": 0.0004, + "step": 815 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 0.0004278122214600444, + "learning_rate": 4.4808743169398906e-05, + "loss": 0.2911, + "step": 820 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.0018693683668971062, + "learning_rate": 4.508196721311476e-05, + "loss": 0.0, + "step": 825 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 0.0012298806104809046, + "learning_rate": 4.5355191256830606e-05, + "loss": 0.0001, + "step": 830 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 0.007856699638068676, + "learning_rate": 4.562841530054645e-05, + "loss": 0.0004, + "step": 835 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.009999307803809643, + "learning_rate": 4.59016393442623e-05, + "loss": 0.0002, + "step": 840 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 0.013572623953223228, + "learning_rate": 4.6174863387978145e-05, + "loss": 0.0002, + "step": 845 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.006055152975022793, + "learning_rate": 4.644808743169399e-05, + "loss": 0.0113, + "step": 850 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.001018978888168931, + "learning_rate": 4.672131147540984e-05, + "loss": 0.0001, + "step": 855 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 0.0018887248588725924, + "learning_rate": 4.6994535519125685e-05, + "loss": 0.0001, + "step": 860 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 0.0015452090883627534, + "learning_rate": 4.726775956284154e-05, + "loss": 0.1306, + "step": 865 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.030793463811278343, + "learning_rate": 4.754098360655738e-05, + "loss": 0.0, + "step": 870 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.0030877627432346344, + "learning_rate": 4.7814207650273224e-05, + "loss": 0.0, + "step": 875 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.0027639123145490885, + "learning_rate": 4.808743169398907e-05, + "loss": 0.227, + "step": 880 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 0.013874650001525879, + "learning_rate": 4.836065573770492e-05, + "loss": 0.0005, + "step": 885 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 0.010947838425636292, + "learning_rate": 4.863387978142076e-05, + "loss": 0.005, + "step": 890 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 0.004630269482731819, + "learning_rate": 4.890710382513661e-05, + "loss": 0.0001, + "step": 895 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.0010024881921708584, + "learning_rate": 4.918032786885246e-05, + "loss": 0.0002, + "step": 900 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 0.0025314155500382185, + "learning_rate": 4.945355191256831e-05, + "loss": 0.0918, + "step": 905 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.0026452147867530584, + "learning_rate": 4.9726775956284156e-05, + "loss": 0.0, + "step": 910 + }, + { + "epoch": 1.0, + "grad_norm": 0.043630439788103104, + "learning_rate": 5e-05, + "loss": 0.0001, + "step": 915 + }, + { + "epoch": 1.0, + "eval_loss": 0.053465329110622406, + "eval_runtime": 658.7855, + "eval_samples_per_second": 11.105, + "eval_steps_per_second": 1.389, + "step": 915 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.020471155643463135, + "learning_rate": 4.996964177292046e-05, + "loss": 0.0243, + "step": 920 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.005614515859633684, + "learning_rate": 4.9939283545840925e-05, + "loss": 0.0001, + "step": 925 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.0024823613930493593, + "learning_rate": 4.990892531876138e-05, + "loss": 0.0, + "step": 930 + }, + { + "epoch": 1.0218579234972678, + "grad_norm": 0.0023433889728039503, + "learning_rate": 4.987856709168185e-05, + "loss": 0.0001, + "step": 935 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 0.0012808924075216055, + "learning_rate": 4.984820886460231e-05, + "loss": 0.0, + "step": 940 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 0.0008591926307417452, + "learning_rate": 4.9817850637522776e-05, + "loss": 0.0, + "step": 945 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.001144225592724979, + "learning_rate": 4.9787492410443234e-05, + "loss": 0.0, + "step": 950 + }, + { + "epoch": 1.0437158469945356, + "grad_norm": 0.006365107372403145, + "learning_rate": 4.97571341833637e-05, + "loss": 0.186, + "step": 955 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.004594831261783838, + "learning_rate": 4.9726775956284156e-05, + "loss": 0.0003, + "step": 960 + }, + { + "epoch": 1.0546448087431695, + "grad_norm": 0.01714457757771015, + "learning_rate": 4.969641772920462e-05, + "loss": 0.0005, + "step": 965 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 0.009641066193580627, + "learning_rate": 4.966605950212508e-05, + "loss": 0.0026, + "step": 970 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.002107401378452778, + "learning_rate": 4.9635701275045536e-05, + "loss": 0.0004, + "step": 975 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 0.0008496601949445903, + "learning_rate": 4.9605343047966e-05, + "loss": 0.0, + "step": 980 + }, + { + "epoch": 1.0765027322404372, + "grad_norm": 0.00019651043112389743, + "learning_rate": 4.957498482088646e-05, + "loss": 0.0, + "step": 985 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.02816096320748329, + "learning_rate": 4.954462659380692e-05, + "loss": 0.0, + "step": 990 + }, + { + "epoch": 1.0874316939890711, + "grad_norm": 0.00035151580232195556, + "learning_rate": 4.951426836672739e-05, + "loss": 0.1947, + "step": 995 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.0009560062899254262, + "learning_rate": 4.948391013964785e-05, + "loss": 0.0, + "step": 1000 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 0.0054119001142680645, + "learning_rate": 4.945355191256831e-05, + "loss": 0.0299, + "step": 1005 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 0.04507184773683548, + "learning_rate": 4.9423193685488774e-05, + "loss": 0.1414, + "step": 1010 + }, + { + "epoch": 1.1092896174863387, + "grad_norm": 0.005444942507892847, + "learning_rate": 4.939283545840923e-05, + "loss": 0.0001, + "step": 1015 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.007937879301607609, + "learning_rate": 4.936247723132969e-05, + "loss": 0.0001, + "step": 1020 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.003920862451195717, + "learning_rate": 4.9332119004250154e-05, + "loss": 0.0001, + "step": 1025 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 0.0018985569477081299, + "learning_rate": 4.930176077717061e-05, + "loss": 0.1051, + "step": 1030 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.0012923554750159383, + "learning_rate": 4.9271402550091076e-05, + "loss": 0.0, + "step": 1035 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 0.016738856211304665, + "learning_rate": 4.9241044323011534e-05, + "loss": 0.0001, + "step": 1040 + }, + { + "epoch": 1.1420765027322404, + "grad_norm": 0.0011123842559754848, + "learning_rate": 4.9210686095932e-05, + "loss": 0.1292, + "step": 1045 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.0014614718966186047, + "learning_rate": 4.918032786885246e-05, + "loss": 0.0002, + "step": 1050 + }, + { + "epoch": 1.1530054644808743, + "grad_norm": 0.5193114280700684, + "learning_rate": 4.914996964177293e-05, + "loss": 0.0008, + "step": 1055 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 0.039534829556941986, + "learning_rate": 4.9119611414693385e-05, + "loss": 0.0004, + "step": 1060 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 0.0015866112662479281, + "learning_rate": 4.908925318761385e-05, + "loss": 0.0, + "step": 1065 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 0.0004405885701999068, + "learning_rate": 4.905889496053431e-05, + "loss": 0.0001, + "step": 1070 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.0003017389390151948, + "learning_rate": 4.9028536733454765e-05, + "loss": 0.0001, + "step": 1075 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.002132098888978362, + "learning_rate": 4.899817850637523e-05, + "loss": 0.0, + "step": 1080 + }, + { + "epoch": 1.185792349726776, + "grad_norm": 0.0002962806320283562, + "learning_rate": 4.896782027929569e-05, + "loss": 0.0001, + "step": 1085 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 0.0759500116109848, + "learning_rate": 4.893746205221615e-05, + "loss": 0.2131, + "step": 1090 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 0.006093773990869522, + "learning_rate": 4.890710382513661e-05, + "loss": 0.0001, + "step": 1095 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.09465143084526062, + "learning_rate": 4.8876745598057074e-05, + "loss": 0.0022, + "step": 1100 + }, + { + "epoch": 1.2076502732240437, + "grad_norm": 0.07754716277122498, + "learning_rate": 4.884638737097754e-05, + "loss": 0.0004, + "step": 1105 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.0023268985096365213, + "learning_rate": 4.8816029143898e-05, + "loss": 0.0002, + "step": 1110 + }, + { + "epoch": 1.2185792349726776, + "grad_norm": 0.0014323684154078364, + "learning_rate": 4.878567091681846e-05, + "loss": 0.0093, + "step": 1115 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.14172188937664032, + "learning_rate": 4.875531268973892e-05, + "loss": 0.0429, + "step": 1120 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.05727756395936012, + "learning_rate": 4.872495446265938e-05, + "loss": 0.0002, + "step": 1125 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 0.001011449727229774, + "learning_rate": 4.869459623557984e-05, + "loss": 0.0, + "step": 1130 + }, + { + "epoch": 1.2404371584699454, + "grad_norm": 0.0003146354283671826, + "learning_rate": 4.8664238008500306e-05, + "loss": 0.0002, + "step": 1135 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 0.0003123127680737525, + "learning_rate": 4.863387978142076e-05, + "loss": 0.0, + "step": 1140 + }, + { + "epoch": 1.2513661202185793, + "grad_norm": 0.000679291901178658, + "learning_rate": 4.860352155434123e-05, + "loss": 0.0, + "step": 1145 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.00043421483132988214, + "learning_rate": 4.857316332726169e-05, + "loss": 0.0, + "step": 1150 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 0.0009190288255922496, + "learning_rate": 4.854280510018216e-05, + "loss": 0.0, + "step": 1155 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 0.00011454925697762519, + "learning_rate": 4.8512446873102615e-05, + "loss": 0.0, + "step": 1160 + }, + { + "epoch": 1.273224043715847, + "grad_norm": 0.00038073299219831824, + "learning_rate": 4.848208864602308e-05, + "loss": 0.0, + "step": 1165 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.00031519224285148084, + "learning_rate": 4.845173041894354e-05, + "loss": 0.0, + "step": 1170 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.00039192906115204096, + "learning_rate": 4.8421372191863995e-05, + "loss": 0.0001, + "step": 1175 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 0.00010484485392225906, + "learning_rate": 4.839101396478446e-05, + "loss": 0.0, + "step": 1180 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.00030461253481917083, + "learning_rate": 4.836065573770492e-05, + "loss": 0.0, + "step": 1185 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 0.00025512345018796623, + "learning_rate": 4.833029751062538e-05, + "loss": 0.0, + "step": 1190 + }, + { + "epoch": 1.3060109289617485, + "grad_norm": 0.00021565568749792874, + "learning_rate": 4.829993928354584e-05, + "loss": 0.0, + "step": 1195 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.0004679218982346356, + "learning_rate": 4.8269581056466304e-05, + "loss": 0.0001, + "step": 1200 + }, + { + "epoch": 1.3169398907103824, + "grad_norm": 0.00021159886091481894, + "learning_rate": 4.823922282938677e-05, + "loss": 0.0, + "step": 1205 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 0.00015547883231192827, + "learning_rate": 4.820886460230723e-05, + "loss": 0.0, + "step": 1210 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 29.090713500976562, + "learning_rate": 4.817850637522769e-05, + "loss": 0.0739, + "step": 1215 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.002653504954650998, + "learning_rate": 4.814814814814815e-05, + "loss": 0.0, + "step": 1220 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.0016200868412852287, + "learning_rate": 4.811778992106861e-05, + "loss": 0.0036, + "step": 1225 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 7.368043588940054e-05, + "learning_rate": 4.808743169398907e-05, + "loss": 0.0001, + "step": 1230 + }, + { + "epoch": 1.349726775956284, + "grad_norm": 5.11952348460909e-05, + "learning_rate": 4.8057073466909535e-05, + "loss": 0.0008, + "step": 1235 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.0010039065964519978, + "learning_rate": 4.802671523982999e-05, + "loss": 0.2929, + "step": 1240 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 0.00012388975301291794, + "learning_rate": 4.799635701275046e-05, + "loss": 0.0, + "step": 1245 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.00013184835552237928, + "learning_rate": 4.7965998785670915e-05, + "loss": 0.0, + "step": 1250 + }, + { + "epoch": 1.3715846994535519, + "grad_norm": 9.407256584381685e-05, + "learning_rate": 4.793564055859138e-05, + "loss": 0.0, + "step": 1255 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.0016062518116086721, + "learning_rate": 4.7905282331511844e-05, + "loss": 0.0, + "step": 1260 + }, + { + "epoch": 1.3825136612021858, + "grad_norm": 8.71685188030824e-05, + "learning_rate": 4.787492410443231e-05, + "loss": 0.0, + "step": 1265 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 0.00011282307968940586, + "learning_rate": 4.7844565877352766e-05, + "loss": 0.0, + "step": 1270 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 9.378144022775814e-05, + "learning_rate": 4.7814207650273224e-05, + "loss": 0.0, + "step": 1275 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.00011343141522957012, + "learning_rate": 4.778384942319369e-05, + "loss": 0.0, + "step": 1280 + }, + { + "epoch": 1.4043715846994536, + "grad_norm": 9.47820590226911e-05, + "learning_rate": 4.7753491196114146e-05, + "loss": 0.0, + "step": 1285 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 8.373497985303402e-05, + "learning_rate": 4.772313296903461e-05, + "loss": 0.0, + "step": 1290 + }, + { + "epoch": 1.4153005464480874, + "grad_norm": 0.00018082663882523775, + "learning_rate": 4.769277474195507e-05, + "loss": 0.1966, + "step": 1295 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 0.06691992282867432, + "learning_rate": 4.766241651487553e-05, + "loss": 0.0006, + "step": 1300 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.0007573667098768055, + "learning_rate": 4.7632058287796e-05, + "loss": 0.0001, + "step": 1305 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 6.261147975921631, + "learning_rate": 4.760170006071646e-05, + "loss": 0.0841, + "step": 1310 + }, + { + "epoch": 1.4371584699453552, + "grad_norm": 0.011942526325583458, + "learning_rate": 4.757134183363692e-05, + "loss": 0.0001, + "step": 1315 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.010376262478530407, + "learning_rate": 4.754098360655738e-05, + "loss": 0.0002, + "step": 1320 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.002821909496560693, + "learning_rate": 4.751062537947784e-05, + "loss": 0.0791, + "step": 1325 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 0.0008530141785740852, + "learning_rate": 4.74802671523983e-05, + "loss": 0.0161, + "step": 1330 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 0.0005102080176584423, + "learning_rate": 4.7449908925318764e-05, + "loss": 0.0002, + "step": 1335 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 0.011689051054418087, + "learning_rate": 4.741955069823922e-05, + "loss": 0.0002, + "step": 1340 + }, + { + "epoch": 1.469945355191257, + "grad_norm": 0.0002597393176984042, + "learning_rate": 4.7389192471159687e-05, + "loss": 0.0001, + "step": 1345 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.00023770586994942278, + "learning_rate": 4.7358834244080144e-05, + "loss": 0.0, + "step": 1350 + }, + { + "epoch": 1.4808743169398908, + "grad_norm": 0.00020558437972795218, + "learning_rate": 4.732847601700061e-05, + "loss": 0.0, + "step": 1355 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.0001555513881612569, + "learning_rate": 4.729811778992107e-05, + "loss": 0.0, + "step": 1360 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 0.00018597490270622075, + "learning_rate": 4.726775956284154e-05, + "loss": 0.0, + "step": 1365 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 0.0011901083635166287, + "learning_rate": 4.7237401335761996e-05, + "loss": 0.0, + "step": 1370 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 0.00025052594719454646, + "learning_rate": 4.720704310868245e-05, + "loss": 0.0, + "step": 1375 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.00035109854070469737, + "learning_rate": 4.717668488160292e-05, + "loss": 0.0, + "step": 1380 + }, + { + "epoch": 1.5136612021857925, + "grad_norm": 37.61546325683594, + "learning_rate": 4.7146326654523376e-05, + "loss": 0.0997, + "step": 1385 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 38.025840759277344, + "learning_rate": 4.711596842744384e-05, + "loss": 0.1406, + "step": 1390 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.00024681788636371493, + "learning_rate": 4.70856102003643e-05, + "loss": 0.0, + "step": 1395 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.0002825226401910186, + "learning_rate": 4.705525197328476e-05, + "loss": 0.0003, + "step": 1400 + }, + { + "epoch": 1.5355191256830603, + "grad_norm": 0.0002672713599167764, + "learning_rate": 4.702489374620522e-05, + "loss": 0.0003, + "step": 1405 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 0.013312156312167645, + "learning_rate": 4.6994535519125685e-05, + "loss": 0.0, + "step": 1410 + }, + { + "epoch": 1.5464480874316942, + "grad_norm": 0.00021928714704699814, + "learning_rate": 4.696417729204615e-05, + "loss": 0.0, + "step": 1415 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 0.3993873596191406, + "learning_rate": 4.6933819064966614e-05, + "loss": 0.0005, + "step": 1420 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.00024002179270610213, + "learning_rate": 4.690346083788707e-05, + "loss": 0.0025, + "step": 1425 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 0.00012442973093129694, + "learning_rate": 4.687310261080753e-05, + "loss": 0.0, + "step": 1430 + }, + { + "epoch": 1.5683060109289617, + "grad_norm": 0.00023839778441470116, + "learning_rate": 4.6842744383727994e-05, + "loss": 0.1631, + "step": 1435 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.006576848216354847, + "learning_rate": 4.681238615664845e-05, + "loss": 0.0, + "step": 1440 + }, + { + "epoch": 1.5792349726775956, + "grad_norm": 0.024206487461924553, + "learning_rate": 4.6782027929568916e-05, + "loss": 0.0001, + "step": 1445 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.00031997630139812827, + "learning_rate": 4.6751669702489374e-05, + "loss": 0.0, + "step": 1450 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.06144952401518822, + "learning_rate": 4.672131147540984e-05, + "loss": 0.0003, + "step": 1455 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 0.00033534682006575167, + "learning_rate": 4.6690953248330296e-05, + "loss": 0.0, + "step": 1460 + }, + { + "epoch": 1.6010928961748634, + "grad_norm": 0.665640652179718, + "learning_rate": 4.666059502125076e-05, + "loss": 0.0003, + "step": 1465 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 0.0002033864293480292, + "learning_rate": 4.6630236794171225e-05, + "loss": 0.0, + "step": 1470 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.0018922536401078105, + "learning_rate": 4.659987856709168e-05, + "loss": 0.0002, + "step": 1475 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.0003269801090937108, + "learning_rate": 4.656952034001215e-05, + "loss": 0.0003, + "step": 1480 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 0.027018524706363678, + "learning_rate": 4.6539162112932605e-05, + "loss": 0.0001, + "step": 1485 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 0.0002585135807748884, + "learning_rate": 4.650880388585307e-05, + "loss": 0.0, + "step": 1490 + }, + { + "epoch": 1.633879781420765, + "grad_norm": 0.0009487815550528467, + "learning_rate": 4.647844565877353e-05, + "loss": 0.0, + "step": 1495 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.00017302001651842147, + "learning_rate": 4.644808743169399e-05, + "loss": 0.0006, + "step": 1500 + }, + { + "epoch": 1.644808743169399, + "grad_norm": 0.0009016783442348242, + "learning_rate": 4.641772920461445e-05, + "loss": 0.0, + "step": 1505 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 0.0017405046382918954, + "learning_rate": 4.6387370977534914e-05, + "loss": 0.0003, + "step": 1510 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.0001675246749073267, + "learning_rate": 4.635701275045538e-05, + "loss": 0.0002, + "step": 1515 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.0001709494536044076, + "learning_rate": 4.632665452337584e-05, + "loss": 0.0, + "step": 1520 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.000520666828379035, + "learning_rate": 4.62962962962963e-05, + "loss": 0.0007, + "step": 1525 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 0.00013493937149178237, + "learning_rate": 4.626593806921676e-05, + "loss": 0.2165, + "step": 1530 + }, + { + "epoch": 1.6775956284153004, + "grad_norm": 0.03252645209431648, + "learning_rate": 4.623557984213722e-05, + "loss": 0.0003, + "step": 1535 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 0.00021406033192761242, + "learning_rate": 4.620522161505768e-05, + "loss": 0.0, + "step": 1540 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.0006847565528005362, + "learning_rate": 4.6174863387978145e-05, + "loss": 0.0, + "step": 1545 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.0003050083469133824, + "learning_rate": 4.61445051608986e-05, + "loss": 0.0, + "step": 1550 + }, + { + "epoch": 1.6994535519125682, + "grad_norm": 0.017207743600010872, + "learning_rate": 4.611414693381907e-05, + "loss": 0.0001, + "step": 1555 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.0002447327133268118, + "learning_rate": 4.6083788706739525e-05, + "loss": 0.0, + "step": 1560 + }, + { + "epoch": 1.710382513661202, + "grad_norm": 0.00020444171968847513, + "learning_rate": 4.605343047965999e-05, + "loss": 0.0, + "step": 1565 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 0.02190363220870495, + "learning_rate": 4.6023072252580454e-05, + "loss": 0.0001, + "step": 1570 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.00021700489742215723, + "learning_rate": 4.599271402550091e-05, + "loss": 0.0001, + "step": 1575 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 0.00015953410184010863, + "learning_rate": 4.5962355798421377e-05, + "loss": 0.0, + "step": 1580 + }, + { + "epoch": 1.7322404371584699, + "grad_norm": 0.00021170971740502864, + "learning_rate": 4.5931997571341834e-05, + "loss": 0.0, + "step": 1585 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.00016968738054856658, + "learning_rate": 4.59016393442623e-05, + "loss": 0.0639, + "step": 1590 + }, + { + "epoch": 1.7431693989071038, + "grad_norm": 0.00013816288264933974, + "learning_rate": 4.5871281117182757e-05, + "loss": 0.1101, + "step": 1595 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.0002588335482869297, + "learning_rate": 4.584092289010322e-05, + "loss": 0.0, + "step": 1600 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.0010825609788298607, + "learning_rate": 4.581056466302368e-05, + "loss": 0.0001, + "step": 1605 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 0.00044289807556197047, + "learning_rate": 4.578020643594414e-05, + "loss": 0.2008, + "step": 1610 + }, + { + "epoch": 1.7650273224043715, + "grad_norm": 0.0006378990947268903, + "learning_rate": 4.57498482088646e-05, + "loss": 0.0, + "step": 1615 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.0013747804332524538, + "learning_rate": 4.5719489981785066e-05, + "loss": 0.0001, + "step": 1620 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.0012717167846858501, + "learning_rate": 4.568913175470553e-05, + "loss": 0.0001, + "step": 1625 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 0.006049423012882471, + "learning_rate": 4.565877352762599e-05, + "loss": 0.1173, + "step": 1630 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.0012180794728919864, + "learning_rate": 4.562841530054645e-05, + "loss": 0.0001, + "step": 1635 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 0.027454646304249763, + "learning_rate": 4.559805707346691e-05, + "loss": 0.0001, + "step": 1640 + }, + { + "epoch": 1.7978142076502732, + "grad_norm": 0.0020080592948943377, + "learning_rate": 4.5567698846387375e-05, + "loss": 0.161, + "step": 1645 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.003008060622960329, + "learning_rate": 4.553734061930783e-05, + "loss": 0.0001, + "step": 1650 + }, + { + "epoch": 1.8087431693989071, + "grad_norm": 0.004481349140405655, + "learning_rate": 4.55069823922283e-05, + "loss": 0.0203, + "step": 1655 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 4.153011322021484, + "learning_rate": 4.5476624165148755e-05, + "loss": 0.1353, + "step": 1660 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 0.010290669277310371, + "learning_rate": 4.544626593806922e-05, + "loss": 0.0005, + "step": 1665 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 0.007870933972299099, + "learning_rate": 4.541590771098968e-05, + "loss": 0.0074, + "step": 1670 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 0.0014877247158437967, + "learning_rate": 4.538554948391014e-05, + "loss": 0.0012, + "step": 1675 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.0007460130145773292, + "learning_rate": 4.5355191256830606e-05, + "loss": 0.0001, + "step": 1680 + }, + { + "epoch": 1.8415300546448088, + "grad_norm": 0.05613982677459717, + "learning_rate": 4.5324833029751064e-05, + "loss": 0.0003, + "step": 1685 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 0.0004631394112948328, + "learning_rate": 4.529447480267153e-05, + "loss": 0.0001, + "step": 1690 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.00038068211870267987, + "learning_rate": 4.5264116575591986e-05, + "loss": 0.0003, + "step": 1695 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.00032613801886327565, + "learning_rate": 4.523375834851245e-05, + "loss": 0.0, + "step": 1700 + }, + { + "epoch": 1.8633879781420766, + "grad_norm": 0.000517977518029511, + "learning_rate": 4.520340012143291e-05, + "loss": 0.1396, + "step": 1705 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.0025108163245022297, + "learning_rate": 4.517304189435337e-05, + "loss": 0.0003, + "step": 1710 + }, + { + "epoch": 1.8743169398907105, + "grad_norm": 0.2560445964336395, + "learning_rate": 4.514268366727383e-05, + "loss": 0.069, + "step": 1715 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.0028507369570434093, + "learning_rate": 4.5112325440194295e-05, + "loss": 0.0022, + "step": 1720 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.001581528689712286, + "learning_rate": 4.508196721311476e-05, + "loss": 0.0, + "step": 1725 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 0.0012307813158258796, + "learning_rate": 4.505160898603522e-05, + "loss": 0.0018, + "step": 1730 + }, + { + "epoch": 1.8961748633879782, + "grad_norm": 0.0004025879898108542, + "learning_rate": 4.502125075895568e-05, + "loss": 0.0006, + "step": 1735 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.0005517560639418662, + "learning_rate": 4.499089253187614e-05, + "loss": 0.0005, + "step": 1740 + }, + { + "epoch": 1.9071038251366121, + "grad_norm": 0.00021167936210986227, + "learning_rate": 4.4960534304796604e-05, + "loss": 0.0005, + "step": 1745 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.0001789630769053474, + "learning_rate": 4.493017607771706e-05, + "loss": 0.0, + "step": 1750 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.04136138781905174, + "learning_rate": 4.4899817850637526e-05, + "loss": 0.0002, + "step": 1755 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 0.0001342537289019674, + "learning_rate": 4.4869459623557984e-05, + "loss": 0.0006, + "step": 1760 + }, + { + "epoch": 1.92896174863388, + "grad_norm": 36.38466262817383, + "learning_rate": 4.483910139647845e-05, + "loss": 0.3505, + "step": 1765 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 0.007546951994299889, + "learning_rate": 4.4808743169398906e-05, + "loss": 0.0, + "step": 1770 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 0.2711917459964752, + "learning_rate": 4.477838494231937e-05, + "loss": 0.0005, + "step": 1775 + }, + { + "epoch": 1.9453551912568305, + "grad_norm": 0.022145679220557213, + "learning_rate": 4.4748026715239835e-05, + "loss": 0.0003, + "step": 1780 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 0.0005350765422917902, + "learning_rate": 4.471766848816029e-05, + "loss": 0.0, + "step": 1785 + }, + { + "epoch": 1.9562841530054644, + "grad_norm": 0.0005207122885622084, + "learning_rate": 4.468731026108076e-05, + "loss": 0.0001, + "step": 1790 + }, + { + "epoch": 1.9617486338797814, + "grad_norm": 0.0017709174426272511, + "learning_rate": 4.4656952034001215e-05, + "loss": 0.0, + "step": 1795 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.00032043023384176195, + "learning_rate": 4.462659380692168e-05, + "loss": 0.0, + "step": 1800 + }, + { + "epoch": 1.9726775956284153, + "grad_norm": 0.013697931542992592, + "learning_rate": 4.459623557984214e-05, + "loss": 0.0001, + "step": 1805 + }, + { + "epoch": 1.9781420765027322, + "grad_norm": 0.00043464158079586923, + "learning_rate": 4.45658773527626e-05, + "loss": 0.0, + "step": 1810 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 0.020648222416639328, + "learning_rate": 4.453551912568306e-05, + "loss": 0.0726, + "step": 1815 + }, + { + "epoch": 1.989071038251366, + "grad_norm": 0.0003471345698926598, + "learning_rate": 4.4505160898603524e-05, + "loss": 0.0, + "step": 1820 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 0.00031178101198747754, + "learning_rate": 4.447480267152398e-05, + "loss": 0.0, + "step": 1825 + }, + { + "epoch": 2.0, + "grad_norm": 0.00018788916349876672, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.0, + "step": 1830 + }, + { + "epoch": 2.0, + "eval_loss": 0.050701793283224106, + "eval_runtime": 660.1575, + "eval_samples_per_second": 11.082, + "eval_steps_per_second": 1.386, + "step": 1830 + }, + { + "epoch": 2.0054644808743167, + "grad_norm": 0.00025049722171388566, + "learning_rate": 4.441408621736491e-05, + "loss": 0.0, + "step": 1835 + }, + { + "epoch": 2.010928961748634, + "grad_norm": 8.584993362426758, + "learning_rate": 4.438372799028537e-05, + "loss": 0.0062, + "step": 1840 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 0.0003093411505687982, + "learning_rate": 4.435336976320583e-05, + "loss": 0.0, + "step": 1845 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 0.00020034695626236498, + "learning_rate": 4.432301153612629e-05, + "loss": 0.0, + "step": 1850 + }, + { + "epoch": 2.0273224043715845, + "grad_norm": 0.0001671861537033692, + "learning_rate": 4.4292653309046756e-05, + "loss": 0.0008, + "step": 1855 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 0.014300575479865074, + "learning_rate": 4.426229508196721e-05, + "loss": 0.0001, + "step": 1860 + }, + { + "epoch": 2.0382513661202184, + "grad_norm": 8.975714445114136e-05, + "learning_rate": 4.423193685488768e-05, + "loss": 0.0, + "step": 1865 + }, + { + "epoch": 2.0437158469945356, + "grad_norm": 0.00019374603289179504, + "learning_rate": 4.4201578627808136e-05, + "loss": 0.0, + "step": 1870 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 0.00013538934581447393, + "learning_rate": 4.41712204007286e-05, + "loss": 0.0, + "step": 1875 + }, + { + "epoch": 2.0546448087431695, + "grad_norm": 0.0002743653894867748, + "learning_rate": 4.4140862173649065e-05, + "loss": 0.0001, + "step": 1880 + }, + { + "epoch": 2.060109289617486, + "grad_norm": 7.988169090822339e-05, + "learning_rate": 4.411050394656952e-05, + "loss": 0.0002, + "step": 1885 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 6.720585952280089e-05, + "learning_rate": 4.408014571948999e-05, + "loss": 0.0, + "step": 1890 + }, + { + "epoch": 2.07103825136612, + "grad_norm": 8.488036110065877e-05, + "learning_rate": 4.4049787492410445e-05, + "loss": 0.0, + "step": 1895 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 0.00010288408520864323, + "learning_rate": 4.401942926533091e-05, + "loss": 0.0, + "step": 1900 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 6.435842078644782e-05, + "learning_rate": 4.398907103825137e-05, + "loss": 0.0001, + "step": 1905 + }, + { + "epoch": 2.087431693989071, + "grad_norm": 0.0001191817136714235, + "learning_rate": 4.395871281117183e-05, + "loss": 0.0, + "step": 1910 + }, + { + "epoch": 2.092896174863388, + "grad_norm": 7.335934787988663e-05, + "learning_rate": 4.392835458409229e-05, + "loss": 0.0, + "step": 1915 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 9.272222087020054e-05, + "learning_rate": 4.3897996357012754e-05, + "loss": 0.0, + "step": 1920 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 5.950667036813684e-05, + "learning_rate": 4.386763812993321e-05, + "loss": 0.0, + "step": 1925 + }, + { + "epoch": 2.109289617486339, + "grad_norm": 0.00011658170114969835, + "learning_rate": 4.3837279902853676e-05, + "loss": 0.0001, + "step": 1930 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 7.576384814456105e-05, + "learning_rate": 4.380692167577414e-05, + "loss": 0.0001, + "step": 1935 + }, + { + "epoch": 2.120218579234973, + "grad_norm": 0.00017848057905212045, + "learning_rate": 4.37765634486946e-05, + "loss": 0.0, + "step": 1940 + }, + { + "epoch": 2.1256830601092895, + "grad_norm": 0.00016036807210184634, + "learning_rate": 4.374620522161506e-05, + "loss": 0.0, + "step": 1945 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 0.0007356128189712763, + "learning_rate": 4.371584699453552e-05, + "loss": 0.0, + "step": 1950 + }, + { + "epoch": 2.1366120218579234, + "grad_norm": 6.098092126194388e-05, + "learning_rate": 4.3685488767455985e-05, + "loss": 0.0, + "step": 1955 + }, + { + "epoch": 2.1420765027322406, + "grad_norm": 0.00010312991798855364, + "learning_rate": 4.365513054037644e-05, + "loss": 0.0, + "step": 1960 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 8.894432539818808e-05, + "learning_rate": 4.362477231329691e-05, + "loss": 0.0, + "step": 1965 + }, + { + "epoch": 2.1530054644808745, + "grad_norm": 8.428292494500056e-05, + "learning_rate": 4.3594414086217365e-05, + "loss": 0.0, + "step": 1970 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 8.923211134970188e-05, + "learning_rate": 4.356405585913783e-05, + "loss": 0.0, + "step": 1975 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 0.0001834633876569569, + "learning_rate": 4.353369763205829e-05, + "loss": 0.0449, + "step": 1980 + }, + { + "epoch": 2.169398907103825, + "grad_norm": 0.0003559277392923832, + "learning_rate": 4.350333940497875e-05, + "loss": 0.0, + "step": 1985 + }, + { + "epoch": 2.1748633879781423, + "grad_norm": 5.8480047300690785e-05, + "learning_rate": 4.3472981177899216e-05, + "loss": 0.0, + "step": 1990 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 9.875800606096163e-05, + "learning_rate": 4.3442622950819674e-05, + "loss": 0.0002, + "step": 1995 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 0.00011199543223483488, + "learning_rate": 4.341226472374014e-05, + "loss": 0.0012, + "step": 2000 + }, + { + "epoch": 2.191256830601093, + "grad_norm": 6.932274845894426e-05, + "learning_rate": 4.3381906496660596e-05, + "loss": 0.0321, + "step": 2005 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 0.022693945094943047, + "learning_rate": 4.335154826958106e-05, + "loss": 0.1198, + "step": 2010 + }, + { + "epoch": 2.202185792349727, + "grad_norm": 5.4680960602127016e-05, + "learning_rate": 4.332119004250152e-05, + "loss": 0.0, + "step": 2015 + }, + { + "epoch": 2.2076502732240435, + "grad_norm": 10.618327140808105, + "learning_rate": 4.329083181542198e-05, + "loss": 0.0059, + "step": 2020 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 7.450117846019566e-05, + "learning_rate": 4.326047358834244e-05, + "loss": 0.0, + "step": 2025 + }, + { + "epoch": 2.2185792349726774, + "grad_norm": 0.004406645428389311, + "learning_rate": 4.32301153612629e-05, + "loss": 0.0, + "step": 2030 + }, + { + "epoch": 2.2240437158469946, + "grad_norm": 8.840003283694386e-05, + "learning_rate": 4.319975713418336e-05, + "loss": 0.0, + "step": 2035 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 0.005674728192389011, + "learning_rate": 4.316939890710383e-05, + "loss": 0.0, + "step": 2040 + }, + { + "epoch": 2.2349726775956285, + "grad_norm": 25.384016036987305, + "learning_rate": 4.313904068002429e-05, + "loss": 0.0272, + "step": 2045 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 8.648393850307912e-05, + "learning_rate": 4.310868245294475e-05, + "loss": 0.2247, + "step": 2050 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 36.4394645690918, + "learning_rate": 4.3078324225865214e-05, + "loss": 0.1866, + "step": 2055 + }, + { + "epoch": 2.251366120218579, + "grad_norm": 0.00034720319672487676, + "learning_rate": 4.304796599878567e-05, + "loss": 0.0, + "step": 2060 + }, + { + "epoch": 2.2568306010928962, + "grad_norm": 0.001058422145433724, + "learning_rate": 4.3017607771706137e-05, + "loss": 0.0001, + "step": 2065 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 0.0010224126745015383, + "learning_rate": 4.2987249544626594e-05, + "loss": 0.0287, + "step": 2070 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 0.00043366028694435954, + "learning_rate": 4.295689131754706e-05, + "loss": 0.0112, + "step": 2075 + }, + { + "epoch": 2.273224043715847, + "grad_norm": 0.00029680755687877536, + "learning_rate": 4.2926533090467517e-05, + "loss": 0.0, + "step": 2080 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 0.0005055105430074036, + "learning_rate": 4.289617486338798e-05, + "loss": 0.0, + "step": 2085 + }, + { + "epoch": 2.2841530054644807, + "grad_norm": 0.00031777381082065403, + "learning_rate": 4.2865816636308446e-05, + "loss": 0.0, + "step": 2090 + }, + { + "epoch": 2.289617486338798, + "grad_norm": 0.00021517739514820278, + "learning_rate": 4.28354584092289e-05, + "loss": 0.0, + "step": 2095 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 0.0009914422407746315, + "learning_rate": 4.280510018214937e-05, + "loss": 0.0469, + "step": 2100 + }, + { + "epoch": 2.300546448087432, + "grad_norm": 0.00023720291210338473, + "learning_rate": 4.2774741955069826e-05, + "loss": 0.0, + "step": 2105 + }, + { + "epoch": 2.3060109289617485, + "grad_norm": 0.00918621476739645, + "learning_rate": 4.274438372799029e-05, + "loss": 0.0001, + "step": 2110 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 0.002750314772129059, + "learning_rate": 4.271402550091075e-05, + "loss": 0.0001, + "step": 2115 + }, + { + "epoch": 2.3169398907103824, + "grad_norm": 0.0003200963546987623, + "learning_rate": 4.268366727383121e-05, + "loss": 0.0003, + "step": 2120 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 0.0004604012647178024, + "learning_rate": 4.265330904675167e-05, + "loss": 0.0, + "step": 2125 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 11.309343338012695, + "learning_rate": 4.262295081967213e-05, + "loss": 0.0048, + "step": 2130 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.0001469504932174459, + "learning_rate": 4.259259259259259e-05, + "loss": 0.0, + "step": 2135 + }, + { + "epoch": 2.33879781420765, + "grad_norm": 0.00021601478511001915, + "learning_rate": 4.256223436551306e-05, + "loss": 0.0, + "step": 2140 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 0.00023017208150122315, + "learning_rate": 4.253187613843352e-05, + "loss": 0.0857, + "step": 2145 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 38.37042999267578, + "learning_rate": 4.250151791135398e-05, + "loss": 0.2799, + "step": 2150 + }, + { + "epoch": 2.3551912568306013, + "grad_norm": 0.0014018617803230882, + "learning_rate": 4.2471159684274444e-05, + "loss": 0.0, + "step": 2155 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 0.07934489101171494, + "learning_rate": 4.24408014571949e-05, + "loss": 0.0837, + "step": 2160 + }, + { + "epoch": 2.366120218579235, + "grad_norm": 0.0009544425411149859, + "learning_rate": 4.2410443230115366e-05, + "loss": 0.0016, + "step": 2165 + }, + { + "epoch": 2.371584699453552, + "grad_norm": 0.0019977609626948833, + "learning_rate": 4.2380085003035824e-05, + "loss": 0.0, + "step": 2170 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.0008353716693818569, + "learning_rate": 4.234972677595629e-05, + "loss": 0.1553, + "step": 2175 + }, + { + "epoch": 2.3825136612021858, + "grad_norm": 0.00047743169125169516, + "learning_rate": 4.2319368548876746e-05, + "loss": 0.0004, + "step": 2180 + }, + { + "epoch": 2.387978142076503, + "grad_norm": 1.1335127353668213, + "learning_rate": 4.2289010321797204e-05, + "loss": 0.0041, + "step": 2185 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 0.0003214840835426003, + "learning_rate": 4.225865209471767e-05, + "loss": 0.0003, + "step": 2190 + }, + { + "epoch": 2.3989071038251364, + "grad_norm": 0.0005583087913691998, + "learning_rate": 4.222829386763813e-05, + "loss": 0.0, + "step": 2195 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 0.00019280117703601718, + "learning_rate": 4.21979356405586e-05, + "loss": 0.0, + "step": 2200 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 0.0002531936625018716, + "learning_rate": 4.2167577413479055e-05, + "loss": 0.0, + "step": 2205 + }, + { + "epoch": 2.4153005464480874, + "grad_norm": 0.0003483460459392518, + "learning_rate": 4.213721918639952e-05, + "loss": 0.0, + "step": 2210 + }, + { + "epoch": 2.420765027322404, + "grad_norm": 0.0002563974994700402, + "learning_rate": 4.210686095931998e-05, + "loss": 0.0001, + "step": 2215 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 0.00037459208397194743, + "learning_rate": 4.207650273224044e-05, + "loss": 0.0, + "step": 2220 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 0.0010059193009510636, + "learning_rate": 4.20461445051609e-05, + "loss": 0.1636, + "step": 2225 + }, + { + "epoch": 2.4371584699453552, + "grad_norm": 0.07868228107690811, + "learning_rate": 4.201578627808136e-05, + "loss": 0.0002, + "step": 2230 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 0.003810027614235878, + "learning_rate": 4.198542805100182e-05, + "loss": 0.0003, + "step": 2235 + }, + { + "epoch": 2.448087431693989, + "grad_norm": 0.0003104351635556668, + "learning_rate": 4.195506982392228e-05, + "loss": 0.0005, + "step": 2240 + }, + { + "epoch": 2.453551912568306, + "grad_norm": 0.000299435923807323, + "learning_rate": 4.1924711596842744e-05, + "loss": 0.0002, + "step": 2245 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.00018563756020739675, + "learning_rate": 4.189435336976321e-05, + "loss": 0.0, + "step": 2250 + }, + { + "epoch": 2.4644808743169397, + "grad_norm": 0.00043477851431816816, + "learning_rate": 4.186399514268367e-05, + "loss": 0.0, + "step": 2255 + }, + { + "epoch": 2.469945355191257, + "grad_norm": 0.00019215783686377108, + "learning_rate": 4.183363691560413e-05, + "loss": 0.0, + "step": 2260 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 0.00027334748301655054, + "learning_rate": 4.1803278688524595e-05, + "loss": 0.0, + "step": 2265 + }, + { + "epoch": 2.480874316939891, + "grad_norm": 0.0002772251609712839, + "learning_rate": 4.177292046144505e-05, + "loss": 0.0, + "step": 2270 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 0.0001554929040139541, + "learning_rate": 4.174256223436552e-05, + "loss": 0.0, + "step": 2275 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 0.00038592342752963305, + "learning_rate": 4.1712204007285975e-05, + "loss": 0.0, + "step": 2280 + }, + { + "epoch": 2.4972677595628414, + "grad_norm": 0.0002766764082480222, + "learning_rate": 4.168184578020643e-05, + "loss": 0.2958, + "step": 2285 + }, + { + "epoch": 2.5027322404371586, + "grad_norm": 0.006776416674256325, + "learning_rate": 4.16514875531269e-05, + "loss": 0.0001, + "step": 2290 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 0.037279609590768814, + "learning_rate": 4.162112932604736e-05, + "loss": 0.0003, + "step": 2295 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 0.0005434492486529052, + "learning_rate": 4.1590771098967827e-05, + "loss": 0.002, + "step": 2300 + }, + { + "epoch": 2.519125683060109, + "grad_norm": 0.0008327377145178616, + "learning_rate": 4.1560412871888284e-05, + "loss": 0.0, + "step": 2305 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 0.00018225843086838722, + "learning_rate": 4.153005464480875e-05, + "loss": 0.0, + "step": 2310 + }, + { + "epoch": 2.530054644808743, + "grad_norm": 0.0016029364196583629, + "learning_rate": 4.1499696417729207e-05, + "loss": 0.0001, + "step": 2315 + }, + { + "epoch": 2.5355191256830603, + "grad_norm": 0.00022132557933218777, + "learning_rate": 4.146933819064967e-05, + "loss": 0.0001, + "step": 2320 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 0.008287652395665646, + "learning_rate": 4.143897996357013e-05, + "loss": 0.0001, + "step": 2325 + }, + { + "epoch": 2.546448087431694, + "grad_norm": 0.0076728700660169125, + "learning_rate": 4.1408621736490587e-05, + "loss": 0.0, + "step": 2330 + }, + { + "epoch": 2.551912568306011, + "grad_norm": 0.00015040539437904954, + "learning_rate": 4.137826350941105e-05, + "loss": 0.0, + "step": 2335 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 0.0013361191377043724, + "learning_rate": 4.134790528233151e-05, + "loss": 0.0, + "step": 2340 + }, + { + "epoch": 2.5628415300546448, + "grad_norm": 0.003388547571375966, + "learning_rate": 4.131754705525197e-05, + "loss": 0.0, + "step": 2345 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 0.000265285256318748, + "learning_rate": 4.128718882817244e-05, + "loss": 0.0, + "step": 2350 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 0.0002575261751189828, + "learning_rate": 4.12568306010929e-05, + "loss": 0.0, + "step": 2355 + }, + { + "epoch": 2.579234972677596, + "grad_norm": 0.00025711252237670124, + "learning_rate": 4.122647237401336e-05, + "loss": 0.0, + "step": 2360 + }, + { + "epoch": 2.5846994535519126, + "grad_norm": 0.0004015800659544766, + "learning_rate": 4.1196114146933825e-05, + "loss": 0.0, + "step": 2365 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 0.0002642312610987574, + "learning_rate": 4.116575591985428e-05, + "loss": 0.1755, + "step": 2370 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 0.00034015090204775333, + "learning_rate": 4.113539769277475e-05, + "loss": 0.0, + "step": 2375 + }, + { + "epoch": 2.6010928961748636, + "grad_norm": 0.00024969771038740873, + "learning_rate": 4.1105039465695205e-05, + "loss": 0.0041, + "step": 2380 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 0.0018195788143202662, + "learning_rate": 4.107468123861566e-05, + "loss": 0.0002, + "step": 2385 + }, + { + "epoch": 2.612021857923497, + "grad_norm": 0.0009367292514070868, + "learning_rate": 4.104432301153613e-05, + "loss": 0.0, + "step": 2390 + }, + { + "epoch": 2.6174863387978142, + "grad_norm": 0.0002414148475509137, + "learning_rate": 4.1013964784456585e-05, + "loss": 0.0, + "step": 2395 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.00028640354867093265, + "learning_rate": 4.098360655737705e-05, + "loss": 0.0, + "step": 2400 + }, + { + "epoch": 2.628415300546448, + "grad_norm": 0.00019009907555300742, + "learning_rate": 4.0953248330297514e-05, + "loss": 0.0, + "step": 2405 + }, + { + "epoch": 2.633879781420765, + "grad_norm": 0.00015359176904894412, + "learning_rate": 4.092289010321798e-05, + "loss": 0.029, + "step": 2410 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 0.0025620998349040747, + "learning_rate": 4.0892531876138436e-05, + "loss": 0.0, + "step": 2415 + }, + { + "epoch": 2.644808743169399, + "grad_norm": 0.00017074740026146173, + "learning_rate": 4.08621736490589e-05, + "loss": 0.0, + "step": 2420 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 0.00021745880076196045, + "learning_rate": 4.083181542197936e-05, + "loss": 0.0, + "step": 2425 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 0.0006353080389089882, + "learning_rate": 4.080145719489982e-05, + "loss": 0.0, + "step": 2430 + }, + { + "epoch": 2.66120218579235, + "grad_norm": 0.000403941870899871, + "learning_rate": 4.077109896782028e-05, + "loss": 0.0, + "step": 2435 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.00042110533104278147, + "learning_rate": 4.074074074074074e-05, + "loss": 0.0, + "step": 2440 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 0.0002816633495967835, + "learning_rate": 4.07103825136612e-05, + "loss": 0.0001, + "step": 2445 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 0.036186400800943375, + "learning_rate": 4.068002428658167e-05, + "loss": 0.0001, + "step": 2450 + }, + { + "epoch": 2.6830601092896176, + "grad_norm": 0.00018799924873746932, + "learning_rate": 4.064966605950213e-05, + "loss": 0.0002, + "step": 2455 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 0.000202516297576949, + "learning_rate": 4.061930783242259e-05, + "loss": 0.0, + "step": 2460 + }, + { + "epoch": 2.6939890710382515, + "grad_norm": 0.01154984999448061, + "learning_rate": 4.0588949605343054e-05, + "loss": 0.0, + "step": 2465 + }, + { + "epoch": 2.699453551912568, + "grad_norm": 0.0010380310704931617, + "learning_rate": 4.055859137826351e-05, + "loss": 0.0, + "step": 2470 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 9.886401676340029e-05, + "learning_rate": 4.0528233151183976e-05, + "loss": 0.0, + "step": 2475 + }, + { + "epoch": 2.710382513661202, + "grad_norm": 0.003762443084269762, + "learning_rate": 4.0497874924104434e-05, + "loss": 0.0, + "step": 2480 + }, + { + "epoch": 2.7158469945355193, + "grad_norm": 0.00016515249444637448, + "learning_rate": 4.046751669702489e-05, + "loss": 0.0, + "step": 2485 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 0.007088791113346815, + "learning_rate": 4.0437158469945356e-05, + "loss": 0.0, + "step": 2490 + }, + { + "epoch": 2.726775956284153, + "grad_norm": 0.00012914990657009184, + "learning_rate": 4.0406800242865814e-05, + "loss": 0.0, + "step": 2495 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 10.831242561340332, + "learning_rate": 4.037644201578628e-05, + "loss": 0.0252, + "step": 2500 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 0.000647792941890657, + "learning_rate": 4.034608378870674e-05, + "loss": 0.0, + "step": 2505 + }, + { + "epoch": 2.7431693989071038, + "grad_norm": 0.0003103922645095736, + "learning_rate": 4.031572556162721e-05, + "loss": 0.0, + "step": 2510 + }, + { + "epoch": 2.748633879781421, + "grad_norm": 0.00011969503248110414, + "learning_rate": 4.0285367334547665e-05, + "loss": 0.0001, + "step": 2515 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 7.822127372492105e-05, + "learning_rate": 4.025500910746813e-05, + "loss": 0.0232, + "step": 2520 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 0.8959982395172119, + "learning_rate": 4.022465088038859e-05, + "loss": 0.0007, + "step": 2525 + }, + { + "epoch": 2.7650273224043715, + "grad_norm": 5.191492164158262e-05, + "learning_rate": 4.019429265330905e-05, + "loss": 0.0507, + "step": 2530 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 3.3753425668692216e-05, + "learning_rate": 4.016393442622951e-05, + "loss": 0.0, + "step": 2535 + }, + { + "epoch": 2.7759562841530054, + "grad_norm": 0.0015989969251677394, + "learning_rate": 4.013357619914997e-05, + "loss": 0.2098, + "step": 2540 + }, + { + "epoch": 2.781420765027322, + "grad_norm": 0.0004384935018606484, + "learning_rate": 4.010321797207043e-05, + "loss": 0.0, + "step": 2545 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.0006148093962110579, + "learning_rate": 4.007285974499089e-05, + "loss": 0.0003, + "step": 2550 + }, + { + "epoch": 2.7923497267759565, + "grad_norm": 0.0004790659877471626, + "learning_rate": 4.0042501517911354e-05, + "loss": 0.0001, + "step": 2555 + }, + { + "epoch": 2.797814207650273, + "grad_norm": 0.00042726221727207303, + "learning_rate": 4.001214329083182e-05, + "loss": 0.0, + "step": 2560 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 0.0007618932868354023, + "learning_rate": 3.998178506375228e-05, + "loss": 0.0001, + "step": 2565 + }, + { + "epoch": 2.808743169398907, + "grad_norm": 0.0008412741590291262, + "learning_rate": 3.995142683667274e-05, + "loss": 0.0001, + "step": 2570 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 0.0014517188537865877, + "learning_rate": 3.9921068609593206e-05, + "loss": 0.0004, + "step": 2575 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 0.00429339986294508, + "learning_rate": 3.989071038251366e-05, + "loss": 0.0486, + "step": 2580 + }, + { + "epoch": 2.8251366120218577, + "grad_norm": 0.00370145495980978, + "learning_rate": 3.986035215543412e-05, + "loss": 0.0003, + "step": 2585 + }, + { + "epoch": 2.830601092896175, + "grad_norm": 0.03380095958709717, + "learning_rate": 3.9829993928354586e-05, + "loss": 0.1322, + "step": 2590 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 0.0033515288960188627, + "learning_rate": 3.979963570127504e-05, + "loss": 0.0001, + "step": 2595 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 0.0014321180060505867, + "learning_rate": 3.976927747419551e-05, + "loss": 0.0001, + "step": 2600 + }, + { + "epoch": 2.8469945355191255, + "grad_norm": 0.03204883262515068, + "learning_rate": 3.9738919247115966e-05, + "loss": 0.0003, + "step": 2605 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 0.008516996167600155, + "learning_rate": 3.970856102003643e-05, + "loss": 0.0001, + "step": 2610 + }, + { + "epoch": 2.8579234972677594, + "grad_norm": 0.001425994443707168, + "learning_rate": 3.9678202792956895e-05, + "loss": 0.0, + "step": 2615 + }, + { + "epoch": 2.8633879781420766, + "grad_norm": 0.0012937316205352545, + "learning_rate": 3.964784456587736e-05, + "loss": 0.0, + "step": 2620 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 0.03010261245071888, + "learning_rate": 3.961748633879782e-05, + "loss": 0.0002, + "step": 2625 + }, + { + "epoch": 2.8743169398907105, + "grad_norm": 0.0004600689571816474, + "learning_rate": 3.958712811171828e-05, + "loss": 0.0, + "step": 2630 + }, + { + "epoch": 2.879781420765027, + "grad_norm": 0.00028389832004904747, + "learning_rate": 3.955676988463874e-05, + "loss": 0.0, + "step": 2635 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 0.0002460504474584013, + "learning_rate": 3.95264116575592e-05, + "loss": 0.0, + "step": 2640 + }, + { + "epoch": 2.890710382513661, + "grad_norm": 0.0009908434003591537, + "learning_rate": 3.949605343047966e-05, + "loss": 0.0, + "step": 2645 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 0.00033436762169003487, + "learning_rate": 3.946569520340012e-05, + "loss": 0.0001, + "step": 2650 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 0.13410432636737823, + "learning_rate": 3.9435336976320584e-05, + "loss": 0.0004, + "step": 2655 + }, + { + "epoch": 2.907103825136612, + "grad_norm": 0.0007000039331614971, + "learning_rate": 3.940497874924105e-05, + "loss": 0.0, + "step": 2660 + }, + { + "epoch": 2.912568306010929, + "grad_norm": 0.0011719991452991962, + "learning_rate": 3.937462052216151e-05, + "loss": 0.0001, + "step": 2665 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 0.0003357301466166973, + "learning_rate": 3.934426229508197e-05, + "loss": 0.0, + "step": 2670 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 0.00033655870356597006, + "learning_rate": 3.9313904068002435e-05, + "loss": 0.0, + "step": 2675 + }, + { + "epoch": 2.92896174863388, + "grad_norm": 0.000492806953843683, + "learning_rate": 3.928354584092289e-05, + "loss": 0.0, + "step": 2680 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 3.1253645420074463, + "learning_rate": 3.925318761384335e-05, + "loss": 0.003, + "step": 2685 + }, + { + "epoch": 2.939890710382514, + "grad_norm": 0.0002712997666094452, + "learning_rate": 3.9222829386763815e-05, + "loss": 0.0001, + "step": 2690 + }, + { + "epoch": 2.9453551912568305, + "grad_norm": 0.00011406480916775763, + "learning_rate": 3.919247115968427e-05, + "loss": 0.0, + "step": 2695 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.00010400738392490894, + "learning_rate": 3.916211293260474e-05, + "loss": 0.0, + "step": 2700 + }, + { + "epoch": 2.9562841530054644, + "grad_norm": 0.00011682388139888644, + "learning_rate": 3.9131754705525195e-05, + "loss": 0.0, + "step": 2705 + }, + { + "epoch": 2.9617486338797816, + "grad_norm": 0.00019643260748125613, + "learning_rate": 3.910139647844566e-05, + "loss": 0.3659, + "step": 2710 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 0.002085154876112938, + "learning_rate": 3.9071038251366124e-05, + "loss": 0.0011, + "step": 2715 + }, + { + "epoch": 2.972677595628415, + "grad_norm": 0.00412380276247859, + "learning_rate": 3.904068002428659e-05, + "loss": 0.0003, + "step": 2720 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 0.007716290187090635, + "learning_rate": 3.9010321797207046e-05, + "loss": 0.0001, + "step": 2725 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 0.00448417104780674, + "learning_rate": 3.897996357012751e-05, + "loss": 0.0002, + "step": 2730 + }, + { + "epoch": 2.989071038251366, + "grad_norm": 0.008972808718681335, + "learning_rate": 3.894960534304797e-05, + "loss": 0.0001, + "step": 2735 + }, + { + "epoch": 2.994535519125683, + "grad_norm": 0.0038951467722654343, + "learning_rate": 3.8919247115968426e-05, + "loss": 0.0001, + "step": 2740 + }, + { + "epoch": 3.0, + "grad_norm": 0.00253617693670094, + "learning_rate": 3.888888888888889e-05, + "loss": 0.0003, + "step": 2745 + }, + { + "epoch": 3.0, + "eval_loss": 0.005263752304017544, + "eval_runtime": 654.9252, + "eval_samples_per_second": 11.171, + "eval_steps_per_second": 1.397, + "step": 2745 + }, + { + "epoch": 3.0054644808743167, + "grad_norm": 0.003645621705800295, + "learning_rate": 3.885853066180935e-05, + "loss": 0.0001, + "step": 2750 + }, + { + "epoch": 3.010928961748634, + "grad_norm": 0.0024589765816926956, + "learning_rate": 3.882817243472981e-05, + "loss": 0.0001, + "step": 2755 + }, + { + "epoch": 3.0163934426229506, + "grad_norm": 0.00158478575758636, + "learning_rate": 3.879781420765027e-05, + "loss": 0.0, + "step": 2760 + }, + { + "epoch": 3.021857923497268, + "grad_norm": 0.000341015518642962, + "learning_rate": 3.8767455980570735e-05, + "loss": 0.0, + "step": 2765 + }, + { + "epoch": 3.0273224043715845, + "grad_norm": 0.0012479755096137524, + "learning_rate": 3.87370977534912e-05, + "loss": 0.0, + "step": 2770 + }, + { + "epoch": 3.0327868852459017, + "grad_norm": 0.0006929456721991301, + "learning_rate": 3.8706739526411664e-05, + "loss": 0.0, + "step": 2775 + }, + { + "epoch": 3.0382513661202184, + "grad_norm": 0.0006383946747519076, + "learning_rate": 3.867638129933212e-05, + "loss": 0.0, + "step": 2780 + }, + { + "epoch": 3.0437158469945356, + "grad_norm": 0.001444321358576417, + "learning_rate": 3.864602307225258e-05, + "loss": 0.0, + "step": 2785 + }, + { + "epoch": 3.0491803278688523, + "grad_norm": 0.00034748337930068374, + "learning_rate": 3.8615664845173044e-05, + "loss": 0.0, + "step": 2790 + }, + { + "epoch": 3.0546448087431695, + "grad_norm": 0.0002488761965651065, + "learning_rate": 3.85853066180935e-05, + "loss": 0.0, + "step": 2795 + }, + { + "epoch": 3.060109289617486, + "grad_norm": 0.0011106929741799831, + "learning_rate": 3.8554948391013967e-05, + "loss": 0.0, + "step": 2800 + }, + { + "epoch": 3.0655737704918034, + "grad_norm": 0.0006728707812726498, + "learning_rate": 3.8524590163934424e-05, + "loss": 0.0, + "step": 2805 + }, + { + "epoch": 3.07103825136612, + "grad_norm": 0.0007144726696424186, + "learning_rate": 3.849423193685489e-05, + "loss": 0.0, + "step": 2810 + }, + { + "epoch": 3.0765027322404372, + "grad_norm": 0.0004376015567686409, + "learning_rate": 3.8463873709775347e-05, + "loss": 0.0, + "step": 2815 + }, + { + "epoch": 3.081967213114754, + "grad_norm": 0.0008594472892582417, + "learning_rate": 3.843351548269581e-05, + "loss": 0.0, + "step": 2820 + }, + { + "epoch": 3.087431693989071, + "grad_norm": 0.0019469836261123419, + "learning_rate": 3.8403157255616276e-05, + "loss": 0.2017, + "step": 2825 + }, + { + "epoch": 3.092896174863388, + "grad_norm": 0.00610232213512063, + "learning_rate": 3.837279902853674e-05, + "loss": 0.0001, + "step": 2830 + }, + { + "epoch": 3.098360655737705, + "grad_norm": 0.0029869854915887117, + "learning_rate": 3.83424408014572e-05, + "loss": 0.0001, + "step": 2835 + }, + { + "epoch": 3.1038251366120218, + "grad_norm": 0.0915936678647995, + "learning_rate": 3.8312082574377656e-05, + "loss": 0.002, + "step": 2840 + }, + { + "epoch": 3.109289617486339, + "grad_norm": 0.007518662605434656, + "learning_rate": 3.828172434729812e-05, + "loss": 0.0001, + "step": 2845 + }, + { + "epoch": 3.1147540983606556, + "grad_norm": 0.003184305736795068, + "learning_rate": 3.825136612021858e-05, + "loss": 0.0008, + "step": 2850 + }, + { + "epoch": 3.120218579234973, + "grad_norm": 0.004786128643900156, + "learning_rate": 3.822100789313904e-05, + "loss": 0.0005, + "step": 2855 + }, + { + "epoch": 3.1256830601092895, + "grad_norm": 0.003782659536227584, + "learning_rate": 3.81906496660595e-05, + "loss": 0.0003, + "step": 2860 + }, + { + "epoch": 3.1311475409836067, + "grad_norm": 0.006366930436342955, + "learning_rate": 3.8160291438979965e-05, + "loss": 0.0004, + "step": 2865 + }, + { + "epoch": 3.1366120218579234, + "grad_norm": 0.0005854523042216897, + "learning_rate": 3.812993321190043e-05, + "loss": 0.0, + "step": 2870 + }, + { + "epoch": 3.1420765027322406, + "grad_norm": 0.044832780957221985, + "learning_rate": 3.8099574984820894e-05, + "loss": 0.0002, + "step": 2875 + }, + { + "epoch": 3.1475409836065573, + "grad_norm": 0.00037537323078140616, + "learning_rate": 3.806921675774135e-05, + "loss": 0.0, + "step": 2880 + }, + { + "epoch": 3.1530054644808745, + "grad_norm": 0.00020258377480786294, + "learning_rate": 3.8038858530661816e-05, + "loss": 0.0, + "step": 2885 + }, + { + "epoch": 3.158469945355191, + "grad_norm": 0.00040754053043201566, + "learning_rate": 3.8008500303582274e-05, + "loss": 0.0, + "step": 2890 + }, + { + "epoch": 3.1639344262295084, + "grad_norm": 0.00027699521160684526, + "learning_rate": 3.797814207650273e-05, + "loss": 0.0, + "step": 2895 + }, + { + "epoch": 3.169398907103825, + "grad_norm": 0.00014932159683667123, + "learning_rate": 3.7947783849423196e-05, + "loss": 0.0, + "step": 2900 + }, + { + "epoch": 3.1748633879781423, + "grad_norm": 0.00015883053129073232, + "learning_rate": 3.7917425622343654e-05, + "loss": 0.0, + "step": 2905 + }, + { + "epoch": 3.180327868852459, + "grad_norm": 0.0006496338173747063, + "learning_rate": 3.788706739526412e-05, + "loss": 0.0001, + "step": 2910 + }, + { + "epoch": 3.185792349726776, + "grad_norm": 0.00013984047109261155, + "learning_rate": 3.7856709168184576e-05, + "loss": 0.0007, + "step": 2915 + }, + { + "epoch": 3.191256830601093, + "grad_norm": 8.843530667945743e-05, + "learning_rate": 3.782635094110504e-05, + "loss": 0.0001, + "step": 2920 + }, + { + "epoch": 3.19672131147541, + "grad_norm": 0.023396478965878487, + "learning_rate": 3.7795992714025505e-05, + "loss": 0.0001, + "step": 2925 + }, + { + "epoch": 3.202185792349727, + "grad_norm": 7.777877908665687e-05, + "learning_rate": 3.776563448694597e-05, + "loss": 0.0, + "step": 2930 + }, + { + "epoch": 3.2076502732240435, + "grad_norm": 8.528177568223327e-05, + "learning_rate": 3.773527625986643e-05, + "loss": 0.0, + "step": 2935 + }, + { + "epoch": 3.2131147540983607, + "grad_norm": 0.012153876014053822, + "learning_rate": 3.7704918032786885e-05, + "loss": 0.0001, + "step": 2940 + }, + { + "epoch": 3.2185792349726774, + "grad_norm": 7.584688864881173e-05, + "learning_rate": 3.767455980570735e-05, + "loss": 0.0, + "step": 2945 + }, + { + "epoch": 3.2240437158469946, + "grad_norm": 6.136555748526007e-05, + "learning_rate": 3.764420157862781e-05, + "loss": 0.0, + "step": 2950 + }, + { + "epoch": 3.2295081967213113, + "grad_norm": 0.001584029640071094, + "learning_rate": 3.761384335154827e-05, + "loss": 0.0, + "step": 2955 + }, + { + "epoch": 3.2349726775956285, + "grad_norm": 7.862479105824605e-05, + "learning_rate": 3.758348512446873e-05, + "loss": 0.0, + "step": 2960 + }, + { + "epoch": 3.240437158469945, + "grad_norm": 0.0008851040038280189, + "learning_rate": 3.7553126897389194e-05, + "loss": 0.0, + "step": 2965 + }, + { + "epoch": 3.2459016393442623, + "grad_norm": 5.676416913047433e-05, + "learning_rate": 3.752276867030965e-05, + "loss": 0.0001, + "step": 2970 + }, + { + "epoch": 3.251366120218579, + "grad_norm": 4.914839267730713, + "learning_rate": 3.7492410443230116e-05, + "loss": 0.0017, + "step": 2975 + }, + { + "epoch": 3.2568306010928962, + "grad_norm": 0.00020339513139333576, + "learning_rate": 3.746205221615058e-05, + "loss": 0.0, + "step": 2980 + }, + { + "epoch": 3.262295081967213, + "grad_norm": 0.0002010238531511277, + "learning_rate": 3.7431693989071045e-05, + "loss": 0.0, + "step": 2985 + }, + { + "epoch": 3.26775956284153, + "grad_norm": 7.606866711284965e-05, + "learning_rate": 3.74013357619915e-05, + "loss": 0.1313, + "step": 2990 + }, + { + "epoch": 3.273224043715847, + "grad_norm": 0.0005323002696968615, + "learning_rate": 3.737097753491196e-05, + "loss": 0.0755, + "step": 2995 + }, + { + "epoch": 3.278688524590164, + "grad_norm": 5.004248669138178e-05, + "learning_rate": 3.7340619307832425e-05, + "loss": 0.0001, + "step": 3000 + }, + { + "epoch": 3.2841530054644807, + "grad_norm": 12.265705108642578, + "learning_rate": 3.731026108075288e-05, + "loss": 0.3303, + "step": 3005 + }, + { + "epoch": 3.289617486338798, + "grad_norm": 0.09155070781707764, + "learning_rate": 3.727990285367335e-05, + "loss": 0.0004, + "step": 3010 + }, + { + "epoch": 3.2950819672131146, + "grad_norm": 0.0008653182885609567, + "learning_rate": 3.7249544626593805e-05, + "loss": 0.0, + "step": 3015 + }, + { + "epoch": 3.300546448087432, + "grad_norm": 0.0018131888937205076, + "learning_rate": 3.721918639951427e-05, + "loss": 0.0001, + "step": 3020 + }, + { + "epoch": 3.3060109289617485, + "grad_norm": 0.00172705901786685, + "learning_rate": 3.7188828172434734e-05, + "loss": 0.0003, + "step": 3025 + }, + { + "epoch": 3.3114754098360657, + "grad_norm": 0.00302744354121387, + "learning_rate": 3.71584699453552e-05, + "loss": 0.0003, + "step": 3030 + }, + { + "epoch": 3.3169398907103824, + "grad_norm": 0.0013879016041755676, + "learning_rate": 3.7128111718275657e-05, + "loss": 0.0005, + "step": 3035 + }, + { + "epoch": 3.3224043715846996, + "grad_norm": 0.024859091266989708, + "learning_rate": 3.7097753491196114e-05, + "loss": 0.0002, + "step": 3040 + }, + { + "epoch": 3.3278688524590163, + "grad_norm": 0.0013450469123199582, + "learning_rate": 3.706739526411658e-05, + "loss": 0.0, + "step": 3045 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.0011400197399780154, + "learning_rate": 3.7037037037037037e-05, + "loss": 0.0, + "step": 3050 + }, + { + "epoch": 3.33879781420765, + "grad_norm": 0.0009009820641949773, + "learning_rate": 3.70066788099575e-05, + "loss": 0.0002, + "step": 3055 + }, + { + "epoch": 3.3442622950819674, + "grad_norm": 0.005573240574449301, + "learning_rate": 3.697632058287796e-05, + "loss": 0.0001, + "step": 3060 + }, + { + "epoch": 3.349726775956284, + "grad_norm": 0.0005607991479337215, + "learning_rate": 3.694596235579842e-05, + "loss": 0.0, + "step": 3065 + }, + { + "epoch": 3.3551912568306013, + "grad_norm": 0.00025221219402737916, + "learning_rate": 3.691560412871888e-05, + "loss": 0.0001, + "step": 3070 + }, + { + "epoch": 3.360655737704918, + "grad_norm": 0.0004546682757791132, + "learning_rate": 3.6885245901639346e-05, + "loss": 0.0, + "step": 3075 + }, + { + "epoch": 3.366120218579235, + "grad_norm": 0.00032192622893489897, + "learning_rate": 3.685488767455981e-05, + "loss": 0.0, + "step": 3080 + }, + { + "epoch": 3.371584699453552, + "grad_norm": 0.0008003158727660775, + "learning_rate": 3.6824529447480275e-05, + "loss": 0.0, + "step": 3085 + }, + { + "epoch": 3.3770491803278686, + "grad_norm": 0.00030273263109847903, + "learning_rate": 3.679417122040073e-05, + "loss": 0.0, + "step": 3090 + }, + { + "epoch": 3.3825136612021858, + "grad_norm": 0.0005664460477419198, + "learning_rate": 3.676381299332119e-05, + "loss": 0.0, + "step": 3095 + }, + { + "epoch": 3.387978142076503, + "grad_norm": 0.2617197632789612, + "learning_rate": 3.6733454766241655e-05, + "loss": 0.0003, + "step": 3100 + }, + { + "epoch": 3.3934426229508197, + "grad_norm": 0.003910338506102562, + "learning_rate": 3.670309653916211e-05, + "loss": 0.0, + "step": 3105 + }, + { + "epoch": 3.3989071038251364, + "grad_norm": 0.00028743871371261775, + "learning_rate": 3.667273831208258e-05, + "loss": 0.0, + "step": 3110 + }, + { + "epoch": 3.4043715846994536, + "grad_norm": 0.00032127246959134936, + "learning_rate": 3.6642380085003035e-05, + "loss": 0.0, + "step": 3115 + }, + { + "epoch": 3.4098360655737707, + "grad_norm": 0.00023537426022812724, + "learning_rate": 3.66120218579235e-05, + "loss": 0.0, + "step": 3120 + }, + { + "epoch": 3.4153005464480874, + "grad_norm": 0.00016202160622924566, + "learning_rate": 3.658166363084396e-05, + "loss": 0.0, + "step": 3125 + }, + { + "epoch": 3.420765027322404, + "grad_norm": 0.0001964465918717906, + "learning_rate": 3.655130540376442e-05, + "loss": 0.0, + "step": 3130 + }, + { + "epoch": 3.4262295081967213, + "grad_norm": 0.0026353199500590563, + "learning_rate": 3.6520947176684886e-05, + "loss": 0.0, + "step": 3135 + }, + { + "epoch": 3.431693989071038, + "grad_norm": 0.0001332706306129694, + "learning_rate": 3.6490588949605344e-05, + "loss": 0.0, + "step": 3140 + }, + { + "epoch": 3.4371584699453552, + "grad_norm": 0.00010366823698859662, + "learning_rate": 3.646023072252581e-05, + "loss": 0.0, + "step": 3145 + }, + { + "epoch": 3.442622950819672, + "grad_norm": 0.0001762459141900763, + "learning_rate": 3.6429872495446266e-05, + "loss": 0.0, + "step": 3150 + }, + { + "epoch": 3.448087431693989, + "grad_norm": 0.00022949972481001168, + "learning_rate": 3.639951426836673e-05, + "loss": 0.0, + "step": 3155 + }, + { + "epoch": 3.453551912568306, + "grad_norm": 0.014571224339306355, + "learning_rate": 3.636915604128719e-05, + "loss": 0.0001, + "step": 3160 + }, + { + "epoch": 3.459016393442623, + "grad_norm": 0.003986168187111616, + "learning_rate": 3.633879781420765e-05, + "loss": 0.0, + "step": 3165 + }, + { + "epoch": 3.4644808743169397, + "grad_norm": 0.00015372096095234156, + "learning_rate": 3.630843958712811e-05, + "loss": 0.0001, + "step": 3170 + }, + { + "epoch": 3.469945355191257, + "grad_norm": 0.00022497742611449212, + "learning_rate": 3.6278081360048575e-05, + "loss": 0.0, + "step": 3175 + }, + { + "epoch": 3.4754098360655736, + "grad_norm": 0.00011189820361323655, + "learning_rate": 3.624772313296903e-05, + "loss": 0.0002, + "step": 3180 + }, + { + "epoch": 3.480874316939891, + "grad_norm": 0.00029375962913036346, + "learning_rate": 3.62173649058895e-05, + "loss": 0.0, + "step": 3185 + }, + { + "epoch": 3.4863387978142075, + "grad_norm": 0.00013606030552182347, + "learning_rate": 3.618700667880996e-05, + "loss": 0.0, + "step": 3190 + }, + { + "epoch": 3.4918032786885247, + "grad_norm": 0.00935420859605074, + "learning_rate": 3.615664845173042e-05, + "loss": 0.0, + "step": 3195 + }, + { + "epoch": 3.4972677595628414, + "grad_norm": 8.235384302679449e-05, + "learning_rate": 3.6126290224650884e-05, + "loss": 0.0, + "step": 3200 + }, + { + "epoch": 3.5027322404371586, + "grad_norm": 0.00010109972208738327, + "learning_rate": 3.609593199757134e-05, + "loss": 0.0, + "step": 3205 + }, + { + "epoch": 3.5081967213114753, + "grad_norm": 7.307323539862409e-05, + "learning_rate": 3.6065573770491806e-05, + "loss": 0.0, + "step": 3210 + }, + { + "epoch": 3.5136612021857925, + "grad_norm": 0.0001078033892554231, + "learning_rate": 3.6035215543412264e-05, + "loss": 0.0, + "step": 3215 + }, + { + "epoch": 3.519125683060109, + "grad_norm": 0.000121029355796054, + "learning_rate": 3.600485731633273e-05, + "loss": 0.0, + "step": 3220 + }, + { + "epoch": 3.5245901639344264, + "grad_norm": 0.0001414872967870906, + "learning_rate": 3.5974499089253186e-05, + "loss": 0.0001, + "step": 3225 + }, + { + "epoch": 3.530054644808743, + "grad_norm": 0.00011328158143442124, + "learning_rate": 3.594414086217365e-05, + "loss": 0.0, + "step": 3230 + }, + { + "epoch": 3.5355191256830603, + "grad_norm": 0.00020013573521282524, + "learning_rate": 3.5913782635094115e-05, + "loss": 0.0, + "step": 3235 + }, + { + "epoch": 3.540983606557377, + "grad_norm": 0.00011696962610585615, + "learning_rate": 3.588342440801457e-05, + "loss": 0.0, + "step": 3240 + }, + { + "epoch": 3.546448087431694, + "grad_norm": 7.634115172550082e-05, + "learning_rate": 3.585306618093504e-05, + "loss": 0.0, + "step": 3245 + }, + { + "epoch": 3.551912568306011, + "grad_norm": 0.006965294014662504, + "learning_rate": 3.5822707953855495e-05, + "loss": 0.0, + "step": 3250 + }, + { + "epoch": 3.557377049180328, + "grad_norm": 0.0001289156498387456, + "learning_rate": 3.579234972677596e-05, + "loss": 0.0, + "step": 3255 + }, + { + "epoch": 3.5628415300546448, + "grad_norm": 0.00012060425797244534, + "learning_rate": 3.576199149969642e-05, + "loss": 0.0, + "step": 3260 + }, + { + "epoch": 3.5683060109289615, + "grad_norm": 0.0001801663893274963, + "learning_rate": 3.573163327261688e-05, + "loss": 0.0, + "step": 3265 + }, + { + "epoch": 3.5737704918032787, + "grad_norm": 9.889312786981463e-05, + "learning_rate": 3.570127504553734e-05, + "loss": 0.0, + "step": 3270 + }, + { + "epoch": 3.579234972677596, + "grad_norm": 8.248529047705233e-05, + "learning_rate": 3.5670916818457804e-05, + "loss": 0.0, + "step": 3275 + }, + { + "epoch": 3.5846994535519126, + "grad_norm": 0.00016700288688298315, + "learning_rate": 3.564055859137826e-05, + "loss": 0.0, + "step": 3280 + }, + { + "epoch": 3.5901639344262293, + "grad_norm": 8.013186015887186e-05, + "learning_rate": 3.5610200364298727e-05, + "loss": 0.0, + "step": 3285 + }, + { + "epoch": 3.5956284153005464, + "grad_norm": 9.400265844305977e-05, + "learning_rate": 3.557984213721919e-05, + "loss": 0.0, + "step": 3290 + }, + { + "epoch": 3.6010928961748636, + "grad_norm": 0.0011224903864786029, + "learning_rate": 3.554948391013965e-05, + "loss": 0.0, + "step": 3295 + }, + { + "epoch": 3.6065573770491803, + "grad_norm": 0.0001463467488065362, + "learning_rate": 3.551912568306011e-05, + "loss": 0.0, + "step": 3300 + }, + { + "epoch": 3.612021857923497, + "grad_norm": 0.0007439145119860768, + "learning_rate": 3.548876745598057e-05, + "loss": 0.0, + "step": 3305 + }, + { + "epoch": 3.6174863387978142, + "grad_norm": 0.00013679706898983568, + "learning_rate": 3.5458409228901036e-05, + "loss": 0.0, + "step": 3310 + }, + { + "epoch": 3.6229508196721314, + "grad_norm": 0.00010268126789014786, + "learning_rate": 3.542805100182149e-05, + "loss": 0.0, + "step": 3315 + }, + { + "epoch": 3.628415300546448, + "grad_norm": 0.00028116197790950537, + "learning_rate": 3.539769277474196e-05, + "loss": 0.0, + "step": 3320 + }, + { + "epoch": 3.633879781420765, + "grad_norm": 9.449726348975673e-05, + "learning_rate": 3.5367334547662416e-05, + "loss": 0.0, + "step": 3325 + }, + { + "epoch": 3.639344262295082, + "grad_norm": 0.00010181035031564534, + "learning_rate": 3.533697632058288e-05, + "loss": 0.0, + "step": 3330 + }, + { + "epoch": 3.644808743169399, + "grad_norm": 6.961561302887276e-05, + "learning_rate": 3.530661809350334e-05, + "loss": 0.0, + "step": 3335 + }, + { + "epoch": 3.650273224043716, + "grad_norm": 0.00010002183262258768, + "learning_rate": 3.52762598664238e-05, + "loss": 0.0, + "step": 3340 + }, + { + "epoch": 3.6557377049180326, + "grad_norm": 9.894008690025657e-05, + "learning_rate": 3.524590163934427e-05, + "loss": 0.0, + "step": 3345 + }, + { + "epoch": 3.66120218579235, + "grad_norm": 0.00013567868154495955, + "learning_rate": 3.5215543412264725e-05, + "loss": 0.0, + "step": 3350 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.0002798748027998954, + "learning_rate": 3.518518518518519e-05, + "loss": 0.0106, + "step": 3355 + }, + { + "epoch": 3.6721311475409837, + "grad_norm": 0.0008901433320716023, + "learning_rate": 3.515482695810565e-05, + "loss": 0.0, + "step": 3360 + }, + { + "epoch": 3.6775956284153004, + "grad_norm": 0.008999276906251907, + "learning_rate": 3.512446873102611e-05, + "loss": 0.0001, + "step": 3365 + }, + { + "epoch": 3.6830601092896176, + "grad_norm": 0.0022385860793292522, + "learning_rate": 3.509411050394657e-05, + "loss": 0.0, + "step": 3370 + }, + { + "epoch": 3.6885245901639343, + "grad_norm": 0.0003485867637209594, + "learning_rate": 3.5063752276867034e-05, + "loss": 0.0, + "step": 3375 + }, + { + "epoch": 3.6939890710382515, + "grad_norm": 0.00025722169084474444, + "learning_rate": 3.503339404978749e-05, + "loss": 0.0, + "step": 3380 + }, + { + "epoch": 3.699453551912568, + "grad_norm": 7.106779230525717e-05, + "learning_rate": 3.5003035822707956e-05, + "loss": 0.0, + "step": 3385 + }, + { + "epoch": 3.7049180327868854, + "grad_norm": 0.0002384383842581883, + "learning_rate": 3.4972677595628414e-05, + "loss": 0.0, + "step": 3390 + }, + { + "epoch": 3.710382513661202, + "grad_norm": 0.0002946928725577891, + "learning_rate": 3.494231936854888e-05, + "loss": 0.1995, + "step": 3395 + }, + { + "epoch": 3.7158469945355193, + "grad_norm": 0.0007392108091153204, + "learning_rate": 3.491196114146934e-05, + "loss": 0.0, + "step": 3400 + }, + { + "epoch": 3.721311475409836, + "grad_norm": 0.001694119768217206, + "learning_rate": 3.48816029143898e-05, + "loss": 0.0, + "step": 3405 + }, + { + "epoch": 3.726775956284153, + "grad_norm": 0.0009142764611169696, + "learning_rate": 3.4851244687310265e-05, + "loss": 0.0, + "step": 3410 + }, + { + "epoch": 3.73224043715847, + "grad_norm": 0.0035774747375398874, + "learning_rate": 3.482088646023072e-05, + "loss": 0.0001, + "step": 3415 + }, + { + "epoch": 3.737704918032787, + "grad_norm": 0.0033966186456382275, + "learning_rate": 3.479052823315119e-05, + "loss": 0.0009, + "step": 3420 + }, + { + "epoch": 3.7431693989071038, + "grad_norm": 0.0004668271285481751, + "learning_rate": 3.4760170006071645e-05, + "loss": 0.0001, + "step": 3425 + }, + { + "epoch": 3.748633879781421, + "grad_norm": 0.0003218352794647217, + "learning_rate": 3.472981177899211e-05, + "loss": 0.0001, + "step": 3430 + }, + { + "epoch": 3.7540983606557377, + "grad_norm": 0.037057679146528244, + "learning_rate": 3.469945355191257e-05, + "loss": 0.0001, + "step": 3435 + }, + { + "epoch": 3.7595628415300544, + "grad_norm": 0.031188733875751495, + "learning_rate": 3.466909532483303e-05, + "loss": 0.001, + "step": 3440 + }, + { + "epoch": 3.7650273224043715, + "grad_norm": 0.00015825718583073467, + "learning_rate": 3.4638737097753496e-05, + "loss": 0.0005, + "step": 3445 + }, + { + "epoch": 3.7704918032786887, + "grad_norm": 0.0037946077063679695, + "learning_rate": 3.4608378870673954e-05, + "loss": 0.0, + "step": 3450 + }, + { + "epoch": 3.7759562841530054, + "grad_norm": 0.0002360966900596395, + "learning_rate": 3.457802064359442e-05, + "loss": 0.0, + "step": 3455 + }, + { + "epoch": 3.781420765027322, + "grad_norm": 0.024776164442300797, + "learning_rate": 3.4547662416514876e-05, + "loss": 0.0003, + "step": 3460 + }, + { + "epoch": 3.7868852459016393, + "grad_norm": 0.0002828339929692447, + "learning_rate": 3.451730418943534e-05, + "loss": 0.0, + "step": 3465 + }, + { + "epoch": 3.7923497267759565, + "grad_norm": 0.00010138905781786889, + "learning_rate": 3.44869459623558e-05, + "loss": 0.0001, + "step": 3470 + }, + { + "epoch": 3.797814207650273, + "grad_norm": 0.00016995143960230052, + "learning_rate": 3.445658773527626e-05, + "loss": 0.0, + "step": 3475 + }, + { + "epoch": 3.80327868852459, + "grad_norm": 0.00012559971946757287, + "learning_rate": 3.442622950819672e-05, + "loss": 0.0, + "step": 3480 + }, + { + "epoch": 3.808743169398907, + "grad_norm": 0.0006380723789334297, + "learning_rate": 3.4395871281117185e-05, + "loss": 0.0, + "step": 3485 + }, + { + "epoch": 3.8142076502732243, + "grad_norm": 0.00035217360709793866, + "learning_rate": 3.436551305403764e-05, + "loss": 0.0001, + "step": 3490 + }, + { + "epoch": 3.819672131147541, + "grad_norm": 0.0002526458411011845, + "learning_rate": 3.433515482695811e-05, + "loss": 0.0, + "step": 3495 + }, + { + "epoch": 3.8251366120218577, + "grad_norm": 9.788498573470861e-05, + "learning_rate": 3.430479659987857e-05, + "loss": 0.0, + "step": 3500 + }, + { + "epoch": 3.830601092896175, + "grad_norm": 0.00022861824254505336, + "learning_rate": 3.427443837279903e-05, + "loss": 0.0001, + "step": 3505 + }, + { + "epoch": 3.836065573770492, + "grad_norm": 0.00013972212036605924, + "learning_rate": 3.4244080145719494e-05, + "loss": 0.0, + "step": 3510 + }, + { + "epoch": 3.841530054644809, + "grad_norm": 0.0001966755517059937, + "learning_rate": 3.421372191863995e-05, + "loss": 0.0, + "step": 3515 + }, + { + "epoch": 3.8469945355191255, + "grad_norm": 0.0006457020062953234, + "learning_rate": 3.4183363691560417e-05, + "loss": 0.0, + "step": 3520 + }, + { + "epoch": 3.8524590163934427, + "grad_norm": 0.00014496638323180377, + "learning_rate": 3.4153005464480874e-05, + "loss": 0.0, + "step": 3525 + }, + { + "epoch": 3.8579234972677594, + "grad_norm": 7.40105751901865e-05, + "learning_rate": 3.412264723740134e-05, + "loss": 0.0, + "step": 3530 + }, + { + "epoch": 3.8633879781420766, + "grad_norm": 0.041569244116544724, + "learning_rate": 3.4092289010321797e-05, + "loss": 0.0003, + "step": 3535 + }, + { + "epoch": 3.8688524590163933, + "grad_norm": 0.0001943446695804596, + "learning_rate": 3.406193078324226e-05, + "loss": 0.0, + "step": 3540 + }, + { + "epoch": 3.8743169398907105, + "grad_norm": 0.00015503684699069709, + "learning_rate": 3.403157255616272e-05, + "loss": 0.0, + "step": 3545 + }, + { + "epoch": 3.879781420765027, + "grad_norm": 0.000700871052686125, + "learning_rate": 3.400121432908318e-05, + "loss": 0.0, + "step": 3550 + }, + { + "epoch": 3.8852459016393444, + "grad_norm": 5.401040834840387e-05, + "learning_rate": 3.397085610200365e-05, + "loss": 0.0, + "step": 3555 + }, + { + "epoch": 3.890710382513661, + "grad_norm": 0.000344992644386366, + "learning_rate": 3.3940497874924106e-05, + "loss": 0.0, + "step": 3560 + }, + { + "epoch": 3.8961748633879782, + "grad_norm": 6.189793202793226e-05, + "learning_rate": 3.391013964784457e-05, + "loss": 0.0274, + "step": 3565 + }, + { + "epoch": 3.901639344262295, + "grad_norm": 6.884944741614163e-05, + "learning_rate": 3.387978142076503e-05, + "loss": 0.0, + "step": 3570 + }, + { + "epoch": 3.907103825136612, + "grad_norm": 6.29270653007552e-05, + "learning_rate": 3.384942319368549e-05, + "loss": 0.0, + "step": 3575 + }, + { + "epoch": 3.912568306010929, + "grad_norm": 0.00013839226448908448, + "learning_rate": 3.381906496660595e-05, + "loss": 0.0, + "step": 3580 + }, + { + "epoch": 3.918032786885246, + "grad_norm": 0.0001924206007970497, + "learning_rate": 3.3788706739526415e-05, + "loss": 0.0, + "step": 3585 + }, + { + "epoch": 3.9234972677595628, + "grad_norm": 0.013641326688230038, + "learning_rate": 3.375834851244687e-05, + "loss": 0.3372, + "step": 3590 + }, + { + "epoch": 3.92896174863388, + "grad_norm": 0.00014125218149274588, + "learning_rate": 3.372799028536734e-05, + "loss": 0.0001, + "step": 3595 + }, + { + "epoch": 3.9344262295081966, + "grad_norm": 0.005081810522824526, + "learning_rate": 3.36976320582878e-05, + "loss": 0.0, + "step": 3600 + }, + { + "epoch": 3.939890710382514, + "grad_norm": 0.00014312181156128645, + "learning_rate": 3.366727383120826e-05, + "loss": 0.0, + "step": 3605 + }, + { + "epoch": 3.9453551912568305, + "grad_norm": 0.00023116619559004903, + "learning_rate": 3.3636915604128724e-05, + "loss": 0.0, + "step": 3610 + }, + { + "epoch": 3.9508196721311473, + "grad_norm": 0.00011138137051602826, + "learning_rate": 3.360655737704918e-05, + "loss": 0.0, + "step": 3615 + }, + { + "epoch": 3.9562841530054644, + "grad_norm": 0.00024408698664046824, + "learning_rate": 3.3576199149969646e-05, + "loss": 0.0, + "step": 3620 + }, + { + "epoch": 3.9617486338797816, + "grad_norm": 0.012111087329685688, + "learning_rate": 3.3545840922890104e-05, + "loss": 0.0, + "step": 3625 + }, + { + "epoch": 3.9672131147540983, + "grad_norm": 0.0005235079443082213, + "learning_rate": 3.351548269581057e-05, + "loss": 0.0, + "step": 3630 + }, + { + "epoch": 3.972677595628415, + "grad_norm": 0.013302493840456009, + "learning_rate": 3.3485124468731026e-05, + "loss": 0.0004, + "step": 3635 + }, + { + "epoch": 3.978142076502732, + "grad_norm": 0.00025188663857989013, + "learning_rate": 3.345476624165149e-05, + "loss": 0.0, + "step": 3640 + }, + { + "epoch": 3.9836065573770494, + "grad_norm": 0.00011241805623285472, + "learning_rate": 3.342440801457195e-05, + "loss": 0.0, + "step": 3645 + }, + { + "epoch": 3.989071038251366, + "grad_norm": 0.0006041810265742242, + "learning_rate": 3.339404978749241e-05, + "loss": 0.0, + "step": 3650 + }, + { + "epoch": 3.994535519125683, + "grad_norm": 0.00017458312504459172, + "learning_rate": 3.336369156041288e-05, + "loss": 0.0, + "step": 3655 + }, + { + "epoch": 4.0, + "grad_norm": 0.00013060122728347778, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0, + "step": 3660 + }, + { + "epoch": 4.0, + "eval_loss": 0.002205207943916321, + "eval_runtime": 657.1958, + "eval_samples_per_second": 11.132, + "eval_steps_per_second": 1.392, + "step": 3660 + }, + { + "epoch": 4.005464480874317, + "grad_norm": 0.00011811502918135375, + "learning_rate": 3.33029751062538e-05, + "loss": 0.0, + "step": 3665 + }, + { + "epoch": 4.0109289617486334, + "grad_norm": 8.069877367233858e-05, + "learning_rate": 3.327261687917426e-05, + "loss": 0.0, + "step": 3670 + }, + { + "epoch": 4.016393442622951, + "grad_norm": 0.00011450869351392612, + "learning_rate": 3.324225865209472e-05, + "loss": 0.0, + "step": 3675 + }, + { + "epoch": 4.021857923497268, + "grad_norm": 0.0001386746735079214, + "learning_rate": 3.321190042501518e-05, + "loss": 0.0, + "step": 3680 + }, + { + "epoch": 4.027322404371585, + "grad_norm": 0.00010339313303120434, + "learning_rate": 3.3181542197935644e-05, + "loss": 0.0, + "step": 3685 + }, + { + "epoch": 4.032786885245901, + "grad_norm": 0.0001254361413884908, + "learning_rate": 3.31511839708561e-05, + "loss": 0.0, + "step": 3690 + }, + { + "epoch": 4.038251366120218, + "grad_norm": 0.0004021910426672548, + "learning_rate": 3.312082574377656e-05, + "loss": 0.0, + "step": 3695 + }, + { + "epoch": 4.043715846994536, + "grad_norm": 0.0001132467805291526, + "learning_rate": 3.3090467516697024e-05, + "loss": 0.0, + "step": 3700 + }, + { + "epoch": 4.049180327868853, + "grad_norm": 0.00153514021076262, + "learning_rate": 3.306010928961749e-05, + "loss": 0.0, + "step": 3705 + }, + { + "epoch": 4.054644808743169, + "grad_norm": 9.872686496237293e-05, + "learning_rate": 3.302975106253795e-05, + "loss": 0.0, + "step": 3710 + }, + { + "epoch": 4.060109289617486, + "grad_norm": 0.00011148265184601769, + "learning_rate": 3.299939283545841e-05, + "loss": 0.0, + "step": 3715 + }, + { + "epoch": 4.065573770491803, + "grad_norm": 0.00012307016004342586, + "learning_rate": 3.2969034608378875e-05, + "loss": 0.0, + "step": 3720 + }, + { + "epoch": 4.0710382513661205, + "grad_norm": 8.422257087659091e-05, + "learning_rate": 3.293867638129933e-05, + "loss": 0.0, + "step": 3725 + }, + { + "epoch": 4.076502732240437, + "grad_norm": 0.0008893508929759264, + "learning_rate": 3.29083181542198e-05, + "loss": 0.0001, + "step": 3730 + }, + { + "epoch": 4.081967213114754, + "grad_norm": 8.832754247123376e-05, + "learning_rate": 3.2877959927140255e-05, + "loss": 0.0, + "step": 3735 + }, + { + "epoch": 4.087431693989071, + "grad_norm": 9.743066038936377e-05, + "learning_rate": 3.284760170006072e-05, + "loss": 0.0, + "step": 3740 + }, + { + "epoch": 4.092896174863388, + "grad_norm": 0.00011171947699040174, + "learning_rate": 3.281724347298118e-05, + "loss": 0.0, + "step": 3745 + }, + { + "epoch": 4.098360655737705, + "grad_norm": 0.00016356426931452006, + "learning_rate": 3.2786885245901635e-05, + "loss": 0.0, + "step": 3750 + }, + { + "epoch": 4.103825136612022, + "grad_norm": 0.0012780509423464537, + "learning_rate": 3.27565270188221e-05, + "loss": 0.0, + "step": 3755 + }, + { + "epoch": 4.109289617486339, + "grad_norm": 0.00011072060442529619, + "learning_rate": 3.2726168791742564e-05, + "loss": 0.0, + "step": 3760 + }, + { + "epoch": 4.114754098360656, + "grad_norm": 0.00011896173964487389, + "learning_rate": 3.269581056466303e-05, + "loss": 0.0, + "step": 3765 + }, + { + "epoch": 4.120218579234972, + "grad_norm": 0.00013041883357800543, + "learning_rate": 3.2665452337583487e-05, + "loss": 0.0, + "step": 3770 + }, + { + "epoch": 4.1256830601092895, + "grad_norm": 0.006071773823350668, + "learning_rate": 3.263509411050395e-05, + "loss": 0.0, + "step": 3775 + }, + { + "epoch": 4.131147540983607, + "grad_norm": 8.175130642484874e-05, + "learning_rate": 3.260473588342441e-05, + "loss": 0.0, + "step": 3780 + }, + { + "epoch": 4.136612021857924, + "grad_norm": 0.005459274630993605, + "learning_rate": 3.257437765634487e-05, + "loss": 0.0, + "step": 3785 + }, + { + "epoch": 4.14207650273224, + "grad_norm": 0.00033711790456436574, + "learning_rate": 3.254401942926533e-05, + "loss": 0.0, + "step": 3790 + }, + { + "epoch": 4.147540983606557, + "grad_norm": 0.00011819975770777091, + "learning_rate": 3.251366120218579e-05, + "loss": 0.0, + "step": 3795 + }, + { + "epoch": 4.1530054644808745, + "grad_norm": 4.659785554395057e-05, + "learning_rate": 3.248330297510625e-05, + "loss": 0.0, + "step": 3800 + }, + { + "epoch": 4.158469945355192, + "grad_norm": 0.0001775735872797668, + "learning_rate": 3.245294474802672e-05, + "loss": 0.0, + "step": 3805 + }, + { + "epoch": 4.163934426229508, + "grad_norm": 0.00011735469161067158, + "learning_rate": 3.242258652094718e-05, + "loss": 0.0, + "step": 3810 + }, + { + "epoch": 4.169398907103825, + "grad_norm": 0.00012988239177502692, + "learning_rate": 3.239222829386764e-05, + "loss": 0.0, + "step": 3815 + }, + { + "epoch": 4.174863387978142, + "grad_norm": 0.004161364398896694, + "learning_rate": 3.2361870066788105e-05, + "loss": 0.0, + "step": 3820 + }, + { + "epoch": 4.180327868852459, + "grad_norm": 0.0001383601047564298, + "learning_rate": 3.233151183970856e-05, + "loss": 0.0, + "step": 3825 + }, + { + "epoch": 4.185792349726776, + "grad_norm": 8.963741856859997e-05, + "learning_rate": 3.230115361262903e-05, + "loss": 0.0, + "step": 3830 + }, + { + "epoch": 4.191256830601093, + "grad_norm": 8.04738374426961e-05, + "learning_rate": 3.2270795385549485e-05, + "loss": 0.0, + "step": 3835 + }, + { + "epoch": 4.19672131147541, + "grad_norm": 0.00012023324961774051, + "learning_rate": 3.224043715846995e-05, + "loss": 0.0, + "step": 3840 + }, + { + "epoch": 4.202185792349727, + "grad_norm": 8.154928946169093e-05, + "learning_rate": 3.221007893139041e-05, + "loss": 0.0, + "step": 3845 + }, + { + "epoch": 4.2076502732240435, + "grad_norm": 9.04480621102266e-05, + "learning_rate": 3.2179720704310865e-05, + "loss": 0.0, + "step": 3850 + }, + { + "epoch": 4.213114754098361, + "grad_norm": 0.0001085352196241729, + "learning_rate": 3.214936247723133e-05, + "loss": 0.0, + "step": 3855 + }, + { + "epoch": 4.218579234972678, + "grad_norm": 8.086708112386987e-05, + "learning_rate": 3.2119004250151794e-05, + "loss": 0.0, + "step": 3860 + }, + { + "epoch": 4.224043715846994, + "grad_norm": 0.00012486812192946672, + "learning_rate": 3.208864602307226e-05, + "loss": 0.0, + "step": 3865 + }, + { + "epoch": 4.229508196721311, + "grad_norm": 5.529926420422271e-05, + "learning_rate": 3.2058287795992716e-05, + "loss": 0.0, + "step": 3870 + }, + { + "epoch": 4.2349726775956285, + "grad_norm": 0.001104644499719143, + "learning_rate": 3.202792956891318e-05, + "loss": 0.0, + "step": 3875 + }, + { + "epoch": 4.240437158469946, + "grad_norm": 8.401143713854253e-05, + "learning_rate": 3.199757134183364e-05, + "loss": 0.0, + "step": 3880 + }, + { + "epoch": 4.245901639344262, + "grad_norm": 7.888235995778814e-05, + "learning_rate": 3.19672131147541e-05, + "loss": 0.0, + "step": 3885 + }, + { + "epoch": 4.251366120218579, + "grad_norm": 0.0001080270521924831, + "learning_rate": 3.193685488767456e-05, + "loss": 0.0, + "step": 3890 + }, + { + "epoch": 4.256830601092896, + "grad_norm": 5.707483796868473e-05, + "learning_rate": 3.1906496660595025e-05, + "loss": 0.0, + "step": 3895 + }, + { + "epoch": 4.262295081967213, + "grad_norm": 7.05120837665163e-05, + "learning_rate": 3.187613843351548e-05, + "loss": 0.0, + "step": 3900 + }, + { + "epoch": 4.26775956284153, + "grad_norm": 6.852354272268713e-05, + "learning_rate": 3.184578020643594e-05, + "loss": 0.0, + "step": 3905 + }, + { + "epoch": 4.273224043715847, + "grad_norm": 0.0026433016173541546, + "learning_rate": 3.1815421979356405e-05, + "loss": 0.0, + "step": 3910 + }, + { + "epoch": 4.278688524590164, + "grad_norm": 8.075160440057516e-05, + "learning_rate": 3.178506375227687e-05, + "loss": 0.0, + "step": 3915 + }, + { + "epoch": 4.284153005464481, + "grad_norm": 8.570487989345565e-05, + "learning_rate": 3.1754705525197334e-05, + "loss": 0.0, + "step": 3920 + }, + { + "epoch": 4.2896174863387975, + "grad_norm": 0.0025646265130490065, + "learning_rate": 3.172434729811779e-05, + "loss": 0.0, + "step": 3925 + }, + { + "epoch": 4.295081967213115, + "grad_norm": 6.75572082400322e-05, + "learning_rate": 3.1693989071038256e-05, + "loss": 0.0, + "step": 3930 + }, + { + "epoch": 4.300546448087432, + "grad_norm": 7.079667557263747e-05, + "learning_rate": 3.1663630843958714e-05, + "loss": 0.0, + "step": 3935 + }, + { + "epoch": 4.306010928961749, + "grad_norm": 4.038075712742284e-05, + "learning_rate": 3.163327261687918e-05, + "loss": 0.0, + "step": 3940 + }, + { + "epoch": 4.311475409836065, + "grad_norm": 5.5980359320528805e-05, + "learning_rate": 3.1602914389799636e-05, + "loss": 0.0, + "step": 3945 + }, + { + "epoch": 4.316939890710382, + "grad_norm": 0.00010364054469391704, + "learning_rate": 3.1572556162720094e-05, + "loss": 0.0, + "step": 3950 + }, + { + "epoch": 4.3224043715847, + "grad_norm": 5.6818211305653676e-05, + "learning_rate": 3.154219793564056e-05, + "loss": 0.0, + "step": 3955 + }, + { + "epoch": 4.327868852459017, + "grad_norm": 9.114974091062322e-05, + "learning_rate": 3.1511839708561016e-05, + "loss": 0.0, + "step": 3960 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 7.907680992502719e-05, + "learning_rate": 3.148148148148148e-05, + "loss": 0.0, + "step": 3965 + }, + { + "epoch": 4.33879781420765, + "grad_norm": 7.477829058188945e-05, + "learning_rate": 3.1451123254401945e-05, + "loss": 0.0, + "step": 3970 + }, + { + "epoch": 4.344262295081967, + "grad_norm": 0.0014824882382526994, + "learning_rate": 3.142076502732241e-05, + "loss": 0.0, + "step": 3975 + }, + { + "epoch": 4.3497267759562845, + "grad_norm": 9.136074368143454e-05, + "learning_rate": 3.139040680024287e-05, + "loss": 0.0, + "step": 3980 + }, + { + "epoch": 4.355191256830601, + "grad_norm": 0.00038130677421577275, + "learning_rate": 3.136004857316333e-05, + "loss": 0.0, + "step": 3985 + }, + { + "epoch": 4.360655737704918, + "grad_norm": 7.651744090253487e-05, + "learning_rate": 3.132969034608379e-05, + "loss": 0.0, + "step": 3990 + }, + { + "epoch": 4.366120218579235, + "grad_norm": 7.331320375669748e-05, + "learning_rate": 3.1299332119004254e-05, + "loss": 0.0, + "step": 3995 + }, + { + "epoch": 4.371584699453552, + "grad_norm": 9.19502999749966e-05, + "learning_rate": 3.126897389192471e-05, + "loss": 0.0, + "step": 4000 + }, + { + "epoch": 4.377049180327869, + "grad_norm": 7.570043089799583e-05, + "learning_rate": 3.123861566484517e-05, + "loss": 0.0, + "step": 4005 + }, + { + "epoch": 4.382513661202186, + "grad_norm": 7.069893035804853e-05, + "learning_rate": 3.1208257437765634e-05, + "loss": 0.0, + "step": 4010 + }, + { + "epoch": 4.387978142076503, + "grad_norm": 5.1304690714459866e-05, + "learning_rate": 3.11778992106861e-05, + "loss": 0.0, + "step": 4015 + }, + { + "epoch": 4.39344262295082, + "grad_norm": 8.647384674986824e-05, + "learning_rate": 3.114754098360656e-05, + "loss": 0.0, + "step": 4020 + }, + { + "epoch": 4.398907103825136, + "grad_norm": 7.207443559309468e-05, + "learning_rate": 3.111718275652702e-05, + "loss": 0.0, + "step": 4025 + }, + { + "epoch": 4.404371584699454, + "grad_norm": 0.00024384768039453775, + "learning_rate": 3.1086824529447486e-05, + "loss": 0.0, + "step": 4030 + }, + { + "epoch": 4.409836065573771, + "grad_norm": 6.280928937485442e-05, + "learning_rate": 3.105646630236794e-05, + "loss": 0.0, + "step": 4035 + }, + { + "epoch": 4.415300546448087, + "grad_norm": 5.470717223943211e-05, + "learning_rate": 3.102610807528841e-05, + "loss": 0.0, + "step": 4040 + }, + { + "epoch": 4.420765027322404, + "grad_norm": 4.7346966312034056e-05, + "learning_rate": 3.0995749848208866e-05, + "loss": 0.0, + "step": 4045 + }, + { + "epoch": 4.426229508196721, + "grad_norm": 0.00017950611072592437, + "learning_rate": 3.096539162112932e-05, + "loss": 0.0, + "step": 4050 + }, + { + "epoch": 4.4316939890710385, + "grad_norm": 5.147058254806325e-05, + "learning_rate": 3.093503339404979e-05, + "loss": 0.0, + "step": 4055 + }, + { + "epoch": 4.437158469945355, + "grad_norm": 7.883601210778579e-05, + "learning_rate": 3.0904675166970246e-05, + "loss": 0.0, + "step": 4060 + }, + { + "epoch": 4.442622950819672, + "grad_norm": 6.258031498873606e-05, + "learning_rate": 3.087431693989071e-05, + "loss": 0.0, + "step": 4065 + }, + { + "epoch": 4.448087431693989, + "grad_norm": 0.0001280458818655461, + "learning_rate": 3.0843958712811175e-05, + "loss": 0.0, + "step": 4070 + }, + { + "epoch": 4.453551912568306, + "grad_norm": 5.015691931475885e-05, + "learning_rate": 3.081360048573164e-05, + "loss": 0.0, + "step": 4075 + }, + { + "epoch": 4.459016393442623, + "grad_norm": 5.9115078329341486e-05, + "learning_rate": 3.07832422586521e-05, + "loss": 0.0, + "step": 4080 + }, + { + "epoch": 4.46448087431694, + "grad_norm": 6.528561789309606e-05, + "learning_rate": 3.075288403157256e-05, + "loss": 0.001, + "step": 4085 + }, + { + "epoch": 4.469945355191257, + "grad_norm": 0.00010480813944013789, + "learning_rate": 3.072252580449302e-05, + "loss": 0.0, + "step": 4090 + }, + { + "epoch": 4.475409836065574, + "grad_norm": 3.688339711516164e-05, + "learning_rate": 3.0692167577413484e-05, + "loss": 0.0, + "step": 4095 + }, + { + "epoch": 4.48087431693989, + "grad_norm": 0.0001360525202471763, + "learning_rate": 3.066180935033394e-05, + "loss": 0.0, + "step": 4100 + }, + { + "epoch": 4.4863387978142075, + "grad_norm": 3.33041389239952e-05, + "learning_rate": 3.06314511232544e-05, + "loss": 0.0, + "step": 4105 + }, + { + "epoch": 4.491803278688525, + "grad_norm": 3.9832579204812646e-05, + "learning_rate": 3.0601092896174864e-05, + "loss": 0.0, + "step": 4110 + }, + { + "epoch": 4.497267759562842, + "grad_norm": 2.829811637639068e-05, + "learning_rate": 3.057073466909532e-05, + "loss": 0.0001, + "step": 4115 + }, + { + "epoch": 4.502732240437158, + "grad_norm": 0.00016292250074911863, + "learning_rate": 3.0540376442015786e-05, + "loss": 0.0, + "step": 4120 + }, + { + "epoch": 4.508196721311475, + "grad_norm": 2.8942140488652512e-05, + "learning_rate": 3.0510018214936247e-05, + "loss": 0.0, + "step": 4125 + }, + { + "epoch": 4.5136612021857925, + "grad_norm": 5.0521703087724745e-05, + "learning_rate": 3.047965998785671e-05, + "loss": 0.0, + "step": 4130 + }, + { + "epoch": 4.51912568306011, + "grad_norm": 2.7378995582694188e-05, + "learning_rate": 3.0449301760777173e-05, + "loss": 0.0, + "step": 4135 + }, + { + "epoch": 4.524590163934426, + "grad_norm": 2.603435677883681e-05, + "learning_rate": 3.0418943533697637e-05, + "loss": 0.0, + "step": 4140 + }, + { + "epoch": 4.530054644808743, + "grad_norm": 2.8875801945105195e-05, + "learning_rate": 3.0388585306618095e-05, + "loss": 0.0, + "step": 4145 + }, + { + "epoch": 4.53551912568306, + "grad_norm": 2.562619192758575e-05, + "learning_rate": 3.0358227079538553e-05, + "loss": 0.0, + "step": 4150 + }, + { + "epoch": 4.540983606557377, + "grad_norm": 3.5220946301706135e-05, + "learning_rate": 3.0327868852459017e-05, + "loss": 0.0003, + "step": 4155 + }, + { + "epoch": 4.546448087431694, + "grad_norm": 2.3912976757856086e-05, + "learning_rate": 3.029751062537948e-05, + "loss": 0.0001, + "step": 4160 + }, + { + "epoch": 4.551912568306011, + "grad_norm": 2.4303217287524603e-05, + "learning_rate": 3.0267152398299943e-05, + "loss": 0.0, + "step": 4165 + }, + { + "epoch": 4.557377049180328, + "grad_norm": 2.267379932163749e-05, + "learning_rate": 3.02367941712204e-05, + "loss": 0.0, + "step": 4170 + }, + { + "epoch": 4.562841530054644, + "grad_norm": 2.6442226953804493e-05, + "learning_rate": 3.0206435944140865e-05, + "loss": 0.0, + "step": 4175 + }, + { + "epoch": 4.5683060109289615, + "grad_norm": 2.2860698663862422e-05, + "learning_rate": 3.0176077717061323e-05, + "loss": 0.0, + "step": 4180 + }, + { + "epoch": 4.573770491803279, + "grad_norm": 2.834216684277635e-05, + "learning_rate": 3.0145719489981787e-05, + "loss": 0.0, + "step": 4185 + }, + { + "epoch": 4.579234972677596, + "grad_norm": 2.5860501409624703e-05, + "learning_rate": 3.011536126290225e-05, + "loss": 0.0, + "step": 4190 + }, + { + "epoch": 4.584699453551913, + "grad_norm": 2.1242882212391123e-05, + "learning_rate": 3.0085003035822713e-05, + "loss": 0.0, + "step": 4195 + }, + { + "epoch": 4.590163934426229, + "grad_norm": 2.523986404412426e-05, + "learning_rate": 3.005464480874317e-05, + "loss": 0.0, + "step": 4200 + }, + { + "epoch": 4.595628415300546, + "grad_norm": 2.213427251263056e-05, + "learning_rate": 3.002428658166363e-05, + "loss": 0.0, + "step": 4205 + }, + { + "epoch": 4.601092896174864, + "grad_norm": 2.5897697923937812e-05, + "learning_rate": 2.9993928354584093e-05, + "loss": 0.0, + "step": 4210 + }, + { + "epoch": 4.60655737704918, + "grad_norm": 0.0009925027843564749, + "learning_rate": 2.9963570127504554e-05, + "loss": 0.0, + "step": 4215 + }, + { + "epoch": 4.612021857923497, + "grad_norm": 2.2528745830641128e-05, + "learning_rate": 2.993321190042502e-05, + "loss": 0.0, + "step": 4220 + }, + { + "epoch": 4.617486338797814, + "grad_norm": 2.1108824512339197e-05, + "learning_rate": 2.9902853673345476e-05, + "loss": 0.0, + "step": 4225 + }, + { + "epoch": 4.622950819672131, + "grad_norm": 2.6829531634575687e-05, + "learning_rate": 2.987249544626594e-05, + "loss": 0.0, + "step": 4230 + }, + { + "epoch": 4.628415300546449, + "grad_norm": 2.457356640661601e-05, + "learning_rate": 2.98421372191864e-05, + "loss": 0.0, + "step": 4235 + }, + { + "epoch": 4.633879781420765, + "grad_norm": 0.0010403507621958852, + "learning_rate": 2.9811778992106863e-05, + "loss": 0.0, + "step": 4240 + }, + { + "epoch": 4.639344262295082, + "grad_norm": 2.311925527465064e-05, + "learning_rate": 2.9781420765027324e-05, + "loss": 0.0, + "step": 4245 + }, + { + "epoch": 4.644808743169399, + "grad_norm": 0.00011339302727719769, + "learning_rate": 2.9751062537947782e-05, + "loss": 0.0, + "step": 4250 + }, + { + "epoch": 4.6502732240437155, + "grad_norm": 2.0847737687290646e-05, + "learning_rate": 2.9720704310868247e-05, + "loss": 0.0, + "step": 4255 + }, + { + "epoch": 4.655737704918033, + "grad_norm": 4.812885163119063e-05, + "learning_rate": 2.9690346083788704e-05, + "loss": 0.0, + "step": 4260 + }, + { + "epoch": 4.66120218579235, + "grad_norm": 2.5263596398872323e-05, + "learning_rate": 2.965998785670917e-05, + "loss": 0.0, + "step": 4265 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 2.3823860828997567e-05, + "learning_rate": 2.962962962962963e-05, + "loss": 0.0, + "step": 4270 + }, + { + "epoch": 4.672131147540983, + "grad_norm": 2.5453253329033032e-05, + "learning_rate": 2.9599271402550094e-05, + "loss": 0.0, + "step": 4275 + }, + { + "epoch": 4.6775956284153, + "grad_norm": 3.0383727789740078e-05, + "learning_rate": 2.9568913175470552e-05, + "loss": 0.0, + "step": 4280 + }, + { + "epoch": 4.683060109289618, + "grad_norm": 2.0236289856256917e-05, + "learning_rate": 2.9538554948391017e-05, + "loss": 0.0, + "step": 4285 + }, + { + "epoch": 4.688524590163935, + "grad_norm": 4.196896406938322e-05, + "learning_rate": 2.9508196721311478e-05, + "loss": 0.0, + "step": 4290 + }, + { + "epoch": 4.693989071038251, + "grad_norm": 2.054395736195147e-05, + "learning_rate": 2.9477838494231942e-05, + "loss": 0.0, + "step": 4295 + }, + { + "epoch": 4.699453551912568, + "grad_norm": 2.2467031158157624e-05, + "learning_rate": 2.94474802671524e-05, + "loss": 0.0, + "step": 4300 + }, + { + "epoch": 4.704918032786885, + "grad_norm": 1.898876143968664e-05, + "learning_rate": 2.9417122040072858e-05, + "loss": 0.0, + "step": 4305 + }, + { + "epoch": 4.7103825136612025, + "grad_norm": 2.9287553843460046e-05, + "learning_rate": 2.9386763812993322e-05, + "loss": 0.0, + "step": 4310 + }, + { + "epoch": 4.715846994535519, + "grad_norm": 2.107855470967479e-05, + "learning_rate": 2.9356405585913783e-05, + "loss": 0.0, + "step": 4315 + }, + { + "epoch": 4.721311475409836, + "grad_norm": 7.70809201640077e-05, + "learning_rate": 2.9326047358834248e-05, + "loss": 0.0, + "step": 4320 + }, + { + "epoch": 4.726775956284153, + "grad_norm": 1.9799354049609974e-05, + "learning_rate": 2.9295689131754706e-05, + "loss": 0.0, + "step": 4325 + }, + { + "epoch": 4.73224043715847, + "grad_norm": 2.4164644855773076e-05, + "learning_rate": 2.926533090467517e-05, + "loss": 0.0, + "step": 4330 + }, + { + "epoch": 4.737704918032787, + "grad_norm": 2.102001235471107e-05, + "learning_rate": 2.9234972677595628e-05, + "loss": 0.0, + "step": 4335 + }, + { + "epoch": 4.743169398907104, + "grad_norm": 0.00021176210429985076, + "learning_rate": 2.9204614450516093e-05, + "loss": 0.0, + "step": 4340 + }, + { + "epoch": 4.748633879781421, + "grad_norm": 0.0021099280565977097, + "learning_rate": 2.9174256223436554e-05, + "loss": 0.0, + "step": 4345 + }, + { + "epoch": 4.754098360655737, + "grad_norm": 3.413944796193391e-05, + "learning_rate": 2.9143897996357018e-05, + "loss": 0.0, + "step": 4350 + }, + { + "epoch": 4.759562841530054, + "grad_norm": 5.8268306020181626e-05, + "learning_rate": 2.9113539769277476e-05, + "loss": 0.0, + "step": 4355 + }, + { + "epoch": 4.7650273224043715, + "grad_norm": 2.4852219212334603e-05, + "learning_rate": 2.9083181542197934e-05, + "loss": 0.0, + "step": 4360 + }, + { + "epoch": 4.770491803278689, + "grad_norm": 2.554810453148093e-05, + "learning_rate": 2.9052823315118398e-05, + "loss": 0.0, + "step": 4365 + }, + { + "epoch": 4.775956284153006, + "grad_norm": 1.9379966033739038e-05, + "learning_rate": 2.902246508803886e-05, + "loss": 0.0, + "step": 4370 + }, + { + "epoch": 4.781420765027322, + "grad_norm": 2.287175811943598e-05, + "learning_rate": 2.8992106860959324e-05, + "loss": 0.0, + "step": 4375 + }, + { + "epoch": 4.786885245901639, + "grad_norm": 1.9639024685602635e-05, + "learning_rate": 2.896174863387978e-05, + "loss": 0.0, + "step": 4380 + }, + { + "epoch": 4.7923497267759565, + "grad_norm": 2.56021576205967e-05, + "learning_rate": 2.8931390406800246e-05, + "loss": 0.0, + "step": 4385 + }, + { + "epoch": 4.797814207650273, + "grad_norm": 3.311937689431943e-05, + "learning_rate": 2.8901032179720704e-05, + "loss": 0.0, + "step": 4390 + }, + { + "epoch": 4.80327868852459, + "grad_norm": 0.00010998953075613827, + "learning_rate": 2.8870673952641168e-05, + "loss": 0.0, + "step": 4395 + }, + { + "epoch": 4.808743169398907, + "grad_norm": 2.0269997548894025e-05, + "learning_rate": 2.884031572556163e-05, + "loss": 0.0, + "step": 4400 + }, + { + "epoch": 4.814207650273224, + "grad_norm": 2.814541949192062e-05, + "learning_rate": 2.8809957498482087e-05, + "loss": 0.0, + "step": 4405 + }, + { + "epoch": 4.8196721311475414, + "grad_norm": 2.220185888290871e-05, + "learning_rate": 2.877959927140255e-05, + "loss": 0.0, + "step": 4410 + }, + { + "epoch": 4.825136612021858, + "grad_norm": 2.1637810277752578e-05, + "learning_rate": 2.874924104432301e-05, + "loss": 0.0, + "step": 4415 + }, + { + "epoch": 4.830601092896175, + "grad_norm": 2.2878619347466156e-05, + "learning_rate": 2.8718882817243474e-05, + "loss": 0.0, + "step": 4420 + }, + { + "epoch": 4.836065573770492, + "grad_norm": 2.0245261111995205e-05, + "learning_rate": 2.8688524590163935e-05, + "loss": 0.0, + "step": 4425 + }, + { + "epoch": 4.841530054644808, + "grad_norm": 2.429804953862913e-05, + "learning_rate": 2.86581663630844e-05, + "loss": 0.0, + "step": 4430 + }, + { + "epoch": 4.8469945355191255, + "grad_norm": 0.007154061924666166, + "learning_rate": 2.8627808136004857e-05, + "loss": 0.2029, + "step": 4435 + }, + { + "epoch": 4.852459016393443, + "grad_norm": 4.056124816997908e-05, + "learning_rate": 2.8597449908925322e-05, + "loss": 0.001, + "step": 4440 + }, + { + "epoch": 4.85792349726776, + "grad_norm": 3.3877342502819374e-05, + "learning_rate": 2.856709168184578e-05, + "loss": 0.0001, + "step": 4445 + }, + { + "epoch": 4.863387978142076, + "grad_norm": 0.1083371564745903, + "learning_rate": 2.8536733454766244e-05, + "loss": 0.0005, + "step": 4450 + }, + { + "epoch": 4.868852459016393, + "grad_norm": 4.0910981624620035e-05, + "learning_rate": 2.8506375227686705e-05, + "loss": 0.0, + "step": 4455 + }, + { + "epoch": 4.8743169398907105, + "grad_norm": 5.547309774556197e-05, + "learning_rate": 2.8476017000607163e-05, + "loss": 0.0885, + "step": 4460 + }, + { + "epoch": 4.879781420765028, + "grad_norm": 0.002973817056044936, + "learning_rate": 2.8445658773527627e-05, + "loss": 0.0, + "step": 4465 + }, + { + "epoch": 4.885245901639344, + "grad_norm": 7.371945685008541e-05, + "learning_rate": 2.841530054644809e-05, + "loss": 0.0, + "step": 4470 + }, + { + "epoch": 4.890710382513661, + "grad_norm": 0.00017757054592948407, + "learning_rate": 2.8384942319368553e-05, + "loss": 0.0005, + "step": 4475 + }, + { + "epoch": 4.896174863387978, + "grad_norm": 3.507310611894354e-05, + "learning_rate": 2.835458409228901e-05, + "loss": 0.0, + "step": 4480 + }, + { + "epoch": 4.901639344262295, + "grad_norm": 0.00014538239338435233, + "learning_rate": 2.8324225865209475e-05, + "loss": 0.1127, + "step": 4485 + }, + { + "epoch": 4.907103825136612, + "grad_norm": 7.141881360439584e-05, + "learning_rate": 2.8293867638129933e-05, + "loss": 0.0005, + "step": 4490 + }, + { + "epoch": 4.912568306010929, + "grad_norm": 5.7522782299201936e-05, + "learning_rate": 2.8263509411050398e-05, + "loss": 0.0, + "step": 4495 + }, + { + "epoch": 4.918032786885246, + "grad_norm": 0.0008930726326070726, + "learning_rate": 2.823315118397086e-05, + "loss": 0.0, + "step": 4500 + }, + { + "epoch": 4.923497267759563, + "grad_norm": 0.0020042534451931715, + "learning_rate": 2.8202792956891317e-05, + "loss": 0.0, + "step": 4505 + }, + { + "epoch": 4.9289617486338795, + "grad_norm": 5.080427217762917e-05, + "learning_rate": 2.817243472981178e-05, + "loss": 0.0002, + "step": 4510 + }, + { + "epoch": 4.934426229508197, + "grad_norm": 5.520815830095671e-05, + "learning_rate": 2.814207650273224e-05, + "loss": 0.0, + "step": 4515 + }, + { + "epoch": 4.939890710382514, + "grad_norm": 3.327699232613668e-05, + "learning_rate": 2.8111718275652703e-05, + "loss": 0.0, + "step": 4520 + }, + { + "epoch": 4.945355191256831, + "grad_norm": 3.853649468510412e-05, + "learning_rate": 2.8081360048573164e-05, + "loss": 0.1111, + "step": 4525 + }, + { + "epoch": 4.950819672131147, + "grad_norm": 4.388443994685076e-05, + "learning_rate": 2.805100182149363e-05, + "loss": 0.1132, + "step": 4530 + }, + { + "epoch": 4.956284153005464, + "grad_norm": 0.00019437754235696048, + "learning_rate": 2.8020643594414087e-05, + "loss": 0.0, + "step": 4535 + }, + { + "epoch": 4.961748633879782, + "grad_norm": 0.00022693238861393183, + "learning_rate": 2.799028536733455e-05, + "loss": 0.0102, + "step": 4540 + }, + { + "epoch": 4.967213114754099, + "grad_norm": 0.00043771814671345055, + "learning_rate": 2.795992714025501e-05, + "loss": 0.0007, + "step": 4545 + }, + { + "epoch": 4.972677595628415, + "grad_norm": 0.0002889480092562735, + "learning_rate": 2.7929568913175473e-05, + "loss": 0.0, + "step": 4550 + }, + { + "epoch": 4.978142076502732, + "grad_norm": 7.144697883632034e-05, + "learning_rate": 2.7899210686095935e-05, + "loss": 0.0, + "step": 4555 + }, + { + "epoch": 4.983606557377049, + "grad_norm": 0.0003368402540218085, + "learning_rate": 2.7868852459016392e-05, + "loss": 0.0, + "step": 4560 + }, + { + "epoch": 4.989071038251366, + "grad_norm": 0.0009206884424202144, + "learning_rate": 2.7838494231936857e-05, + "loss": 0.0, + "step": 4565 + }, + { + "epoch": 4.994535519125683, + "grad_norm": 5.6415323342662305e-05, + "learning_rate": 2.7808136004857315e-05, + "loss": 0.0116, + "step": 4570 + }, + { + "epoch": 5.0, + "grad_norm": 7.074388850014657e-05, + "learning_rate": 2.777777777777778e-05, + "loss": 0.0, + "step": 4575 + }, + { + "epoch": 5.0, + "eval_loss": 0.0017390144057571888, + "eval_runtime": 676.778, + "eval_samples_per_second": 10.81, + "eval_steps_per_second": 1.352, + "step": 4575 + }, + { + "epoch": 5.005464480874317, + "grad_norm": 0.00010769537038868293, + "learning_rate": 2.774741955069824e-05, + "loss": 0.0, + "step": 4580 + }, + { + "epoch": 5.0109289617486334, + "grad_norm": 0.00015485276526305825, + "learning_rate": 2.7717061323618705e-05, + "loss": 0.0, + "step": 4585 + }, + { + "epoch": 5.016393442622951, + "grad_norm": 0.0003547978412825614, + "learning_rate": 2.7686703096539162e-05, + "loss": 0.0, + "step": 4590 + }, + { + "epoch": 5.021857923497268, + "grad_norm": 0.0005272823618724942, + "learning_rate": 2.7656344869459627e-05, + "loss": 0.0, + "step": 4595 + }, + { + "epoch": 5.027322404371585, + "grad_norm": 0.0015367731684818864, + "learning_rate": 2.7625986642380085e-05, + "loss": 0.0, + "step": 4600 + }, + { + "epoch": 5.032786885245901, + "grad_norm": 0.00010357119754189625, + "learning_rate": 2.7595628415300546e-05, + "loss": 0.0, + "step": 4605 + }, + { + "epoch": 5.038251366120218, + "grad_norm": 5.5412652727682143e-05, + "learning_rate": 2.756527018822101e-05, + "loss": 0.0001, + "step": 4610 + }, + { + "epoch": 5.043715846994536, + "grad_norm": 0.0001249344350071624, + "learning_rate": 2.7534911961141468e-05, + "loss": 0.0, + "step": 4615 + }, + { + "epoch": 5.049180327868853, + "grad_norm": 0.0027478511910885572, + "learning_rate": 2.7504553734061933e-05, + "loss": 0.0, + "step": 4620 + }, + { + "epoch": 5.054644808743169, + "grad_norm": 0.0025896786246448755, + "learning_rate": 2.747419550698239e-05, + "loss": 0.073, + "step": 4625 + }, + { + "epoch": 5.060109289617486, + "grad_norm": 0.00015381591219920665, + "learning_rate": 2.7443837279902855e-05, + "loss": 0.0, + "step": 4630 + }, + { + "epoch": 5.065573770491803, + "grad_norm": 0.00017391947039868683, + "learning_rate": 2.7413479052823316e-05, + "loss": 0.0, + "step": 4635 + }, + { + "epoch": 5.0710382513661205, + "grad_norm": 0.00014396451297216117, + "learning_rate": 2.738312082574378e-05, + "loss": 0.0, + "step": 4640 + }, + { + "epoch": 5.076502732240437, + "grad_norm": 0.00019471778068691492, + "learning_rate": 2.7352762598664238e-05, + "loss": 0.0, + "step": 4645 + }, + { + "epoch": 5.081967213114754, + "grad_norm": 0.00017244904302060604, + "learning_rate": 2.7322404371584703e-05, + "loss": 0.0001, + "step": 4650 + }, + { + "epoch": 5.087431693989071, + "grad_norm": 0.00013770755322184414, + "learning_rate": 2.7292046144505164e-05, + "loss": 0.0, + "step": 4655 + }, + { + "epoch": 5.092896174863388, + "grad_norm": 0.0009572531562298536, + "learning_rate": 2.726168791742562e-05, + "loss": 0.0, + "step": 4660 + }, + { + "epoch": 5.098360655737705, + "grad_norm": 0.005346562713384628, + "learning_rate": 2.7231329690346086e-05, + "loss": 0.0, + "step": 4665 + }, + { + "epoch": 5.103825136612022, + "grad_norm": 0.00018971337703987956, + "learning_rate": 2.7200971463266544e-05, + "loss": 0.0, + "step": 4670 + }, + { + "epoch": 5.109289617486339, + "grad_norm": 0.00018444436136633158, + "learning_rate": 2.717061323618701e-05, + "loss": 0.0, + "step": 4675 + }, + { + "epoch": 5.114754098360656, + "grad_norm": 0.00036197478766553104, + "learning_rate": 2.714025500910747e-05, + "loss": 0.0961, + "step": 4680 + }, + { + "epoch": 5.120218579234972, + "grad_norm": 0.00030608210363425314, + "learning_rate": 2.7109896782027934e-05, + "loss": 0.0, + "step": 4685 + }, + { + "epoch": 5.1256830601092895, + "grad_norm": 0.010481505654752254, + "learning_rate": 2.7079538554948392e-05, + "loss": 0.0, + "step": 4690 + }, + { + "epoch": 5.131147540983607, + "grad_norm": 0.0004171307082287967, + "learning_rate": 2.7049180327868856e-05, + "loss": 0.0001, + "step": 4695 + }, + { + "epoch": 5.136612021857924, + "grad_norm": 0.00030121684540063143, + "learning_rate": 2.7018822100789314e-05, + "loss": 0.0, + "step": 4700 + }, + { + "epoch": 5.14207650273224, + "grad_norm": 8.655495184939355e-05, + "learning_rate": 2.6988463873709775e-05, + "loss": 0.0, + "step": 4705 + }, + { + "epoch": 5.147540983606557, + "grad_norm": 0.00010451547859702259, + "learning_rate": 2.695810564663024e-05, + "loss": 0.0, + "step": 4710 + }, + { + "epoch": 5.1530054644808745, + "grad_norm": 0.013965661637485027, + "learning_rate": 2.6927747419550697e-05, + "loss": 0.0001, + "step": 4715 + }, + { + "epoch": 5.158469945355192, + "grad_norm": 0.00010520996147533879, + "learning_rate": 2.6897389192471162e-05, + "loss": 0.0, + "step": 4720 + }, + { + "epoch": 5.163934426229508, + "grad_norm": 9.894570393953472e-05, + "learning_rate": 2.686703096539162e-05, + "loss": 0.0, + "step": 4725 + }, + { + "epoch": 5.169398907103825, + "grad_norm": 7.636708323843777e-05, + "learning_rate": 2.6836672738312084e-05, + "loss": 0.0001, + "step": 4730 + }, + { + "epoch": 5.174863387978142, + "grad_norm": 6.734576891176403e-05, + "learning_rate": 2.6806314511232545e-05, + "loss": 0.0, + "step": 4735 + }, + { + "epoch": 5.180327868852459, + "grad_norm": 8.934761717682704e-05, + "learning_rate": 2.677595628415301e-05, + "loss": 0.0001, + "step": 4740 + }, + { + "epoch": 5.185792349726776, + "grad_norm": 0.0001079771900549531, + "learning_rate": 2.6745598057073468e-05, + "loss": 0.0, + "step": 4745 + }, + { + "epoch": 5.191256830601093, + "grad_norm": 8.022497786441818e-05, + "learning_rate": 2.6715239829993932e-05, + "loss": 0.0462, + "step": 4750 + }, + { + "epoch": 5.19672131147541, + "grad_norm": 0.0001275836257264018, + "learning_rate": 2.668488160291439e-05, + "loss": 0.0001, + "step": 4755 + }, + { + "epoch": 5.202185792349727, + "grad_norm": 0.00036970950895920396, + "learning_rate": 2.665452337583485e-05, + "loss": 0.0, + "step": 4760 + }, + { + "epoch": 5.2076502732240435, + "grad_norm": 0.0001368650555377826, + "learning_rate": 2.6624165148755316e-05, + "loss": 0.0, + "step": 4765 + }, + { + "epoch": 5.213114754098361, + "grad_norm": 9.971411782316864e-05, + "learning_rate": 2.6593806921675773e-05, + "loss": 0.0, + "step": 4770 + }, + { + "epoch": 5.218579234972678, + "grad_norm": 7.680917769903317e-05, + "learning_rate": 2.6563448694596238e-05, + "loss": 0.0, + "step": 4775 + }, + { + "epoch": 5.224043715846994, + "grad_norm": 9.601003694115207e-05, + "learning_rate": 2.6533090467516696e-05, + "loss": 0.0, + "step": 4780 + }, + { + "epoch": 5.229508196721311, + "grad_norm": 9.728920849738643e-05, + "learning_rate": 2.650273224043716e-05, + "loss": 0.0, + "step": 4785 + }, + { + "epoch": 5.2349726775956285, + "grad_norm": 8.36860272102058e-05, + "learning_rate": 2.647237401335762e-05, + "loss": 0.0, + "step": 4790 + }, + { + "epoch": 5.240437158469946, + "grad_norm": 0.00011909649037988856, + "learning_rate": 2.6442015786278086e-05, + "loss": 0.0001, + "step": 4795 + }, + { + "epoch": 5.245901639344262, + "grad_norm": 6.814736843807623e-05, + "learning_rate": 2.6411657559198543e-05, + "loss": 0.0, + "step": 4800 + }, + { + "epoch": 5.251366120218579, + "grad_norm": 0.00010630305769154802, + "learning_rate": 2.6381299332119008e-05, + "loss": 0.0001, + "step": 4805 + }, + { + "epoch": 5.256830601092896, + "grad_norm": 0.00012657047773245722, + "learning_rate": 2.6350941105039466e-05, + "loss": 0.0001, + "step": 4810 + }, + { + "epoch": 5.262295081967213, + "grad_norm": 9.68086751527153e-05, + "learning_rate": 2.6320582877959927e-05, + "loss": 0.0, + "step": 4815 + }, + { + "epoch": 5.26775956284153, + "grad_norm": 0.0001090406221919693, + "learning_rate": 2.629022465088039e-05, + "loss": 0.0, + "step": 4820 + }, + { + "epoch": 5.273224043715847, + "grad_norm": 0.00010317601845599711, + "learning_rate": 2.625986642380085e-05, + "loss": 0.0, + "step": 4825 + }, + { + "epoch": 5.278688524590164, + "grad_norm": 6.058696817490272e-05, + "learning_rate": 2.6229508196721314e-05, + "loss": 0.0, + "step": 4830 + }, + { + "epoch": 5.284153005464481, + "grad_norm": 8.128488116199151e-05, + "learning_rate": 2.619914996964177e-05, + "loss": 0.0, + "step": 4835 + }, + { + "epoch": 5.2896174863387975, + "grad_norm": 7.313517562579364e-05, + "learning_rate": 2.6168791742562236e-05, + "loss": 0.0, + "step": 4840 + }, + { + "epoch": 5.295081967213115, + "grad_norm": 8.592737140133977e-05, + "learning_rate": 2.6138433515482697e-05, + "loss": 0.0, + "step": 4845 + }, + { + "epoch": 5.300546448087432, + "grad_norm": 0.00022616136993747205, + "learning_rate": 2.610807528840316e-05, + "loss": 0.0, + "step": 4850 + }, + { + "epoch": 5.306010928961749, + "grad_norm": 0.005083122756332159, + "learning_rate": 2.607771706132362e-05, + "loss": 0.0, + "step": 4855 + }, + { + "epoch": 5.311475409836065, + "grad_norm": 0.00020489096641540527, + "learning_rate": 2.604735883424408e-05, + "loss": 0.0, + "step": 4860 + }, + { + "epoch": 5.316939890710382, + "grad_norm": 0.00010977088822983205, + "learning_rate": 2.6017000607164545e-05, + "loss": 0.0, + "step": 4865 + }, + { + "epoch": 5.3224043715847, + "grad_norm": 7.336941052926704e-05, + "learning_rate": 2.5986642380085003e-05, + "loss": 0.0, + "step": 4870 + }, + { + "epoch": 5.327868852459017, + "grad_norm": 5.920181138208136e-05, + "learning_rate": 2.5956284153005467e-05, + "loss": 0.0, + "step": 4875 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.00023859695647843182, + "learning_rate": 2.5925925925925925e-05, + "loss": 0.0, + "step": 4880 + }, + { + "epoch": 5.33879781420765, + "grad_norm": 7.648219616385177e-05, + "learning_rate": 2.589556769884639e-05, + "loss": 0.0, + "step": 4885 + }, + { + "epoch": 5.344262295081967, + "grad_norm": 5.340289862942882e-05, + "learning_rate": 2.586520947176685e-05, + "loss": 0.0, + "step": 4890 + }, + { + "epoch": 5.3497267759562845, + "grad_norm": 0.00011013679613824934, + "learning_rate": 2.5834851244687315e-05, + "loss": 0.0001, + "step": 4895 + }, + { + "epoch": 5.355191256830601, + "grad_norm": 8.648641960462555e-05, + "learning_rate": 2.5804493017607773e-05, + "loss": 0.0, + "step": 4900 + }, + { + "epoch": 5.360655737704918, + "grad_norm": 0.00014422819367609918, + "learning_rate": 2.5774134790528237e-05, + "loss": 0.0, + "step": 4905 + }, + { + "epoch": 5.366120218579235, + "grad_norm": 9.631054126657546e-05, + "learning_rate": 2.5743776563448695e-05, + "loss": 0.1501, + "step": 4910 + }, + { + "epoch": 5.371584699453552, + "grad_norm": 0.0006303318659774959, + "learning_rate": 2.5713418336369156e-05, + "loss": 0.0002, + "step": 4915 + }, + { + "epoch": 5.377049180327869, + "grad_norm": 0.0002542531001381576, + "learning_rate": 2.568306010928962e-05, + "loss": 0.0005, + "step": 4920 + }, + { + "epoch": 5.382513661202186, + "grad_norm": 0.00015373634232673794, + "learning_rate": 2.565270188221008e-05, + "loss": 0.0006, + "step": 4925 + }, + { + "epoch": 5.387978142076503, + "grad_norm": 0.0001260093122255057, + "learning_rate": 2.5622343655130543e-05, + "loss": 0.0, + "step": 4930 + }, + { + "epoch": 5.39344262295082, + "grad_norm": 0.00012952568067703396, + "learning_rate": 2.5591985428051e-05, + "loss": 0.0002, + "step": 4935 + }, + { + "epoch": 5.398907103825136, + "grad_norm": 0.00015524527407251298, + "learning_rate": 2.5561627200971465e-05, + "loss": 0.0014, + "step": 4940 + }, + { + "epoch": 5.404371584699454, + "grad_norm": 5.933045395067893e-05, + "learning_rate": 2.5531268973891926e-05, + "loss": 0.0, + "step": 4945 + }, + { + "epoch": 5.409836065573771, + "grad_norm": 9.225990652339533e-05, + "learning_rate": 2.550091074681239e-05, + "loss": 0.0, + "step": 4950 + }, + { + "epoch": 5.415300546448087, + "grad_norm": 9.452126687392592e-05, + "learning_rate": 2.547055251973285e-05, + "loss": 0.0, + "step": 4955 + }, + { + "epoch": 5.420765027322404, + "grad_norm": 9.709106961963698e-05, + "learning_rate": 2.5440194292653306e-05, + "loss": 0.0001, + "step": 4960 + }, + { + "epoch": 5.426229508196721, + "grad_norm": 5.982341099297628e-05, + "learning_rate": 2.540983606557377e-05, + "loss": 0.0, + "step": 4965 + }, + { + "epoch": 5.4316939890710385, + "grad_norm": 8.735879964660853e-05, + "learning_rate": 2.5379477838494232e-05, + "loss": 0.0, + "step": 4970 + }, + { + "epoch": 5.437158469945355, + "grad_norm": 9.017515549203381e-05, + "learning_rate": 2.5349119611414697e-05, + "loss": 0.0, + "step": 4975 + }, + { + "epoch": 5.442622950819672, + "grad_norm": 0.00011913449998246506, + "learning_rate": 2.5318761384335154e-05, + "loss": 0.0, + "step": 4980 + }, + { + "epoch": 5.448087431693989, + "grad_norm": 7.266786269610748e-05, + "learning_rate": 2.528840315725562e-05, + "loss": 0.0, + "step": 4985 + }, + { + "epoch": 5.453551912568306, + "grad_norm": 4.6318815293489024e-05, + "learning_rate": 2.5258044930176077e-05, + "loss": 0.0, + "step": 4990 + }, + { + "epoch": 5.459016393442623, + "grad_norm": 0.00011023716069757938, + "learning_rate": 2.522768670309654e-05, + "loss": 0.0, + "step": 4995 + }, + { + "epoch": 5.46448087431694, + "grad_norm": 0.019288262352347374, + "learning_rate": 2.5197328476017002e-05, + "loss": 0.0001, + "step": 5000 + }, + { + "epoch": 5.469945355191257, + "grad_norm": 8.01812275312841e-05, + "learning_rate": 2.5166970248937467e-05, + "loss": 0.0, + "step": 5005 + }, + { + "epoch": 5.475409836065574, + "grad_norm": 5.575180330197327e-05, + "learning_rate": 2.5136612021857924e-05, + "loss": 0.0, + "step": 5010 + }, + { + "epoch": 5.48087431693989, + "grad_norm": 6.652524461969733e-05, + "learning_rate": 2.5106253794778382e-05, + "loss": 0.0, + "step": 5015 + }, + { + "epoch": 5.4863387978142075, + "grad_norm": 5.448004958452657e-05, + "learning_rate": 2.5075895567698847e-05, + "loss": 0.0001, + "step": 5020 + }, + { + "epoch": 5.491803278688525, + "grad_norm": 5.704570867237635e-05, + "learning_rate": 2.5045537340619308e-05, + "loss": 0.0063, + "step": 5025 + }, + { + "epoch": 5.497267759562842, + "grad_norm": 4.593836274580099e-05, + "learning_rate": 2.5015179113539772e-05, + "loss": 0.0002, + "step": 5030 + }, + { + "epoch": 5.502732240437158, + "grad_norm": 3.784838554565795e-05, + "learning_rate": 2.498482088646023e-05, + "loss": 0.0, + "step": 5035 + }, + { + "epoch": 5.508196721311475, + "grad_norm": 0.00014102249406278133, + "learning_rate": 2.495446265938069e-05, + "loss": 0.0, + "step": 5040 + }, + { + "epoch": 5.5136612021857925, + "grad_norm": 0.00031027215300127864, + "learning_rate": 2.4924104432301156e-05, + "loss": 0.0, + "step": 5045 + }, + { + "epoch": 5.51912568306011, + "grad_norm": 3.274340997450054e-05, + "learning_rate": 2.4893746205221617e-05, + "loss": 0.0, + "step": 5050 + }, + { + "epoch": 5.524590163934426, + "grad_norm": 5.2845189202344045e-05, + "learning_rate": 2.4863387978142078e-05, + "loss": 0.0, + "step": 5055 + }, + { + "epoch": 5.530054644808743, + "grad_norm": 25.070024490356445, + "learning_rate": 2.483302975106254e-05, + "loss": 0.1856, + "step": 5060 + }, + { + "epoch": 5.53551912568306, + "grad_norm": 0.0268250722438097, + "learning_rate": 2.4802671523983e-05, + "loss": 0.0, + "step": 5065 + }, + { + "epoch": 5.540983606557377, + "grad_norm": 0.0001650653430260718, + "learning_rate": 2.477231329690346e-05, + "loss": 0.0041, + "step": 5070 + }, + { + "epoch": 5.546448087431694, + "grad_norm": 0.0002502961433492601, + "learning_rate": 2.4741955069823926e-05, + "loss": 0.0, + "step": 5075 + }, + { + "epoch": 5.551912568306011, + "grad_norm": 0.00017075585492420942, + "learning_rate": 2.4711596842744387e-05, + "loss": 0.0001, + "step": 5080 + }, + { + "epoch": 5.557377049180328, + "grad_norm": 7.287297194125131e-05, + "learning_rate": 2.4681238615664845e-05, + "loss": 0.0, + "step": 5085 + }, + { + "epoch": 5.562841530054644, + "grad_norm": 0.00028358641429804265, + "learning_rate": 2.4650880388585306e-05, + "loss": 0.0, + "step": 5090 + }, + { + "epoch": 5.5683060109289615, + "grad_norm": 0.0002642961626406759, + "learning_rate": 2.4620522161505767e-05, + "loss": 0.0, + "step": 5095 + }, + { + "epoch": 5.573770491803279, + "grad_norm": 0.0006491028470918536, + "learning_rate": 2.459016393442623e-05, + "loss": 0.0, + "step": 5100 + }, + { + "epoch": 5.579234972677596, + "grad_norm": 0.00018292821187060326, + "learning_rate": 2.4559805707346693e-05, + "loss": 0.0001, + "step": 5105 + }, + { + "epoch": 5.584699453551913, + "grad_norm": 0.0009338250965811312, + "learning_rate": 2.4529447480267154e-05, + "loss": 0.0001, + "step": 5110 + }, + { + "epoch": 5.590163934426229, + "grad_norm": 0.03507191687822342, + "learning_rate": 2.4499089253187615e-05, + "loss": 0.0001, + "step": 5115 + }, + { + "epoch": 5.595628415300546, + "grad_norm": 8.738631731830537e-05, + "learning_rate": 2.4468731026108076e-05, + "loss": 0.0, + "step": 5120 + }, + { + "epoch": 5.601092896174864, + "grad_norm": 7.589610322611406e-05, + "learning_rate": 2.4438372799028537e-05, + "loss": 0.0, + "step": 5125 + }, + { + "epoch": 5.60655737704918, + "grad_norm": 0.0003264937549829483, + "learning_rate": 2.4408014571949e-05, + "loss": 0.0, + "step": 5130 + }, + { + "epoch": 5.612021857923497, + "grad_norm": 0.00025950855342671275, + "learning_rate": 2.437765634486946e-05, + "loss": 0.0, + "step": 5135 + }, + { + "epoch": 5.617486338797814, + "grad_norm": 0.00013693803339265287, + "learning_rate": 2.434729811778992e-05, + "loss": 0.0001, + "step": 5140 + }, + { + "epoch": 5.622950819672131, + "grad_norm": 0.0001310525694862008, + "learning_rate": 2.431693989071038e-05, + "loss": 0.0, + "step": 5145 + }, + { + "epoch": 5.628415300546449, + "grad_norm": 6.411856156773865e-05, + "learning_rate": 2.4286581663630846e-05, + "loss": 0.0, + "step": 5150 + }, + { + "epoch": 5.633879781420765, + "grad_norm": 0.00021306371490936726, + "learning_rate": 2.4256223436551307e-05, + "loss": 0.0146, + "step": 5155 + }, + { + "epoch": 5.639344262295082, + "grad_norm": 0.00023212283849716187, + "learning_rate": 2.422586520947177e-05, + "loss": 0.0001, + "step": 5160 + }, + { + "epoch": 5.644808743169399, + "grad_norm": 0.0002247579104732722, + "learning_rate": 2.419550698239223e-05, + "loss": 0.0, + "step": 5165 + }, + { + "epoch": 5.6502732240437155, + "grad_norm": 0.0003208070411346853, + "learning_rate": 2.416514875531269e-05, + "loss": 0.0, + "step": 5170 + }, + { + "epoch": 5.655737704918033, + "grad_norm": 0.0003484230546746403, + "learning_rate": 2.4134790528233152e-05, + "loss": 0.0, + "step": 5175 + }, + { + "epoch": 5.66120218579235, + "grad_norm": 0.010849053040146828, + "learning_rate": 2.4104432301153616e-05, + "loss": 0.0001, + "step": 5180 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.000284497975371778, + "learning_rate": 2.4074074074074074e-05, + "loss": 0.0831, + "step": 5185 + }, + { + "epoch": 5.672131147540983, + "grad_norm": 0.0001850848348112777, + "learning_rate": 2.4043715846994535e-05, + "loss": 0.2457, + "step": 5190 + }, + { + "epoch": 5.6775956284153, + "grad_norm": 0.0009068201761692762, + "learning_rate": 2.4013357619914996e-05, + "loss": 0.0, + "step": 5195 + }, + { + "epoch": 5.683060109289618, + "grad_norm": 0.0002595623955130577, + "learning_rate": 2.3982999392835457e-05, + "loss": 0.0502, + "step": 5200 + }, + { + "epoch": 5.688524590163935, + "grad_norm": 0.00040304780122824013, + "learning_rate": 2.3952641165755922e-05, + "loss": 0.003, + "step": 5205 + }, + { + "epoch": 5.693989071038251, + "grad_norm": 0.0006999442121013999, + "learning_rate": 2.3922282938676383e-05, + "loss": 0.0, + "step": 5210 + }, + { + "epoch": 5.699453551912568, + "grad_norm": 0.0008923182031139731, + "learning_rate": 2.3891924711596844e-05, + "loss": 0.0001, + "step": 5215 + }, + { + "epoch": 5.704918032786885, + "grad_norm": 0.0028996258042752743, + "learning_rate": 2.3861566484517305e-05, + "loss": 0.0001, + "step": 5220 + }, + { + "epoch": 5.7103825136612025, + "grad_norm": 0.011778953485190868, + "learning_rate": 2.3831208257437767e-05, + "loss": 0.0001, + "step": 5225 + }, + { + "epoch": 5.715846994535519, + "grad_norm": 0.0009088412043638527, + "learning_rate": 2.380085003035823e-05, + "loss": 0.0004, + "step": 5230 + }, + { + "epoch": 5.721311475409836, + "grad_norm": 0.0005468827439472079, + "learning_rate": 2.377049180327869e-05, + "loss": 0.0, + "step": 5235 + }, + { + "epoch": 5.726775956284153, + "grad_norm": 0.0005143904127180576, + "learning_rate": 2.374013357619915e-05, + "loss": 0.0, + "step": 5240 + }, + { + "epoch": 5.73224043715847, + "grad_norm": 0.012319618836045265, + "learning_rate": 2.370977534911961e-05, + "loss": 0.0001, + "step": 5245 + }, + { + "epoch": 5.737704918032787, + "grad_norm": 0.00032935780473053455, + "learning_rate": 2.3679417122040072e-05, + "loss": 0.0, + "step": 5250 + }, + { + "epoch": 5.743169398907104, + "grad_norm": 0.00028610375011339784, + "learning_rate": 2.3649058894960537e-05, + "loss": 0.0001, + "step": 5255 + }, + { + "epoch": 5.748633879781421, + "grad_norm": 0.0014600688591599464, + "learning_rate": 2.3618700667880998e-05, + "loss": 0.0, + "step": 5260 + }, + { + "epoch": 5.754098360655737, + "grad_norm": 0.0003803087747655809, + "learning_rate": 2.358834244080146e-05, + "loss": 0.0, + "step": 5265 + }, + { + "epoch": 5.759562841530054, + "grad_norm": 0.0012569596292451024, + "learning_rate": 2.355798421372192e-05, + "loss": 0.0, + "step": 5270 + }, + { + "epoch": 5.7650273224043715, + "grad_norm": 0.00016804836923256516, + "learning_rate": 2.352762598664238e-05, + "loss": 0.0001, + "step": 5275 + }, + { + "epoch": 5.770491803278689, + "grad_norm": 0.00032320586615242064, + "learning_rate": 2.3497267759562842e-05, + "loss": 0.0, + "step": 5280 + }, + { + "epoch": 5.775956284153006, + "grad_norm": 0.00043845665641129017, + "learning_rate": 2.3466909532483307e-05, + "loss": 0.0, + "step": 5285 + }, + { + "epoch": 5.781420765027322, + "grad_norm": 0.0004221588606014848, + "learning_rate": 2.3436551305403765e-05, + "loss": 0.0, + "step": 5290 + }, + { + "epoch": 5.786885245901639, + "grad_norm": 0.00026679132133722305, + "learning_rate": 2.3406193078324226e-05, + "loss": 0.0, + "step": 5295 + }, + { + "epoch": 5.7923497267759565, + "grad_norm": 0.00012715437333099544, + "learning_rate": 2.3375834851244687e-05, + "loss": 0.0, + "step": 5300 + }, + { + "epoch": 5.797814207650273, + "grad_norm": 0.00016335255349986255, + "learning_rate": 2.3345476624165148e-05, + "loss": 0.0, + "step": 5305 + }, + { + "epoch": 5.80327868852459, + "grad_norm": 0.0005586635088548064, + "learning_rate": 2.3315118397085612e-05, + "loss": 0.0, + "step": 5310 + }, + { + "epoch": 5.808743169398907, + "grad_norm": 0.0017712228000164032, + "learning_rate": 2.3284760170006074e-05, + "loss": 0.0, + "step": 5315 + }, + { + "epoch": 5.814207650273224, + "grad_norm": 0.000200931157451123, + "learning_rate": 2.3254401942926535e-05, + "loss": 0.0, + "step": 5320 + }, + { + "epoch": 5.8196721311475414, + "grad_norm": 0.00016768582281656563, + "learning_rate": 2.3224043715846996e-05, + "loss": 0.0, + "step": 5325 + }, + { + "epoch": 5.825136612021858, + "grad_norm": 0.0001333783147856593, + "learning_rate": 2.3193685488767457e-05, + "loss": 0.0, + "step": 5330 + }, + { + "epoch": 5.830601092896175, + "grad_norm": 0.00022269372129812837, + "learning_rate": 2.316332726168792e-05, + "loss": 0.0, + "step": 5335 + }, + { + "epoch": 5.836065573770492, + "grad_norm": 0.00021091346570756286, + "learning_rate": 2.313296903460838e-05, + "loss": 0.0001, + "step": 5340 + }, + { + "epoch": 5.841530054644808, + "grad_norm": 0.0034973840229213238, + "learning_rate": 2.310261080752884e-05, + "loss": 0.0, + "step": 5345 + }, + { + "epoch": 5.8469945355191255, + "grad_norm": 0.00016680124099366367, + "learning_rate": 2.30722525804493e-05, + "loss": 0.0, + "step": 5350 + }, + { + "epoch": 5.852459016393443, + "grad_norm": 0.00016794119437690824, + "learning_rate": 2.3041894353369763e-05, + "loss": 0.0, + "step": 5355 + }, + { + "epoch": 5.85792349726776, + "grad_norm": 0.0001823761558625847, + "learning_rate": 2.3011536126290227e-05, + "loss": 0.0, + "step": 5360 + }, + { + "epoch": 5.863387978142076, + "grad_norm": 0.00018940315931104124, + "learning_rate": 2.2981177899210688e-05, + "loss": 0.0, + "step": 5365 + }, + { + "epoch": 5.868852459016393, + "grad_norm": 0.0001646766031626612, + "learning_rate": 2.295081967213115e-05, + "loss": 0.0, + "step": 5370 + }, + { + "epoch": 5.8743169398907105, + "grad_norm": 0.00026264862390235066, + "learning_rate": 2.292046144505161e-05, + "loss": 0.0, + "step": 5375 + }, + { + "epoch": 5.879781420765028, + "grad_norm": 0.00020386996038723737, + "learning_rate": 2.289010321797207e-05, + "loss": 0.0, + "step": 5380 + }, + { + "epoch": 5.885245901639344, + "grad_norm": 0.00019918520411010832, + "learning_rate": 2.2859744990892533e-05, + "loss": 0.0, + "step": 5385 + }, + { + "epoch": 5.890710382513661, + "grad_norm": 0.0001257824624190107, + "learning_rate": 2.2829386763812994e-05, + "loss": 0.0, + "step": 5390 + }, + { + "epoch": 5.896174863387978, + "grad_norm": 0.00011864578118547797, + "learning_rate": 2.2799028536733455e-05, + "loss": 0.0, + "step": 5395 + }, + { + "epoch": 5.901639344262295, + "grad_norm": 0.00027177087031304836, + "learning_rate": 2.2768670309653916e-05, + "loss": 0.0, + "step": 5400 + }, + { + "epoch": 5.907103825136612, + "grad_norm": 0.014322335831820965, + "learning_rate": 2.2738312082574377e-05, + "loss": 0.0, + "step": 5405 + }, + { + "epoch": 5.912568306010929, + "grad_norm": 0.00014552676293533295, + "learning_rate": 2.270795385549484e-05, + "loss": 0.0, + "step": 5410 + }, + { + "epoch": 5.918032786885246, + "grad_norm": 0.0001333577383775264, + "learning_rate": 2.2677595628415303e-05, + "loss": 0.0, + "step": 5415 + }, + { + "epoch": 5.923497267759563, + "grad_norm": 9.596958989277482e-05, + "learning_rate": 2.2647237401335764e-05, + "loss": 0.0, + "step": 5420 + }, + { + "epoch": 5.9289617486338795, + "grad_norm": 0.0062270862981677055, + "learning_rate": 2.2616879174256225e-05, + "loss": 0.0, + "step": 5425 + }, + { + "epoch": 5.934426229508197, + "grad_norm": 0.00010670346091501415, + "learning_rate": 2.2586520947176686e-05, + "loss": 0.0, + "step": 5430 + }, + { + "epoch": 5.939890710382514, + "grad_norm": 0.0011700303293764591, + "learning_rate": 2.2556162720097147e-05, + "loss": 0.0, + "step": 5435 + }, + { + "epoch": 5.945355191256831, + "grad_norm": 0.0001294987159781158, + "learning_rate": 2.252580449301761e-05, + "loss": 0.0, + "step": 5440 + }, + { + "epoch": 5.950819672131147, + "grad_norm": 8.402008097618818e-05, + "learning_rate": 2.249544626593807e-05, + "loss": 0.0, + "step": 5445 + }, + { + "epoch": 5.956284153005464, + "grad_norm": 6.773810309823602e-05, + "learning_rate": 2.246508803885853e-05, + "loss": 0.0, + "step": 5450 + }, + { + "epoch": 5.961748633879782, + "grad_norm": 0.00012332104961387813, + "learning_rate": 2.2434729811778992e-05, + "loss": 0.0, + "step": 5455 + }, + { + "epoch": 5.967213114754099, + "grad_norm": 9.364177822135389e-05, + "learning_rate": 2.2404371584699453e-05, + "loss": 0.0, + "step": 5460 + }, + { + "epoch": 5.972677595628415, + "grad_norm": 0.0001126359638874419, + "learning_rate": 2.2374013357619918e-05, + "loss": 0.0, + "step": 5465 + }, + { + "epoch": 5.978142076502732, + "grad_norm": 0.0001698069245321676, + "learning_rate": 2.234365513054038e-05, + "loss": 0.0, + "step": 5470 + }, + { + "epoch": 5.983606557377049, + "grad_norm": 0.00012718628568109125, + "learning_rate": 2.231329690346084e-05, + "loss": 0.0, + "step": 5475 + }, + { + "epoch": 5.989071038251366, + "grad_norm": 0.14746598899364471, + "learning_rate": 2.22829386763813e-05, + "loss": 0.0004, + "step": 5480 + }, + { + "epoch": 5.994535519125683, + "grad_norm": 0.00011750194244086742, + "learning_rate": 2.2252580449301762e-05, + "loss": 0.0, + "step": 5485 + }, + { + "epoch": 6.0, + "grad_norm": 0.00011425597767811269, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0047, + "step": 5490 + }, + { + "epoch": 6.0, + "eval_loss": 0.0026069784071296453, + "eval_runtime": 667.3626, + "eval_samples_per_second": 10.963, + "eval_steps_per_second": 1.371, + "step": 5490 + }, + { + "epoch": 6.005464480874317, + "grad_norm": 0.011173485778272152, + "learning_rate": 2.2191863995142684e-05, + "loss": 0.0, + "step": 5495 + }, + { + "epoch": 6.0109289617486334, + "grad_norm": 8.967368921730667e-05, + "learning_rate": 2.2161505768063146e-05, + "loss": 0.0, + "step": 5500 + }, + { + "epoch": 6.016393442622951, + "grad_norm": 4.766190977534279e-05, + "learning_rate": 2.2131147540983607e-05, + "loss": 0.0, + "step": 5505 + }, + { + "epoch": 6.021857923497268, + "grad_norm": 0.002877232152968645, + "learning_rate": 2.2100789313904068e-05, + "loss": 0.0, + "step": 5510 + }, + { + "epoch": 6.027322404371585, + "grad_norm": 5.845217310707085e-05, + "learning_rate": 2.2070431086824532e-05, + "loss": 0.0, + "step": 5515 + }, + { + "epoch": 6.032786885245901, + "grad_norm": 5.649280865327455e-05, + "learning_rate": 2.2040072859744993e-05, + "loss": 0.0, + "step": 5520 + }, + { + "epoch": 6.038251366120218, + "grad_norm": 0.00013220343680586666, + "learning_rate": 2.2009714632665455e-05, + "loss": 0.0, + "step": 5525 + }, + { + "epoch": 6.043715846994536, + "grad_norm": 5.454764686874114e-05, + "learning_rate": 2.1979356405585916e-05, + "loss": 0.0, + "step": 5530 + }, + { + "epoch": 6.049180327868853, + "grad_norm": 7.190610631369054e-05, + "learning_rate": 2.1948998178506377e-05, + "loss": 0.0, + "step": 5535 + }, + { + "epoch": 6.054644808743169, + "grad_norm": 0.00018310271843802184, + "learning_rate": 2.1918639951426838e-05, + "loss": 0.0, + "step": 5540 + }, + { + "epoch": 6.060109289617486, + "grad_norm": 5.9327132476028055e-05, + "learning_rate": 2.18882817243473e-05, + "loss": 0.0, + "step": 5545 + }, + { + "epoch": 6.065573770491803, + "grad_norm": 0.00018289749277755618, + "learning_rate": 2.185792349726776e-05, + "loss": 0.176, + "step": 5550 + }, + { + "epoch": 6.0710382513661205, + "grad_norm": 0.000174393251654692, + "learning_rate": 2.182756527018822e-05, + "loss": 0.0, + "step": 5555 + }, + { + "epoch": 6.076502732240437, + "grad_norm": 0.015763528645038605, + "learning_rate": 2.1797207043108682e-05, + "loss": 0.0001, + "step": 5560 + }, + { + "epoch": 6.081967213114754, + "grad_norm": 0.0009447432239539921, + "learning_rate": 2.1766848816029144e-05, + "loss": 0.0, + "step": 5565 + }, + { + "epoch": 6.087431693989071, + "grad_norm": 0.0006745553691871464, + "learning_rate": 2.1736490588949608e-05, + "loss": 0.0, + "step": 5570 + }, + { + "epoch": 6.092896174863388, + "grad_norm": 0.0007902304059825838, + "learning_rate": 2.170613236187007e-05, + "loss": 0.0, + "step": 5575 + }, + { + "epoch": 6.098360655737705, + "grad_norm": 0.003297103103250265, + "learning_rate": 2.167577413479053e-05, + "loss": 0.0, + "step": 5580 + }, + { + "epoch": 6.103825136612022, + "grad_norm": 0.0008036267245188355, + "learning_rate": 2.164541590771099e-05, + "loss": 0.0002, + "step": 5585 + }, + { + "epoch": 6.109289617486339, + "grad_norm": 0.0005939893890172243, + "learning_rate": 2.161505768063145e-05, + "loss": 0.0001, + "step": 5590 + }, + { + "epoch": 6.114754098360656, + "grad_norm": 9.905810293275863e-05, + "learning_rate": 2.1584699453551914e-05, + "loss": 0.0, + "step": 5595 + }, + { + "epoch": 6.120218579234972, + "grad_norm": 0.0003050408558920026, + "learning_rate": 2.1554341226472375e-05, + "loss": 0.0, + "step": 5600 + }, + { + "epoch": 6.1256830601092895, + "grad_norm": 0.001176638645119965, + "learning_rate": 2.1523982999392836e-05, + "loss": 0.0001, + "step": 5605 + }, + { + "epoch": 6.131147540983607, + "grad_norm": 0.0001844777725636959, + "learning_rate": 2.1493624772313297e-05, + "loss": 0.0, + "step": 5610 + }, + { + "epoch": 6.136612021857924, + "grad_norm": 0.0003056370187550783, + "learning_rate": 2.1463266545233758e-05, + "loss": 0.0, + "step": 5615 + }, + { + "epoch": 6.14207650273224, + "grad_norm": 0.01378646306693554, + "learning_rate": 2.1432908318154223e-05, + "loss": 0.0001, + "step": 5620 + }, + { + "epoch": 6.147540983606557, + "grad_norm": 0.0001626972807571292, + "learning_rate": 2.1402550091074684e-05, + "loss": 0.0, + "step": 5625 + }, + { + "epoch": 6.1530054644808745, + "grad_norm": 0.0002374864707235247, + "learning_rate": 2.1372191863995145e-05, + "loss": 0.0, + "step": 5630 + }, + { + "epoch": 6.158469945355192, + "grad_norm": 0.0007631028420291841, + "learning_rate": 2.1341833636915606e-05, + "loss": 0.0, + "step": 5635 + }, + { + "epoch": 6.163934426229508, + "grad_norm": 0.0007890752167440951, + "learning_rate": 2.1311475409836064e-05, + "loss": 0.0001, + "step": 5640 + }, + { + "epoch": 6.169398907103825, + "grad_norm": 0.00014803047815803438, + "learning_rate": 2.128111718275653e-05, + "loss": 0.0, + "step": 5645 + }, + { + "epoch": 6.174863387978142, + "grad_norm": 0.0001815713185351342, + "learning_rate": 2.125075895567699e-05, + "loss": 0.0, + "step": 5650 + }, + { + "epoch": 6.180327868852459, + "grad_norm": 0.0002662826154846698, + "learning_rate": 2.122040072859745e-05, + "loss": 0.0, + "step": 5655 + }, + { + "epoch": 6.185792349726776, + "grad_norm": 0.00023396898177452385, + "learning_rate": 2.1190042501517912e-05, + "loss": 0.0, + "step": 5660 + }, + { + "epoch": 6.191256830601093, + "grad_norm": 0.0009009042987599969, + "learning_rate": 2.1159684274438373e-05, + "loss": 0.0, + "step": 5665 + }, + { + "epoch": 6.19672131147541, + "grad_norm": 0.00019553759193513542, + "learning_rate": 2.1129326047358834e-05, + "loss": 0.0, + "step": 5670 + }, + { + "epoch": 6.202185792349727, + "grad_norm": 0.0001129544252762571, + "learning_rate": 2.10989678202793e-05, + "loss": 0.0, + "step": 5675 + }, + { + "epoch": 6.2076502732240435, + "grad_norm": 0.0001647875178605318, + "learning_rate": 2.106860959319976e-05, + "loss": 0.0, + "step": 5680 + }, + { + "epoch": 6.213114754098361, + "grad_norm": 7.417155575240031e-05, + "learning_rate": 2.103825136612022e-05, + "loss": 0.0, + "step": 5685 + }, + { + "epoch": 6.218579234972678, + "grad_norm": 4.739851283375174e-05, + "learning_rate": 2.100789313904068e-05, + "loss": 0.0, + "step": 5690 + }, + { + "epoch": 6.224043715846994, + "grad_norm": 0.31114462018013, + "learning_rate": 2.097753491196114e-05, + "loss": 0.0014, + "step": 5695 + }, + { + "epoch": 6.229508196721311, + "grad_norm": 0.00010993971227435395, + "learning_rate": 2.0947176684881604e-05, + "loss": 0.0, + "step": 5700 + }, + { + "epoch": 6.2349726775956285, + "grad_norm": 0.00012361927656456828, + "learning_rate": 2.0916818457802065e-05, + "loss": 0.0, + "step": 5705 + }, + { + "epoch": 6.240437158469946, + "grad_norm": 3.7131321732886136e-05, + "learning_rate": 2.0886460230722527e-05, + "loss": 0.0, + "step": 5710 + }, + { + "epoch": 6.245901639344262, + "grad_norm": 8.455519855488092e-05, + "learning_rate": 2.0856102003642988e-05, + "loss": 0.0, + "step": 5715 + }, + { + "epoch": 6.251366120218579, + "grad_norm": 4.8671347030904144e-05, + "learning_rate": 2.082574377656345e-05, + "loss": 0.0, + "step": 5720 + }, + { + "epoch": 6.256830601092896, + "grad_norm": 2.6576733944239095e-05, + "learning_rate": 2.0795385549483913e-05, + "loss": 0.0, + "step": 5725 + }, + { + "epoch": 6.262295081967213, + "grad_norm": 0.00012619678454939276, + "learning_rate": 2.0765027322404374e-05, + "loss": 0.0, + "step": 5730 + }, + { + "epoch": 6.26775956284153, + "grad_norm": 0.00012123715714551508, + "learning_rate": 2.0734669095324836e-05, + "loss": 0.0, + "step": 5735 + }, + { + "epoch": 6.273224043715847, + "grad_norm": 6.571730773430318e-05, + "learning_rate": 2.0704310868245293e-05, + "loss": 0.0, + "step": 5740 + }, + { + "epoch": 6.278688524590164, + "grad_norm": 0.00016342464368790388, + "learning_rate": 2.0673952641165754e-05, + "loss": 0.0, + "step": 5745 + }, + { + "epoch": 6.284153005464481, + "grad_norm": 7.798125443514436e-05, + "learning_rate": 2.064359441408622e-05, + "loss": 0.0, + "step": 5750 + }, + { + "epoch": 6.2896174863387975, + "grad_norm": 4.3517509766388685e-05, + "learning_rate": 2.061323618700668e-05, + "loss": 0.0, + "step": 5755 + }, + { + "epoch": 6.295081967213115, + "grad_norm": 5.0315840780967847e-05, + "learning_rate": 2.058287795992714e-05, + "loss": 0.0, + "step": 5760 + }, + { + "epoch": 6.300546448087432, + "grad_norm": 5.3943444072501734e-05, + "learning_rate": 2.0552519732847602e-05, + "loss": 0.0, + "step": 5765 + }, + { + "epoch": 6.306010928961749, + "grad_norm": 0.00030482446891255677, + "learning_rate": 2.0522161505768063e-05, + "loss": 0.0, + "step": 5770 + }, + { + "epoch": 6.311475409836065, + "grad_norm": 6.828513869550079e-05, + "learning_rate": 2.0491803278688525e-05, + "loss": 0.0, + "step": 5775 + }, + { + "epoch": 6.316939890710382, + "grad_norm": 9.909820801112801e-05, + "learning_rate": 2.046144505160899e-05, + "loss": 0.0, + "step": 5780 + }, + { + "epoch": 6.3224043715847, + "grad_norm": 9.688820864539593e-05, + "learning_rate": 2.043108682452945e-05, + "loss": 0.0, + "step": 5785 + }, + { + "epoch": 6.327868852459017, + "grad_norm": 3.0218470783438534e-05, + "learning_rate": 2.040072859744991e-05, + "loss": 0.0, + "step": 5790 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 0.002815735526382923, + "learning_rate": 2.037037037037037e-05, + "loss": 0.0, + "step": 5795 + }, + { + "epoch": 6.33879781420765, + "grad_norm": 5.187254646443762e-05, + "learning_rate": 2.0340012143290834e-05, + "loss": 0.0, + "step": 5800 + }, + { + "epoch": 6.344262295081967, + "grad_norm": 3.162359644193202e-05, + "learning_rate": 2.0309653916211295e-05, + "loss": 0.0, + "step": 5805 + }, + { + "epoch": 6.3497267759562845, + "grad_norm": 0.0004508834390435368, + "learning_rate": 2.0279295689131756e-05, + "loss": 0.0, + "step": 5810 + }, + { + "epoch": 6.355191256830601, + "grad_norm": 5.048835009802133e-05, + "learning_rate": 2.0248937462052217e-05, + "loss": 0.0, + "step": 5815 + }, + { + "epoch": 6.360655737704918, + "grad_norm": 4.7119614464463666e-05, + "learning_rate": 2.0218579234972678e-05, + "loss": 0.0, + "step": 5820 + }, + { + "epoch": 6.366120218579235, + "grad_norm": 6.0273188864812255e-05, + "learning_rate": 2.018822100789314e-05, + "loss": 0.0001, + "step": 5825 + }, + { + "epoch": 6.371584699453552, + "grad_norm": 3.4625056287040934e-05, + "learning_rate": 2.0157862780813604e-05, + "loss": 0.0, + "step": 5830 + }, + { + "epoch": 6.377049180327869, + "grad_norm": 0.00014075507351662964, + "learning_rate": 2.0127504553734065e-05, + "loss": 0.0, + "step": 5835 + }, + { + "epoch": 6.382513661202186, + "grad_norm": 0.0014786600368097425, + "learning_rate": 2.0097146326654526e-05, + "loss": 0.0, + "step": 5840 + }, + { + "epoch": 6.387978142076503, + "grad_norm": 3.1873085390543565e-05, + "learning_rate": 2.0066788099574984e-05, + "loss": 0.0, + "step": 5845 + }, + { + "epoch": 6.39344262295082, + "grad_norm": 5.925606092205271e-05, + "learning_rate": 2.0036429872495445e-05, + "loss": 0.0, + "step": 5850 + }, + { + "epoch": 6.398907103825136, + "grad_norm": 7.679805275984108e-05, + "learning_rate": 2.000607164541591e-05, + "loss": 0.0, + "step": 5855 + }, + { + "epoch": 6.404371584699454, + "grad_norm": 0.00013544733519665897, + "learning_rate": 1.997571341833637e-05, + "loss": 0.0, + "step": 5860 + }, + { + "epoch": 6.409836065573771, + "grad_norm": 0.0003026534104719758, + "learning_rate": 1.994535519125683e-05, + "loss": 0.0, + "step": 5865 + }, + { + "epoch": 6.415300546448087, + "grad_norm": 0.00010233583452645689, + "learning_rate": 1.9914996964177293e-05, + "loss": 0.0, + "step": 5870 + }, + { + "epoch": 6.420765027322404, + "grad_norm": 0.0002885489957407117, + "learning_rate": 1.9884638737097754e-05, + "loss": 0.0, + "step": 5875 + }, + { + "epoch": 6.426229508196721, + "grad_norm": 7.025294326012954e-05, + "learning_rate": 1.9854280510018215e-05, + "loss": 0.0, + "step": 5880 + }, + { + "epoch": 6.4316939890710385, + "grad_norm": 5.036436778027564e-05, + "learning_rate": 1.982392228293868e-05, + "loss": 0.0, + "step": 5885 + }, + { + "epoch": 6.437158469945355, + "grad_norm": 4.766888378071599e-05, + "learning_rate": 1.979356405585914e-05, + "loss": 0.0, + "step": 5890 + }, + { + "epoch": 6.442622950819672, + "grad_norm": 2.0245690393494442e-05, + "learning_rate": 1.97632058287796e-05, + "loss": 0.0, + "step": 5895 + }, + { + "epoch": 6.448087431693989, + "grad_norm": 0.0019478934118524194, + "learning_rate": 1.973284760170006e-05, + "loss": 0.0001, + "step": 5900 + }, + { + "epoch": 6.453551912568306, + "grad_norm": 5.526033783098683e-05, + "learning_rate": 1.9702489374620524e-05, + "loss": 0.0, + "step": 5905 + }, + { + "epoch": 6.459016393442623, + "grad_norm": 4.0888528019422665e-05, + "learning_rate": 1.9672131147540985e-05, + "loss": 0.0, + "step": 5910 + }, + { + "epoch": 6.46448087431694, + "grad_norm": 4.284735405235551e-05, + "learning_rate": 1.9641772920461446e-05, + "loss": 0.0, + "step": 5915 + }, + { + "epoch": 6.469945355191257, + "grad_norm": 3.235202530049719e-05, + "learning_rate": 1.9611414693381907e-05, + "loss": 0.0, + "step": 5920 + }, + { + "epoch": 6.475409836065574, + "grad_norm": 8.918684761738405e-05, + "learning_rate": 1.958105646630237e-05, + "loss": 0.0, + "step": 5925 + }, + { + "epoch": 6.48087431693989, + "grad_norm": 2.8192551326355897e-05, + "learning_rate": 1.955069823922283e-05, + "loss": 0.0, + "step": 5930 + }, + { + "epoch": 6.4863387978142075, + "grad_norm": 6.065002526156604e-05, + "learning_rate": 1.9520340012143294e-05, + "loss": 0.0, + "step": 5935 + }, + { + "epoch": 6.491803278688525, + "grad_norm": 8.560925198253244e-05, + "learning_rate": 1.9489981785063755e-05, + "loss": 0.0, + "step": 5940 + }, + { + "epoch": 6.497267759562842, + "grad_norm": 3.3522384910611436e-05, + "learning_rate": 1.9459623557984213e-05, + "loss": 0.0, + "step": 5945 + }, + { + "epoch": 6.502732240437158, + "grad_norm": 8.88680006028153e-05, + "learning_rate": 1.9429265330904674e-05, + "loss": 0.0, + "step": 5950 + }, + { + "epoch": 6.508196721311475, + "grad_norm": 6.100683458498679e-05, + "learning_rate": 1.9398907103825135e-05, + "loss": 0.0, + "step": 5955 + }, + { + "epoch": 6.5136612021857925, + "grad_norm": 1.864024488895666e-05, + "learning_rate": 1.93685488767456e-05, + "loss": 0.0, + "step": 5960 + }, + { + "epoch": 6.51912568306011, + "grad_norm": 0.00014385611575562507, + "learning_rate": 1.933819064966606e-05, + "loss": 0.0, + "step": 5965 + }, + { + "epoch": 6.524590163934426, + "grad_norm": 3.123824717476964e-05, + "learning_rate": 1.9307832422586522e-05, + "loss": 0.0, + "step": 5970 + }, + { + "epoch": 6.530054644808743, + "grad_norm": 2.9815590096404776e-05, + "learning_rate": 1.9277474195506983e-05, + "loss": 0.0, + "step": 5975 + }, + { + "epoch": 6.53551912568306, + "grad_norm": 2.5311090212198906e-05, + "learning_rate": 1.9247115968427444e-05, + "loss": 0.0, + "step": 5980 + }, + { + "epoch": 6.540983606557377, + "grad_norm": 6.320253305602819e-05, + "learning_rate": 1.9216757741347906e-05, + "loss": 0.0, + "step": 5985 + }, + { + "epoch": 6.546448087431694, + "grad_norm": 6.124599894974381e-05, + "learning_rate": 1.918639951426837e-05, + "loss": 0.0, + "step": 5990 + }, + { + "epoch": 6.551912568306011, + "grad_norm": 0.0008633544784970582, + "learning_rate": 1.9156041287188828e-05, + "loss": 0.0, + "step": 5995 + }, + { + "epoch": 6.557377049180328, + "grad_norm": 6.350692274281755e-05, + "learning_rate": 1.912568306010929e-05, + "loss": 0.0, + "step": 6000 + }, + { + "epoch": 6.562841530054644, + "grad_norm": 3.356828892719932e-05, + "learning_rate": 1.909532483302975e-05, + "loss": 0.0, + "step": 6005 + }, + { + "epoch": 6.5683060109289615, + "grad_norm": 0.00026475227787159383, + "learning_rate": 1.9064966605950215e-05, + "loss": 0.0, + "step": 6010 + }, + { + "epoch": 6.573770491803279, + "grad_norm": 2.997915908053983e-05, + "learning_rate": 1.9034608378870676e-05, + "loss": 0.0, + "step": 6015 + }, + { + "epoch": 6.579234972677596, + "grad_norm": 4.210296174278483e-05, + "learning_rate": 1.9004250151791137e-05, + "loss": 0.0, + "step": 6020 + }, + { + "epoch": 6.584699453551913, + "grad_norm": 4.2359999497421086e-05, + "learning_rate": 1.8973891924711598e-05, + "loss": 0.0, + "step": 6025 + }, + { + "epoch": 6.590163934426229, + "grad_norm": 4.0853054088074714e-05, + "learning_rate": 1.894353369763206e-05, + "loss": 0.0, + "step": 6030 + }, + { + "epoch": 6.595628415300546, + "grad_norm": 1.567585968587082e-05, + "learning_rate": 1.891317547055252e-05, + "loss": 0.0, + "step": 6035 + }, + { + "epoch": 6.601092896174864, + "grad_norm": 3.672630919027142e-05, + "learning_rate": 1.8882817243472985e-05, + "loss": 0.0, + "step": 6040 + }, + { + "epoch": 6.60655737704918, + "grad_norm": 2.127391599060502e-05, + "learning_rate": 1.8852459016393442e-05, + "loss": 0.0, + "step": 6045 + }, + { + "epoch": 6.612021857923497, + "grad_norm": 4.033459481433965e-05, + "learning_rate": 1.8822100789313904e-05, + "loss": 0.0, + "step": 6050 + }, + { + "epoch": 6.617486338797814, + "grad_norm": 3.772424315684475e-05, + "learning_rate": 1.8791742562234365e-05, + "loss": 0.0, + "step": 6055 + }, + { + "epoch": 6.622950819672131, + "grad_norm": 4.5684719225391746e-05, + "learning_rate": 1.8761384335154826e-05, + "loss": 0.0, + "step": 6060 + }, + { + "epoch": 6.628415300546449, + "grad_norm": 4.037997859995812e-05, + "learning_rate": 1.873102610807529e-05, + "loss": 0.0, + "step": 6065 + }, + { + "epoch": 6.633879781420765, + "grad_norm": 2.9085753340041265e-05, + "learning_rate": 1.870066788099575e-05, + "loss": 0.0, + "step": 6070 + }, + { + "epoch": 6.639344262295082, + "grad_norm": 4.5837143261451274e-05, + "learning_rate": 1.8670309653916213e-05, + "loss": 0.0, + "step": 6075 + }, + { + "epoch": 6.644808743169399, + "grad_norm": 2.149335887224879e-05, + "learning_rate": 1.8639951426836674e-05, + "loss": 0.0, + "step": 6080 + }, + { + "epoch": 6.6502732240437155, + "grad_norm": 2.6230225557810627e-05, + "learning_rate": 1.8609593199757135e-05, + "loss": 0.0, + "step": 6085 + }, + { + "epoch": 6.655737704918033, + "grad_norm": 5.128745760885067e-05, + "learning_rate": 1.85792349726776e-05, + "loss": 0.0, + "step": 6090 + }, + { + "epoch": 6.66120218579235, + "grad_norm": 2.8409345759428106e-05, + "learning_rate": 1.8548876745598057e-05, + "loss": 0.0, + "step": 6095 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.829441862355452e-05, + "learning_rate": 1.8518518518518518e-05, + "loss": 0.0, + "step": 6100 + }, + { + "epoch": 6.672131147540983, + "grad_norm": 2.8088525141356513e-05, + "learning_rate": 1.848816029143898e-05, + "loss": 0.0, + "step": 6105 + }, + { + "epoch": 6.6775956284153, + "grad_norm": 0.00019966061518061906, + "learning_rate": 1.845780206435944e-05, + "loss": 0.0, + "step": 6110 + }, + { + "epoch": 6.683060109289618, + "grad_norm": 3.2912070309976116e-05, + "learning_rate": 1.8427443837279905e-05, + "loss": 0.0, + "step": 6115 + }, + { + "epoch": 6.688524590163935, + "grad_norm": 3.402951915632002e-05, + "learning_rate": 1.8397085610200366e-05, + "loss": 0.0, + "step": 6120 + }, + { + "epoch": 6.693989071038251, + "grad_norm": 5.13341037731152e-05, + "learning_rate": 1.8366727383120827e-05, + "loss": 0.0, + "step": 6125 + }, + { + "epoch": 6.699453551912568, + "grad_norm": 3.986077354056761e-05, + "learning_rate": 1.833636915604129e-05, + "loss": 0.0, + "step": 6130 + }, + { + "epoch": 6.704918032786885, + "grad_norm": 4.487119076657109e-05, + "learning_rate": 1.830601092896175e-05, + "loss": 0.0, + "step": 6135 + }, + { + "epoch": 6.7103825136612025, + "grad_norm": 1.591966429259628e-05, + "learning_rate": 1.827565270188221e-05, + "loss": 0.0, + "step": 6140 + }, + { + "epoch": 6.715846994535519, + "grad_norm": 2.8471571567934006e-05, + "learning_rate": 1.8245294474802672e-05, + "loss": 0.0, + "step": 6145 + }, + { + "epoch": 6.721311475409836, + "grad_norm": 2.545778625062667e-05, + "learning_rate": 1.8214936247723133e-05, + "loss": 0.0, + "step": 6150 + }, + { + "epoch": 6.726775956284153, + "grad_norm": 4.5180757297202945e-05, + "learning_rate": 1.8184578020643594e-05, + "loss": 0.0, + "step": 6155 + }, + { + "epoch": 6.73224043715847, + "grad_norm": 3.133527570753358e-05, + "learning_rate": 1.8154219793564055e-05, + "loss": 0.0, + "step": 6160 + }, + { + "epoch": 6.737704918032787, + "grad_norm": 2.0645300537580624e-05, + "learning_rate": 1.8123861566484516e-05, + "loss": 0.0, + "step": 6165 + }, + { + "epoch": 6.743169398907104, + "grad_norm": 3.279056545579806e-05, + "learning_rate": 1.809350333940498e-05, + "loss": 0.0, + "step": 6170 + }, + { + "epoch": 6.748633879781421, + "grad_norm": 3.055525303352624e-05, + "learning_rate": 1.8063145112325442e-05, + "loss": 0.0, + "step": 6175 + }, + { + "epoch": 6.754098360655737, + "grad_norm": 1.9079752746620215e-05, + "learning_rate": 1.8032786885245903e-05, + "loss": 0.0, + "step": 6180 + }, + { + "epoch": 6.759562841530054, + "grad_norm": 0.00016503347433172166, + "learning_rate": 1.8002428658166364e-05, + "loss": 0.0, + "step": 6185 + }, + { + "epoch": 6.7650273224043715, + "grad_norm": 2.6253288524458185e-05, + "learning_rate": 1.7972070431086825e-05, + "loss": 0.0, + "step": 6190 + }, + { + "epoch": 6.770491803278689, + "grad_norm": 2.5179529984598048e-05, + "learning_rate": 1.7941712204007287e-05, + "loss": 0.0, + "step": 6195 + }, + { + "epoch": 6.775956284153006, + "grad_norm": 3.118176755378954e-05, + "learning_rate": 1.7911353976927748e-05, + "loss": 0.0, + "step": 6200 + }, + { + "epoch": 6.781420765027322, + "grad_norm": 2.4124465198838152e-05, + "learning_rate": 1.788099574984821e-05, + "loss": 0.0, + "step": 6205 + }, + { + "epoch": 6.786885245901639, + "grad_norm": 0.2607771158218384, + "learning_rate": 1.785063752276867e-05, + "loss": 0.0002, + "step": 6210 + }, + { + "epoch": 6.7923497267759565, + "grad_norm": 1.626986158953514e-05, + "learning_rate": 1.782027929568913e-05, + "loss": 0.0, + "step": 6215 + }, + { + "epoch": 6.797814207650273, + "grad_norm": 0.0010127287823706865, + "learning_rate": 1.7789921068609596e-05, + "loss": 0.0, + "step": 6220 + }, + { + "epoch": 6.80327868852459, + "grad_norm": 3.336209192639217e-05, + "learning_rate": 1.7759562841530057e-05, + "loss": 0.0, + "step": 6225 + }, + { + "epoch": 6.808743169398907, + "grad_norm": 3.142928471788764e-05, + "learning_rate": 1.7729204614450518e-05, + "loss": 0.0, + "step": 6230 + }, + { + "epoch": 6.814207650273224, + "grad_norm": 2.5947849280782975e-05, + "learning_rate": 1.769884638737098e-05, + "loss": 0.0, + "step": 6235 + }, + { + "epoch": 6.8196721311475414, + "grad_norm": 0.001310658873990178, + "learning_rate": 1.766848816029144e-05, + "loss": 0.0, + "step": 6240 + }, + { + "epoch": 6.825136612021858, + "grad_norm": 7.160162931540981e-05, + "learning_rate": 1.76381299332119e-05, + "loss": 0.0, + "step": 6245 + }, + { + "epoch": 6.830601092896175, + "grad_norm": 2.0879228031844832e-05, + "learning_rate": 1.7607771706132362e-05, + "loss": 0.0, + "step": 6250 + }, + { + "epoch": 6.836065573770492, + "grad_norm": 9.69652392086573e-05, + "learning_rate": 1.7577413479052823e-05, + "loss": 0.0, + "step": 6255 + }, + { + "epoch": 6.841530054644808, + "grad_norm": 1.9431065084063448e-05, + "learning_rate": 1.7547055251973285e-05, + "loss": 0.0, + "step": 6260 + }, + { + "epoch": 6.8469945355191255, + "grad_norm": 5.2403873269213364e-05, + "learning_rate": 1.7516697024893746e-05, + "loss": 0.0, + "step": 6265 + }, + { + "epoch": 6.852459016393443, + "grad_norm": 8.865645213518292e-05, + "learning_rate": 1.7486338797814207e-05, + "loss": 0.0, + "step": 6270 + }, + { + "epoch": 6.85792349726776, + "grad_norm": 2.442375807731878e-05, + "learning_rate": 1.745598057073467e-05, + "loss": 0.0, + "step": 6275 + }, + { + "epoch": 6.863387978142076, + "grad_norm": 2.5425528292544186e-05, + "learning_rate": 1.7425622343655132e-05, + "loss": 0.0, + "step": 6280 + }, + { + "epoch": 6.868852459016393, + "grad_norm": 5.923685966990888e-05, + "learning_rate": 1.7395264116575594e-05, + "loss": 0.0, + "step": 6285 + }, + { + "epoch": 6.8743169398907105, + "grad_norm": 2.9877415727241896e-05, + "learning_rate": 1.7364905889496055e-05, + "loss": 0.0, + "step": 6290 + }, + { + "epoch": 6.879781420765028, + "grad_norm": 6.036084596416913e-05, + "learning_rate": 1.7334547662416516e-05, + "loss": 0.0001, + "step": 6295 + }, + { + "epoch": 6.885245901639344, + "grad_norm": 3.035015834029764e-05, + "learning_rate": 1.7304189435336977e-05, + "loss": 0.0, + "step": 6300 + }, + { + "epoch": 6.890710382513661, + "grad_norm": 1.9475093722576275e-05, + "learning_rate": 1.7273831208257438e-05, + "loss": 0.0, + "step": 6305 + }, + { + "epoch": 6.896174863387978, + "grad_norm": 2.220989335910417e-05, + "learning_rate": 1.72434729811779e-05, + "loss": 0.0, + "step": 6310 + }, + { + "epoch": 6.901639344262295, + "grad_norm": 2.0596038666553795e-05, + "learning_rate": 1.721311475409836e-05, + "loss": 0.0, + "step": 6315 + }, + { + "epoch": 6.907103825136612, + "grad_norm": 0.00045092753134667873, + "learning_rate": 1.718275652701882e-05, + "loss": 0.0, + "step": 6320 + }, + { + "epoch": 6.912568306010929, + "grad_norm": 1.8349317542742938e-05, + "learning_rate": 1.7152398299939286e-05, + "loss": 0.0, + "step": 6325 + }, + { + "epoch": 6.918032786885246, + "grad_norm": 1.6973086530924775e-05, + "learning_rate": 1.7122040072859747e-05, + "loss": 0.0, + "step": 6330 + }, + { + "epoch": 6.923497267759563, + "grad_norm": 3.073631160077639e-05, + "learning_rate": 1.7091681845780208e-05, + "loss": 0.0, + "step": 6335 + }, + { + "epoch": 6.9289617486338795, + "grad_norm": 1.8282253222423606e-05, + "learning_rate": 1.706132361870067e-05, + "loss": 0.0, + "step": 6340 + }, + { + "epoch": 6.934426229508197, + "grad_norm": 1.3297793884703424e-05, + "learning_rate": 1.703096539162113e-05, + "loss": 0.0, + "step": 6345 + }, + { + "epoch": 6.939890710382514, + "grad_norm": 1.68807619047584e-05, + "learning_rate": 1.700060716454159e-05, + "loss": 0.0, + "step": 6350 + }, + { + "epoch": 6.945355191256831, + "grad_norm": 1.75243585545104e-05, + "learning_rate": 1.6970248937462053e-05, + "loss": 0.0, + "step": 6355 + }, + { + "epoch": 6.950819672131147, + "grad_norm": 1.8891807485488243e-05, + "learning_rate": 1.6939890710382514e-05, + "loss": 0.0, + "step": 6360 + }, + { + "epoch": 6.956284153005464, + "grad_norm": 1.1831501069536898e-05, + "learning_rate": 1.6909532483302975e-05, + "loss": 0.0, + "step": 6365 + }, + { + "epoch": 6.961748633879782, + "grad_norm": 3.065099008381367e-05, + "learning_rate": 1.6879174256223436e-05, + "loss": 0.0, + "step": 6370 + }, + { + "epoch": 6.967213114754099, + "grad_norm": 2.236567161162384e-05, + "learning_rate": 1.68488160291439e-05, + "loss": 0.0, + "step": 6375 + }, + { + "epoch": 6.972677595628415, + "grad_norm": 2.7795680580311455e-05, + "learning_rate": 1.6818457802064362e-05, + "loss": 0.0, + "step": 6380 + }, + { + "epoch": 6.978142076502732, + "grad_norm": 4.299731517676264e-05, + "learning_rate": 1.6788099574984823e-05, + "loss": 0.0, + "step": 6385 + }, + { + "epoch": 6.983606557377049, + "grad_norm": 2.10926154977642e-05, + "learning_rate": 1.6757741347905284e-05, + "loss": 0.0, + "step": 6390 + }, + { + "epoch": 6.989071038251366, + "grad_norm": 3.6098430427955464e-05, + "learning_rate": 1.6727383120825745e-05, + "loss": 0.0, + "step": 6395 + }, + { + "epoch": 6.994535519125683, + "grad_norm": 1.9995864931843244e-05, + "learning_rate": 1.6697024893746206e-05, + "loss": 0.0, + "step": 6400 + }, + { + "epoch": 7.0, + "grad_norm": 1.6834292182466015e-05, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0, + "step": 6405 + }, + { + "epoch": 7.0, + "eval_loss": 0.0011112982174381614, + "eval_runtime": 697.2626, + "eval_samples_per_second": 10.492, + "eval_steps_per_second": 1.312, + "step": 6405 + }, + { + "epoch": 7.005464480874317, + "grad_norm": 1.7212223610840738e-05, + "learning_rate": 1.663630843958713e-05, + "loss": 0.0, + "step": 6410 + }, + { + "epoch": 7.0109289617486334, + "grad_norm": 2.5316898245364428e-05, + "learning_rate": 1.660595021250759e-05, + "loss": 0.0, + "step": 6415 + }, + { + "epoch": 7.016393442622951, + "grad_norm": 1.4173186173138674e-05, + "learning_rate": 1.657559198542805e-05, + "loss": 0.0, + "step": 6420 + }, + { + "epoch": 7.021857923497268, + "grad_norm": 1.2397636965033598e-05, + "learning_rate": 1.6545233758348512e-05, + "loss": 0.0, + "step": 6425 + }, + { + "epoch": 7.027322404371585, + "grad_norm": 8.636420716356952e-06, + "learning_rate": 1.6514875531268976e-05, + "loss": 0.0, + "step": 6430 + }, + { + "epoch": 7.032786885245901, + "grad_norm": 1.855434493336361e-05, + "learning_rate": 1.6484517304189438e-05, + "loss": 0.0, + "step": 6435 + }, + { + "epoch": 7.038251366120218, + "grad_norm": 2.6530422474024817e-05, + "learning_rate": 1.64541590771099e-05, + "loss": 0.0, + "step": 6440 + }, + { + "epoch": 7.043715846994536, + "grad_norm": 1.7378168195136823e-05, + "learning_rate": 1.642380085003036e-05, + "loss": 0.0, + "step": 6445 + }, + { + "epoch": 7.049180327868853, + "grad_norm": 2.0945200958522037e-05, + "learning_rate": 1.6393442622950818e-05, + "loss": 0.0, + "step": 6450 + }, + { + "epoch": 7.054644808743169, + "grad_norm": 9.741668327478692e-05, + "learning_rate": 1.6363084395871282e-05, + "loss": 0.0, + "step": 6455 + }, + { + "epoch": 7.060109289617486, + "grad_norm": 1.0040616871265229e-05, + "learning_rate": 1.6332726168791743e-05, + "loss": 0.0, + "step": 6460 + }, + { + "epoch": 7.065573770491803, + "grad_norm": 1.7955384464585222e-05, + "learning_rate": 1.6302367941712204e-05, + "loss": 0.0, + "step": 6465 + }, + { + "epoch": 7.0710382513661205, + "grad_norm": 1.3347224921744782e-05, + "learning_rate": 1.6272009714632666e-05, + "loss": 0.0, + "step": 6470 + }, + { + "epoch": 7.076502732240437, + "grad_norm": 2.3174603484221734e-05, + "learning_rate": 1.6241651487553127e-05, + "loss": 0.0, + "step": 6475 + }, + { + "epoch": 7.081967213114754, + "grad_norm": 0.00023038995277602226, + "learning_rate": 1.621129326047359e-05, + "loss": 0.0, + "step": 6480 + }, + { + "epoch": 7.087431693989071, + "grad_norm": 1.3241274245956447e-05, + "learning_rate": 1.6180935033394052e-05, + "loss": 0.0, + "step": 6485 + }, + { + "epoch": 7.092896174863388, + "grad_norm": 2.079802652588114e-05, + "learning_rate": 1.6150576806314513e-05, + "loss": 0.0, + "step": 6490 + }, + { + "epoch": 7.098360655737705, + "grad_norm": 2.717071765800938e-05, + "learning_rate": 1.6120218579234975e-05, + "loss": 0.0, + "step": 6495 + }, + { + "epoch": 7.103825136612022, + "grad_norm": 0.0020049286540597677, + "learning_rate": 1.6089860352155432e-05, + "loss": 0.0, + "step": 6500 + }, + { + "epoch": 7.109289617486339, + "grad_norm": 2.1362251573009416e-05, + "learning_rate": 1.6059502125075897e-05, + "loss": 0.0, + "step": 6505 + }, + { + "epoch": 7.114754098360656, + "grad_norm": 1.807320222724229e-05, + "learning_rate": 1.6029143897996358e-05, + "loss": 0.0, + "step": 6510 + }, + { + "epoch": 7.120218579234972, + "grad_norm": 1.5250131582433823e-05, + "learning_rate": 1.599878567091682e-05, + "loss": 0.0, + "step": 6515 + }, + { + "epoch": 7.1256830601092895, + "grad_norm": 2.2128344426164404e-05, + "learning_rate": 1.596842744383728e-05, + "loss": 0.0, + "step": 6520 + }, + { + "epoch": 7.131147540983607, + "grad_norm": 2.122944533766713e-05, + "learning_rate": 1.593806921675774e-05, + "loss": 0.0, + "step": 6525 + }, + { + "epoch": 7.136612021857924, + "grad_norm": 1.889590930659324e-05, + "learning_rate": 1.5907710989678202e-05, + "loss": 0.0, + "step": 6530 + }, + { + "epoch": 7.14207650273224, + "grad_norm": 1.700487337075174e-05, + "learning_rate": 1.5877352762598667e-05, + "loss": 0.0, + "step": 6535 + }, + { + "epoch": 7.147540983606557, + "grad_norm": 1.6051513739512302e-05, + "learning_rate": 1.5846994535519128e-05, + "loss": 0.0, + "step": 6540 + }, + { + "epoch": 7.1530054644808745, + "grad_norm": 5.704201976186596e-05, + "learning_rate": 1.581663630843959e-05, + "loss": 0.0, + "step": 6545 + }, + { + "epoch": 7.158469945355192, + "grad_norm": 1.7098071111831814e-05, + "learning_rate": 1.5786278081360047e-05, + "loss": 0.0, + "step": 6550 + }, + { + "epoch": 7.163934426229508, + "grad_norm": 0.00030831946060061455, + "learning_rate": 1.5755919854280508e-05, + "loss": 0.0, + "step": 6555 + }, + { + "epoch": 7.169398907103825, + "grad_norm": 8.064251596806571e-05, + "learning_rate": 1.5725561627200973e-05, + "loss": 0.0, + "step": 6560 + }, + { + "epoch": 7.174863387978142, + "grad_norm": 2.111588401021436e-05, + "learning_rate": 1.5695203400121434e-05, + "loss": 0.0, + "step": 6565 + }, + { + "epoch": 7.180327868852459, + "grad_norm": 2.722311364777852e-05, + "learning_rate": 1.5664845173041895e-05, + "loss": 0.0, + "step": 6570 + }, + { + "epoch": 7.185792349726776, + "grad_norm": 2.8191749152028933e-05, + "learning_rate": 1.5634486945962356e-05, + "loss": 0.0, + "step": 6575 + }, + { + "epoch": 7.191256830601093, + "grad_norm": 5.410460289567709e-05, + "learning_rate": 1.5604128718882817e-05, + "loss": 0.0, + "step": 6580 + }, + { + "epoch": 7.19672131147541, + "grad_norm": 0.0008463920094072819, + "learning_rate": 1.557377049180328e-05, + "loss": 0.0, + "step": 6585 + }, + { + "epoch": 7.202185792349727, + "grad_norm": 1.6411495380452834e-05, + "learning_rate": 1.5543412264723743e-05, + "loss": 0.0, + "step": 6590 + }, + { + "epoch": 7.2076502732240435, + "grad_norm": 4.104662366444245e-05, + "learning_rate": 1.5513054037644204e-05, + "loss": 0.0, + "step": 6595 + }, + { + "epoch": 7.213114754098361, + "grad_norm": 2.1741001546615735e-05, + "learning_rate": 1.548269581056466e-05, + "loss": 0.0, + "step": 6600 + }, + { + "epoch": 7.218579234972678, + "grad_norm": 2.477094494679477e-05, + "learning_rate": 1.5452337583485123e-05, + "loss": 0.0, + "step": 6605 + }, + { + "epoch": 7.224043715846994, + "grad_norm": 1.5612909919582307e-05, + "learning_rate": 1.5421979356405587e-05, + "loss": 0.0, + "step": 6610 + }, + { + "epoch": 7.229508196721311, + "grad_norm": 1.5115789210540242e-05, + "learning_rate": 1.539162112932605e-05, + "loss": 0.0, + "step": 6615 + }, + { + "epoch": 7.2349726775956285, + "grad_norm": 2.655894604686182e-05, + "learning_rate": 1.536126290224651e-05, + "loss": 0.0, + "step": 6620 + }, + { + "epoch": 7.240437158469946, + "grad_norm": 2.9735285352217034e-05, + "learning_rate": 1.533090467516697e-05, + "loss": 0.0, + "step": 6625 + }, + { + "epoch": 7.245901639344262, + "grad_norm": 0.0004776114656124264, + "learning_rate": 1.5300546448087432e-05, + "loss": 0.0, + "step": 6630 + }, + { + "epoch": 7.251366120218579, + "grad_norm": 1.4129350347502623e-05, + "learning_rate": 1.5270188221007893e-05, + "loss": 0.0, + "step": 6635 + }, + { + "epoch": 7.256830601092896, + "grad_norm": 4.216094748699106e-05, + "learning_rate": 1.5239829993928356e-05, + "loss": 0.0, + "step": 6640 + }, + { + "epoch": 7.262295081967213, + "grad_norm": 2.8377304261084646e-05, + "learning_rate": 1.5209471766848819e-05, + "loss": 0.0, + "step": 6645 + }, + { + "epoch": 7.26775956284153, + "grad_norm": 1.1897628610313404e-05, + "learning_rate": 1.5179113539769276e-05, + "loss": 0.0, + "step": 6650 + }, + { + "epoch": 7.273224043715847, + "grad_norm": 3.0133123800624162e-05, + "learning_rate": 1.514875531268974e-05, + "loss": 0.0, + "step": 6655 + }, + { + "epoch": 7.278688524590164, + "grad_norm": 1.8536607967689633e-05, + "learning_rate": 1.51183970856102e-05, + "loss": 0.0, + "step": 6660 + }, + { + "epoch": 7.284153005464481, + "grad_norm": 4.778361471835524e-05, + "learning_rate": 1.5088038858530661e-05, + "loss": 0.0, + "step": 6665 + }, + { + "epoch": 7.2896174863387975, + "grad_norm": 1.5681132936151698e-05, + "learning_rate": 1.5057680631451124e-05, + "loss": 0.0, + "step": 6670 + }, + { + "epoch": 7.295081967213115, + "grad_norm": 2.1964269762975164e-05, + "learning_rate": 1.5027322404371585e-05, + "loss": 0.0, + "step": 6675 + }, + { + "epoch": 7.300546448087432, + "grad_norm": 0.0007649378385394812, + "learning_rate": 1.4996964177292046e-05, + "loss": 0.0, + "step": 6680 + }, + { + "epoch": 7.306010928961749, + "grad_norm": 1.5521494788117707e-05, + "learning_rate": 1.496660595021251e-05, + "loss": 0.0, + "step": 6685 + }, + { + "epoch": 7.311475409836065, + "grad_norm": 1.1767200703616254e-05, + "learning_rate": 1.493624772313297e-05, + "loss": 0.0, + "step": 6690 + }, + { + "epoch": 7.316939890710382, + "grad_norm": 2.8079559342586435e-05, + "learning_rate": 1.4905889496053432e-05, + "loss": 0.0, + "step": 6695 + }, + { + "epoch": 7.3224043715847, + "grad_norm": 4.105495463591069e-05, + "learning_rate": 1.4875531268973891e-05, + "loss": 0.0, + "step": 6700 + }, + { + "epoch": 7.327868852459017, + "grad_norm": 2.0951249098288827e-05, + "learning_rate": 1.4845173041894352e-05, + "loss": 0.0, + "step": 6705 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 1.3757561646343675e-05, + "learning_rate": 1.4814814814814815e-05, + "loss": 0.0, + "step": 6710 + }, + { + "epoch": 7.33879781420765, + "grad_norm": 1.4121252206678037e-05, + "learning_rate": 1.4784456587735276e-05, + "loss": 0.0, + "step": 6715 + }, + { + "epoch": 7.344262295081967, + "grad_norm": 1.3257107639219612e-05, + "learning_rate": 1.4754098360655739e-05, + "loss": 0.0, + "step": 6720 + }, + { + "epoch": 7.3497267759562845, + "grad_norm": 2.558304367994424e-05, + "learning_rate": 1.47237401335762e-05, + "loss": 0.0, + "step": 6725 + }, + { + "epoch": 7.355191256830601, + "grad_norm": 1.6294507076963782e-05, + "learning_rate": 1.4693381906496661e-05, + "loss": 0.0, + "step": 6730 + }, + { + "epoch": 7.360655737704918, + "grad_norm": 0.0001479545171605423, + "learning_rate": 1.4663023679417124e-05, + "loss": 0.0, + "step": 6735 + }, + { + "epoch": 7.366120218579235, + "grad_norm": 1.7503914932603948e-05, + "learning_rate": 1.4632665452337585e-05, + "loss": 0.0, + "step": 6740 + }, + { + "epoch": 7.371584699453552, + "grad_norm": 1.6494923329446465e-05, + "learning_rate": 1.4602307225258046e-05, + "loss": 0.0, + "step": 6745 + }, + { + "epoch": 7.377049180327869, + "grad_norm": 0.000659099780023098, + "learning_rate": 1.4571948998178509e-05, + "loss": 0.0, + "step": 6750 + }, + { + "epoch": 7.382513661202186, + "grad_norm": 1.7199408830492757e-05, + "learning_rate": 1.4541590771098967e-05, + "loss": 0.0, + "step": 6755 + }, + { + "epoch": 7.387978142076503, + "grad_norm": 3.6840476241195574e-05, + "learning_rate": 1.451123254401943e-05, + "loss": 0.0, + "step": 6760 + }, + { + "epoch": 7.39344262295082, + "grad_norm": 2.0265113562345505e-05, + "learning_rate": 1.448087431693989e-05, + "loss": 0.0, + "step": 6765 + }, + { + "epoch": 7.398907103825136, + "grad_norm": 0.00412601325660944, + "learning_rate": 1.4450516089860352e-05, + "loss": 0.0, + "step": 6770 + }, + { + "epoch": 7.404371584699454, + "grad_norm": 7.587561412947252e-05, + "learning_rate": 1.4420157862780815e-05, + "loss": 0.0, + "step": 6775 + }, + { + "epoch": 7.409836065573771, + "grad_norm": 1.973733196791727e-05, + "learning_rate": 1.4389799635701276e-05, + "loss": 0.0, + "step": 6780 + }, + { + "epoch": 7.415300546448087, + "grad_norm": 3.874331741826609e-05, + "learning_rate": 1.4359441408621737e-05, + "loss": 0.0, + "step": 6785 + }, + { + "epoch": 7.420765027322404, + "grad_norm": 8.414830517722294e-05, + "learning_rate": 1.43290831815422e-05, + "loss": 0.0, + "step": 6790 + }, + { + "epoch": 7.426229508196721, + "grad_norm": 1.557487848913297e-05, + "learning_rate": 1.4298724954462661e-05, + "loss": 0.0, + "step": 6795 + }, + { + "epoch": 7.4316939890710385, + "grad_norm": 1.9917782992706634e-05, + "learning_rate": 1.4268366727383122e-05, + "loss": 0.0, + "step": 6800 + }, + { + "epoch": 7.437158469945355, + "grad_norm": 8.511933992849663e-05, + "learning_rate": 1.4238008500303581e-05, + "loss": 0.0, + "step": 6805 + }, + { + "epoch": 7.442622950819672, + "grad_norm": 1.9273149518994614e-05, + "learning_rate": 1.4207650273224044e-05, + "loss": 0.0, + "step": 6810 + }, + { + "epoch": 7.448087431693989, + "grad_norm": 0.0007372593972831964, + "learning_rate": 1.4177292046144505e-05, + "loss": 0.0, + "step": 6815 + }, + { + "epoch": 7.453551912568306, + "grad_norm": 1.7892272808239795e-05, + "learning_rate": 1.4146933819064967e-05, + "loss": 0.0, + "step": 6820 + }, + { + "epoch": 7.459016393442623, + "grad_norm": 1.3487725482264068e-05, + "learning_rate": 1.411657559198543e-05, + "loss": 0.0, + "step": 6825 + }, + { + "epoch": 7.46448087431694, + "grad_norm": 1.5921907106530853e-05, + "learning_rate": 1.408621736490589e-05, + "loss": 0.0, + "step": 6830 + }, + { + "epoch": 7.469945355191257, + "grad_norm": 1.7433028915547766e-05, + "learning_rate": 1.4055859137826352e-05, + "loss": 0.0, + "step": 6835 + }, + { + "epoch": 7.475409836065574, + "grad_norm": 2.843215588654857e-05, + "learning_rate": 1.4025500910746814e-05, + "loss": 0.0, + "step": 6840 + }, + { + "epoch": 7.48087431693989, + "grad_norm": 5.431610770756379e-05, + "learning_rate": 1.3995142683667276e-05, + "loss": 0.0, + "step": 6845 + }, + { + "epoch": 7.4863387978142075, + "grad_norm": 1.080147376342211e-05, + "learning_rate": 1.3964784456587737e-05, + "loss": 0.0, + "step": 6850 + }, + { + "epoch": 7.491803278688525, + "grad_norm": 1.6685771697666496e-05, + "learning_rate": 1.3934426229508196e-05, + "loss": 0.0, + "step": 6855 + }, + { + "epoch": 7.497267759562842, + "grad_norm": 2.097435026371386e-05, + "learning_rate": 1.3904068002428657e-05, + "loss": 0.0, + "step": 6860 + }, + { + "epoch": 7.502732240437158, + "grad_norm": 1.7133286746684462e-05, + "learning_rate": 1.387370977534912e-05, + "loss": 0.0, + "step": 6865 + }, + { + "epoch": 7.508196721311475, + "grad_norm": 0.0006649263086728752, + "learning_rate": 1.3843351548269581e-05, + "loss": 0.0, + "step": 6870 + }, + { + "epoch": 7.5136612021857925, + "grad_norm": 2.0749612303916365e-05, + "learning_rate": 1.3812993321190042e-05, + "loss": 0.0, + "step": 6875 + }, + { + "epoch": 7.51912568306011, + "grad_norm": 2.099968696711585e-05, + "learning_rate": 1.3782635094110505e-05, + "loss": 0.0, + "step": 6880 + }, + { + "epoch": 7.524590163934426, + "grad_norm": 1.3263029359222855e-05, + "learning_rate": 1.3752276867030966e-05, + "loss": 0.0, + "step": 6885 + }, + { + "epoch": 7.530054644808743, + "grad_norm": 1.159540352091426e-05, + "learning_rate": 1.3721918639951427e-05, + "loss": 0.0, + "step": 6890 + }, + { + "epoch": 7.53551912568306, + "grad_norm": 1.8703914975048974e-05, + "learning_rate": 1.369156041287189e-05, + "loss": 0.0, + "step": 6895 + }, + { + "epoch": 7.540983606557377, + "grad_norm": 1.2056356354150921e-05, + "learning_rate": 1.3661202185792351e-05, + "loss": 0.0, + "step": 6900 + }, + { + "epoch": 7.546448087431694, + "grad_norm": 6.520111492136493e-05, + "learning_rate": 1.363084395871281e-05, + "loss": 0.0, + "step": 6905 + }, + { + "epoch": 7.551912568306011, + "grad_norm": 1.4154918972053565e-05, + "learning_rate": 1.3600485731633272e-05, + "loss": 0.0, + "step": 6910 + }, + { + "epoch": 7.557377049180328, + "grad_norm": 7.266816828632727e-05, + "learning_rate": 1.3570127504553735e-05, + "loss": 0.0, + "step": 6915 + }, + { + "epoch": 7.562841530054644, + "grad_norm": 1.5945193808875047e-05, + "learning_rate": 1.3539769277474196e-05, + "loss": 0.0, + "step": 6920 + }, + { + "epoch": 7.5683060109289615, + "grad_norm": 9.222345397574827e-06, + "learning_rate": 1.3509411050394657e-05, + "loss": 0.0, + "step": 6925 + }, + { + "epoch": 7.573770491803279, + "grad_norm": 1.5671421351726167e-05, + "learning_rate": 1.347905282331512e-05, + "loss": 0.0, + "step": 6930 + }, + { + "epoch": 7.579234972677596, + "grad_norm": 2.936273995146621e-05, + "learning_rate": 1.3448694596235581e-05, + "loss": 0.0, + "step": 6935 + }, + { + "epoch": 7.584699453551913, + "grad_norm": 0.0006553413695655763, + "learning_rate": 1.3418336369156042e-05, + "loss": 0.0, + "step": 6940 + }, + { + "epoch": 7.590163934426229, + "grad_norm": 1.565740785736125e-05, + "learning_rate": 1.3387978142076505e-05, + "loss": 0.0, + "step": 6945 + }, + { + "epoch": 7.595628415300546, + "grad_norm": 9.471644261793699e-06, + "learning_rate": 1.3357619914996966e-05, + "loss": 0.0, + "step": 6950 + }, + { + "epoch": 7.601092896174864, + "grad_norm": 1.3549584764405154e-05, + "learning_rate": 1.3327261687917426e-05, + "loss": 0.0, + "step": 6955 + }, + { + "epoch": 7.60655737704918, + "grad_norm": 1.6184301784960553e-05, + "learning_rate": 1.3296903460837887e-05, + "loss": 0.0, + "step": 6960 + }, + { + "epoch": 7.612021857923497, + "grad_norm": 8.797870577836875e-06, + "learning_rate": 1.3266545233758348e-05, + "loss": 0.0, + "step": 6965 + }, + { + "epoch": 7.617486338797814, + "grad_norm": 1.9648776287795044e-05, + "learning_rate": 1.323618700667881e-05, + "loss": 0.0, + "step": 6970 + }, + { + "epoch": 7.622950819672131, + "grad_norm": 0.0006378691759891808, + "learning_rate": 1.3205828779599272e-05, + "loss": 0.0, + "step": 6975 + }, + { + "epoch": 7.628415300546449, + "grad_norm": 2.147053055523429e-05, + "learning_rate": 1.3175470552519733e-05, + "loss": 0.0, + "step": 6980 + }, + { + "epoch": 7.633879781420765, + "grad_norm": 1.374123621644685e-05, + "learning_rate": 1.3145112325440196e-05, + "loss": 0.0, + "step": 6985 + }, + { + "epoch": 7.639344262295082, + "grad_norm": 1.2073891412001103e-05, + "learning_rate": 1.3114754098360657e-05, + "loss": 0.0, + "step": 6990 + }, + { + "epoch": 7.644808743169399, + "grad_norm": 2.7077034246758558e-05, + "learning_rate": 1.3084395871281118e-05, + "loss": 0.0, + "step": 6995 + }, + { + "epoch": 7.6502732240437155, + "grad_norm": 1.192458876175806e-05, + "learning_rate": 1.305403764420158e-05, + "loss": 0.0, + "step": 7000 + }, + { + "epoch": 7.655737704918033, + "grad_norm": 1.2989978131372482e-05, + "learning_rate": 1.302367941712204e-05, + "loss": 0.0, + "step": 7005 + }, + { + "epoch": 7.66120218579235, + "grad_norm": 7.656402158318087e-05, + "learning_rate": 1.2993321190042501e-05, + "loss": 0.0, + "step": 7010 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 1.8988159354194067e-05, + "learning_rate": 1.2962962962962962e-05, + "loss": 0.0, + "step": 7015 + }, + { + "epoch": 7.672131147540983, + "grad_norm": 1.1558569894987158e-05, + "learning_rate": 1.2932604735883425e-05, + "loss": 0.0, + "step": 7020 + }, + { + "epoch": 7.6775956284153, + "grad_norm": 1.1080450349254534e-05, + "learning_rate": 1.2902246508803886e-05, + "loss": 0.0, + "step": 7025 + }, + { + "epoch": 7.683060109289618, + "grad_norm": 9.0610474217101e-06, + "learning_rate": 1.2871888281724348e-05, + "loss": 0.0, + "step": 7030 + }, + { + "epoch": 7.688524590163935, + "grad_norm": 2.1570349417743273e-05, + "learning_rate": 1.284153005464481e-05, + "loss": 0.0, + "step": 7035 + }, + { + "epoch": 7.693989071038251, + "grad_norm": 8.527462341589853e-06, + "learning_rate": 1.2811171827565271e-05, + "loss": 0.0, + "step": 7040 + }, + { + "epoch": 7.699453551912568, + "grad_norm": 3.9674927393207327e-05, + "learning_rate": 1.2780813600485733e-05, + "loss": 0.191, + "step": 7045 + }, + { + "epoch": 7.704918032786885, + "grad_norm": 1.570533640915528e-05, + "learning_rate": 1.2750455373406195e-05, + "loss": 0.0, + "step": 7050 + }, + { + "epoch": 7.7103825136612025, + "grad_norm": 9.140562178799883e-05, + "learning_rate": 1.2720097146326653e-05, + "loss": 0.0, + "step": 7055 + }, + { + "epoch": 7.715846994535519, + "grad_norm": 4.640642509912141e-05, + "learning_rate": 1.2689738919247116e-05, + "loss": 0.0, + "step": 7060 + }, + { + "epoch": 7.721311475409836, + "grad_norm": 2.3676704586250708e-05, + "learning_rate": 1.2659380692167577e-05, + "loss": 0.0, + "step": 7065 + }, + { + "epoch": 7.726775956284153, + "grad_norm": 0.049377378076314926, + "learning_rate": 1.2629022465088038e-05, + "loss": 0.0001, + "step": 7070 + }, + { + "epoch": 7.73224043715847, + "grad_norm": 1.8265061953570694e-05, + "learning_rate": 1.2598664238008501e-05, + "loss": 0.0, + "step": 7075 + }, + { + "epoch": 7.737704918032787, + "grad_norm": 1.7306145309703425e-05, + "learning_rate": 1.2568306010928962e-05, + "loss": 0.0, + "step": 7080 + }, + { + "epoch": 7.743169398907104, + "grad_norm": 0.0007131117163226008, + "learning_rate": 1.2537947783849423e-05, + "loss": 0.0, + "step": 7085 + }, + { + "epoch": 7.748633879781421, + "grad_norm": 4.315848491387442e-05, + "learning_rate": 1.2507589556769886e-05, + "loss": 0.0, + "step": 7090 + }, + { + "epoch": 7.754098360655737, + "grad_norm": 0.0017947383457794785, + "learning_rate": 1.2477231329690346e-05, + "loss": 0.0, + "step": 7095 + }, + { + "epoch": 7.759562841530054, + "grad_norm": 0.0001714023237582296, + "learning_rate": 1.2446873102610808e-05, + "loss": 0.0002, + "step": 7100 + }, + { + "epoch": 7.7650273224043715, + "grad_norm": 3.838367047137581e-05, + "learning_rate": 1.241651487553127e-05, + "loss": 0.0, + "step": 7105 + }, + { + "epoch": 7.770491803278689, + "grad_norm": 4.043810986331664e-05, + "learning_rate": 1.238615664845173e-05, + "loss": 0.0, + "step": 7110 + }, + { + "epoch": 7.775956284153006, + "grad_norm": 1.842474921431858e-05, + "learning_rate": 1.2355798421372194e-05, + "loss": 0.0, + "step": 7115 + }, + { + "epoch": 7.781420765027322, + "grad_norm": 1.2997180419915821e-05, + "learning_rate": 1.2325440194292653e-05, + "loss": 0.0, + "step": 7120 + }, + { + "epoch": 7.786885245901639, + "grad_norm": 1.204277214128524e-05, + "learning_rate": 1.2295081967213116e-05, + "loss": 0.0, + "step": 7125 + }, + { + "epoch": 7.7923497267759565, + "grad_norm": 1.7841040971688926e-05, + "learning_rate": 1.2264723740133577e-05, + "loss": 0.0, + "step": 7130 + }, + { + "epoch": 7.797814207650273, + "grad_norm": 8.681177132530138e-05, + "learning_rate": 1.2234365513054038e-05, + "loss": 0.0, + "step": 7135 + }, + { + "epoch": 7.80327868852459, + "grad_norm": 1.538177821203135e-05, + "learning_rate": 1.22040072859745e-05, + "loss": 0.0, + "step": 7140 + }, + { + "epoch": 7.808743169398907, + "grad_norm": 1.1661175449262373e-05, + "learning_rate": 1.217364905889496e-05, + "loss": 0.0, + "step": 7145 + }, + { + "epoch": 7.814207650273224, + "grad_norm": 1.3177917026041541e-05, + "learning_rate": 1.2143290831815423e-05, + "loss": 0.0001, + "step": 7150 + }, + { + "epoch": 7.8196721311475414, + "grad_norm": 1.9387889551580884e-05, + "learning_rate": 1.2112932604735884e-05, + "loss": 0.0, + "step": 7155 + }, + { + "epoch": 7.825136612021858, + "grad_norm": 1.555710332468152e-05, + "learning_rate": 1.2082574377656345e-05, + "loss": 0.0, + "step": 7160 + }, + { + "epoch": 7.830601092896175, + "grad_norm": 1.2738814803014975e-05, + "learning_rate": 1.2052216150576808e-05, + "loss": 0.0, + "step": 7165 + }, + { + "epoch": 7.836065573770492, + "grad_norm": 0.00013192101323511451, + "learning_rate": 1.2021857923497268e-05, + "loss": 0.0, + "step": 7170 + }, + { + "epoch": 7.841530054644808, + "grad_norm": 1.0073224075313192e-05, + "learning_rate": 1.1991499696417729e-05, + "loss": 0.0, + "step": 7175 + }, + { + "epoch": 7.8469945355191255, + "grad_norm": 1.0778838259284385e-05, + "learning_rate": 1.1961141469338192e-05, + "loss": 0.0, + "step": 7180 + }, + { + "epoch": 7.852459016393443, + "grad_norm": 2.35039769904688e-05, + "learning_rate": 1.1930783242258653e-05, + "loss": 0.0, + "step": 7185 + }, + { + "epoch": 7.85792349726776, + "grad_norm": 1.8142010958399624e-05, + "learning_rate": 1.1900425015179116e-05, + "loss": 0.0, + "step": 7190 + }, + { + "epoch": 7.863387978142076, + "grad_norm": 1.2536640497273766e-05, + "learning_rate": 1.1870066788099575e-05, + "loss": 0.0, + "step": 7195 + }, + { + "epoch": 7.868852459016393, + "grad_norm": 1.3795261111226864e-05, + "learning_rate": 1.1839708561020036e-05, + "loss": 0.0, + "step": 7200 + }, + { + "epoch": 7.8743169398907105, + "grad_norm": 1.054769836628111e-05, + "learning_rate": 1.1809350333940499e-05, + "loss": 0.0, + "step": 7205 + }, + { + "epoch": 7.879781420765028, + "grad_norm": 3.637406916823238e-05, + "learning_rate": 1.177899210686096e-05, + "loss": 0.0, + "step": 7210 + }, + { + "epoch": 7.885245901639344, + "grad_norm": 1.304059424001025e-05, + "learning_rate": 1.1748633879781421e-05, + "loss": 0.0, + "step": 7215 + }, + { + "epoch": 7.890710382513661, + "grad_norm": 1.0693567674024962e-05, + "learning_rate": 1.1718275652701882e-05, + "loss": 0.0, + "step": 7220 + }, + { + "epoch": 7.896174863387978, + "grad_norm": 1.3089750609651674e-05, + "learning_rate": 1.1687917425622343e-05, + "loss": 0.0, + "step": 7225 + }, + { + "epoch": 7.901639344262295, + "grad_norm": 1.2414632692525629e-05, + "learning_rate": 1.1657559198542806e-05, + "loss": 0.0, + "step": 7230 + }, + { + "epoch": 7.907103825136612, + "grad_norm": 0.0006229742430150509, + "learning_rate": 1.1627200971463267e-05, + "loss": 0.0, + "step": 7235 + }, + { + "epoch": 7.912568306010929, + "grad_norm": 1.361664089927217e-05, + "learning_rate": 1.1596842744383728e-05, + "loss": 0.0, + "step": 7240 + }, + { + "epoch": 7.918032786885246, + "grad_norm": 1.4006955098011531e-05, + "learning_rate": 1.156648451730419e-05, + "loss": 0.0, + "step": 7245 + }, + { + "epoch": 7.923497267759563, + "grad_norm": 1.6826483260956593e-05, + "learning_rate": 1.153612629022465e-05, + "loss": 0.0, + "step": 7250 + }, + { + "epoch": 7.9289617486338795, + "grad_norm": 1.3367481187742669e-05, + "learning_rate": 1.1505768063145114e-05, + "loss": 0.0, + "step": 7255 + }, + { + "epoch": 7.934426229508197, + "grad_norm": 2.5410168746020645e-05, + "learning_rate": 1.1475409836065575e-05, + "loss": 0.0, + "step": 7260 + }, + { + "epoch": 7.939890710382514, + "grad_norm": 1.2079419320798479e-05, + "learning_rate": 1.1445051608986036e-05, + "loss": 0.0001, + "step": 7265 + }, + { + "epoch": 7.945355191256831, + "grad_norm": 8.898069609131198e-06, + "learning_rate": 1.1414693381906497e-05, + "loss": 0.0, + "step": 7270 + }, + { + "epoch": 7.950819672131147, + "grad_norm": 1.2275562767172232e-05, + "learning_rate": 1.1384335154826958e-05, + "loss": 0.0, + "step": 7275 + }, + { + "epoch": 7.956284153005464, + "grad_norm": 1.0254681910737418e-05, + "learning_rate": 1.135397692774742e-05, + "loss": 0.0, + "step": 7280 + }, + { + "epoch": 7.961748633879782, + "grad_norm": 1.1392396118026227e-05, + "learning_rate": 1.1323618700667882e-05, + "loss": 0.0, + "step": 7285 + }, + { + "epoch": 7.967213114754099, + "grad_norm": 3.0942966986913234e-05, + "learning_rate": 1.1293260473588343e-05, + "loss": 0.0, + "step": 7290 + }, + { + "epoch": 7.972677595628415, + "grad_norm": 8.617805178801063e-06, + "learning_rate": 1.1262902246508804e-05, + "loss": 0.0, + "step": 7295 + }, + { + "epoch": 7.978142076502732, + "grad_norm": 1.213542327604955e-05, + "learning_rate": 1.1232544019429265e-05, + "loss": 0.0, + "step": 7300 + }, + { + "epoch": 7.983606557377049, + "grad_norm": 9.413888619747013e-06, + "learning_rate": 1.1202185792349727e-05, + "loss": 0.0, + "step": 7305 + }, + { + "epoch": 7.989071038251366, + "grad_norm": 1.631492887099739e-05, + "learning_rate": 1.117182756527019e-05, + "loss": 0.0, + "step": 7310 + }, + { + "epoch": 7.994535519125683, + "grad_norm": 2.603645043564029e-05, + "learning_rate": 1.114146933819065e-05, + "loss": 0.0, + "step": 7315 + }, + { + "epoch": 8.0, + "grad_norm": 1.8122764231520705e-05, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.0, + "step": 7320 + }, + { + "epoch": 8.0, + "eval_loss": 1.6960844959612587e-06, + "eval_runtime": 658.0485, + "eval_samples_per_second": 11.118, + "eval_steps_per_second": 1.39, + "step": 7320 + }, + { + "epoch": 8.005464480874316, + "grad_norm": 1.7527658201288432e-05, + "learning_rate": 1.1080752884031573e-05, + "loss": 0.0, + "step": 7325 + }, + { + "epoch": 8.010928961748634, + "grad_norm": 1.5412620996357873e-05, + "learning_rate": 1.1050394656952034e-05, + "loss": 0.0, + "step": 7330 + }, + { + "epoch": 8.01639344262295, + "grad_norm": 1.1587193512241356e-05, + "learning_rate": 1.1020036429872497e-05, + "loss": 0.0, + "step": 7335 + }, + { + "epoch": 8.021857923497267, + "grad_norm": 2.150508043996524e-05, + "learning_rate": 1.0989678202792958e-05, + "loss": 0.0, + "step": 7340 + }, + { + "epoch": 8.027322404371585, + "grad_norm": 9.768027666723356e-06, + "learning_rate": 1.0959319975713419e-05, + "loss": 0.0, + "step": 7345 + }, + { + "epoch": 8.032786885245901, + "grad_norm": 1.269577751372708e-05, + "learning_rate": 1.092896174863388e-05, + "loss": 0.0, + "step": 7350 + }, + { + "epoch": 8.03825136612022, + "grad_norm": 1.0717486475186888e-05, + "learning_rate": 1.0898603521554341e-05, + "loss": 0.0, + "step": 7355 + }, + { + "epoch": 8.043715846994536, + "grad_norm": 1.7090585970436223e-05, + "learning_rate": 1.0868245294474804e-05, + "loss": 0.0, + "step": 7360 + }, + { + "epoch": 8.049180327868852, + "grad_norm": 1.6057352695497684e-05, + "learning_rate": 1.0837887067395265e-05, + "loss": 0.0, + "step": 7365 + }, + { + "epoch": 8.05464480874317, + "grad_norm": 1.3813902114634402e-05, + "learning_rate": 1.0807528840315725e-05, + "loss": 0.0, + "step": 7370 + }, + { + "epoch": 8.060109289617486, + "grad_norm": 1.1777274266933091e-05, + "learning_rate": 1.0777170613236187e-05, + "loss": 0.0, + "step": 7375 + }, + { + "epoch": 8.065573770491802, + "grad_norm": 1.597378468431998e-05, + "learning_rate": 1.0746812386156649e-05, + "loss": 0.0, + "step": 7380 + }, + { + "epoch": 8.07103825136612, + "grad_norm": 5.7681496400618926e-05, + "learning_rate": 1.0716454159077111e-05, + "loss": 0.0, + "step": 7385 + }, + { + "epoch": 8.076502732240437, + "grad_norm": 2.5443077902309597e-05, + "learning_rate": 1.0686095931997573e-05, + "loss": 0.0, + "step": 7390 + }, + { + "epoch": 8.081967213114755, + "grad_norm": 1.2299603440624196e-05, + "learning_rate": 1.0655737704918032e-05, + "loss": 0.0, + "step": 7395 + }, + { + "epoch": 8.087431693989071, + "grad_norm": 1.8994456695509143e-05, + "learning_rate": 1.0625379477838495e-05, + "loss": 0.0, + "step": 7400 + }, + { + "epoch": 8.092896174863387, + "grad_norm": 1.2942989997100085e-05, + "learning_rate": 1.0595021250758956e-05, + "loss": 0.0, + "step": 7405 + }, + { + "epoch": 8.098360655737705, + "grad_norm": 1.2755004718201235e-05, + "learning_rate": 1.0564663023679417e-05, + "loss": 0.0, + "step": 7410 + }, + { + "epoch": 8.103825136612022, + "grad_norm": 1.5904519386822358e-05, + "learning_rate": 1.053430479659988e-05, + "loss": 0.0, + "step": 7415 + }, + { + "epoch": 8.109289617486338, + "grad_norm": 8.2708374975482e-06, + "learning_rate": 1.050394656952034e-05, + "loss": 0.0, + "step": 7420 + }, + { + "epoch": 8.114754098360656, + "grad_norm": 0.0006852184887975454, + "learning_rate": 1.0473588342440802e-05, + "loss": 0.0, + "step": 7425 + }, + { + "epoch": 8.120218579234972, + "grad_norm": 0.00019752304069697857, + "learning_rate": 1.0443230115361263e-05, + "loss": 0.0, + "step": 7430 + }, + { + "epoch": 8.12568306010929, + "grad_norm": 5.792970841866918e-05, + "learning_rate": 1.0412871888281724e-05, + "loss": 0.0, + "step": 7435 + }, + { + "epoch": 8.131147540983607, + "grad_norm": 8.440183592028916e-06, + "learning_rate": 1.0382513661202187e-05, + "loss": 0.0, + "step": 7440 + }, + { + "epoch": 8.136612021857923, + "grad_norm": 1.4333023500512354e-05, + "learning_rate": 1.0352155434122647e-05, + "loss": 0.0, + "step": 7445 + }, + { + "epoch": 8.142076502732241, + "grad_norm": 2.101344034599606e-05, + "learning_rate": 1.032179720704311e-05, + "loss": 0.0, + "step": 7450 + }, + { + "epoch": 8.147540983606557, + "grad_norm": 1.0169248525926378e-05, + "learning_rate": 1.029143897996357e-05, + "loss": 0.0, + "step": 7455 + }, + { + "epoch": 8.153005464480874, + "grad_norm": 1.284426252823323e-05, + "learning_rate": 1.0261080752884032e-05, + "loss": 0.0, + "step": 7460 + }, + { + "epoch": 8.158469945355192, + "grad_norm": 2.392159149167128e-05, + "learning_rate": 1.0230722525804495e-05, + "loss": 0.0, + "step": 7465 + }, + { + "epoch": 8.163934426229508, + "grad_norm": 2.8029156965203583e-05, + "learning_rate": 1.0200364298724956e-05, + "loss": 0.0, + "step": 7470 + }, + { + "epoch": 8.169398907103826, + "grad_norm": 0.0004401684273034334, + "learning_rate": 1.0170006071645417e-05, + "loss": 0.0, + "step": 7475 + }, + { + "epoch": 8.174863387978142, + "grad_norm": 8.851263373799156e-06, + "learning_rate": 1.0139647844565878e-05, + "loss": 0.0, + "step": 7480 + }, + { + "epoch": 8.180327868852459, + "grad_norm": 0.00010571895109023899, + "learning_rate": 1.0109289617486339e-05, + "loss": 0.0, + "step": 7485 + }, + { + "epoch": 8.185792349726777, + "grad_norm": 1.1960997653659433e-05, + "learning_rate": 1.0078931390406802e-05, + "loss": 0.0, + "step": 7490 + }, + { + "epoch": 8.191256830601093, + "grad_norm": 2.6386380341136828e-05, + "learning_rate": 1.0048573163327263e-05, + "loss": 0.0, + "step": 7495 + }, + { + "epoch": 8.19672131147541, + "grad_norm": 2.395056617388036e-05, + "learning_rate": 1.0018214936247722e-05, + "loss": 0.0, + "step": 7500 + }, + { + "epoch": 8.202185792349727, + "grad_norm": 7.888736035965849e-06, + "learning_rate": 9.987856709168185e-06, + "loss": 0.0, + "step": 7505 + }, + { + "epoch": 8.207650273224044, + "grad_norm": 1.250679724762449e-05, + "learning_rate": 9.957498482088646e-06, + "loss": 0.0, + "step": 7510 + }, + { + "epoch": 8.21311475409836, + "grad_norm": 6.540030153701082e-05, + "learning_rate": 9.927140255009108e-06, + "loss": 0.0, + "step": 7515 + }, + { + "epoch": 8.218579234972678, + "grad_norm": 2.1273477614158764e-05, + "learning_rate": 9.89678202792957e-06, + "loss": 0.0, + "step": 7520 + }, + { + "epoch": 8.224043715846994, + "grad_norm": 1.247999352926854e-05, + "learning_rate": 9.86642380085003e-06, + "loss": 0.0, + "step": 7525 + }, + { + "epoch": 8.229508196721312, + "grad_norm": 1.9308432456455193e-05, + "learning_rate": 9.836065573770493e-06, + "loss": 0.0, + "step": 7530 + }, + { + "epoch": 8.234972677595628, + "grad_norm": 1.2912430975120515e-05, + "learning_rate": 9.805707346690954e-06, + "loss": 0.0, + "step": 7535 + }, + { + "epoch": 8.240437158469945, + "grad_norm": 1.1097822607553098e-05, + "learning_rate": 9.775349119611415e-06, + "loss": 0.0, + "step": 7540 + }, + { + "epoch": 8.245901639344263, + "grad_norm": 3.416691470192745e-05, + "learning_rate": 9.744990892531878e-06, + "loss": 0.0, + "step": 7545 + }, + { + "epoch": 8.251366120218579, + "grad_norm": 1.2960584172105882e-05, + "learning_rate": 9.714632665452337e-06, + "loss": 0.0, + "step": 7550 + }, + { + "epoch": 8.256830601092895, + "grad_norm": 5.391754166339524e-05, + "learning_rate": 9.6842744383728e-06, + "loss": 0.0, + "step": 7555 + }, + { + "epoch": 8.262295081967213, + "grad_norm": 1.9058212274103425e-05, + "learning_rate": 9.653916211293261e-06, + "loss": 0.0, + "step": 7560 + }, + { + "epoch": 8.26775956284153, + "grad_norm": 8.722054190002382e-06, + "learning_rate": 9.623557984213722e-06, + "loss": 0.0, + "step": 7565 + }, + { + "epoch": 8.273224043715848, + "grad_norm": 1.2237047485541552e-05, + "learning_rate": 9.593199757134185e-06, + "loss": 0.0, + "step": 7570 + }, + { + "epoch": 8.278688524590164, + "grad_norm": 5.237998266238719e-05, + "learning_rate": 9.562841530054644e-06, + "loss": 0.0, + "step": 7575 + }, + { + "epoch": 8.28415300546448, + "grad_norm": 9.937410140992142e-06, + "learning_rate": 9.532483302975107e-06, + "loss": 0.0, + "step": 7580 + }, + { + "epoch": 8.289617486338798, + "grad_norm": 1.3053542716079392e-05, + "learning_rate": 9.502125075895568e-06, + "loss": 0.0, + "step": 7585 + }, + { + "epoch": 8.295081967213115, + "grad_norm": 2.3322838387684897e-05, + "learning_rate": 9.47176684881603e-06, + "loss": 0.0, + "step": 7590 + }, + { + "epoch": 8.300546448087431, + "grad_norm": 0.000983793055638671, + "learning_rate": 9.441408621736492e-06, + "loss": 0.0, + "step": 7595 + }, + { + "epoch": 8.306010928961749, + "grad_norm": 0.00012811202032025903, + "learning_rate": 9.411050394656952e-06, + "loss": 0.0, + "step": 7600 + }, + { + "epoch": 8.311475409836065, + "grad_norm": 1.6763411622378044e-05, + "learning_rate": 9.380692167577413e-06, + "loss": 0.0, + "step": 7605 + }, + { + "epoch": 8.316939890710383, + "grad_norm": 3.3423602872062474e-05, + "learning_rate": 9.350333940497876e-06, + "loss": 0.0, + "step": 7610 + }, + { + "epoch": 8.3224043715847, + "grad_norm": 1.1610113688220736e-05, + "learning_rate": 9.319975713418337e-06, + "loss": 0.0, + "step": 7615 + }, + { + "epoch": 8.327868852459016, + "grad_norm": 2.3615129975951277e-05, + "learning_rate": 9.2896174863388e-06, + "loss": 0.0, + "step": 7620 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 7.937882401165552e-06, + "learning_rate": 9.259259259259259e-06, + "loss": 0.0, + "step": 7625 + }, + { + "epoch": 8.33879781420765, + "grad_norm": 1.095014886232093e-05, + "learning_rate": 9.22890103217972e-06, + "loss": 0.0, + "step": 7630 + }, + { + "epoch": 8.344262295081966, + "grad_norm": 7.690862730669323e-06, + "learning_rate": 9.198542805100183e-06, + "loss": 0.0, + "step": 7635 + }, + { + "epoch": 8.349726775956285, + "grad_norm": 1.6766489352448843e-05, + "learning_rate": 9.168184578020644e-06, + "loss": 0.0, + "step": 7640 + }, + { + "epoch": 8.3551912568306, + "grad_norm": 0.0001764387561706826, + "learning_rate": 9.137826350941105e-06, + "loss": 0.0001, + "step": 7645 + }, + { + "epoch": 8.360655737704919, + "grad_norm": 7.887566607678309e-05, + "learning_rate": 9.107468123861566e-06, + "loss": 0.0, + "step": 7650 + }, + { + "epoch": 8.366120218579235, + "grad_norm": 1.2673816854658071e-05, + "learning_rate": 9.077109896782028e-06, + "loss": 0.0, + "step": 7655 + }, + { + "epoch": 8.371584699453551, + "grad_norm": 4.3066185753559694e-05, + "learning_rate": 9.04675166970249e-06, + "loss": 0.0, + "step": 7660 + }, + { + "epoch": 8.37704918032787, + "grad_norm": 1.3214439604780637e-05, + "learning_rate": 9.016393442622952e-06, + "loss": 0.0, + "step": 7665 + }, + { + "epoch": 8.382513661202186, + "grad_norm": 1.4271675354393665e-05, + "learning_rate": 8.986035215543413e-06, + "loss": 0.0, + "step": 7670 + }, + { + "epoch": 8.387978142076502, + "grad_norm": 0.00020471119205467403, + "learning_rate": 8.955676988463874e-06, + "loss": 0.0, + "step": 7675 + }, + { + "epoch": 8.39344262295082, + "grad_norm": 5.713020800612867e-05, + "learning_rate": 8.925318761384335e-06, + "loss": 0.0, + "step": 7680 + }, + { + "epoch": 8.398907103825136, + "grad_norm": 0.0001489708956796676, + "learning_rate": 8.894960534304798e-06, + "loss": 0.0, + "step": 7685 + }, + { + "epoch": 8.404371584699454, + "grad_norm": 2.921839040936902e-05, + "learning_rate": 8.864602307225259e-06, + "loss": 0.0, + "step": 7690 + }, + { + "epoch": 8.40983606557377, + "grad_norm": 9.445561772736255e-06, + "learning_rate": 8.83424408014572e-06, + "loss": 0.0, + "step": 7695 + }, + { + "epoch": 8.415300546448087, + "grad_norm": 1.582982440595515e-05, + "learning_rate": 8.803885853066181e-06, + "loss": 0.0, + "step": 7700 + }, + { + "epoch": 8.420765027322405, + "grad_norm": 1.92226125363959e-05, + "learning_rate": 8.773527625986642e-06, + "loss": 0.0, + "step": 7705 + }, + { + "epoch": 8.426229508196721, + "grad_norm": 5.412556856754236e-05, + "learning_rate": 8.743169398907103e-06, + "loss": 0.0, + "step": 7710 + }, + { + "epoch": 8.431693989071038, + "grad_norm": 2.1962119717500173e-05, + "learning_rate": 8.712811171827566e-06, + "loss": 0.0, + "step": 7715 + }, + { + "epoch": 8.437158469945356, + "grad_norm": 0.0006890874356031418, + "learning_rate": 8.682452944748027e-06, + "loss": 0.0, + "step": 7720 + }, + { + "epoch": 8.442622950819672, + "grad_norm": 2.5473029381828383e-05, + "learning_rate": 8.652094717668488e-06, + "loss": 0.0, + "step": 7725 + }, + { + "epoch": 8.448087431693988, + "grad_norm": 8.04679439170286e-06, + "learning_rate": 8.62173649058895e-06, + "loss": 0.0, + "step": 7730 + }, + { + "epoch": 8.453551912568306, + "grad_norm": 0.0006084745400585234, + "learning_rate": 8.59137826350941e-06, + "loss": 0.0, + "step": 7735 + }, + { + "epoch": 8.459016393442623, + "grad_norm": 1.0556545021245256e-05, + "learning_rate": 8.561020036429874e-06, + "loss": 0.0, + "step": 7740 + }, + { + "epoch": 8.46448087431694, + "grad_norm": 2.4723283786443062e-05, + "learning_rate": 8.530661809350335e-06, + "loss": 0.0, + "step": 7745 + }, + { + "epoch": 8.469945355191257, + "grad_norm": 1.2455606338335201e-05, + "learning_rate": 8.500303582270796e-06, + "loss": 0.0, + "step": 7750 + }, + { + "epoch": 8.475409836065573, + "grad_norm": 4.926376277580857e-05, + "learning_rate": 8.469945355191257e-06, + "loss": 0.0, + "step": 7755 + }, + { + "epoch": 8.480874316939891, + "grad_norm": 1.2116083780711051e-05, + "learning_rate": 8.439587128111718e-06, + "loss": 0.0, + "step": 7760 + }, + { + "epoch": 8.486338797814208, + "grad_norm": 0.0004952167510055006, + "learning_rate": 8.409228901032181e-06, + "loss": 0.0, + "step": 7765 + }, + { + "epoch": 8.491803278688524, + "grad_norm": 1.5964784324751236e-05, + "learning_rate": 8.378870673952642e-06, + "loss": 0.0, + "step": 7770 + }, + { + "epoch": 8.497267759562842, + "grad_norm": 2.0253926777513698e-05, + "learning_rate": 8.348512446873103e-06, + "loss": 0.0, + "step": 7775 + }, + { + "epoch": 8.502732240437158, + "grad_norm": 4.6292418119264767e-05, + "learning_rate": 8.318154219793564e-06, + "loss": 0.0, + "step": 7780 + }, + { + "epoch": 8.508196721311476, + "grad_norm": 1.7785914678825065e-05, + "learning_rate": 8.287795992714025e-06, + "loss": 0.0, + "step": 7785 + }, + { + "epoch": 8.513661202185792, + "grad_norm": 0.0006781655829399824, + "learning_rate": 8.257437765634488e-06, + "loss": 0.0, + "step": 7790 + }, + { + "epoch": 8.519125683060109, + "grad_norm": 9.413172847416718e-06, + "learning_rate": 8.22707953855495e-06, + "loss": 0.0, + "step": 7795 + }, + { + "epoch": 8.524590163934427, + "grad_norm": 0.00011244224879192188, + "learning_rate": 8.196721311475409e-06, + "loss": 0.0, + "step": 7800 + }, + { + "epoch": 8.530054644808743, + "grad_norm": 2.30734749493422e-05, + "learning_rate": 8.166363084395872e-06, + "loss": 0.0, + "step": 7805 + }, + { + "epoch": 8.53551912568306, + "grad_norm": 9.177093488688115e-06, + "learning_rate": 8.136004857316333e-06, + "loss": 0.0, + "step": 7810 + }, + { + "epoch": 8.540983606557377, + "grad_norm": 0.00042046752059832215, + "learning_rate": 8.105646630236796e-06, + "loss": 0.0, + "step": 7815 + }, + { + "epoch": 8.546448087431694, + "grad_norm": 0.0004966892302036285, + "learning_rate": 8.075288403157257e-06, + "loss": 0.0, + "step": 7820 + }, + { + "epoch": 8.551912568306012, + "grad_norm": 3.082709008594975e-05, + "learning_rate": 8.044930176077716e-06, + "loss": 0.0, + "step": 7825 + }, + { + "epoch": 8.557377049180328, + "grad_norm": 2.252473495900631e-05, + "learning_rate": 8.014571948998179e-06, + "loss": 0.0, + "step": 7830 + }, + { + "epoch": 8.562841530054644, + "grad_norm": 8.535890628991183e-06, + "learning_rate": 7.98421372191864e-06, + "loss": 0.0, + "step": 7835 + }, + { + "epoch": 8.568306010928962, + "grad_norm": 9.065177437150851e-06, + "learning_rate": 7.953855494839101e-06, + "loss": 0.0, + "step": 7840 + }, + { + "epoch": 8.573770491803279, + "grad_norm": 9.072325156012084e-06, + "learning_rate": 7.923497267759564e-06, + "loss": 0.0, + "step": 7845 + }, + { + "epoch": 8.579234972677595, + "grad_norm": 3.252259557484649e-05, + "learning_rate": 7.893139040680023e-06, + "loss": 0.0, + "step": 7850 + }, + { + "epoch": 8.584699453551913, + "grad_norm": 4.2116338590858504e-05, + "learning_rate": 7.862780813600486e-06, + "loss": 0.0, + "step": 7855 + }, + { + "epoch": 8.59016393442623, + "grad_norm": 9.784484973351937e-06, + "learning_rate": 7.832422586520947e-06, + "loss": 0.0, + "step": 7860 + }, + { + "epoch": 8.595628415300546, + "grad_norm": 1.4877758985676337e-05, + "learning_rate": 7.802064359441409e-06, + "loss": 0.0, + "step": 7865 + }, + { + "epoch": 8.601092896174864, + "grad_norm": 2.199526716140099e-05, + "learning_rate": 7.771706132361871e-06, + "loss": 0.0, + "step": 7870 + }, + { + "epoch": 8.60655737704918, + "grad_norm": 1.5495361367356963e-05, + "learning_rate": 7.74134790528233e-06, + "loss": 0.0, + "step": 7875 + }, + { + "epoch": 8.612021857923498, + "grad_norm": 3.3597727451706305e-05, + "learning_rate": 7.710989678202794e-06, + "loss": 0.0, + "step": 7880 + }, + { + "epoch": 8.617486338797814, + "grad_norm": 0.00010077483602799475, + "learning_rate": 7.680631451123255e-06, + "loss": 0.0, + "step": 7885 + }, + { + "epoch": 8.62295081967213, + "grad_norm": 1.6386205970775336e-05, + "learning_rate": 7.650273224043716e-06, + "loss": 0.0, + "step": 7890 + }, + { + "epoch": 8.628415300546449, + "grad_norm": 0.0001092545353458263, + "learning_rate": 7.619914996964178e-06, + "loss": 0.0, + "step": 7895 + }, + { + "epoch": 8.633879781420765, + "grad_norm": 1.4560269846697338e-05, + "learning_rate": 7.589556769884638e-06, + "loss": 0.0, + "step": 7900 + }, + { + "epoch": 8.639344262295083, + "grad_norm": 1.3944583770353347e-05, + "learning_rate": 7.5591985428051e-06, + "loss": 0.0, + "step": 7905 + }, + { + "epoch": 8.6448087431694, + "grad_norm": 9.102309559239075e-05, + "learning_rate": 7.528840315725562e-06, + "loss": 0.0, + "step": 7910 + }, + { + "epoch": 8.650273224043715, + "grad_norm": 0.0014770968118682504, + "learning_rate": 7.498482088646023e-06, + "loss": 0.0, + "step": 7915 + }, + { + "epoch": 8.655737704918034, + "grad_norm": 1.5495656043640338e-05, + "learning_rate": 7.468123861566485e-06, + "loss": 0.0, + "step": 7920 + }, + { + "epoch": 8.66120218579235, + "grad_norm": 1.086993779608747e-05, + "learning_rate": 7.4377656344869455e-06, + "loss": 0.0, + "step": 7925 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 2.422315264993813e-05, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.0, + "step": 7930 + }, + { + "epoch": 8.672131147540984, + "grad_norm": 1.2280898772587534e-05, + "learning_rate": 7.3770491803278695e-06, + "loss": 0.0, + "step": 7935 + }, + { + "epoch": 8.6775956284153, + "grad_norm": 7.415174422931159e-06, + "learning_rate": 7.346690953248331e-06, + "loss": 0.0, + "step": 7940 + }, + { + "epoch": 8.683060109289617, + "grad_norm": 1.318469821853796e-05, + "learning_rate": 7.3163327261687926e-06, + "loss": 0.0, + "step": 7945 + }, + { + "epoch": 8.688524590163935, + "grad_norm": 7.0088044594740495e-06, + "learning_rate": 7.2859744990892545e-06, + "loss": 0.0, + "step": 7950 + }, + { + "epoch": 8.693989071038251, + "grad_norm": 1.3353911526792217e-05, + "learning_rate": 7.255616272009715e-06, + "loss": 0.0, + "step": 7955 + }, + { + "epoch": 8.699453551912569, + "grad_norm": 5.0659851694945246e-05, + "learning_rate": 7.225258044930176e-06, + "loss": 0.0, + "step": 7960 + }, + { + "epoch": 8.704918032786885, + "grad_norm": 0.00012321716349106282, + "learning_rate": 7.194899817850638e-06, + "loss": 0.0, + "step": 7965 + }, + { + "epoch": 8.710382513661202, + "grad_norm": 7.799344530212693e-06, + "learning_rate": 7.1645415907711e-06, + "loss": 0.0, + "step": 7970 + }, + { + "epoch": 8.71584699453552, + "grad_norm": 1.204593718284741e-05, + "learning_rate": 7.134183363691561e-06, + "loss": 0.0, + "step": 7975 + }, + { + "epoch": 8.721311475409836, + "grad_norm": 7.936175279610325e-06, + "learning_rate": 7.103825136612022e-06, + "loss": 0.0, + "step": 7980 + }, + { + "epoch": 8.726775956284152, + "grad_norm": 1.0668262802937534e-05, + "learning_rate": 7.073466909532483e-06, + "loss": 0.0, + "step": 7985 + }, + { + "epoch": 8.73224043715847, + "grad_norm": 1.0944058885797858e-05, + "learning_rate": 7.043108682452945e-06, + "loss": 0.0, + "step": 7990 + }, + { + "epoch": 8.737704918032787, + "grad_norm": 3.984873183071613e-05, + "learning_rate": 7.012750455373407e-06, + "loss": 0.0, + "step": 7995 + }, + { + "epoch": 8.743169398907105, + "grad_norm": 2.668611705303192e-05, + "learning_rate": 6.982392228293868e-06, + "loss": 0.0, + "step": 8000 + }, + { + "epoch": 8.748633879781421, + "grad_norm": 3.1083716748980805e-05, + "learning_rate": 6.952034001214329e-06, + "loss": 0.0, + "step": 8005 + }, + { + "epoch": 8.754098360655737, + "grad_norm": 5.123380105942488e-05, + "learning_rate": 6.921675774134791e-06, + "loss": 0.0, + "step": 8010 + }, + { + "epoch": 8.759562841530055, + "grad_norm": 3.057342837564647e-05, + "learning_rate": 6.891317547055253e-06, + "loss": 0.0, + "step": 8015 + }, + { + "epoch": 8.765027322404372, + "grad_norm": 1.0824885066540446e-05, + "learning_rate": 6.860959319975714e-06, + "loss": 0.0, + "step": 8020 + }, + { + "epoch": 8.770491803278688, + "grad_norm": 1.0632065823301673e-05, + "learning_rate": 6.830601092896176e-06, + "loss": 0.0, + "step": 8025 + }, + { + "epoch": 8.775956284153006, + "grad_norm": 0.00019011649419553578, + "learning_rate": 6.800242865816636e-06, + "loss": 0.0, + "step": 8030 + }, + { + "epoch": 8.781420765027322, + "grad_norm": 2.6195617465418763e-05, + "learning_rate": 6.769884638737098e-06, + "loss": 0.0, + "step": 8035 + }, + { + "epoch": 8.78688524590164, + "grad_norm": 1.6233725546044298e-05, + "learning_rate": 6.73952641165756e-06, + "loss": 0.0, + "step": 8040 + }, + { + "epoch": 8.792349726775956, + "grad_norm": 1.057436293194769e-05, + "learning_rate": 6.709168184578021e-06, + "loss": 0.0, + "step": 8045 + }, + { + "epoch": 8.797814207650273, + "grad_norm": 4.19470998167526e-05, + "learning_rate": 6.678809957498483e-06, + "loss": 0.0, + "step": 8050 + }, + { + "epoch": 8.80327868852459, + "grad_norm": 1.0058052794192918e-05, + "learning_rate": 6.648451730418943e-06, + "loss": 0.0, + "step": 8055 + }, + { + "epoch": 8.808743169398907, + "grad_norm": 7.835977885406464e-05, + "learning_rate": 6.618093503339405e-06, + "loss": 0.0, + "step": 8060 + }, + { + "epoch": 8.814207650273223, + "grad_norm": 0.00041921433876268566, + "learning_rate": 6.5877352762598664e-06, + "loss": 0.0, + "step": 8065 + }, + { + "epoch": 8.819672131147541, + "grad_norm": 8.555156455258839e-06, + "learning_rate": 6.557377049180328e-06, + "loss": 0.0, + "step": 8070 + }, + { + "epoch": 8.825136612021858, + "grad_norm": 0.00044895359314978123, + "learning_rate": 6.52701882210079e-06, + "loss": 0.0, + "step": 8075 + }, + { + "epoch": 8.830601092896174, + "grad_norm": 2.7534799301065505e-05, + "learning_rate": 6.496660595021251e-06, + "loss": 0.0, + "step": 8080 + }, + { + "epoch": 8.836065573770492, + "grad_norm": 2.2113057639217004e-05, + "learning_rate": 6.466302367941713e-06, + "loss": 0.0, + "step": 8085 + }, + { + "epoch": 8.841530054644808, + "grad_norm": 7.0613623393001035e-06, + "learning_rate": 6.435944140862174e-06, + "loss": 0.0, + "step": 8090 + }, + { + "epoch": 8.846994535519126, + "grad_norm": 4.0296203223988414e-05, + "learning_rate": 6.405585913782636e-06, + "loss": 0.0, + "step": 8095 + }, + { + "epoch": 8.852459016393443, + "grad_norm": 3.491979805403389e-05, + "learning_rate": 6.375227686703098e-06, + "loss": 0.0, + "step": 8100 + }, + { + "epoch": 8.857923497267759, + "grad_norm": 1.561005956318695e-05, + "learning_rate": 6.344869459623558e-06, + "loss": 0.0, + "step": 8105 + }, + { + "epoch": 8.863387978142077, + "grad_norm": 2.3691472961218096e-05, + "learning_rate": 6.314511232544019e-06, + "loss": 0.0, + "step": 8110 + }, + { + "epoch": 8.868852459016393, + "grad_norm": 1.4078505955694709e-05, + "learning_rate": 6.284153005464481e-06, + "loss": 0.0, + "step": 8115 + }, + { + "epoch": 8.87431693989071, + "grad_norm": 8.696079748915508e-05, + "learning_rate": 6.253794778384943e-06, + "loss": 0.0, + "step": 8120 + }, + { + "epoch": 8.879781420765028, + "grad_norm": 1.038772188621806e-05, + "learning_rate": 6.223436551305404e-06, + "loss": 0.0, + "step": 8125 + }, + { + "epoch": 8.885245901639344, + "grad_norm": 1.992170473386068e-05, + "learning_rate": 6.193078324225865e-06, + "loss": 0.0, + "step": 8130 + }, + { + "epoch": 8.890710382513662, + "grad_norm": 2.0900015442748554e-05, + "learning_rate": 6.1627200971463265e-06, + "loss": 0.0, + "step": 8135 + }, + { + "epoch": 8.896174863387978, + "grad_norm": 1.2317065738898236e-05, + "learning_rate": 6.1323618700667884e-06, + "loss": 0.0, + "step": 8140 + }, + { + "epoch": 8.901639344262295, + "grad_norm": 4.956311022397131e-05, + "learning_rate": 6.10200364298725e-06, + "loss": 0.0, + "step": 8145 + }, + { + "epoch": 8.907103825136613, + "grad_norm": 8.253177838923875e-06, + "learning_rate": 6.0716454159077115e-06, + "loss": 0.0, + "step": 8150 + }, + { + "epoch": 8.912568306010929, + "grad_norm": 6.0874495829921216e-05, + "learning_rate": 6.041287188828173e-06, + "loss": 0.0, + "step": 8155 + }, + { + "epoch": 8.918032786885245, + "grad_norm": 1.0368218681833241e-05, + "learning_rate": 6.010928961748634e-06, + "loss": 0.0, + "step": 8160 + }, + { + "epoch": 8.923497267759563, + "grad_norm": 1.1900097888428718e-05, + "learning_rate": 5.980570734669096e-06, + "loss": 0.0, + "step": 8165 + }, + { + "epoch": 8.92896174863388, + "grad_norm": 5.567036350839771e-05, + "learning_rate": 5.950212507589558e-06, + "loss": 0.0, + "step": 8170 + }, + { + "epoch": 8.934426229508198, + "grad_norm": 1.0399870916444343e-05, + "learning_rate": 5.919854280510018e-06, + "loss": 0.0, + "step": 8175 + }, + { + "epoch": 8.939890710382514, + "grad_norm": 1.9010061805602163e-05, + "learning_rate": 5.88949605343048e-06, + "loss": 0.0, + "step": 8180 + }, + { + "epoch": 8.94535519125683, + "grad_norm": 9.974577551474795e-06, + "learning_rate": 5.859137826350941e-06, + "loss": 0.0, + "step": 8185 + }, + { + "epoch": 8.950819672131148, + "grad_norm": 2.6583495127852075e-05, + "learning_rate": 5.828779599271403e-06, + "loss": 0.0, + "step": 8190 + }, + { + "epoch": 8.956284153005464, + "grad_norm": 0.001321590505540371, + "learning_rate": 5.798421372191864e-06, + "loss": 0.0, + "step": 8195 + }, + { + "epoch": 8.96174863387978, + "grad_norm": 1.875944872153923e-05, + "learning_rate": 5.768063145112325e-06, + "loss": 0.0, + "step": 8200 + }, + { + "epoch": 8.967213114754099, + "grad_norm": 8.05357103672577e-06, + "learning_rate": 5.737704918032787e-06, + "loss": 0.0, + "step": 8205 + }, + { + "epoch": 8.972677595628415, + "grad_norm": 0.0005711750127375126, + "learning_rate": 5.7073466909532485e-06, + "loss": 0.0, + "step": 8210 + }, + { + "epoch": 8.978142076502731, + "grad_norm": 2.2052852727938443e-05, + "learning_rate": 5.67698846387371e-06, + "loss": 0.0, + "step": 8215 + }, + { + "epoch": 8.98360655737705, + "grad_norm": 4.620003528543748e-05, + "learning_rate": 5.646630236794172e-06, + "loss": 0.0, + "step": 8220 + }, + { + "epoch": 8.989071038251366, + "grad_norm": 1.6385482012992725e-05, + "learning_rate": 5.616272009714633e-06, + "loss": 0.0, + "step": 8225 + }, + { + "epoch": 8.994535519125684, + "grad_norm": 9.964457603928167e-06, + "learning_rate": 5.585913782635095e-06, + "loss": 0.0, + "step": 8230 + }, + { + "epoch": 9.0, + "grad_norm": 7.681346687604673e-06, + "learning_rate": 5.555555555555556e-06, + "loss": 0.0, + "step": 8235 + }, + { + "epoch": 9.0, + "eval_loss": 1.2326364640102838e-06, + "eval_runtime": 661.3599, + "eval_samples_per_second": 11.062, + "eval_steps_per_second": 1.384, + "step": 8235 + }, + { + "epoch": 9.005464480874316, + "grad_norm": 1.251523553946754e-05, + "learning_rate": 5.525197328476017e-06, + "loss": 0.0, + "step": 8240 + }, + { + "epoch": 9.010928961748634, + "grad_norm": 1.9697485186043195e-05, + "learning_rate": 5.494839101396479e-06, + "loss": 0.0, + "step": 8245 + }, + { + "epoch": 9.01639344262295, + "grad_norm": 8.097184036159888e-05, + "learning_rate": 5.46448087431694e-06, + "loss": 0.0, + "step": 8250 + }, + { + "epoch": 9.021857923497267, + "grad_norm": 8.922267625166569e-06, + "learning_rate": 5.434122647237402e-06, + "loss": 0.0, + "step": 8255 + }, + { + "epoch": 9.027322404371585, + "grad_norm": 8.49717889650492e-06, + "learning_rate": 5.403764420157862e-06, + "loss": 0.0, + "step": 8260 + }, + { + "epoch": 9.032786885245901, + "grad_norm": 9.96219114313135e-06, + "learning_rate": 5.373406193078324e-06, + "loss": 0.0, + "step": 8265 + }, + { + "epoch": 9.03825136612022, + "grad_norm": 7.731101504759863e-06, + "learning_rate": 5.343047965998786e-06, + "loss": 0.0, + "step": 8270 + }, + { + "epoch": 9.043715846994536, + "grad_norm": 2.623209365992807e-05, + "learning_rate": 5.312689738919247e-06, + "loss": 0.0, + "step": 8275 + }, + { + "epoch": 9.049180327868852, + "grad_norm": 3.5027180274482816e-05, + "learning_rate": 5.2823315118397085e-06, + "loss": 0.0, + "step": 8280 + }, + { + "epoch": 9.05464480874317, + "grad_norm": 5.4955471568973735e-05, + "learning_rate": 5.25197328476017e-06, + "loss": 0.0, + "step": 8285 + }, + { + "epoch": 9.060109289617486, + "grad_norm": 5.6031560234259814e-05, + "learning_rate": 5.221615057680632e-06, + "loss": 0.0, + "step": 8290 + }, + { + "epoch": 9.065573770491802, + "grad_norm": 2.9362845452851616e-05, + "learning_rate": 5.191256830601094e-06, + "loss": 0.0, + "step": 8295 + }, + { + "epoch": 9.07103825136612, + "grad_norm": 1.5086310668266378e-05, + "learning_rate": 5.160898603521555e-06, + "loss": 0.0, + "step": 8300 + }, + { + "epoch": 9.076502732240437, + "grad_norm": 1.0832651241798885e-05, + "learning_rate": 5.130540376442016e-06, + "loss": 0.0, + "step": 8305 + }, + { + "epoch": 9.081967213114755, + "grad_norm": 8.822114068607334e-06, + "learning_rate": 5.100182149362478e-06, + "loss": 0.0, + "step": 8310 + }, + { + "epoch": 9.087431693989071, + "grad_norm": 1.0006731827161275e-05, + "learning_rate": 5.069823922282939e-06, + "loss": 0.0, + "step": 8315 + }, + { + "epoch": 9.092896174863387, + "grad_norm": 8.003929906408302e-06, + "learning_rate": 5.039465695203401e-06, + "loss": 0.0, + "step": 8320 + }, + { + "epoch": 9.098360655737705, + "grad_norm": 9.480178050580435e-06, + "learning_rate": 5.009107468123861e-06, + "loss": 0.0, + "step": 8325 + }, + { + "epoch": 9.103825136612022, + "grad_norm": 9.101410796574783e-06, + "learning_rate": 4.978749241044323e-06, + "loss": 0.0, + "step": 8330 + }, + { + "epoch": 9.109289617486338, + "grad_norm": 1.1568869922484737e-05, + "learning_rate": 4.948391013964785e-06, + "loss": 0.0, + "step": 8335 + }, + { + "epoch": 9.114754098360656, + "grad_norm": 9.305016646976583e-06, + "learning_rate": 4.918032786885246e-06, + "loss": 0.0, + "step": 8340 + }, + { + "epoch": 9.120218579234972, + "grad_norm": 1.3673022294824477e-05, + "learning_rate": 4.8876745598057074e-06, + "loss": 0.0, + "step": 8345 + }, + { + "epoch": 9.12568306010929, + "grad_norm": 0.0009493128163740039, + "learning_rate": 4.8573163327261686e-06, + "loss": 0.0, + "step": 8350 + }, + { + "epoch": 9.131147540983607, + "grad_norm": 1.3566643247031607e-05, + "learning_rate": 4.8269581056466305e-06, + "loss": 0.0, + "step": 8355 + }, + { + "epoch": 9.136612021857923, + "grad_norm": 9.785872862266842e-06, + "learning_rate": 4.7965998785670925e-06, + "loss": 0.0, + "step": 8360 + }, + { + "epoch": 9.142076502732241, + "grad_norm": 3.198983904439956e-05, + "learning_rate": 4.766241651487554e-06, + "loss": 0.0, + "step": 8365 + }, + { + "epoch": 9.147540983606557, + "grad_norm": 1.2671367585426196e-05, + "learning_rate": 4.735883424408015e-06, + "loss": 0.0, + "step": 8370 + }, + { + "epoch": 9.153005464480874, + "grad_norm": 1.4876402019581292e-05, + "learning_rate": 4.705525197328476e-06, + "loss": 0.0, + "step": 8375 + }, + { + "epoch": 9.158469945355192, + "grad_norm": 1.182084724860033e-05, + "learning_rate": 4.675166970248938e-06, + "loss": 0.0, + "step": 8380 + }, + { + "epoch": 9.163934426229508, + "grad_norm": 2.6076608264702372e-05, + "learning_rate": 4.6448087431694e-06, + "loss": 0.0, + "step": 8385 + }, + { + "epoch": 9.169398907103826, + "grad_norm": 1.273778070753906e-05, + "learning_rate": 4.61445051608986e-06, + "loss": 0.0, + "step": 8390 + }, + { + "epoch": 9.174863387978142, + "grad_norm": 0.0007501414511352777, + "learning_rate": 4.584092289010322e-06, + "loss": 0.0, + "step": 8395 + }, + { + "epoch": 9.180327868852459, + "grad_norm": 7.561423444713e-06, + "learning_rate": 4.553734061930783e-06, + "loss": 0.0, + "step": 8400 + }, + { + "epoch": 9.185792349726777, + "grad_norm": 7.892997018643655e-06, + "learning_rate": 4.523375834851245e-06, + "loss": 0.0, + "step": 8405 + }, + { + "epoch": 9.191256830601093, + "grad_norm": 1.2118057384213898e-05, + "learning_rate": 4.493017607771706e-06, + "loss": 0.0, + "step": 8410 + }, + { + "epoch": 9.19672131147541, + "grad_norm": 8.411708222411107e-06, + "learning_rate": 4.4626593806921675e-06, + "loss": 0.0, + "step": 8415 + }, + { + "epoch": 9.202185792349727, + "grad_norm": 8.92472053237725e-06, + "learning_rate": 4.4323011536126294e-06, + "loss": 0.0, + "step": 8420 + }, + { + "epoch": 9.207650273224044, + "grad_norm": 8.844925105222501e-06, + "learning_rate": 4.401942926533091e-06, + "loss": 0.0, + "step": 8425 + }, + { + "epoch": 9.21311475409836, + "grad_norm": 9.090890671359375e-06, + "learning_rate": 4.371584699453552e-06, + "loss": 0.0, + "step": 8430 + }, + { + "epoch": 9.218579234972678, + "grad_norm": 0.00030929245986044407, + "learning_rate": 4.341226472374014e-06, + "loss": 0.0, + "step": 8435 + }, + { + "epoch": 9.224043715846994, + "grad_norm": 9.536963261780329e-06, + "learning_rate": 4.310868245294475e-06, + "loss": 0.0, + "step": 8440 + }, + { + "epoch": 9.229508196721312, + "grad_norm": 1.0459447366883978e-05, + "learning_rate": 4.280510018214937e-06, + "loss": 0.0, + "step": 8445 + }, + { + "epoch": 9.234972677595628, + "grad_norm": 1.430536576663144e-05, + "learning_rate": 4.250151791135398e-06, + "loss": 0.0, + "step": 8450 + }, + { + "epoch": 9.240437158469945, + "grad_norm": 8.668756890983786e-06, + "learning_rate": 4.219793564055859e-06, + "loss": 0.0, + "step": 8455 + }, + { + "epoch": 9.245901639344263, + "grad_norm": 4.2804833356058225e-05, + "learning_rate": 4.189435336976321e-06, + "loss": 0.0, + "step": 8460 + }, + { + "epoch": 9.251366120218579, + "grad_norm": 1.0608948286972009e-05, + "learning_rate": 4.159077109896782e-06, + "loss": 0.0, + "step": 8465 + }, + { + "epoch": 9.256830601092895, + "grad_norm": 0.0008651612442918122, + "learning_rate": 4.128718882817244e-06, + "loss": 0.0, + "step": 8470 + }, + { + "epoch": 9.262295081967213, + "grad_norm": 1.1031417670892552e-05, + "learning_rate": 4.098360655737704e-06, + "loss": 0.0, + "step": 8475 + }, + { + "epoch": 9.26775956284153, + "grad_norm": 1.4733498574059922e-05, + "learning_rate": 4.068002428658166e-06, + "loss": 0.0, + "step": 8480 + }, + { + "epoch": 9.273224043715848, + "grad_norm": 1.1113019354525022e-05, + "learning_rate": 4.037644201578628e-06, + "loss": 0.0, + "step": 8485 + }, + { + "epoch": 9.278688524590164, + "grad_norm": 1.671107384026982e-05, + "learning_rate": 4.0072859744990895e-06, + "loss": 0.0, + "step": 8490 + }, + { + "epoch": 9.28415300546448, + "grad_norm": 1.4919727618689649e-05, + "learning_rate": 3.976927747419551e-06, + "loss": 0.0, + "step": 8495 + }, + { + "epoch": 9.289617486338798, + "grad_norm": 1.0741515325207729e-05, + "learning_rate": 3.946569520340012e-06, + "loss": 0.0, + "step": 8500 + }, + { + "epoch": 9.295081967213115, + "grad_norm": 1.4579722119378857e-05, + "learning_rate": 3.916211293260474e-06, + "loss": 0.0, + "step": 8505 + }, + { + "epoch": 9.300546448087431, + "grad_norm": 0.001307551865465939, + "learning_rate": 3.885853066180936e-06, + "loss": 0.0, + "step": 8510 + }, + { + "epoch": 9.306010928961749, + "grad_norm": 1.936557237058878e-05, + "learning_rate": 3.855494839101397e-06, + "loss": 0.0, + "step": 8515 + }, + { + "epoch": 9.311475409836065, + "grad_norm": 2.149451756849885e-05, + "learning_rate": 3.825136612021858e-06, + "loss": 0.0, + "step": 8520 + }, + { + "epoch": 9.316939890710383, + "grad_norm": 8.021363464649767e-05, + "learning_rate": 3.794778384942319e-06, + "loss": 0.0, + "step": 8525 + }, + { + "epoch": 9.3224043715847, + "grad_norm": 3.3864893339341506e-05, + "learning_rate": 3.764420157862781e-06, + "loss": 0.0, + "step": 8530 + }, + { + "epoch": 9.327868852459016, + "grad_norm": 3.2648382330080494e-05, + "learning_rate": 3.7340619307832426e-06, + "loss": 0.0, + "step": 8535 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 1.936520311573986e-05, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.0, + "step": 8540 + }, + { + "epoch": 9.33879781420765, + "grad_norm": 7.971724699018523e-06, + "learning_rate": 3.6733454766241653e-06, + "loss": 0.0, + "step": 8545 + }, + { + "epoch": 9.344262295081966, + "grad_norm": 1.110977063945029e-05, + "learning_rate": 3.6429872495446273e-06, + "loss": 0.0, + "step": 8550 + }, + { + "epoch": 9.349726775956285, + "grad_norm": 1.2776695257343818e-05, + "learning_rate": 3.612629022465088e-06, + "loss": 0.0, + "step": 8555 + }, + { + "epoch": 9.3551912568306, + "grad_norm": 4.267817348591052e-05, + "learning_rate": 3.58227079538555e-06, + "loss": 0.0, + "step": 8560 + }, + { + "epoch": 9.360655737704919, + "grad_norm": 9.000210411613807e-06, + "learning_rate": 3.551912568306011e-06, + "loss": 0.0, + "step": 8565 + }, + { + "epoch": 9.366120218579235, + "grad_norm": 1.3303446394274943e-05, + "learning_rate": 3.5215543412264726e-06, + "loss": 0.0, + "step": 8570 + }, + { + "epoch": 9.371584699453551, + "grad_norm": 5.926217636442743e-05, + "learning_rate": 3.491196114146934e-06, + "loss": 0.0, + "step": 8575 + }, + { + "epoch": 9.37704918032787, + "grad_norm": 9.163100003206637e-06, + "learning_rate": 3.4608378870673953e-06, + "loss": 0.0, + "step": 8580 + }, + { + "epoch": 9.382513661202186, + "grad_norm": 8.968859219748992e-06, + "learning_rate": 3.430479659987857e-06, + "loss": 0.0, + "step": 8585 + }, + { + "epoch": 9.387978142076502, + "grad_norm": 1.1057231859012973e-05, + "learning_rate": 3.400121432908318e-06, + "loss": 0.0, + "step": 8590 + }, + { + "epoch": 9.39344262295082, + "grad_norm": 9.770934411790222e-06, + "learning_rate": 3.36976320582878e-06, + "loss": 0.0, + "step": 8595 + }, + { + "epoch": 9.398907103825136, + "grad_norm": 0.0007320807198993862, + "learning_rate": 3.3394049787492415e-06, + "loss": 0.0, + "step": 8600 + }, + { + "epoch": 9.404371584699454, + "grad_norm": 0.0010416394798085093, + "learning_rate": 3.3090467516697027e-06, + "loss": 0.0, + "step": 8605 + }, + { + "epoch": 9.40983606557377, + "grad_norm": 1.6940070054261014e-05, + "learning_rate": 3.278688524590164e-06, + "loss": 0.0, + "step": 8610 + }, + { + "epoch": 9.415300546448087, + "grad_norm": 1.8109252778231166e-05, + "learning_rate": 3.2483302975106253e-06, + "loss": 0.0, + "step": 8615 + }, + { + "epoch": 9.420765027322405, + "grad_norm": 1.22038818517467e-05, + "learning_rate": 3.217972070431087e-06, + "loss": 0.0, + "step": 8620 + }, + { + "epoch": 9.426229508196721, + "grad_norm": 9.562543709762394e-06, + "learning_rate": 3.187613843351549e-06, + "loss": 0.0, + "step": 8625 + }, + { + "epoch": 9.431693989071038, + "grad_norm": 1.2685290130320936e-05, + "learning_rate": 3.1572556162720096e-06, + "loss": 0.0, + "step": 8630 + }, + { + "epoch": 9.437158469945356, + "grad_norm": 9.530190254736226e-06, + "learning_rate": 3.1268973891924715e-06, + "loss": 0.0, + "step": 8635 + }, + { + "epoch": 9.442622950819672, + "grad_norm": 3.304435813333839e-05, + "learning_rate": 3.0965391621129327e-06, + "loss": 0.0, + "step": 8640 + }, + { + "epoch": 9.448087431693988, + "grad_norm": 1.0142703104065731e-05, + "learning_rate": 3.0661809350333942e-06, + "loss": 0.0, + "step": 8645 + }, + { + "epoch": 9.453551912568306, + "grad_norm": 3.2079547963803634e-05, + "learning_rate": 3.0358227079538558e-06, + "loss": 0.0, + "step": 8650 + }, + { + "epoch": 9.459016393442623, + "grad_norm": 7.287067546712933e-06, + "learning_rate": 3.005464480874317e-06, + "loss": 0.0, + "step": 8655 + }, + { + "epoch": 9.46448087431694, + "grad_norm": 0.00036683311918750405, + "learning_rate": 2.975106253794779e-06, + "loss": 0.0, + "step": 8660 + }, + { + "epoch": 9.469945355191257, + "grad_norm": 1.1772449397540186e-05, + "learning_rate": 2.94474802671524e-06, + "loss": 0.0, + "step": 8665 + }, + { + "epoch": 9.475409836065573, + "grad_norm": 0.0004519826325122267, + "learning_rate": 2.9143897996357016e-06, + "loss": 0.0, + "step": 8670 + }, + { + "epoch": 9.480874316939891, + "grad_norm": 1.3867033885617275e-05, + "learning_rate": 2.8840315725561627e-06, + "loss": 0.0, + "step": 8675 + }, + { + "epoch": 9.486338797814208, + "grad_norm": 1.1242904292885214e-05, + "learning_rate": 2.8536733454766242e-06, + "loss": 0.0, + "step": 8680 + }, + { + "epoch": 9.491803278688524, + "grad_norm": 1.9089002307737246e-05, + "learning_rate": 2.823315118397086e-06, + "loss": 0.0, + "step": 8685 + }, + { + "epoch": 9.497267759562842, + "grad_norm": 6.980729267525021e-06, + "learning_rate": 2.7929568913175473e-06, + "loss": 0.0, + "step": 8690 + }, + { + "epoch": 9.502732240437158, + "grad_norm": 2.4231898350990377e-05, + "learning_rate": 2.7625986642380085e-06, + "loss": 0.0, + "step": 8695 + }, + { + "epoch": 9.508196721311476, + "grad_norm": 4.8574838729109615e-05, + "learning_rate": 2.73224043715847e-06, + "loss": 0.0, + "step": 8700 + }, + { + "epoch": 9.513661202185792, + "grad_norm": 4.2161831515841186e-05, + "learning_rate": 2.701882210078931e-06, + "loss": 0.0, + "step": 8705 + }, + { + "epoch": 9.519125683060109, + "grad_norm": 9.971013241738547e-06, + "learning_rate": 2.671523982999393e-06, + "loss": 0.0, + "step": 8710 + }, + { + "epoch": 9.524590163934427, + "grad_norm": 2.4025157472351566e-05, + "learning_rate": 2.6411657559198543e-06, + "loss": 0.0, + "step": 8715 + }, + { + "epoch": 9.530054644808743, + "grad_norm": 2.1537007341976278e-05, + "learning_rate": 2.610807528840316e-06, + "loss": 0.0, + "step": 8720 + }, + { + "epoch": 9.53551912568306, + "grad_norm": 0.0013983087847009301, + "learning_rate": 2.5804493017607774e-06, + "loss": 0.0, + "step": 8725 + }, + { + "epoch": 9.540983606557377, + "grad_norm": 8.980196980701294e-06, + "learning_rate": 2.550091074681239e-06, + "loss": 0.0, + "step": 8730 + }, + { + "epoch": 9.546448087431694, + "grad_norm": 8.051301847444847e-06, + "learning_rate": 2.5197328476017005e-06, + "loss": 0.0, + "step": 8735 + }, + { + "epoch": 9.551912568306012, + "grad_norm": 1.0314599421690218e-05, + "learning_rate": 2.4893746205221616e-06, + "loss": 0.0, + "step": 8740 + }, + { + "epoch": 9.557377049180328, + "grad_norm": 9.725902600621339e-06, + "learning_rate": 2.459016393442623e-06, + "loss": 0.0, + "step": 8745 + }, + { + "epoch": 9.562841530054644, + "grad_norm": 2.8515923986560665e-05, + "learning_rate": 2.4286581663630843e-06, + "loss": 0.0, + "step": 8750 + }, + { + "epoch": 9.568306010928962, + "grad_norm": 0.0003265489067416638, + "learning_rate": 2.3982999392835463e-06, + "loss": 0.0, + "step": 8755 + }, + { + "epoch": 9.573770491803279, + "grad_norm": 8.852043720253278e-06, + "learning_rate": 2.3679417122040074e-06, + "loss": 0.0, + "step": 8760 + }, + { + "epoch": 9.579234972677595, + "grad_norm": 7.82777806307422e-06, + "learning_rate": 2.337583485124469e-06, + "loss": 0.0, + "step": 8765 + }, + { + "epoch": 9.584699453551913, + "grad_norm": 1.2773216440109536e-05, + "learning_rate": 2.30722525804493e-06, + "loss": 0.0, + "step": 8770 + }, + { + "epoch": 9.59016393442623, + "grad_norm": 7.689292942814063e-06, + "learning_rate": 2.2768670309653916e-06, + "loss": 0.0, + "step": 8775 + }, + { + "epoch": 9.595628415300546, + "grad_norm": 1.9598150174715556e-05, + "learning_rate": 2.246508803885853e-06, + "loss": 0.0, + "step": 8780 + }, + { + "epoch": 9.601092896174864, + "grad_norm": 1.991266617551446e-05, + "learning_rate": 2.2161505768063147e-06, + "loss": 0.0, + "step": 8785 + }, + { + "epoch": 9.60655737704918, + "grad_norm": 8.718829121789895e-06, + "learning_rate": 2.185792349726776e-06, + "loss": 0.0, + "step": 8790 + }, + { + "epoch": 9.612021857923498, + "grad_norm": 1.2843075637647416e-05, + "learning_rate": 2.1554341226472374e-06, + "loss": 0.0, + "step": 8795 + }, + { + "epoch": 9.617486338797814, + "grad_norm": 8.296131454699207e-06, + "learning_rate": 2.125075895567699e-06, + "loss": 0.0, + "step": 8800 + }, + { + "epoch": 9.62295081967213, + "grad_norm": 9.450495781493373e-06, + "learning_rate": 2.0947176684881605e-06, + "loss": 0.0, + "step": 8805 + }, + { + "epoch": 9.628415300546449, + "grad_norm": 8.672905096318573e-06, + "learning_rate": 2.064359441408622e-06, + "loss": 0.0, + "step": 8810 + }, + { + "epoch": 9.633879781420765, + "grad_norm": 1.4991738680691924e-05, + "learning_rate": 2.034001214329083e-06, + "loss": 0.0, + "step": 8815 + }, + { + "epoch": 9.639344262295083, + "grad_norm": 1.4407401977223344e-05, + "learning_rate": 2.0036429872495447e-06, + "loss": 0.0, + "step": 8820 + }, + { + "epoch": 9.6448087431694, + "grad_norm": 2.0339235561550595e-05, + "learning_rate": 1.973284760170006e-06, + "loss": 0.0, + "step": 8825 + }, + { + "epoch": 9.650273224043715, + "grad_norm": 1.8044936950900592e-05, + "learning_rate": 1.942926533090468e-06, + "loss": 0.0, + "step": 8830 + }, + { + "epoch": 9.655737704918034, + "grad_norm": 8.918646926758811e-06, + "learning_rate": 1.912568306010929e-06, + "loss": 0.0, + "step": 8835 + }, + { + "epoch": 9.66120218579235, + "grad_norm": 1.3933055015513673e-05, + "learning_rate": 1.8822100789313905e-06, + "loss": 0.0, + "step": 8840 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 1.8471546354703605e-05, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.0, + "step": 8845 + }, + { + "epoch": 9.672131147540984, + "grad_norm": 1.658586916164495e-05, + "learning_rate": 1.8214936247723136e-06, + "loss": 0.0, + "step": 8850 + }, + { + "epoch": 9.6775956284153, + "grad_norm": 8.949955372372642e-06, + "learning_rate": 1.791135397692775e-06, + "loss": 0.0, + "step": 8855 + }, + { + "epoch": 9.683060109289617, + "grad_norm": 8.244574019045103e-06, + "learning_rate": 1.7607771706132363e-06, + "loss": 0.0, + "step": 8860 + }, + { + "epoch": 9.688524590163935, + "grad_norm": 1.4154088603390846e-05, + "learning_rate": 1.7304189435336977e-06, + "loss": 0.0, + "step": 8865 + }, + { + "epoch": 9.693989071038251, + "grad_norm": 7.654744877072517e-06, + "learning_rate": 1.700060716454159e-06, + "loss": 0.0, + "step": 8870 + }, + { + "epoch": 9.699453551912569, + "grad_norm": 3.458944411249831e-05, + "learning_rate": 1.6697024893746208e-06, + "loss": 0.0, + "step": 8875 + }, + { + "epoch": 9.704918032786885, + "grad_norm": 3.436910265008919e-05, + "learning_rate": 1.639344262295082e-06, + "loss": 0.0, + "step": 8880 + }, + { + "epoch": 9.710382513661202, + "grad_norm": 1.1345953680574894e-05, + "learning_rate": 1.6089860352155434e-06, + "loss": 0.0, + "step": 8885 + }, + { + "epoch": 9.71584699453552, + "grad_norm": 9.515051715425216e-06, + "learning_rate": 1.5786278081360048e-06, + "loss": 0.0, + "step": 8890 + }, + { + "epoch": 9.721311475409836, + "grad_norm": 1.4450808521360159e-05, + "learning_rate": 1.5482695810564663e-06, + "loss": 0.0, + "step": 8895 + }, + { + "epoch": 9.726775956284152, + "grad_norm": 0.0002483540738467127, + "learning_rate": 1.5179113539769279e-06, + "loss": 0.0, + "step": 8900 + }, + { + "epoch": 9.73224043715847, + "grad_norm": 7.709054443694185e-06, + "learning_rate": 1.4875531268973894e-06, + "loss": 0.0, + "step": 8905 + }, + { + "epoch": 9.737704918032787, + "grad_norm": 0.0009973180713132024, + "learning_rate": 1.4571948998178508e-06, + "loss": 0.0, + "step": 8910 + }, + { + "epoch": 9.743169398907105, + "grad_norm": 6.907128408784047e-05, + "learning_rate": 1.4268366727383121e-06, + "loss": 0.0, + "step": 8915 + }, + { + "epoch": 9.748633879781421, + "grad_norm": 9.242547093890607e-06, + "learning_rate": 1.3964784456587737e-06, + "loss": 0.0, + "step": 8920 + }, + { + "epoch": 9.754098360655737, + "grad_norm": 1.0322552952857222e-05, + "learning_rate": 1.366120218579235e-06, + "loss": 0.0, + "step": 8925 + }, + { + "epoch": 9.759562841530055, + "grad_norm": 7.239196747832466e-06, + "learning_rate": 1.3357619914996966e-06, + "loss": 0.0, + "step": 8930 + }, + { + "epoch": 9.765027322404372, + "grad_norm": 1.5557798178633675e-05, + "learning_rate": 1.305403764420158e-06, + "loss": 0.0, + "step": 8935 + }, + { + "epoch": 9.770491803278688, + "grad_norm": 2.291934288223274e-05, + "learning_rate": 1.2750455373406195e-06, + "loss": 0.0, + "step": 8940 + }, + { + "epoch": 9.775956284153006, + "grad_norm": 7.496446869481588e-06, + "learning_rate": 1.2446873102610808e-06, + "loss": 0.0, + "step": 8945 + }, + { + "epoch": 9.781420765027322, + "grad_norm": 4.650903429137543e-05, + "learning_rate": 1.2143290831815421e-06, + "loss": 0.0, + "step": 8950 + }, + { + "epoch": 9.78688524590164, + "grad_norm": 9.247813977708574e-06, + "learning_rate": 1.1839708561020037e-06, + "loss": 0.0, + "step": 8955 + }, + { + "epoch": 9.792349726775956, + "grad_norm": 0.00309645663946867, + "learning_rate": 1.153612629022465e-06, + "loss": 0.0, + "step": 8960 + }, + { + "epoch": 9.797814207650273, + "grad_norm": 9.541035979054868e-06, + "learning_rate": 1.1232544019429266e-06, + "loss": 0.0, + "step": 8965 + }, + { + "epoch": 9.80327868852459, + "grad_norm": 6.343203949654708e-06, + "learning_rate": 1.092896174863388e-06, + "loss": 0.0, + "step": 8970 + }, + { + "epoch": 9.808743169398907, + "grad_norm": 2.5990406356868334e-05, + "learning_rate": 1.0625379477838495e-06, + "loss": 0.0, + "step": 8975 + }, + { + "epoch": 9.814207650273223, + "grad_norm": 0.0003062534669879824, + "learning_rate": 1.032179720704311e-06, + "loss": 0.0, + "step": 8980 + }, + { + "epoch": 9.819672131147541, + "grad_norm": 1.3689349543710705e-05, + "learning_rate": 1.0018214936247724e-06, + "loss": 0.0, + "step": 8985 + }, + { + "epoch": 9.825136612021858, + "grad_norm": 8.631909622636158e-06, + "learning_rate": 9.71463266545234e-07, + "loss": 0.0, + "step": 8990 + }, + { + "epoch": 9.830601092896174, + "grad_norm": 1.931633050844539e-05, + "learning_rate": 9.411050394656953e-07, + "loss": 0.0, + "step": 8995 + }, + { + "epoch": 9.836065573770492, + "grad_norm": 9.17322267923737e-06, + "learning_rate": 9.107468123861568e-07, + "loss": 0.0, + "step": 9000 + }, + { + "epoch": 9.841530054644808, + "grad_norm": 3.237798227928579e-05, + "learning_rate": 8.803885853066182e-07, + "loss": 0.0, + "step": 9005 + }, + { + "epoch": 9.846994535519126, + "grad_norm": 9.100303941522725e-06, + "learning_rate": 8.500303582270795e-07, + "loss": 0.0, + "step": 9010 + }, + { + "epoch": 9.852459016393443, + "grad_norm": 9.24307187233353e-06, + "learning_rate": 8.19672131147541e-07, + "loss": 0.0, + "step": 9015 + }, + { + "epoch": 9.857923497267759, + "grad_norm": 0.00014155333337839693, + "learning_rate": 7.893139040680024e-07, + "loss": 0.0, + "step": 9020 + }, + { + "epoch": 9.863387978142077, + "grad_norm": 1.3452459825202823e-05, + "learning_rate": 7.589556769884639e-07, + "loss": 0.0, + "step": 9025 + }, + { + "epoch": 9.868852459016393, + "grad_norm": 0.00011776310566347092, + "learning_rate": 7.285974499089254e-07, + "loss": 0.0, + "step": 9030 + }, + { + "epoch": 9.87431693989071, + "grad_norm": 1.1997640285699163e-05, + "learning_rate": 6.982392228293868e-07, + "loss": 0.0, + "step": 9035 + }, + { + "epoch": 9.879781420765028, + "grad_norm": 2.0175972167635337e-05, + "learning_rate": 6.678809957498483e-07, + "loss": 0.0, + "step": 9040 + }, + { + "epoch": 9.885245901639344, + "grad_norm": 0.0003090601530857384, + "learning_rate": 6.375227686703097e-07, + "loss": 0.0, + "step": 9045 + }, + { + "epoch": 9.890710382513662, + "grad_norm": 7.763220310152974e-06, + "learning_rate": 6.071645415907711e-07, + "loss": 0.0, + "step": 9050 + }, + { + "epoch": 9.896174863387978, + "grad_norm": 8.539267582818866e-06, + "learning_rate": 5.768063145112325e-07, + "loss": 0.0, + "step": 9055 + }, + { + "epoch": 9.901639344262295, + "grad_norm": 4.812333645531908e-05, + "learning_rate": 5.46448087431694e-07, + "loss": 0.0, + "step": 9060 + }, + { + "epoch": 9.907103825136613, + "grad_norm": 7.236944838950876e-06, + "learning_rate": 5.160898603521555e-07, + "loss": 0.0, + "step": 9065 + }, + { + "epoch": 9.912568306010929, + "grad_norm": 7.316658411582466e-06, + "learning_rate": 4.85731633272617e-07, + "loss": 0.0, + "step": 9070 + }, + { + "epoch": 9.918032786885245, + "grad_norm": 8.218483344535343e-06, + "learning_rate": 4.553734061930784e-07, + "loss": 0.0, + "step": 9075 + }, + { + "epoch": 9.923497267759563, + "grad_norm": 1.1103961696790066e-05, + "learning_rate": 4.2501517911353975e-07, + "loss": 0.0, + "step": 9080 + }, + { + "epoch": 9.92896174863388, + "grad_norm": 9.260171282221563e-06, + "learning_rate": 3.946569520340012e-07, + "loss": 0.0, + "step": 9085 + }, + { + "epoch": 9.934426229508198, + "grad_norm": 1.0400766768725589e-05, + "learning_rate": 3.642987249544627e-07, + "loss": 0.0, + "step": 9090 + }, + { + "epoch": 9.939890710382514, + "grad_norm": 9.862003025773447e-06, + "learning_rate": 3.3394049787492414e-07, + "loss": 0.0, + "step": 9095 + }, + { + "epoch": 9.94535519125683, + "grad_norm": 7.058705705276225e-06, + "learning_rate": 3.0358227079538554e-07, + "loss": 0.0, + "step": 9100 + }, + { + "epoch": 9.950819672131148, + "grad_norm": 1.4703833585372195e-05, + "learning_rate": 2.73224043715847e-07, + "loss": 0.0, + "step": 9105 + }, + { + "epoch": 9.956284153005464, + "grad_norm": 7.783631190250162e-06, + "learning_rate": 2.428658166363085e-07, + "loss": 0.0, + "step": 9110 + }, + { + "epoch": 9.96174863387978, + "grad_norm": 1.1165495379827917e-05, + "learning_rate": 2.1250758955676987e-07, + "loss": 0.0, + "step": 9115 + }, + { + "epoch": 9.967213114754099, + "grad_norm": 0.0003044393961317837, + "learning_rate": 1.8214936247723135e-07, + "loss": 0.0, + "step": 9120 + }, + { + "epoch": 9.972677595628415, + "grad_norm": 1.3321034202817827e-05, + "learning_rate": 1.5179113539769277e-07, + "loss": 0.0, + "step": 9125 + }, + { + "epoch": 9.978142076502731, + "grad_norm": 2.1599676983896643e-05, + "learning_rate": 1.2143290831815424e-07, + "loss": 0.0, + "step": 9130 + }, + { + "epoch": 9.98360655737705, + "grad_norm": 1.0389105227659456e-05, + "learning_rate": 9.107468123861567e-08, + "loss": 0.0, + "step": 9135 + }, + { + "epoch": 9.989071038251366, + "grad_norm": 6.75558248985908e-06, + "learning_rate": 6.071645415907712e-08, + "loss": 0.0, + "step": 9140 + }, + { + "epoch": 9.994535519125684, + "grad_norm": 0.00030103125027380884, + "learning_rate": 3.035822707953856e-08, + "loss": 0.0, + "step": 9145 + }, + { + "epoch": 10.0, + "grad_norm": 1.5515264749410562e-05, + "learning_rate": 0.0, + "loss": 0.0, + "step": 9150 + }, + { + "epoch": 10.0, + "eval_loss": 9.066787356459827e-07, + "eval_runtime": 670.1167, + "eval_samples_per_second": 10.918, + "eval_steps_per_second": 1.365, + "step": 9150 + } + ], + "logging_steps": 5, + "max_steps": 9150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 2.5121727459832627e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}