{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4252, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023518344308560675, "grad_norm": 116.89811794868847, "learning_rate": 2.1126760563380282e-08, "loss": 1.1517, "step": 10 }, { "epoch": 0.004703668861712135, "grad_norm": 277.9710155529788, "learning_rate": 4.460093896713615e-08, "loss": 1.2647, "step": 20 }, { "epoch": 0.0070555032925682035, "grad_norm": 136.25013023553325, "learning_rate": 6.807511737089202e-08, "loss": 1.1512, "step": 30 }, { "epoch": 0.00940733772342427, "grad_norm": 113.12348827889794, "learning_rate": 9.154929577464789e-08, "loss": 1.0798, "step": 40 }, { "epoch": 0.011759172154280339, "grad_norm": 116.51120422837609, "learning_rate": 1.1502347417840374e-07, "loss": 1.0682, "step": 50 }, { "epoch": 0.014111006585136407, "grad_norm": 158.25517802003571, "learning_rate": 1.384976525821596e-07, "loss": 1.0344, "step": 60 }, { "epoch": 0.016462841015992474, "grad_norm": 100.5101888275114, "learning_rate": 1.619718309859155e-07, "loss": 0.9328, "step": 70 }, { "epoch": 0.01881467544684854, "grad_norm": 56.20830713816404, "learning_rate": 1.8544600938967138e-07, "loss": 0.8777, "step": 80 }, { "epoch": 0.02116650987770461, "grad_norm": 240.15395280772944, "learning_rate": 2.089201877934272e-07, "loss": 0.9159, "step": 90 }, { "epoch": 0.023518344308560677, "grad_norm": 203.37555869337805, "learning_rate": 2.323943661971831e-07, "loss": 0.8656, "step": 100 }, { "epoch": 0.025870178739416744, "grad_norm": 118.0083364283108, "learning_rate": 2.5586854460093895e-07, "loss": 0.7863, "step": 110 }, { "epoch": 0.028222013170272814, "grad_norm": 79.77067789322896, "learning_rate": 2.7934272300469483e-07, "loss": 0.8134, "step": 120 }, { "epoch": 0.03057384760112888, "grad_norm": 267.7680667397846, "learning_rate": 3.0281690140845066e-07, "loss": 0.7945, "step": 130 }, { "epoch": 0.03292568203198495, "grad_norm": 189.00926278298175, "learning_rate": 3.2629107981220654e-07, "loss": 0.8096, "step": 140 }, { "epoch": 0.03527751646284102, "grad_norm": 57.70972405124874, "learning_rate": 3.497652582159624e-07, "loss": 0.7752, "step": 150 }, { "epoch": 0.03762935089369708, "grad_norm": 69.99759640153404, "learning_rate": 3.732394366197183e-07, "loss": 0.7083, "step": 160 }, { "epoch": 0.03998118532455315, "grad_norm": 8809.659775787271, "learning_rate": 3.967136150234742e-07, "loss": 0.748, "step": 170 }, { "epoch": 0.04233301975540922, "grad_norm": 71.05386775505833, "learning_rate": 4.2018779342723e-07, "loss": 0.7624, "step": 180 }, { "epoch": 0.044684854186265284, "grad_norm": 35.37716956890071, "learning_rate": 4.436619718309859e-07, "loss": 0.751, "step": 190 }, { "epoch": 0.047036688617121354, "grad_norm": 80.85891409276856, "learning_rate": 4.671361502347418e-07, "loss": 0.7739, "step": 200 }, { "epoch": 0.049388523047977424, "grad_norm": 45.92516752004636, "learning_rate": 4.906103286384976e-07, "loss": 0.7545, "step": 210 }, { "epoch": 0.05174035747883349, "grad_norm": 68.60575847289506, "learning_rate": 5.140845070422535e-07, "loss": 0.726, "step": 220 }, { "epoch": 0.05409219190968956, "grad_norm": 442.4046356251326, "learning_rate": 5.375586854460093e-07, "loss": 0.7692, "step": 230 }, { "epoch": 0.05644402634054563, "grad_norm": 148.22276704875333, "learning_rate": 5.610328638497653e-07, "loss": 0.7193, "step": 240 }, { "epoch": 0.05879586077140169, "grad_norm": 121.42536705157094, "learning_rate": 5.845070422535211e-07, "loss": 0.7235, "step": 250 }, { "epoch": 0.06114769520225776, "grad_norm": 57.2265048995909, "learning_rate": 6.079812206572769e-07, "loss": 0.749, "step": 260 }, { "epoch": 0.06349952963311382, "grad_norm": 142.6135442328223, "learning_rate": 6.314553990610329e-07, "loss": 0.677, "step": 270 }, { "epoch": 0.0658513640639699, "grad_norm": 66.8189535712452, "learning_rate": 6.549295774647887e-07, "loss": 0.7162, "step": 280 }, { "epoch": 0.06820319849482596, "grad_norm": 47.59798393227095, "learning_rate": 6.784037558685446e-07, "loss": 0.7246, "step": 290 }, { "epoch": 0.07055503292568203, "grad_norm": 201.28699484373632, "learning_rate": 7.018779342723005e-07, "loss": 0.6994, "step": 300 }, { "epoch": 0.0729068673565381, "grad_norm": 236.38896159577624, "learning_rate": 7.253521126760564e-07, "loss": 0.6451, "step": 310 }, { "epoch": 0.07525870178739416, "grad_norm": 162.15190996547886, "learning_rate": 7.488262910798122e-07, "loss": 0.6597, "step": 320 }, { "epoch": 0.07761053621825023, "grad_norm": 56.18228731138108, "learning_rate": 7.72300469483568e-07, "loss": 0.6914, "step": 330 }, { "epoch": 0.0799623706491063, "grad_norm": 324.865693829362, "learning_rate": 7.95774647887324e-07, "loss": 0.6758, "step": 340 }, { "epoch": 0.08231420507996237, "grad_norm": 137.83272350854858, "learning_rate": 8.192488262910797e-07, "loss": 0.6552, "step": 350 }, { "epoch": 0.08466603951081844, "grad_norm": 51.74661746915652, "learning_rate": 8.427230046948356e-07, "loss": 0.6257, "step": 360 }, { "epoch": 0.08701787394167451, "grad_norm": 94.85986008371302, "learning_rate": 8.661971830985915e-07, "loss": 0.6168, "step": 370 }, { "epoch": 0.08936970837253057, "grad_norm": 83.03153262845805, "learning_rate": 8.896713615023473e-07, "loss": 0.6406, "step": 380 }, { "epoch": 0.09172154280338664, "grad_norm": 295.36784847538064, "learning_rate": 9.131455399061032e-07, "loss": 0.6493, "step": 390 }, { "epoch": 0.09407337723424271, "grad_norm": 87.86096387558246, "learning_rate": 9.366197183098591e-07, "loss": 0.6093, "step": 400 }, { "epoch": 0.09642521166509878, "grad_norm": 55.21521035928988, "learning_rate": 9.60093896713615e-07, "loss": 0.681, "step": 410 }, { "epoch": 0.09877704609595485, "grad_norm": 149.75943970393467, "learning_rate": 9.83568075117371e-07, "loss": 0.6635, "step": 420 }, { "epoch": 0.10112888052681092, "grad_norm": 96.72986417277598, "learning_rate": 9.999984829771844e-07, "loss": 0.6782, "step": 430 }, { "epoch": 0.10348071495766697, "grad_norm": 42.184520996896104, "learning_rate": 9.999715139387692e-07, "loss": 0.6748, "step": 440 }, { "epoch": 0.10583254938852305, "grad_norm": 93.47313883617879, "learning_rate": 9.99910835375207e-07, "loss": 0.6883, "step": 450 }, { "epoch": 0.10818438381937912, "grad_norm": 51.181550134553135, "learning_rate": 9.99816451377622e-07, "loss": 0.6567, "step": 460 }, { "epoch": 0.11053621825023519, "grad_norm": 88.52012869382804, "learning_rate": 9.996883683096559e-07, "loss": 0.6579, "step": 470 }, { "epoch": 0.11288805268109126, "grad_norm": 56.48549654574926, "learning_rate": 9.995265948070397e-07, "loss": 0.6076, "step": 480 }, { "epoch": 0.11523988711194733, "grad_norm": 50.9131497233012, "learning_rate": 9.99331141777011e-07, "loss": 0.5999, "step": 490 }, { "epoch": 0.11759172154280338, "grad_norm": 70.73723550031531, "learning_rate": 9.991020223975778e-07, "loss": 0.63, "step": 500 }, { "epoch": 0.11994355597365945, "grad_norm": 88.64709002603719, "learning_rate": 9.988392521166315e-07, "loss": 0.6039, "step": 510 }, { "epoch": 0.12229539040451552, "grad_norm": 75.39690510928469, "learning_rate": 9.98542848650904e-07, "loss": 0.586, "step": 520 }, { "epoch": 0.12464722483537159, "grad_norm": 113.20042490187971, "learning_rate": 9.98212831984774e-07, "loss": 0.5865, "step": 530 }, { "epoch": 0.12699905926622765, "grad_norm": 94.68667178533609, "learning_rate": 9.978492243689197e-07, "loss": 0.6069, "step": 540 }, { "epoch": 0.12935089369708372, "grad_norm": 34.52540469666706, "learning_rate": 9.974520503188178e-07, "loss": 0.6206, "step": 550 }, { "epoch": 0.1317027281279398, "grad_norm": 117.14521192427702, "learning_rate": 9.970213366130908e-07, "loss": 0.6274, "step": 560 }, { "epoch": 0.13405456255879586, "grad_norm": 53.776543081710706, "learning_rate": 9.965571122917027e-07, "loss": 0.6426, "step": 570 }, { "epoch": 0.13640639698965193, "grad_norm": 124.09156480152814, "learning_rate": 9.960594086539992e-07, "loss": 0.6068, "step": 580 }, { "epoch": 0.138758231420508, "grad_norm": 58.91257394960272, "learning_rate": 9.95528259256599e-07, "loss": 0.6136, "step": 590 }, { "epoch": 0.14111006585136407, "grad_norm": 44.14364925959302, "learning_rate": 9.949636999111302e-07, "loss": 0.6182, "step": 600 }, { "epoch": 0.14346190028222014, "grad_norm": 102.41392308653452, "learning_rate": 9.94365768681816e-07, "loss": 0.5759, "step": 610 }, { "epoch": 0.1458137347130762, "grad_norm": 105.69613721621276, "learning_rate": 9.937345058829093e-07, "loss": 0.5786, "step": 620 }, { "epoch": 0.14816556914393228, "grad_norm": 38.98828708919228, "learning_rate": 9.930699540759728e-07, "loss": 0.6084, "step": 630 }, { "epoch": 0.15051740357478832, "grad_norm": 354.4940196727006, "learning_rate": 9.923721580670113e-07, "loss": 0.5689, "step": 640 }, { "epoch": 0.1528692380056444, "grad_norm": 153.95231269118221, "learning_rate": 9.916411649034491e-07, "loss": 0.5657, "step": 650 }, { "epoch": 0.15522107243650046, "grad_norm": 928.564170107045, "learning_rate": 9.908770238709592e-07, "loss": 0.6084, "step": 660 }, { "epoch": 0.15757290686735653, "grad_norm": 92.12341812343223, "learning_rate": 9.900797864901394e-07, "loss": 0.5847, "step": 670 }, { "epoch": 0.1599247412982126, "grad_norm": 50.21703372463601, "learning_rate": 9.892495065130394e-07, "loss": 0.6206, "step": 680 }, { "epoch": 0.16227657572906867, "grad_norm": 38.305107297666325, "learning_rate": 9.883862399195357e-07, "loss": 0.6206, "step": 690 }, { "epoch": 0.16462841015992474, "grad_norm": 105.72112815211388, "learning_rate": 9.874900449135582e-07, "loss": 0.5821, "step": 700 }, { "epoch": 0.1669802445907808, "grad_norm": 162.13055243234115, "learning_rate": 9.865609819191659e-07, "loss": 0.5742, "step": 710 }, { "epoch": 0.16933207902163688, "grad_norm": 58.70026610582205, "learning_rate": 9.855991135764718e-07, "loss": 0.547, "step": 720 }, { "epoch": 0.17168391345249295, "grad_norm": 100.43914638169913, "learning_rate": 9.846045047374215e-07, "loss": 0.5794, "step": 730 }, { "epoch": 0.17403574788334902, "grad_norm": 139.51085792167504, "learning_rate": 9.83577222461418e-07, "loss": 0.5835, "step": 740 }, { "epoch": 0.1763875823142051, "grad_norm": 91.55902670992894, "learning_rate": 9.825173360108034e-07, "loss": 0.568, "step": 750 }, { "epoch": 0.17873941674506114, "grad_norm": 49.09585389161329, "learning_rate": 9.814249168461868e-07, "loss": 0.5416, "step": 760 }, { "epoch": 0.1810912511759172, "grad_norm": 39.30524914731049, "learning_rate": 9.80300038621627e-07, "loss": 0.5974, "step": 770 }, { "epoch": 0.18344308560677328, "grad_norm": 437.03477978317704, "learning_rate": 9.79142777179666e-07, "loss": 0.614, "step": 780 }, { "epoch": 0.18579492003762935, "grad_norm": 113.50973293517667, "learning_rate": 9.779532105462173e-07, "loss": 0.5798, "step": 790 }, { "epoch": 0.18814675446848542, "grad_norm": 310.99758984616733, "learning_rate": 9.767314189253023e-07, "loss": 0.5609, "step": 800 }, { "epoch": 0.1904985888993415, "grad_norm": 78.52431684901823, "learning_rate": 9.754774846936455e-07, "loss": 0.582, "step": 810 }, { "epoch": 0.19285042333019756, "grad_norm": 34.79880147456029, "learning_rate": 9.74191492395118e-07, "loss": 0.5538, "step": 820 }, { "epoch": 0.19520225776105363, "grad_norm": 48.17636180244228, "learning_rate": 9.728735287350395e-07, "loss": 0.5686, "step": 830 }, { "epoch": 0.1975540921919097, "grad_norm": 222.11427267611163, "learning_rate": 9.715236825743306e-07, "loss": 0.5695, "step": 840 }, { "epoch": 0.19990592662276577, "grad_norm": 163.48188564313824, "learning_rate": 9.701420449235224e-07, "loss": 0.5993, "step": 850 }, { "epoch": 0.20225776105362184, "grad_norm": 180.8729345274767, "learning_rate": 9.687287089366208e-07, "loss": 0.5765, "step": 860 }, { "epoch": 0.20460959548447788, "grad_norm": 64.88971976462376, "learning_rate": 9.672837699048247e-07, "loss": 0.6076, "step": 870 }, { "epoch": 0.20696142991533395, "grad_norm": 45.66335791522097, "learning_rate": 9.65807325250101e-07, "loss": 0.5102, "step": 880 }, { "epoch": 0.20931326434619002, "grad_norm": 69.3940533519506, "learning_rate": 9.642994745186186e-07, "loss": 0.596, "step": 890 }, { "epoch": 0.2116650987770461, "grad_norm": 90.66599474951411, "learning_rate": 9.627603193740329e-07, "loss": 0.5539, "step": 900 }, { "epoch": 0.21401693320790216, "grad_norm": 203.88445976449455, "learning_rate": 9.611899635906345e-07, "loss": 0.5494, "step": 910 }, { "epoch": 0.21636876763875823, "grad_norm": 35.55143547858949, "learning_rate": 9.595885130463512e-07, "loss": 0.5821, "step": 920 }, { "epoch": 0.2187206020696143, "grad_norm": 135.08153342145144, "learning_rate": 9.579560757156092e-07, "loss": 0.5466, "step": 930 }, { "epoch": 0.22107243650047037, "grad_norm": 89.381610871162, "learning_rate": 9.562927616620534e-07, "loss": 0.532, "step": 940 }, { "epoch": 0.22342427093132644, "grad_norm": 178.58317872348456, "learning_rate": 9.545986830311271e-07, "loss": 0.6014, "step": 950 }, { "epoch": 0.2257761053621825, "grad_norm": 84.70137793347699, "learning_rate": 9.528739540425097e-07, "loss": 0.5751, "step": 960 }, { "epoch": 0.22812793979303858, "grad_norm": 26.339901114155346, "learning_rate": 9.511186909824171e-07, "loss": 0.5783, "step": 970 }, { "epoch": 0.23047977422389465, "grad_norm": 67.79506982367464, "learning_rate": 9.493330121957599e-07, "loss": 0.563, "step": 980 }, { "epoch": 0.2328316086547507, "grad_norm": 119.24074854076687, "learning_rate": 9.475170380781654e-07, "loss": 0.5572, "step": 990 }, { "epoch": 0.23518344308560676, "grad_norm": 146.1667351412439, "learning_rate": 9.456708910678595e-07, "loss": 0.5367, "step": 1000 }, { "epoch": 0.23753527751646283, "grad_norm": 88.88690460802079, "learning_rate": 9.437946956374117e-07, "loss": 0.5375, "step": 1010 }, { "epoch": 0.2398871119473189, "grad_norm": 81.31694958580168, "learning_rate": 9.41888578285343e-07, "loss": 0.5458, "step": 1020 }, { "epoch": 0.24223894637817497, "grad_norm": 45.5082728792024, "learning_rate": 9.399526675275968e-07, "loss": 0.5648, "step": 1030 }, { "epoch": 0.24459078080903104, "grad_norm": 54.62994929806069, "learning_rate": 9.379870938888743e-07, "loss": 0.5464, "step": 1040 }, { "epoch": 0.24694261523988711, "grad_norm": 95.25990550468921, "learning_rate": 9.359919898938336e-07, "loss": 0.5276, "step": 1050 }, { "epoch": 0.24929444967074318, "grad_norm": 56.86836827412844, "learning_rate": 9.33967490058155e-07, "loss": 0.5037, "step": 1060 }, { "epoch": 0.2516462841015992, "grad_norm": 263.1519863594519, "learning_rate": 9.319137308794712e-07, "loss": 0.5763, "step": 1070 }, { "epoch": 0.2539981185324553, "grad_norm": 48.975787835710975, "learning_rate": 9.298308508281645e-07, "loss": 0.5204, "step": 1080 }, { "epoch": 0.25634995296331137, "grad_norm": 618.5840505320624, "learning_rate": 9.277189903380308e-07, "loss": 0.5375, "step": 1090 }, { "epoch": 0.25870178739416744, "grad_norm": 113.884112676073, "learning_rate": 9.255782917968107e-07, "loss": 0.568, "step": 1100 }, { "epoch": 0.2610536218250235, "grad_norm": 385.97433675604555, "learning_rate": 9.234088995365898e-07, "loss": 0.5479, "step": 1110 }, { "epoch": 0.2634054562558796, "grad_norm": 235.00439426458524, "learning_rate": 9.212109598240669e-07, "loss": 0.5667, "step": 1120 }, { "epoch": 0.26575729068673565, "grad_norm": 526.3091500577647, "learning_rate": 9.189846208506931e-07, "loss": 0.519, "step": 1130 }, { "epoch": 0.2681091251175917, "grad_norm": 150.0750656097666, "learning_rate": 9.167300327226794e-07, "loss": 0.4965, "step": 1140 }, { "epoch": 0.2704609595484478, "grad_norm": 30.016558195777204, "learning_rate": 9.144473474508765e-07, "loss": 0.4756, "step": 1150 }, { "epoch": 0.27281279397930386, "grad_norm": 60.193732037518764, "learning_rate": 9.12136718940526e-07, "loss": 0.5377, "step": 1160 }, { "epoch": 0.27516462841015993, "grad_norm": 219.87409938307607, "learning_rate": 9.097983029808831e-07, "loss": 0.5231, "step": 1170 }, { "epoch": 0.277516462841016, "grad_norm": 691.5188548277733, "learning_rate": 9.074322572347135e-07, "loss": 0.5553, "step": 1180 }, { "epoch": 0.27986829727187207, "grad_norm": 58.14856175427267, "learning_rate": 9.050387412276628e-07, "loss": 0.5261, "step": 1190 }, { "epoch": 0.28222013170272814, "grad_norm": 72.18580518142099, "learning_rate": 9.026179163375012e-07, "loss": 0.5083, "step": 1200 }, { "epoch": 0.2845719661335842, "grad_norm": 48.319128442643155, "learning_rate": 9.001699457832425e-07, "loss": 0.5661, "step": 1210 }, { "epoch": 0.2869238005644403, "grad_norm": 85.74869777359726, "learning_rate": 8.976949946141399e-07, "loss": 0.5502, "step": 1220 }, { "epoch": 0.28927563499529635, "grad_norm": 27.470838511127315, "learning_rate": 8.951932296985576e-07, "loss": 0.5194, "step": 1230 }, { "epoch": 0.2916274694261524, "grad_norm": 103.72100326338919, "learning_rate": 8.926648197127202e-07, "loss": 0.5538, "step": 1240 }, { "epoch": 0.2939793038570085, "grad_norm": 37.395281219640474, "learning_rate": 8.901099351293397e-07, "loss": 0.5585, "step": 1250 }, { "epoch": 0.29633113828786456, "grad_norm": 76.92500519691636, "learning_rate": 8.875287482061225e-07, "loss": 0.5556, "step": 1260 }, { "epoch": 0.29868297271872063, "grad_norm": 115.83402029314064, "learning_rate": 8.849214329741542e-07, "loss": 0.5347, "step": 1270 }, { "epoch": 0.30103480714957664, "grad_norm": 26.861221309569398, "learning_rate": 8.822881652261671e-07, "loss": 0.5434, "step": 1280 }, { "epoch": 0.3033866415804327, "grad_norm": 47.06480089234017, "learning_rate": 8.796291225046866e-07, "loss": 0.515, "step": 1290 }, { "epoch": 0.3057384760112888, "grad_norm": 242.58390403927206, "learning_rate": 8.76944484090062e-07, "loss": 0.5204, "step": 1300 }, { "epoch": 0.30809031044214485, "grad_norm": 69.9924587472249, "learning_rate": 8.742344309883776e-07, "loss": 0.5266, "step": 1310 }, { "epoch": 0.3104421448730009, "grad_norm": 174.79009098016476, "learning_rate": 8.7149914591925e-07, "loss": 0.5269, "step": 1320 }, { "epoch": 0.312793979303857, "grad_norm": 51.84251807068747, "learning_rate": 8.687388133035074e-07, "loss": 0.513, "step": 1330 }, { "epoch": 0.31514581373471307, "grad_norm": 101.12451895232418, "learning_rate": 8.659536192507564e-07, "loss": 0.5578, "step": 1340 }, { "epoch": 0.31749764816556914, "grad_norm": 91.89776616663202, "learning_rate": 8.631437515468336e-07, "loss": 0.509, "step": 1350 }, { "epoch": 0.3198494825964252, "grad_norm": 363.4004724877564, "learning_rate": 8.603093996411444e-07, "loss": 0.5398, "step": 1360 }, { "epoch": 0.3222013170272813, "grad_norm": 66.77614323450521, "learning_rate": 8.574507546338895e-07, "loss": 0.4921, "step": 1370 }, { "epoch": 0.32455315145813735, "grad_norm": 49.08933240076176, "learning_rate": 8.545680092631815e-07, "loss": 0.5313, "step": 1380 }, { "epoch": 0.3269049858889934, "grad_norm": 49.554772352006694, "learning_rate": 8.516613578920488e-07, "loss": 0.447, "step": 1390 }, { "epoch": 0.3292568203198495, "grad_norm": 43.55060672795549, "learning_rate": 8.487309964953311e-07, "loss": 0.5278, "step": 1400 }, { "epoch": 0.33160865475070556, "grad_norm": 53.0380482362615, "learning_rate": 8.457771226464674e-07, "loss": 0.505, "step": 1410 }, { "epoch": 0.3339604891815616, "grad_norm": 79.64660023563455, "learning_rate": 8.427999355041735e-07, "loss": 0.5226, "step": 1420 }, { "epoch": 0.3363123236124177, "grad_norm": 55.95587949885615, "learning_rate": 8.397996357990153e-07, "loss": 0.5258, "step": 1430 }, { "epoch": 0.33866415804327377, "grad_norm": 411.2146929812851, "learning_rate": 8.367764258198744e-07, "loss": 0.5709, "step": 1440 }, { "epoch": 0.34101599247412984, "grad_norm": 43.734123642110056, "learning_rate": 8.337305094003091e-07, "loss": 0.5129, "step": 1450 }, { "epoch": 0.3433678269049859, "grad_norm": 227.72574989130484, "learning_rate": 8.306620919048115e-07, "loss": 0.5365, "step": 1460 }, { "epoch": 0.345719661335842, "grad_norm": 230.09797130940922, "learning_rate": 8.275713802149622e-07, "loss": 0.4906, "step": 1470 }, { "epoch": 0.34807149576669805, "grad_norm": 54.19875482542778, "learning_rate": 8.244585827154795e-07, "loss": 0.499, "step": 1480 }, { "epoch": 0.3504233301975541, "grad_norm": 55.791984177409674, "learning_rate": 8.213239092801718e-07, "loss": 0.5155, "step": 1490 }, { "epoch": 0.3527751646284102, "grad_norm": 38.61566415419522, "learning_rate": 8.181675712577864e-07, "loss": 0.4924, "step": 1500 }, { "epoch": 0.3551269990592662, "grad_norm": 59.61394018369017, "learning_rate": 8.149897814577589e-07, "loss": 0.4948, "step": 1510 }, { "epoch": 0.35747883349012227, "grad_norm": 52.530770820466806, "learning_rate": 8.117907541358664e-07, "loss": 0.4625, "step": 1520 }, { "epoch": 0.35983066792097834, "grad_norm": 53.03111212968418, "learning_rate": 8.08570704979781e-07, "loss": 0.4853, "step": 1530 }, { "epoch": 0.3621825023518344, "grad_norm": 65.37886086917705, "learning_rate": 8.053298510945279e-07, "loss": 0.499, "step": 1540 }, { "epoch": 0.3645343367826905, "grad_norm": 274.0211083038785, "learning_rate": 8.020684109878465e-07, "loss": 0.5308, "step": 1550 }, { "epoch": 0.36688617121354655, "grad_norm": 207.52089838136985, "learning_rate": 7.987866045554598e-07, "loss": 0.5111, "step": 1560 }, { "epoch": 0.3692380056444026, "grad_norm": 201.91551525915267, "learning_rate": 7.954846530662467e-07, "loss": 0.5095, "step": 1570 }, { "epoch": 0.3715898400752587, "grad_norm": 66.61996761679119, "learning_rate": 7.921627791473242e-07, "loss": 0.4668, "step": 1580 }, { "epoch": 0.37394167450611476, "grad_norm": 29.688431988926887, "learning_rate": 7.888212067690372e-07, "loss": 0.5093, "step": 1590 }, { "epoch": 0.37629350893697083, "grad_norm": 96.58502652217979, "learning_rate": 7.854601612298577e-07, "loss": 0.5492, "step": 1600 }, { "epoch": 0.3786453433678269, "grad_norm": 160.08236167143332, "learning_rate": 7.820798691411945e-07, "loss": 0.5056, "step": 1610 }, { "epoch": 0.380997177798683, "grad_norm": 94.89520708968821, "learning_rate": 7.786805584121143e-07, "loss": 0.5017, "step": 1620 }, { "epoch": 0.38334901222953904, "grad_norm": 111.60483541208626, "learning_rate": 7.75262458233976e-07, "loss": 0.4888, "step": 1630 }, { "epoch": 0.3857008466603951, "grad_norm": 52.130149242500984, "learning_rate": 7.718257990649766e-07, "loss": 0.5116, "step": 1640 }, { "epoch": 0.3880526810912512, "grad_norm": 132.947730554617, "learning_rate": 7.683708126146146e-07, "loss": 0.4888, "step": 1650 }, { "epoch": 0.39040451552210725, "grad_norm": 119.19032436133351, "learning_rate": 7.648977318280667e-07, "loss": 0.5562, "step": 1660 }, { "epoch": 0.3927563499529633, "grad_norm": 79.62177704099146, "learning_rate": 7.614067908704822e-07, "loss": 0.4979, "step": 1670 }, { "epoch": 0.3951081843838194, "grad_norm": 276.43996320587473, "learning_rate": 7.578982251111941e-07, "loss": 0.5253, "step": 1680 }, { "epoch": 0.39746001881467546, "grad_norm": 33.12487289775825, "learning_rate": 7.543722711078515e-07, "loss": 0.4725, "step": 1690 }, { "epoch": 0.39981185324553153, "grad_norm": 196.6122063462283, "learning_rate": 7.508291665904684e-07, "loss": 0.4929, "step": 1700 }, { "epoch": 0.4021636876763876, "grad_norm": 65.99306854525632, "learning_rate": 7.472691504453963e-07, "loss": 0.5353, "step": 1710 }, { "epoch": 0.4045155221072437, "grad_norm": 419.0366078449193, "learning_rate": 7.436924626992177e-07, "loss": 0.5149, "step": 1720 }, { "epoch": 0.40686735653809974, "grad_norm": 63.6596547753076, "learning_rate": 7.400993445025623e-07, "loss": 0.5043, "step": 1730 }, { "epoch": 0.40921919096895576, "grad_norm": 94.23732036703356, "learning_rate": 7.364900381138488e-07, "loss": 0.5123, "step": 1740 }, { "epoch": 0.41157102539981183, "grad_norm": 125.17343539075726, "learning_rate": 7.3286478688295e-07, "loss": 0.5495, "step": 1750 }, { "epoch": 0.4139228598306679, "grad_norm": 490.21785388787504, "learning_rate": 7.292238352347865e-07, "loss": 0.5369, "step": 1760 }, { "epoch": 0.41627469426152397, "grad_norm": 35.884085037680386, "learning_rate": 7.255674286528462e-07, "loss": 0.5539, "step": 1770 }, { "epoch": 0.41862652869238004, "grad_norm": 106.9642458702286, "learning_rate": 7.218958136626336e-07, "loss": 0.441, "step": 1780 }, { "epoch": 0.4209783631232361, "grad_norm": 71.82566114451349, "learning_rate": 7.182092378150479e-07, "loss": 0.5205, "step": 1790 }, { "epoch": 0.4233301975540922, "grad_norm": 779.4365683391414, "learning_rate": 7.145079496696924e-07, "loss": 0.4959, "step": 1800 }, { "epoch": 0.42568203198494825, "grad_norm": 44.667117126603884, "learning_rate": 7.107921987781162e-07, "loss": 0.4753, "step": 1810 }, { "epoch": 0.4280338664158043, "grad_norm": 110.85747495169443, "learning_rate": 7.070622356669887e-07, "loss": 0.4958, "step": 1820 }, { "epoch": 0.4303857008466604, "grad_norm": 113.10527215049324, "learning_rate": 7.03318311821208e-07, "loss": 0.454, "step": 1830 }, { "epoch": 0.43273753527751646, "grad_norm": 17.84232109387917, "learning_rate": 6.995606796669454e-07, "loss": 0.5019, "step": 1840 }, { "epoch": 0.43508936970837253, "grad_norm": 110.74344931358036, "learning_rate": 6.957895925546262e-07, "loss": 0.4719, "step": 1850 }, { "epoch": 0.4374412041392286, "grad_norm": 41.489250407979306, "learning_rate": 6.920053047418475e-07, "loss": 0.5075, "step": 1860 }, { "epoch": 0.43979303857008467, "grad_norm": 30.1685856705556, "learning_rate": 6.88208071376236e-07, "loss": 0.4843, "step": 1870 }, { "epoch": 0.44214487300094074, "grad_norm": 145.26880468994483, "learning_rate": 6.843981484782452e-07, "loss": 0.4794, "step": 1880 }, { "epoch": 0.4444967074317968, "grad_norm": 108.30784248543398, "learning_rate": 6.80575792923893e-07, "loss": 0.4918, "step": 1890 }, { "epoch": 0.4468485418626529, "grad_norm": 272.0716196572391, "learning_rate": 6.767412624274434e-07, "loss": 0.4926, "step": 1900 }, { "epoch": 0.44920037629350895, "grad_norm": 53.98848770173013, "learning_rate": 6.728948155240303e-07, "loss": 0.5, "step": 1910 }, { "epoch": 0.451552210724365, "grad_norm": 90.77607158800988, "learning_rate": 6.690367115522257e-07, "loss": 0.4901, "step": 1920 }, { "epoch": 0.4539040451552211, "grad_norm": 42.55048877269962, "learning_rate": 6.651672106365554e-07, "loss": 0.5189, "step": 1930 }, { "epoch": 0.45625587958607716, "grad_norm": 44.97567866783233, "learning_rate": 6.612865736699598e-07, "loss": 0.5033, "step": 1940 }, { "epoch": 0.45860771401693323, "grad_norm": 67.49282430103631, "learning_rate": 6.573950622962039e-07, "loss": 0.5276, "step": 1950 }, { "epoch": 0.4609595484477893, "grad_norm": 59.17058897377058, "learning_rate": 6.534929388922374e-07, "loss": 0.4735, "step": 1960 }, { "epoch": 0.4633113828786453, "grad_norm": 111.86181053840893, "learning_rate": 6.495804665505029e-07, "loss": 0.4788, "step": 1970 }, { "epoch": 0.4656632173095014, "grad_norm": 189.94498833584603, "learning_rate": 6.456579090611987e-07, "loss": 0.5144, "step": 1980 }, { "epoch": 0.46801505174035746, "grad_norm": 122.35102212459157, "learning_rate": 6.417255308944928e-07, "loss": 0.5283, "step": 1990 }, { "epoch": 0.4703668861712135, "grad_norm": 186.0617213529026, "learning_rate": 6.37783597182692e-07, "loss": 0.4672, "step": 2000 }, { "epoch": 0.4727187206020696, "grad_norm": 115.36410352920348, "learning_rate": 6.338323737023651e-07, "loss": 0.4965, "step": 2010 }, { "epoch": 0.47507055503292567, "grad_norm": 85.9333267678821, "learning_rate": 6.298721268564243e-07, "loss": 0.4895, "step": 2020 }, { "epoch": 0.47742238946378174, "grad_norm": 211.95325107965013, "learning_rate": 6.259031236561632e-07, "loss": 0.5538, "step": 2030 }, { "epoch": 0.4797742238946378, "grad_norm": 91.27430764319159, "learning_rate": 6.219256317032537e-07, "loss": 0.4437, "step": 2040 }, { "epoch": 0.4821260583254939, "grad_norm": 298.2235290567097, "learning_rate": 6.179399191717046e-07, "loss": 0.4904, "step": 2050 }, { "epoch": 0.48447789275634995, "grad_norm": 36.409310791398326, "learning_rate": 6.139462547897793e-07, "loss": 0.5011, "step": 2060 }, { "epoch": 0.486829727187206, "grad_norm": 61.35140792241313, "learning_rate": 6.099449078218781e-07, "loss": 0.4999, "step": 2070 }, { "epoch": 0.4891815616180621, "grad_norm": 192.0635061382374, "learning_rate": 6.059361480503839e-07, "loss": 0.4978, "step": 2080 }, { "epoch": 0.49153339604891816, "grad_norm": 1602.6959579230695, "learning_rate": 6.019202457574717e-07, "loss": 0.5296, "step": 2090 }, { "epoch": 0.49388523047977423, "grad_norm": 46.88396962107588, "learning_rate": 5.97897471706886e-07, "loss": 0.4666, "step": 2100 }, { "epoch": 0.4962370649106303, "grad_norm": 42.5404300683936, "learning_rate": 5.938680971256855e-07, "loss": 0.4917, "step": 2110 }, { "epoch": 0.49858889934148637, "grad_norm": 168.45591180658585, "learning_rate": 5.898323936859554e-07, "loss": 0.4982, "step": 2120 }, { "epoch": 0.5009407337723424, "grad_norm": 98.29895214863133, "learning_rate": 5.857906334864908e-07, "loss": 0.4613, "step": 2130 }, { "epoch": 0.5032925682031985, "grad_norm": 84.14383238818485, "learning_rate": 5.817430890344514e-07, "loss": 0.4573, "step": 2140 }, { "epoch": 0.5056444026340545, "grad_norm": 63.91190561893289, "learning_rate": 5.776900332269874e-07, "loss": 0.4658, "step": 2150 }, { "epoch": 0.5079962370649106, "grad_norm": 105.97500836643457, "learning_rate": 5.73631739332841e-07, "loss": 0.4943, "step": 2160 }, { "epoch": 0.5103480714957667, "grad_norm": 84.24147849783279, "learning_rate": 5.695684809739212e-07, "loss": 0.4429, "step": 2170 }, { "epoch": 0.5126999059266227, "grad_norm": 189.39210679587683, "learning_rate": 5.655005321068556e-07, "loss": 0.4739, "step": 2180 }, { "epoch": 0.5150517403574788, "grad_norm": 154.99555922544172, "learning_rate": 5.614281670045191e-07, "loss": 0.4914, "step": 2190 }, { "epoch": 0.5174035747883349, "grad_norm": 54.60327223607897, "learning_rate": 5.573516602375427e-07, "loss": 0.4801, "step": 2200 }, { "epoch": 0.519755409219191, "grad_norm": 83.19875850848948, "learning_rate": 5.532712866557994e-07, "loss": 0.5323, "step": 2210 }, { "epoch": 0.522107243650047, "grad_norm": 42.60436730500025, "learning_rate": 5.491873213698749e-07, "loss": 0.4573, "step": 2220 }, { "epoch": 0.5244590780809031, "grad_norm": 65.42258392696083, "learning_rate": 5.451000397325176e-07, "loss": 0.4814, "step": 2230 }, { "epoch": 0.5268109125117592, "grad_norm": 115.80636504665334, "learning_rate": 5.410097173200738e-07, "loss": 0.4587, "step": 2240 }, { "epoch": 0.5291627469426152, "grad_norm": 69.60117550551328, "learning_rate": 5.36916629913908e-07, "loss": 0.5132, "step": 2250 }, { "epoch": 0.5315145813734713, "grad_norm": 66.375344126355, "learning_rate": 5.32821053481808e-07, "loss": 0.4671, "step": 2260 }, { "epoch": 0.5338664158043274, "grad_norm": 35.28013711011858, "learning_rate": 5.287232641593799e-07, "loss": 0.4786, "step": 2270 }, { "epoch": 0.5362182502351834, "grad_norm": 786.6344541862824, "learning_rate": 5.24623538231428e-07, "loss": 0.446, "step": 2280 }, { "epoch": 0.5385700846660395, "grad_norm": 123.10978108283794, "learning_rate": 5.205221521133293e-07, "loss": 0.4589, "step": 2290 }, { "epoch": 0.5409219190968956, "grad_norm": 219.42520028684848, "learning_rate": 5.164193823323949e-07, "loss": 0.5345, "step": 2300 }, { "epoch": 0.5432737535277516, "grad_norm": 42.749936673742305, "learning_rate": 5.123155055092266e-07, "loss": 0.5589, "step": 2310 }, { "epoch": 0.5456255879586077, "grad_norm": 132.15751551013375, "learning_rate": 5.082107983390663e-07, "loss": 0.4669, "step": 2320 }, { "epoch": 0.5479774223894638, "grad_norm": 102.8246996509365, "learning_rate": 5.041055375731404e-07, "loss": 0.4539, "step": 2330 }, { "epoch": 0.5503292568203199, "grad_norm": 38.44411987577502, "learning_rate": 5e-07, "loss": 0.5467, "step": 2340 }, { "epoch": 0.5526810912511759, "grad_norm": 74.77277783685864, "learning_rate": 4.958944624268596e-07, "loss": 0.5021, "step": 2350 }, { "epoch": 0.555032925682032, "grad_norm": 38.676119419707206, "learning_rate": 4.917892016609336e-07, "loss": 0.4276, "step": 2360 }, { "epoch": 0.5573847601128881, "grad_norm": 108.70553186871935, "learning_rate": 4.876844944907734e-07, "loss": 0.4719, "step": 2370 }, { "epoch": 0.5597365945437441, "grad_norm": 295.3911082440077, "learning_rate": 4.835806176676051e-07, "loss": 0.4445, "step": 2380 }, { "epoch": 0.5620884289746002, "grad_norm": 49.65291441301108, "learning_rate": 4.794778478866707e-07, "loss": 0.4846, "step": 2390 }, { "epoch": 0.5644402634054563, "grad_norm": 60.72248586512295, "learning_rate": 4.753764617685719e-07, "loss": 0.4897, "step": 2400 }, { "epoch": 0.5667920978363123, "grad_norm": 64.47512708646681, "learning_rate": 4.7127673584062015e-07, "loss": 0.4741, "step": 2410 }, { "epoch": 0.5691439322671684, "grad_norm": 47.47422172419129, "learning_rate": 4.671789465181919e-07, "loss": 0.4669, "step": 2420 }, { "epoch": 0.5714957666980245, "grad_norm": 23.291621066236825, "learning_rate": 4.6308337008609215e-07, "loss": 0.4387, "step": 2430 }, { "epoch": 0.5738476011288806, "grad_norm": 38.681058186421566, "learning_rate": 4.5899028267992613e-07, "loss": 0.478, "step": 2440 }, { "epoch": 0.5761994355597366, "grad_norm": 49.47985557527302, "learning_rate": 4.548999602674824e-07, "loss": 0.4768, "step": 2450 }, { "epoch": 0.5785512699905927, "grad_norm": 61.860260636343476, "learning_rate": 4.5081267863012504e-07, "loss": 0.4633, "step": 2460 }, { "epoch": 0.5809031044214488, "grad_norm": 71.87446520905469, "learning_rate": 4.4672871334420054e-07, "loss": 0.4902, "step": 2470 }, { "epoch": 0.5832549388523048, "grad_norm": 59.913860308252595, "learning_rate": 4.4264833976245736e-07, "loss": 0.4773, "step": 2480 }, { "epoch": 0.5856067732831609, "grad_norm": 66.42267107533596, "learning_rate": 4.3857183299548084e-07, "loss": 0.4871, "step": 2490 }, { "epoch": 0.587958607714017, "grad_norm": 31.108180028188862, "learning_rate": 4.344994678931445e-07, "loss": 0.4645, "step": 2500 }, { "epoch": 0.590310442144873, "grad_norm": 28.34707994823098, "learning_rate": 4.304315190260787e-07, "loss": 0.4702, "step": 2510 }, { "epoch": 0.5926622765757291, "grad_norm": 171.3392721123324, "learning_rate": 4.2636826066715895e-07, "loss": 0.4394, "step": 2520 }, { "epoch": 0.5950141110065852, "grad_norm": 52.496086063752934, "learning_rate": 4.2230996677301265e-07, "loss": 0.468, "step": 2530 }, { "epoch": 0.5973659454374413, "grad_norm": 152.40165257833203, "learning_rate": 4.182569109655488e-07, "loss": 0.4449, "step": 2540 }, { "epoch": 0.5997177798682972, "grad_norm": 49.22527606031466, "learning_rate": 4.142093665135092e-07, "loss": 0.4621, "step": 2550 }, { "epoch": 0.6020696142991533, "grad_norm": 204.6300032968629, "learning_rate": 4.101676063140447e-07, "loss": 0.4697, "step": 2560 }, { "epoch": 0.6044214487300094, "grad_norm": 49.04313064544541, "learning_rate": 4.0613190287431457e-07, "loss": 0.4672, "step": 2570 }, { "epoch": 0.6067732831608654, "grad_norm": 75.74845619142411, "learning_rate": 4.0210252829311384e-07, "loss": 0.4547, "step": 2580 }, { "epoch": 0.6091251175917215, "grad_norm": 41.52522300616638, "learning_rate": 3.980797542425284e-07, "loss": 0.4841, "step": 2590 }, { "epoch": 0.6114769520225776, "grad_norm": 24.660847040050093, "learning_rate": 3.9406385194961617e-07, "loss": 0.505, "step": 2600 }, { "epoch": 0.6138287864534336, "grad_norm": 32.07608549927192, "learning_rate": 3.9005509217812195e-07, "loss": 0.4588, "step": 2610 }, { "epoch": 0.6161806208842897, "grad_norm": 39.70294250231196, "learning_rate": 3.8605374521022074e-07, "loss": 0.4559, "step": 2620 }, { "epoch": 0.6185324553151458, "grad_norm": 77.97272809649765, "learning_rate": 3.8206008082829546e-07, "loss": 0.4406, "step": 2630 }, { "epoch": 0.6208842897460018, "grad_norm": 45.67613106527001, "learning_rate": 3.7807436829674625e-07, "loss": 0.4385, "step": 2640 }, { "epoch": 0.6232361241768579, "grad_norm": 76.59247815028435, "learning_rate": 3.740968763438369e-07, "loss": 0.4869, "step": 2650 }, { "epoch": 0.625587958607714, "grad_norm": 39.68012913221431, "learning_rate": 3.7012787314357564e-07, "loss": 0.4721, "step": 2660 }, { "epoch": 0.6279397930385701, "grad_norm": 141.25589970566216, "learning_rate": 3.6616762629763485e-07, "loss": 0.5125, "step": 2670 }, { "epoch": 0.6302916274694261, "grad_norm": 23.55149435862232, "learning_rate": 3.6221640281730807e-07, "loss": 0.4115, "step": 2680 }, { "epoch": 0.6326434619002822, "grad_norm": 143.42525243309444, "learning_rate": 3.5827446910550706e-07, "loss": 0.4668, "step": 2690 }, { "epoch": 0.6349952963311383, "grad_norm": 77.81781401332093, "learning_rate": 3.543420909388013e-07, "loss": 0.4647, "step": 2700 }, { "epoch": 0.6373471307619943, "grad_norm": 368.31014969438706, "learning_rate": 3.5041953344949713e-07, "loss": 0.5143, "step": 2710 }, { "epoch": 0.6396989651928504, "grad_norm": 191.99282418445105, "learning_rate": 3.4650706110776263e-07, "loss": 0.4714, "step": 2720 }, { "epoch": 0.6420507996237065, "grad_norm": 172.6805749126617, "learning_rate": 3.4260493770379594e-07, "loss": 0.4876, "step": 2730 }, { "epoch": 0.6444026340545626, "grad_norm": 61.17327984316229, "learning_rate": 3.387134263300403e-07, "loss": 0.4178, "step": 2740 }, { "epoch": 0.6467544684854186, "grad_norm": 114.17761831777952, "learning_rate": 3.3483278936344473e-07, "loss": 0.4889, "step": 2750 }, { "epoch": 0.6491063029162747, "grad_norm": 29.259182763118652, "learning_rate": 3.3096328844777445e-07, "loss": 0.4508, "step": 2760 }, { "epoch": 0.6514581373471308, "grad_norm": 119.4854028220211, "learning_rate": 3.2710518447596975e-07, "loss": 0.4392, "step": 2770 }, { "epoch": 0.6538099717779868, "grad_norm": 25.936405577527868, "learning_rate": 3.232587375725566e-07, "loss": 0.4725, "step": 2780 }, { "epoch": 0.6561618062088429, "grad_norm": 67.32516092680038, "learning_rate": 3.1942420707610713e-07, "loss": 0.4363, "step": 2790 }, { "epoch": 0.658513640639699, "grad_norm": 49.096928986568244, "learning_rate": 3.156018515217549e-07, "loss": 0.4891, "step": 2800 }, { "epoch": 0.660865475070555, "grad_norm": 48.270407025181726, "learning_rate": 3.11791928623764e-07, "loss": 0.4846, "step": 2810 }, { "epoch": 0.6632173095014111, "grad_norm": 33.146338229003334, "learning_rate": 3.079946952581526e-07, "loss": 0.4372, "step": 2820 }, { "epoch": 0.6655691439322672, "grad_norm": 65.86339846284933, "learning_rate": 3.042104074453739e-07, "loss": 0.4938, "step": 2830 }, { "epoch": 0.6679209783631233, "grad_norm": 81.43044910317796, "learning_rate": 3.0043932033305455e-07, "loss": 0.5127, "step": 2840 }, { "epoch": 0.6702728127939793, "grad_norm": 28.25621020100993, "learning_rate": 2.9668168817879205e-07, "loss": 0.5283, "step": 2850 }, { "epoch": 0.6726246472248354, "grad_norm": 70.5644459258473, "learning_rate": 2.9293776433301144e-07, "loss": 0.4676, "step": 2860 }, { "epoch": 0.6749764816556915, "grad_norm": 833.1173554079337, "learning_rate": 2.8920780122188393e-07, "loss": 0.4238, "step": 2870 }, { "epoch": 0.6773283160865475, "grad_norm": 50.15505341992137, "learning_rate": 2.854920503303076e-07, "loss": 0.4317, "step": 2880 }, { "epoch": 0.6796801505174036, "grad_norm": 53.62160731503517, "learning_rate": 2.8179076218495213e-07, "loss": 0.5074, "step": 2890 }, { "epoch": 0.6820319849482597, "grad_norm": 82.67486758357786, "learning_rate": 2.7810418633736637e-07, "loss": 0.451, "step": 2900 }, { "epoch": 0.6843838193791157, "grad_norm": 70.65167543458996, "learning_rate": 2.744325713471536e-07, "loss": 0.4832, "step": 2910 }, { "epoch": 0.6867356538099718, "grad_norm": 70.82701033248681, "learning_rate": 2.707761647652135e-07, "loss": 0.4679, "step": 2920 }, { "epoch": 0.6890874882408279, "grad_norm": 119.20686302974886, "learning_rate": 2.6713521311705e-07, "loss": 0.4545, "step": 2930 }, { "epoch": 0.691439322671684, "grad_norm": 46.64006057310895, "learning_rate": 2.635099618861513e-07, "loss": 0.5192, "step": 2940 }, { "epoch": 0.69379115710254, "grad_norm": 62.9139122700984, "learning_rate": 2.5990065549743766e-07, "loss": 0.4554, "step": 2950 }, { "epoch": 0.6961429915333961, "grad_norm": 134.31949058487535, "learning_rate": 2.5630753730078236e-07, "loss": 0.5069, "step": 2960 }, { "epoch": 0.6984948259642522, "grad_norm": 91.61539909081964, "learning_rate": 2.527308495546038e-07, "loss": 0.4289, "step": 2970 }, { "epoch": 0.7008466603951082, "grad_norm": 104.86257682935386, "learning_rate": 2.4917083340953175e-07, "loss": 0.433, "step": 2980 }, { "epoch": 0.7031984948259643, "grad_norm": 21.474222567958595, "learning_rate": 2.456277288921485e-07, "loss": 0.4365, "step": 2990 }, { "epoch": 0.7055503292568204, "grad_norm": 324.3124016674876, "learning_rate": 2.4210177488880587e-07, "loss": 0.4179, "step": 3000 }, { "epoch": 0.7079021636876763, "grad_norm": 39.24490171045256, "learning_rate": 2.3859320912951797e-07, "loss": 0.4258, "step": 3010 }, { "epoch": 0.7102539981185324, "grad_norm": 26.55750023296075, "learning_rate": 2.3510226817193319e-07, "loss": 0.4575, "step": 3020 }, { "epoch": 0.7126058325493885, "grad_norm": 76.18210272558196, "learning_rate": 2.3162918738538539e-07, "loss": 0.4474, "step": 3030 }, { "epoch": 0.7149576669802445, "grad_norm": 163.82835782749893, "learning_rate": 2.281742009350235e-07, "loss": 0.5178, "step": 3040 }, { "epoch": 0.7173095014111006, "grad_norm": 71.20621946298115, "learning_rate": 2.247375417660241e-07, "loss": 0.4542, "step": 3050 }, { "epoch": 0.7196613358419567, "grad_norm": 29.44780246695801, "learning_rate": 2.2131944158788545e-07, "loss": 0.4696, "step": 3060 }, { "epoch": 0.7220131702728128, "grad_norm": 28.85017992785853, "learning_rate": 2.1792013085880539e-07, "loss": 0.4262, "step": 3070 }, { "epoch": 0.7243650047036688, "grad_norm": 161.1358114916103, "learning_rate": 2.1453983877014224e-07, "loss": 0.4506, "step": 3080 }, { "epoch": 0.7267168391345249, "grad_norm": 48.21294290482945, "learning_rate": 2.1117879323096283e-07, "loss": 0.4762, "step": 3090 }, { "epoch": 0.729068673565381, "grad_norm": 92.39652488615809, "learning_rate": 2.0783722085267592e-07, "loss": 0.4569, "step": 3100 }, { "epoch": 0.731420507996237, "grad_norm": 72.21113504991627, "learning_rate": 2.0451534693375344e-07, "loss": 0.4738, "step": 3110 }, { "epoch": 0.7337723424270931, "grad_norm": 92.26016205643168, "learning_rate": 2.0121339544454035e-07, "loss": 0.4435, "step": 3120 }, { "epoch": 0.7361241768579492, "grad_norm": 68.48228287299885, "learning_rate": 1.9793158901215346e-07, "loss": 0.4313, "step": 3130 }, { "epoch": 0.7384760112888052, "grad_norm": 164.80360664767645, "learning_rate": 1.9467014890547223e-07, "loss": 0.4148, "step": 3140 }, { "epoch": 0.7408278457196613, "grad_norm": 66.07514802799065, "learning_rate": 1.9142929502021904e-07, "loss": 0.4557, "step": 3150 }, { "epoch": 0.7431796801505174, "grad_norm": 54.18230405158542, "learning_rate": 1.8820924586413373e-07, "loss": 0.4389, "step": 3160 }, { "epoch": 0.7455315145813735, "grad_norm": 54.933584541808706, "learning_rate": 1.8501021854224115e-07, "loss": 0.4405, "step": 3170 }, { "epoch": 0.7478833490122295, "grad_norm": 73.88687819153088, "learning_rate": 1.8183242874221365e-07, "loss": 0.444, "step": 3180 }, { "epoch": 0.7502351834430856, "grad_norm": 155.9296620837369, "learning_rate": 1.786760907198281e-07, "loss": 0.4456, "step": 3190 }, { "epoch": 0.7525870178739417, "grad_norm": 98.13292721707111, "learning_rate": 1.7554141728452038e-07, "loss": 0.4804, "step": 3200 }, { "epoch": 0.7549388523047977, "grad_norm": 20.87180402977683, "learning_rate": 1.7242861978503782e-07, "loss": 0.3997, "step": 3210 }, { "epoch": 0.7572906867356538, "grad_norm": 139.8758062253345, "learning_rate": 1.6933790809518839e-07, "loss": 0.4336, "step": 3220 }, { "epoch": 0.7596425211665099, "grad_norm": 67.67816700607545, "learning_rate": 1.6626949059969098e-07, "loss": 0.4371, "step": 3230 }, { "epoch": 0.761994355597366, "grad_norm": 31.044524767135282, "learning_rate": 1.632235741801255e-07, "loss": 0.4678, "step": 3240 }, { "epoch": 0.764346190028222, "grad_norm": 31.59032271901563, "learning_rate": 1.6020036420098455e-07, "loss": 0.4474, "step": 3250 }, { "epoch": 0.7666980244590781, "grad_norm": 56.09519646865281, "learning_rate": 1.5720006449582635e-07, "loss": 0.4464, "step": 3260 }, { "epoch": 0.7690498588899342, "grad_norm": 60.68844089135062, "learning_rate": 1.5422287735353257e-07, "loss": 0.4525, "step": 3270 }, { "epoch": 0.7714016933207902, "grad_norm": 44.61096552113006, "learning_rate": 1.5126900350466886e-07, "loss": 0.4514, "step": 3280 }, { "epoch": 0.7737535277516463, "grad_norm": 79.00286386953672, "learning_rate": 1.4833864210795132e-07, "loss": 0.4546, "step": 3290 }, { "epoch": 0.7761053621825024, "grad_norm": 96.77266999382469, "learning_rate": 1.4543199073681855e-07, "loss": 0.4188, "step": 3300 }, { "epoch": 0.7784571966133584, "grad_norm": 62.93527984098046, "learning_rate": 1.4254924536611046e-07, "loss": 0.4931, "step": 3310 }, { "epoch": 0.7808090310442145, "grad_norm": 78.89136393447265, "learning_rate": 1.396906003588557e-07, "loss": 0.478, "step": 3320 }, { "epoch": 0.7831608654750706, "grad_norm": 123.25424347020814, "learning_rate": 1.368562484531664e-07, "loss": 0.4439, "step": 3330 }, { "epoch": 0.7855126999059266, "grad_norm": 72.17648953992217, "learning_rate": 1.3404638074924356e-07, "loss": 0.4661, "step": 3340 }, { "epoch": 0.7878645343367827, "grad_norm": 29.181609218156055, "learning_rate": 1.3126118669649255e-07, "loss": 0.4775, "step": 3350 }, { "epoch": 0.7902163687676388, "grad_norm": 26.80987859871448, "learning_rate": 1.2850085408075e-07, "loss": 0.432, "step": 3360 }, { "epoch": 0.7925682031984949, "grad_norm": 42.77599635619014, "learning_rate": 1.2576556901162234e-07, "loss": 0.4705, "step": 3370 }, { "epoch": 0.7949200376293509, "grad_norm": 45.81217647451442, "learning_rate": 1.2305551590993806e-07, "loss": 0.47, "step": 3380 }, { "epoch": 0.797271872060207, "grad_norm": 101.96913312576068, "learning_rate": 1.2037087749531328e-07, "loss": 0.4229, "step": 3390 }, { "epoch": 0.7996237064910631, "grad_norm": 215.88495935464798, "learning_rate": 1.177118347738329e-07, "loss": 0.4447, "step": 3400 }, { "epoch": 0.8019755409219191, "grad_norm": 72.55346736718074, "learning_rate": 1.1507856702584573e-07, "loss": 0.4563, "step": 3410 }, { "epoch": 0.8043273753527752, "grad_norm": 267.60170600595404, "learning_rate": 1.1247125179387734e-07, "loss": 0.4835, "step": 3420 }, { "epoch": 0.8066792097836313, "grad_norm": 38.91519432313084, "learning_rate": 1.098900648706601e-07, "loss": 0.445, "step": 3430 }, { "epoch": 0.8090310442144873, "grad_norm": 107.08048276724904, "learning_rate": 1.0733518028727973e-07, "loss": 0.437, "step": 3440 }, { "epoch": 0.8113828786453434, "grad_norm": 39.41515052926627, "learning_rate": 1.048067703014423e-07, "loss": 0.4016, "step": 3450 }, { "epoch": 0.8137347130761995, "grad_norm": 64.66608261177336, "learning_rate": 1.0230500538586012e-07, "loss": 0.4537, "step": 3460 }, { "epoch": 0.8160865475070554, "grad_norm": 61.054737141139334, "learning_rate": 9.983005421675761e-08, "loss": 0.4124, "step": 3470 }, { "epoch": 0.8184383819379115, "grad_norm": 59.104871236625186, "learning_rate": 9.738208366249895e-08, "loss": 0.447, "step": 3480 }, { "epoch": 0.8207902163687676, "grad_norm": 110.57928796201509, "learning_rate": 9.496125877233736e-08, "loss": 0.461, "step": 3490 }, { "epoch": 0.8231420507996237, "grad_norm": 234.52785886205078, "learning_rate": 9.256774276528655e-08, "loss": 0.4856, "step": 3500 }, { "epoch": 0.8254938852304797, "grad_norm": 63.124590798290406, "learning_rate": 9.020169701911695e-08, "loss": 0.4501, "step": 3510 }, { "epoch": 0.8278457196613358, "grad_norm": 146.5239433816864, "learning_rate": 8.786328105947405e-08, "loss": 0.4902, "step": 3520 }, { "epoch": 0.8301975540921919, "grad_norm": 34.913788680016566, "learning_rate": 8.555265254912337e-08, "loss": 0.4275, "step": 3530 }, { "epoch": 0.8325493885230479, "grad_norm": 79.65521261255103, "learning_rate": 8.326996727732055e-08, "loss": 0.4531, "step": 3540 }, { "epoch": 0.834901222953904, "grad_norm": 47.72263945629518, "learning_rate": 8.101537914930684e-08, "loss": 0.4062, "step": 3550 }, { "epoch": 0.8372530573847601, "grad_norm": 94.48797265907596, "learning_rate": 7.878904017593302e-08, "loss": 0.4462, "step": 3560 }, { "epoch": 0.8396048918156162, "grad_norm": 89.24680993944979, "learning_rate": 7.659110046341016e-08, "loss": 0.4334, "step": 3570 }, { "epoch": 0.8419567262464722, "grad_norm": 31.758114332081302, "learning_rate": 7.442170820318922e-08, "loss": 0.423, "step": 3580 }, { "epoch": 0.8443085606773283, "grad_norm": 108.36139891290028, "learning_rate": 7.228100966196916e-08, "loss": 0.46, "step": 3590 }, { "epoch": 0.8466603951081844, "grad_norm": 45.42121511612232, "learning_rate": 7.016914917183541e-08, "loss": 0.4393, "step": 3600 }, { "epoch": 0.8490122295390404, "grad_norm": 38.06302239041297, "learning_rate": 6.808626912052878e-08, "loss": 0.4123, "step": 3610 }, { "epoch": 0.8513640639698965, "grad_norm": 144.0496907397513, "learning_rate": 6.603250994184506e-08, "loss": 0.4226, "step": 3620 }, { "epoch": 0.8537158984007526, "grad_norm": 57.78752403702503, "learning_rate": 6.40080101061664e-08, "loss": 0.4378, "step": 3630 }, { "epoch": 0.8560677328316086, "grad_norm": 37.60367979124333, "learning_rate": 6.201290611112564e-08, "loss": 0.4561, "step": 3640 }, { "epoch": 0.8584195672624647, "grad_norm": 53.18398073653025, "learning_rate": 6.004733247240317e-08, "loss": 0.4678, "step": 3650 }, { "epoch": 0.8607714016933208, "grad_norm": 133.8658799415832, "learning_rate": 5.8111421714657105e-08, "loss": 0.4677, "step": 3660 }, { "epoch": 0.8631232361241769, "grad_norm": 53.25914607379931, "learning_rate": 5.620530436258841e-08, "loss": 0.5194, "step": 3670 }, { "epoch": 0.8654750705550329, "grad_norm": 70.42760542084339, "learning_rate": 5.4329108932140546e-08, "loss": 0.427, "step": 3680 }, { "epoch": 0.867826904985889, "grad_norm": 64.38658865994383, "learning_rate": 5.2482961921834604e-08, "loss": 0.4482, "step": 3690 }, { "epoch": 0.8701787394167451, "grad_norm": 42.99630254744219, "learning_rate": 5.066698780424006e-08, "loss": 0.4426, "step": 3700 }, { "epoch": 0.8725305738476011, "grad_norm": 51.11741118831597, "learning_rate": 4.888130901758292e-08, "loss": 0.434, "step": 3710 }, { "epoch": 0.8748824082784572, "grad_norm": 53.96851327406331, "learning_rate": 4.71260459574902e-08, "loss": 0.3842, "step": 3720 }, { "epoch": 0.8772342427093133, "grad_norm": 56.59443774149579, "learning_rate": 4.5401316968873004e-08, "loss": 0.4481, "step": 3730 }, { "epoch": 0.8795860771401693, "grad_norm": 83.85863914669255, "learning_rate": 4.370723833794665e-08, "loss": 0.4266, "step": 3740 }, { "epoch": 0.8819379115710254, "grad_norm": 39.06100931375012, "learning_rate": 4.2043924284390854e-08, "loss": 0.4238, "step": 3750 }, { "epoch": 0.8842897460018815, "grad_norm": 107.61806818011783, "learning_rate": 4.041148695364882e-08, "loss": 0.4485, "step": 3760 }, { "epoch": 0.8866415804327376, "grad_norm": 33.85213758711094, "learning_rate": 3.881003640936548e-08, "loss": 0.446, "step": 3770 }, { "epoch": 0.8889934148635936, "grad_norm": 61.187373812156466, "learning_rate": 3.723968062596711e-08, "loss": 0.4344, "step": 3780 }, { "epoch": 0.8913452492944497, "grad_norm": 736.9312025906532, "learning_rate": 3.570052548138147e-08, "loss": 0.4541, "step": 3790 }, { "epoch": 0.8936970837253058, "grad_norm": 119.33344609512177, "learning_rate": 3.4192674749898785e-08, "loss": 0.4908, "step": 3800 }, { "epoch": 0.8960489181561618, "grad_norm": 25.5797619702394, "learning_rate": 3.2716230095175435e-08, "loss": 0.4004, "step": 3810 }, { "epoch": 0.8984007525870179, "grad_norm": 49.359917397601, "learning_rate": 3.127129106337917e-08, "loss": 0.4518, "step": 3820 }, { "epoch": 0.900752587017874, "grad_norm": 25.07989197247865, "learning_rate": 2.985795507647754e-08, "loss": 0.4131, "step": 3830 }, { "epoch": 0.90310442144873, "grad_norm": 64.61035042248321, "learning_rate": 2.8476317425669527e-08, "loss": 0.447, "step": 3840 }, { "epoch": 0.9054562558795861, "grad_norm": 93.42176266818919, "learning_rate": 2.7126471264960593e-08, "loss": 0.4329, "step": 3850 }, { "epoch": 0.9078080903104422, "grad_norm": 89.24070781803138, "learning_rate": 2.580850760488196e-08, "loss": 0.4753, "step": 3860 }, { "epoch": 0.9101599247412983, "grad_norm": 26.373133567746557, "learning_rate": 2.4522515306354517e-08, "loss": 0.4606, "step": 3870 }, { "epoch": 0.9125117591721543, "grad_norm": 473.7854777781819, "learning_rate": 2.3268581074697536e-08, "loss": 0.5009, "step": 3880 }, { "epoch": 0.9148635936030104, "grad_norm": 32.436085698187235, "learning_rate": 2.204678945378269e-08, "loss": 0.4162, "step": 3890 }, { "epoch": 0.9172154280338665, "grad_norm": 170.05001192780844, "learning_rate": 2.0857222820333808e-08, "loss": 0.4458, "step": 3900 }, { "epoch": 0.9195672624647225, "grad_norm": 48.18623345953366, "learning_rate": 1.9699961378373097e-08, "loss": 0.4123, "step": 3910 }, { "epoch": 0.9219190968955786, "grad_norm": 1358.9466832068094, "learning_rate": 1.8575083153813175e-08, "loss": 0.4355, "step": 3920 }, { "epoch": 0.9242709313264346, "grad_norm": 113.07515347161218, "learning_rate": 1.7482663989196456e-08, "loss": 0.4339, "step": 3930 }, { "epoch": 0.9266227657572906, "grad_norm": 70.74972682616625, "learning_rate": 1.64227775385819e-08, "loss": 0.4448, "step": 3940 }, { "epoch": 0.9289746001881467, "grad_norm": 42.63045190997477, "learning_rate": 1.539549526257866e-08, "loss": 0.446, "step": 3950 }, { "epoch": 0.9313264346190028, "grad_norm": 297.25978639344856, "learning_rate": 1.4400886423528103e-08, "loss": 0.422, "step": 3960 }, { "epoch": 0.9336782690498588, "grad_norm": 67.23940820577212, "learning_rate": 1.3439018080834142e-08, "loss": 0.4835, "step": 3970 }, { "epoch": 0.9360301034807149, "grad_norm": 77.9763692203697, "learning_rate": 1.2509955086441758e-08, "loss": 0.4637, "step": 3980 }, { "epoch": 0.938381937911571, "grad_norm": 118.62750518663206, "learning_rate": 1.1613760080464385e-08, "loss": 0.44, "step": 3990 }, { "epoch": 0.940733772342427, "grad_norm": 34.90355156969723, "learning_rate": 1.0750493486960666e-08, "loss": 0.4278, "step": 4000 }, { "epoch": 0.9430856067732831, "grad_norm": 25.802830924781176, "learning_rate": 9.920213509860498e-09, "loss": 0.4353, "step": 4010 }, { "epoch": 0.9454374412041392, "grad_norm": 44.67589158879238, "learning_rate": 9.122976129040782e-09, "loss": 0.4026, "step": 4020 }, { "epoch": 0.9477892756349953, "grad_norm": 31.063785760081664, "learning_rate": 8.358835096550886e-09, "loss": 0.4056, "step": 4030 }, { "epoch": 0.9501411100658513, "grad_norm": 29.538919263142972, "learning_rate": 7.627841932988765e-09, "loss": 0.392, "step": 4040 }, { "epoch": 0.9524929444967074, "grad_norm": 48.153765650560715, "learning_rate": 6.9300459240271835e-09, "loss": 0.434, "step": 4050 }, { "epoch": 0.9548447789275635, "grad_norm": 97.9376546668311, "learning_rate": 6.265494117090764e-09, "loss": 0.4637, "step": 4060 }, { "epoch": 0.9571966133584195, "grad_norm": 46.369837122592855, "learning_rate": 5.634231318183913e-09, "loss": 0.4483, "step": 4070 }, { "epoch": 0.9595484477892756, "grad_norm": 34.112979889172955, "learning_rate": 5.036300088869794e-09, "loss": 0.3958, "step": 4080 }, { "epoch": 0.9619002822201317, "grad_norm": 67.68046313703896, "learning_rate": 4.471740743400843e-09, "loss": 0.3998, "step": 4090 }, { "epoch": 0.9642521166509878, "grad_norm": 64.94301439710068, "learning_rate": 3.94059134600061e-09, "loss": 0.5025, "step": 4100 }, { "epoch": 0.9666039510818438, "grad_norm": 72.14615132171306, "learning_rate": 3.4428877082972597e-09, "loss": 0.4421, "step": 4110 }, { "epoch": 0.9689557855126999, "grad_norm": 24.113378758695696, "learning_rate": 2.9786633869091083e-09, "loss": 0.4162, "step": 4120 }, { "epoch": 0.971307619943556, "grad_norm": 241.97699208841857, "learning_rate": 2.5479496811823264e-09, "loss": 0.4298, "step": 4130 }, { "epoch": 0.973659454374412, "grad_norm": 133.43074381257875, "learning_rate": 2.1507756310802926e-09, "loss": 0.4428, "step": 4140 }, { "epoch": 0.9760112888052681, "grad_norm": 26.108621458874143, "learning_rate": 1.7871680152258816e-09, "loss": 0.478, "step": 4150 }, { "epoch": 0.9783631232361242, "grad_norm": 48.05086153194788, "learning_rate": 1.4571513490960197e-09, "loss": 0.4364, "step": 4160 }, { "epoch": 0.9807149576669802, "grad_norm": 135.6321121517264, "learning_rate": 1.1607478833685624e-09, "loss": 0.4654, "step": 4170 }, { "epoch": 0.9830667920978363, "grad_norm": 43.352481280379564, "learning_rate": 8.97977602422162e-10, "loss": 0.4522, "step": 4180 }, { "epoch": 0.9854186265286924, "grad_norm": 400.18828509454596, "learning_rate": 6.688582229890106e-10, "loss": 0.4308, "step": 4190 }, { "epoch": 0.9877704609595485, "grad_norm": 118.9697384571146, "learning_rate": 4.734051929601857e-10, "loss": 0.4335, "step": 4200 }, { "epoch": 0.9901222953904045, "grad_norm": 620.0407392361634, "learning_rate": 3.116316903440941e-10, "loss": 0.4353, "step": 4210 }, { "epoch": 0.9924741298212606, "grad_norm": 93.82732661926246, "learning_rate": 1.8354862237812685e-10, "loss": 0.4672, "step": 4220 }, { "epoch": 0.9948259642521167, "grad_norm": 39.8947952442234, "learning_rate": 8.916462479297005e-11, "loss": 0.4519, "step": 4230 }, { "epoch": 0.9971777986829727, "grad_norm": 220.48761793592806, "learning_rate": 2.8486061230736934e-11, "loss": 0.4331, "step": 4240 }, { "epoch": 0.9995296331138288, "grad_norm": 32.77645356865427, "learning_rate": 1.5170228155891861e-12, "loss": 0.4223, "step": 4250 }, { "epoch": 1.0, "step": 4252, "total_flos": 4.149639304655667e+17, "train_loss": 0.5222998527373667, "train_runtime": 59750.607, "train_samples_per_second": 1.139, "train_steps_per_second": 0.071 } ], "logging_steps": 10, "max_steps": 4252, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.149639304655667e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }