CocoRoF's picture
Training in progress, step 12495, checkpoint
4d8e572 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999862446870104,
"eval_steps": 2500,
"global_step": 12495,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008003091193973672,
"grad_norm": 75.25,
"learning_rate": 9.999984368969842e-07,
"loss": 135.0496,
"step": 10
},
{
"epoch": 0.0016006182387947345,
"grad_norm": 77.9375,
"learning_rate": 9.999968737939682e-07,
"loss": 134.7034,
"step": 20
},
{
"epoch": 0.0024009273581921016,
"grad_norm": 78.8125,
"learning_rate": 9.999953106909524e-07,
"loss": 134.1727,
"step": 30
},
{
"epoch": 0.003201236477589469,
"grad_norm": 74.125,
"learning_rate": 9.999937475879366e-07,
"loss": 134.3995,
"step": 40
},
{
"epoch": 0.004001545596986836,
"grad_norm": 77.8125,
"learning_rate": 9.999921844849208e-07,
"loss": 133.854,
"step": 50
},
{
"epoch": 0.004801854716384203,
"grad_norm": 73.1875,
"learning_rate": 9.999906213819048e-07,
"loss": 134.2893,
"step": 60
},
{
"epoch": 0.005602163835781571,
"grad_norm": 74.0625,
"learning_rate": 9.99989058278889e-07,
"loss": 134.4156,
"step": 70
},
{
"epoch": 0.006402472955178938,
"grad_norm": 72.5625,
"learning_rate": 9.999874951758733e-07,
"loss": 132.5738,
"step": 80
},
{
"epoch": 0.007202782074576305,
"grad_norm": 80.8125,
"learning_rate": 9.999859320728575e-07,
"loss": 134.4381,
"step": 90
},
{
"epoch": 0.008003091193973673,
"grad_norm": 78.875,
"learning_rate": 9.999843689698417e-07,
"loss": 133.9594,
"step": 100
},
{
"epoch": 0.00880340031337104,
"grad_norm": 71.125,
"learning_rate": 9.999828058668257e-07,
"loss": 133.9086,
"step": 110
},
{
"epoch": 0.009603709432768406,
"grad_norm": 72.0625,
"learning_rate": 9.9998124276381e-07,
"loss": 133.434,
"step": 120
},
{
"epoch": 0.010404018552165774,
"grad_norm": 79.25,
"learning_rate": 9.999796796607942e-07,
"loss": 133.4989,
"step": 130
},
{
"epoch": 0.011204327671563142,
"grad_norm": 79.1875,
"learning_rate": 9.999781165577784e-07,
"loss": 134.517,
"step": 140
},
{
"epoch": 0.012004636790960508,
"grad_norm": 71.9375,
"learning_rate": 9.999765534547624e-07,
"loss": 133.5077,
"step": 150
},
{
"epoch": 0.012804945910357876,
"grad_norm": 74.625,
"learning_rate": 9.999749903517466e-07,
"loss": 133.3733,
"step": 160
},
{
"epoch": 0.013605255029755244,
"grad_norm": 78.6875,
"learning_rate": 9.999734272487308e-07,
"loss": 132.901,
"step": 170
},
{
"epoch": 0.01440556414915261,
"grad_norm": 76.625,
"learning_rate": 9.999718641457148e-07,
"loss": 135.7891,
"step": 180
},
{
"epoch": 0.015205873268549977,
"grad_norm": 72.6875,
"learning_rate": 9.99970301042699e-07,
"loss": 134.1953,
"step": 190
},
{
"epoch": 0.016006182387947345,
"grad_norm": 72.1875,
"learning_rate": 9.999687379396833e-07,
"loss": 132.8302,
"step": 200
},
{
"epoch": 0.01680649150734471,
"grad_norm": 76.375,
"learning_rate": 9.999671748366673e-07,
"loss": 133.097,
"step": 210
},
{
"epoch": 0.01760680062674208,
"grad_norm": 81.25,
"learning_rate": 9.999656117336515e-07,
"loss": 133.4282,
"step": 220
},
{
"epoch": 0.018407109746139447,
"grad_norm": 85.125,
"learning_rate": 9.999640486306357e-07,
"loss": 134.319,
"step": 230
},
{
"epoch": 0.019207418865536813,
"grad_norm": 79.125,
"learning_rate": 9.9996248552762e-07,
"loss": 132.7719,
"step": 240
},
{
"epoch": 0.020007727984934182,
"grad_norm": 78.5,
"learning_rate": 9.999609224246041e-07,
"loss": 133.3401,
"step": 250
},
{
"epoch": 0.02080803710433155,
"grad_norm": 82.625,
"learning_rate": 9.999593593215884e-07,
"loss": 134.8742,
"step": 260
},
{
"epoch": 0.021608346223728914,
"grad_norm": 76.875,
"learning_rate": 9.999577962185724e-07,
"loss": 135.1933,
"step": 270
},
{
"epoch": 0.022408655343126284,
"grad_norm": 76.3125,
"learning_rate": 9.999562331155566e-07,
"loss": 134.0234,
"step": 280
},
{
"epoch": 0.02320896446252365,
"grad_norm": 74.5,
"learning_rate": 9.999546700125408e-07,
"loss": 133.5756,
"step": 290
},
{
"epoch": 0.024009273581921016,
"grad_norm": 81.5625,
"learning_rate": 9.99953106909525e-07,
"loss": 134.8582,
"step": 300
},
{
"epoch": 0.024809582701318385,
"grad_norm": 73.25,
"learning_rate": 9.99951543806509e-07,
"loss": 132.5909,
"step": 310
},
{
"epoch": 0.02560989182071575,
"grad_norm": 71.3125,
"learning_rate": 9.999499807034932e-07,
"loss": 133.9742,
"step": 320
},
{
"epoch": 0.026410200940113118,
"grad_norm": 71.3125,
"learning_rate": 9.999484176004775e-07,
"loss": 134.9034,
"step": 330
},
{
"epoch": 0.027210510059510487,
"grad_norm": 73.5,
"learning_rate": 9.999468544974615e-07,
"loss": 132.86,
"step": 340
},
{
"epoch": 0.028010819178907853,
"grad_norm": 73.4375,
"learning_rate": 9.999452913944457e-07,
"loss": 134.602,
"step": 350
},
{
"epoch": 0.02881112829830522,
"grad_norm": 76.6875,
"learning_rate": 9.9994372829143e-07,
"loss": 132.606,
"step": 360
},
{
"epoch": 0.02961143741770259,
"grad_norm": 75.9375,
"learning_rate": 9.99942165188414e-07,
"loss": 134.4456,
"step": 370
},
{
"epoch": 0.030411746537099955,
"grad_norm": 73.8125,
"learning_rate": 9.999406020853981e-07,
"loss": 132.9977,
"step": 380
},
{
"epoch": 0.03121205565649732,
"grad_norm": 79.375,
"learning_rate": 9.999390389823824e-07,
"loss": 133.731,
"step": 390
},
{
"epoch": 0.03201236477589469,
"grad_norm": 77.4375,
"learning_rate": 9.999374758793666e-07,
"loss": 133.9014,
"step": 400
},
{
"epoch": 0.032812673895292056,
"grad_norm": 70.8125,
"learning_rate": 9.999359127763508e-07,
"loss": 135.4049,
"step": 410
},
{
"epoch": 0.03361298301468942,
"grad_norm": 75.125,
"learning_rate": 9.999343496733348e-07,
"loss": 135.3231,
"step": 420
},
{
"epoch": 0.03441329213408679,
"grad_norm": 79.6875,
"learning_rate": 9.99932786570319e-07,
"loss": 132.8583,
"step": 430
},
{
"epoch": 0.03521360125348416,
"grad_norm": 84.0625,
"learning_rate": 9.999312234673032e-07,
"loss": 133.4561,
"step": 440
},
{
"epoch": 0.03601391037288153,
"grad_norm": 75.125,
"learning_rate": 9.999296603642874e-07,
"loss": 133.2269,
"step": 450
},
{
"epoch": 0.03681421949227889,
"grad_norm": 71.4375,
"learning_rate": 9.999280972612715e-07,
"loss": 134.1606,
"step": 460
},
{
"epoch": 0.03761452861167626,
"grad_norm": 72.9375,
"learning_rate": 9.999265341582557e-07,
"loss": 134.7773,
"step": 470
},
{
"epoch": 0.038414837731073626,
"grad_norm": 76.875,
"learning_rate": 9.999249710552399e-07,
"loss": 134.0885,
"step": 480
},
{
"epoch": 0.03921514685047099,
"grad_norm": 76.8125,
"learning_rate": 9.999234079522241e-07,
"loss": 134.5257,
"step": 490
},
{
"epoch": 0.040015455969868365,
"grad_norm": 70.25,
"learning_rate": 9.999218448492081e-07,
"loss": 134.4442,
"step": 500
},
{
"epoch": 0.04081576508926573,
"grad_norm": 79.875,
"learning_rate": 9.999202817461923e-07,
"loss": 134.2612,
"step": 510
},
{
"epoch": 0.0416160742086631,
"grad_norm": 75.375,
"learning_rate": 9.999187186431766e-07,
"loss": 135.1053,
"step": 520
},
{
"epoch": 0.04241638332806046,
"grad_norm": 75.4375,
"learning_rate": 9.999171555401606e-07,
"loss": 133.3491,
"step": 530
},
{
"epoch": 0.04321669244745783,
"grad_norm": 81.4375,
"learning_rate": 9.999155924371448e-07,
"loss": 133.6457,
"step": 540
},
{
"epoch": 0.044017001566855195,
"grad_norm": 76.5625,
"learning_rate": 9.99914029334129e-07,
"loss": 135.0537,
"step": 550
},
{
"epoch": 0.04481731068625257,
"grad_norm": 77.375,
"learning_rate": 9.999124662311132e-07,
"loss": 134.9535,
"step": 560
},
{
"epoch": 0.045617619805649934,
"grad_norm": 77.75,
"learning_rate": 9.999109031280974e-07,
"loss": 134.2294,
"step": 570
},
{
"epoch": 0.0464179289250473,
"grad_norm": 82.5,
"learning_rate": 9.999093400250814e-07,
"loss": 133.7713,
"step": 580
},
{
"epoch": 0.047218238044444666,
"grad_norm": 79.125,
"learning_rate": 9.999077769220657e-07,
"loss": 133.3779,
"step": 590
},
{
"epoch": 0.04801854716384203,
"grad_norm": 78.4375,
"learning_rate": 9.999062138190499e-07,
"loss": 133.576,
"step": 600
},
{
"epoch": 0.0488188562832394,
"grad_norm": 76.1875,
"learning_rate": 9.99904650716034e-07,
"loss": 136.3208,
"step": 610
},
{
"epoch": 0.04961916540263677,
"grad_norm": 76.3125,
"learning_rate": 9.99903087613018e-07,
"loss": 132.9463,
"step": 620
},
{
"epoch": 0.05041947452203414,
"grad_norm": 74.0,
"learning_rate": 9.999015245100023e-07,
"loss": 133.6666,
"step": 630
},
{
"epoch": 0.0512197836414315,
"grad_norm": 73.75,
"learning_rate": 9.998999614069865e-07,
"loss": 134.8779,
"step": 640
},
{
"epoch": 0.05202009276082887,
"grad_norm": 77.3125,
"learning_rate": 9.998983983039708e-07,
"loss": 133.0986,
"step": 650
},
{
"epoch": 0.052820401880226235,
"grad_norm": 78.5625,
"learning_rate": 9.998968352009548e-07,
"loss": 135.0677,
"step": 660
},
{
"epoch": 0.0536207109996236,
"grad_norm": 78.1875,
"learning_rate": 9.99895272097939e-07,
"loss": 133.2067,
"step": 670
},
{
"epoch": 0.054421020119020974,
"grad_norm": 70.0625,
"learning_rate": 9.998937089949232e-07,
"loss": 133.9511,
"step": 680
},
{
"epoch": 0.05522132923841834,
"grad_norm": 80.375,
"learning_rate": 9.998921458919072e-07,
"loss": 133.9503,
"step": 690
},
{
"epoch": 0.056021638357815706,
"grad_norm": 74.6875,
"learning_rate": 9.998905827888914e-07,
"loss": 134.0494,
"step": 700
},
{
"epoch": 0.05682194747721307,
"grad_norm": 77.25,
"learning_rate": 9.998890196858756e-07,
"loss": 134.0122,
"step": 710
},
{
"epoch": 0.05762225659661044,
"grad_norm": 80.875,
"learning_rate": 9.998874565828599e-07,
"loss": 134.0668,
"step": 720
},
{
"epoch": 0.05842256571600781,
"grad_norm": 84.125,
"learning_rate": 9.99885893479844e-07,
"loss": 134.0675,
"step": 730
},
{
"epoch": 0.05922287483540518,
"grad_norm": 72.375,
"learning_rate": 9.99884330376828e-07,
"loss": 133.1081,
"step": 740
},
{
"epoch": 0.06002318395480254,
"grad_norm": 72.5625,
"learning_rate": 9.998827672738123e-07,
"loss": 134.6276,
"step": 750
},
{
"epoch": 0.06082349307419991,
"grad_norm": 78.3125,
"learning_rate": 9.998812041707965e-07,
"loss": 134.8289,
"step": 760
},
{
"epoch": 0.061623802193597275,
"grad_norm": 78.125,
"learning_rate": 9.998796410677807e-07,
"loss": 134.8663,
"step": 770
},
{
"epoch": 0.06242411131299464,
"grad_norm": 71.75,
"learning_rate": 9.998780779647647e-07,
"loss": 133.6727,
"step": 780
},
{
"epoch": 0.06322442043239201,
"grad_norm": 74.4375,
"learning_rate": 9.99876514861749e-07,
"loss": 133.9329,
"step": 790
},
{
"epoch": 0.06402472955178938,
"grad_norm": 74.625,
"learning_rate": 9.998749517587332e-07,
"loss": 133.3252,
"step": 800
},
{
"epoch": 0.06482503867118675,
"grad_norm": 71.625,
"learning_rate": 9.998733886557172e-07,
"loss": 134.0405,
"step": 810
},
{
"epoch": 0.06562534779058411,
"grad_norm": 80.625,
"learning_rate": 9.998718255527014e-07,
"loss": 133.8837,
"step": 820
},
{
"epoch": 0.06642565690998148,
"grad_norm": 76.5625,
"learning_rate": 9.998702624496856e-07,
"loss": 134.2091,
"step": 830
},
{
"epoch": 0.06722596602937884,
"grad_norm": 71.75,
"learning_rate": 9.998686993466698e-07,
"loss": 133.8055,
"step": 840
},
{
"epoch": 0.06802627514877621,
"grad_norm": 78.9375,
"learning_rate": 9.998671362436539e-07,
"loss": 132.6678,
"step": 850
},
{
"epoch": 0.06882658426817358,
"grad_norm": 78.8125,
"learning_rate": 9.99865573140638e-07,
"loss": 134.7295,
"step": 860
},
{
"epoch": 0.06962689338757094,
"grad_norm": 77.375,
"learning_rate": 9.998640100376223e-07,
"loss": 134.0342,
"step": 870
},
{
"epoch": 0.07042720250696832,
"grad_norm": 76.0625,
"learning_rate": 9.998624469346065e-07,
"loss": 132.651,
"step": 880
},
{
"epoch": 0.07122751162636569,
"grad_norm": 75.0625,
"learning_rate": 9.998608838315905e-07,
"loss": 133.6062,
"step": 890
},
{
"epoch": 0.07202782074576305,
"grad_norm": 73.5,
"learning_rate": 9.998593207285747e-07,
"loss": 132.7781,
"step": 900
},
{
"epoch": 0.07282812986516042,
"grad_norm": 82.4375,
"learning_rate": 9.99857757625559e-07,
"loss": 134.2361,
"step": 910
},
{
"epoch": 0.07362843898455779,
"grad_norm": 73.0625,
"learning_rate": 9.998561945225432e-07,
"loss": 133.2907,
"step": 920
},
{
"epoch": 0.07442874810395515,
"grad_norm": 80.6875,
"learning_rate": 9.998546314195274e-07,
"loss": 134.2265,
"step": 930
},
{
"epoch": 0.07522905722335252,
"grad_norm": 80.25,
"learning_rate": 9.998530683165114e-07,
"loss": 133.9985,
"step": 940
},
{
"epoch": 0.07602936634274989,
"grad_norm": 74.125,
"learning_rate": 9.998515052134956e-07,
"loss": 134.877,
"step": 950
},
{
"epoch": 0.07682967546214725,
"grad_norm": 75.8125,
"learning_rate": 9.998499421104798e-07,
"loss": 133.8007,
"step": 960
},
{
"epoch": 0.07762998458154462,
"grad_norm": 80.0625,
"learning_rate": 9.998483790074638e-07,
"loss": 134.6694,
"step": 970
},
{
"epoch": 0.07843029370094198,
"grad_norm": 76.125,
"learning_rate": 9.99846815904448e-07,
"loss": 133.3736,
"step": 980
},
{
"epoch": 0.07923060282033935,
"grad_norm": 81.9375,
"learning_rate": 9.998452528014323e-07,
"loss": 134.208,
"step": 990
},
{
"epoch": 0.08003091193973673,
"grad_norm": 75.4375,
"learning_rate": 9.998436896984163e-07,
"loss": 133.7086,
"step": 1000
},
{
"epoch": 0.0808312210591341,
"grad_norm": 79.4375,
"learning_rate": 9.998421265954005e-07,
"loss": 133.6408,
"step": 1010
},
{
"epoch": 0.08163153017853146,
"grad_norm": 77.8125,
"learning_rate": 9.998405634923847e-07,
"loss": 132.0228,
"step": 1020
},
{
"epoch": 0.08243183929792883,
"grad_norm": 79.75,
"learning_rate": 9.99839000389369e-07,
"loss": 133.7949,
"step": 1030
},
{
"epoch": 0.0832321484173262,
"grad_norm": 77.5,
"learning_rate": 9.998374372863532e-07,
"loss": 132.4473,
"step": 1040
},
{
"epoch": 0.08403245753672356,
"grad_norm": 78.625,
"learning_rate": 9.998358741833372e-07,
"loss": 133.509,
"step": 1050
},
{
"epoch": 0.08483276665612093,
"grad_norm": 74.625,
"learning_rate": 9.998343110803214e-07,
"loss": 134.2043,
"step": 1060
},
{
"epoch": 0.08563307577551829,
"grad_norm": 79.5,
"learning_rate": 9.998327479773056e-07,
"loss": 134.1369,
"step": 1070
},
{
"epoch": 0.08643338489491566,
"grad_norm": 81.3125,
"learning_rate": 9.998311848742898e-07,
"loss": 135.2469,
"step": 1080
},
{
"epoch": 0.08723369401431302,
"grad_norm": 76.8125,
"learning_rate": 9.99829621771274e-07,
"loss": 134.3792,
"step": 1090
},
{
"epoch": 0.08803400313371039,
"grad_norm": 83.0625,
"learning_rate": 9.99828058668258e-07,
"loss": 134.3031,
"step": 1100
},
{
"epoch": 0.08883431225310777,
"grad_norm": 77.5625,
"learning_rate": 9.998264955652423e-07,
"loss": 133.6508,
"step": 1110
},
{
"epoch": 0.08963462137250514,
"grad_norm": 72.5625,
"learning_rate": 9.998249324622265e-07,
"loss": 133.622,
"step": 1120
},
{
"epoch": 0.0904349304919025,
"grad_norm": 72.4375,
"learning_rate": 9.998233693592105e-07,
"loss": 133.585,
"step": 1130
},
{
"epoch": 0.09123523961129987,
"grad_norm": 83.8125,
"learning_rate": 9.998218062561947e-07,
"loss": 134.5366,
"step": 1140
},
{
"epoch": 0.09203554873069723,
"grad_norm": 75.6875,
"learning_rate": 9.99820243153179e-07,
"loss": 132.9767,
"step": 1150
},
{
"epoch": 0.0928358578500946,
"grad_norm": 75.6875,
"learning_rate": 9.99818680050163e-07,
"loss": 135.017,
"step": 1160
},
{
"epoch": 0.09363616696949197,
"grad_norm": 78.8125,
"learning_rate": 9.998171169471471e-07,
"loss": 133.7126,
"step": 1170
},
{
"epoch": 0.09443647608888933,
"grad_norm": 77.125,
"learning_rate": 9.998155538441314e-07,
"loss": 133.368,
"step": 1180
},
{
"epoch": 0.0952367852082867,
"grad_norm": 80.5625,
"learning_rate": 9.998139907411156e-07,
"loss": 134.3654,
"step": 1190
},
{
"epoch": 0.09603709432768406,
"grad_norm": 77.25,
"learning_rate": 9.998124276380998e-07,
"loss": 133.1469,
"step": 1200
},
{
"epoch": 0.09683740344708143,
"grad_norm": 71.0625,
"learning_rate": 9.998108645350838e-07,
"loss": 134.3933,
"step": 1210
},
{
"epoch": 0.0976377125664788,
"grad_norm": 71.125,
"learning_rate": 9.99809301432068e-07,
"loss": 132.187,
"step": 1220
},
{
"epoch": 0.09843802168587618,
"grad_norm": 73.0625,
"learning_rate": 9.998077383290522e-07,
"loss": 134.8376,
"step": 1230
},
{
"epoch": 0.09923833080527354,
"grad_norm": 72.6875,
"learning_rate": 9.998061752260365e-07,
"loss": 133.2062,
"step": 1240
},
{
"epoch": 0.10003863992467091,
"grad_norm": 82.9375,
"learning_rate": 9.998046121230207e-07,
"loss": 134.4847,
"step": 1250
},
{
"epoch": 0.10083894904406827,
"grad_norm": 80.5,
"learning_rate": 9.998030490200047e-07,
"loss": 133.9005,
"step": 1260
},
{
"epoch": 0.10163925816346564,
"grad_norm": 75.3125,
"learning_rate": 9.99801485916989e-07,
"loss": 132.473,
"step": 1270
},
{
"epoch": 0.102439567282863,
"grad_norm": 72.875,
"learning_rate": 9.997999228139731e-07,
"loss": 133.0457,
"step": 1280
},
{
"epoch": 0.10323987640226037,
"grad_norm": 75.875,
"learning_rate": 9.997983597109571e-07,
"loss": 132.4673,
"step": 1290
},
{
"epoch": 0.10404018552165774,
"grad_norm": 78.5625,
"learning_rate": 9.997967966079413e-07,
"loss": 133.7126,
"step": 1300
},
{
"epoch": 0.1048404946410551,
"grad_norm": 78.25,
"learning_rate": 9.997952335049256e-07,
"loss": 133.5371,
"step": 1310
},
{
"epoch": 0.10564080376045247,
"grad_norm": 76.1875,
"learning_rate": 9.997936704019096e-07,
"loss": 132.6477,
"step": 1320
},
{
"epoch": 0.10644111287984984,
"grad_norm": 73.875,
"learning_rate": 9.997921072988938e-07,
"loss": 131.6196,
"step": 1330
},
{
"epoch": 0.1072414219992472,
"grad_norm": 83.875,
"learning_rate": 9.99790544195878e-07,
"loss": 133.8508,
"step": 1340
},
{
"epoch": 0.10804173111864458,
"grad_norm": 72.25,
"learning_rate": 9.997889810928622e-07,
"loss": 133.3958,
"step": 1350
},
{
"epoch": 0.10884204023804195,
"grad_norm": 77.875,
"learning_rate": 9.997874179898464e-07,
"loss": 134.013,
"step": 1360
},
{
"epoch": 0.10964234935743931,
"grad_norm": 67.6875,
"learning_rate": 9.997858548868304e-07,
"loss": 133.0476,
"step": 1370
},
{
"epoch": 0.11044265847683668,
"grad_norm": 78.0,
"learning_rate": 9.997842917838147e-07,
"loss": 134.0779,
"step": 1380
},
{
"epoch": 0.11124296759623405,
"grad_norm": 75.25,
"learning_rate": 9.997827286807989e-07,
"loss": 135.0265,
"step": 1390
},
{
"epoch": 0.11204327671563141,
"grad_norm": 84.4375,
"learning_rate": 9.99781165577783e-07,
"loss": 132.7369,
"step": 1400
},
{
"epoch": 0.11284358583502878,
"grad_norm": 76.875,
"learning_rate": 9.997796024747673e-07,
"loss": 132.9652,
"step": 1410
},
{
"epoch": 0.11364389495442614,
"grad_norm": 75.8125,
"learning_rate": 9.997780393717513e-07,
"loss": 132.9404,
"step": 1420
},
{
"epoch": 0.11444420407382351,
"grad_norm": 81.5625,
"learning_rate": 9.997764762687355e-07,
"loss": 133.6693,
"step": 1430
},
{
"epoch": 0.11524451319322088,
"grad_norm": 79.0625,
"learning_rate": 9.997749131657198e-07,
"loss": 133.2776,
"step": 1440
},
{
"epoch": 0.11604482231261824,
"grad_norm": 72.6875,
"learning_rate": 9.997733500627038e-07,
"loss": 133.9213,
"step": 1450
},
{
"epoch": 0.11684513143201562,
"grad_norm": 77.0625,
"learning_rate": 9.99771786959688e-07,
"loss": 133.5122,
"step": 1460
},
{
"epoch": 0.11764544055141299,
"grad_norm": 78.125,
"learning_rate": 9.997702238566722e-07,
"loss": 134.5584,
"step": 1470
},
{
"epoch": 0.11844574967081035,
"grad_norm": 78.0,
"learning_rate": 9.997686607536562e-07,
"loss": 132.1062,
"step": 1480
},
{
"epoch": 0.11924605879020772,
"grad_norm": 81.4375,
"learning_rate": 9.997670976506404e-07,
"loss": 134.1707,
"step": 1490
},
{
"epoch": 0.12004636790960509,
"grad_norm": 80.375,
"learning_rate": 9.997655345476247e-07,
"loss": 134.8848,
"step": 1500
},
{
"epoch": 0.12084667702900245,
"grad_norm": 76.0625,
"learning_rate": 9.997639714446089e-07,
"loss": 131.9225,
"step": 1510
},
{
"epoch": 0.12164698614839982,
"grad_norm": 73.0,
"learning_rate": 9.997624083415929e-07,
"loss": 133.7994,
"step": 1520
},
{
"epoch": 0.12244729526779718,
"grad_norm": 78.3125,
"learning_rate": 9.99760845238577e-07,
"loss": 133.7843,
"step": 1530
},
{
"epoch": 0.12324760438719455,
"grad_norm": 69.4375,
"learning_rate": 9.997592821355613e-07,
"loss": 131.5354,
"step": 1540
},
{
"epoch": 0.12404791350659192,
"grad_norm": 75.0,
"learning_rate": 9.997577190325455e-07,
"loss": 133.611,
"step": 1550
},
{
"epoch": 0.12484822262598928,
"grad_norm": 75.625,
"learning_rate": 9.997561559295297e-07,
"loss": 132.3119,
"step": 1560
},
{
"epoch": 0.12564853174538665,
"grad_norm": 81.25,
"learning_rate": 9.997545928265138e-07,
"loss": 132.667,
"step": 1570
},
{
"epoch": 0.12644884086478403,
"grad_norm": 80.8125,
"learning_rate": 9.99753029723498e-07,
"loss": 133.3723,
"step": 1580
},
{
"epoch": 0.12724914998418138,
"grad_norm": 75.9375,
"learning_rate": 9.997514666204822e-07,
"loss": 131.5222,
"step": 1590
},
{
"epoch": 0.12804945910357876,
"grad_norm": 74.6875,
"learning_rate": 9.997499035174664e-07,
"loss": 133.6289,
"step": 1600
},
{
"epoch": 0.1288497682229761,
"grad_norm": 79.9375,
"learning_rate": 9.997483404144504e-07,
"loss": 133.2038,
"step": 1610
},
{
"epoch": 0.1296500773423735,
"grad_norm": 76.8125,
"learning_rate": 9.997467773114346e-07,
"loss": 133.7517,
"step": 1620
},
{
"epoch": 0.13045038646177085,
"grad_norm": 78.125,
"learning_rate": 9.997452142084189e-07,
"loss": 133.22,
"step": 1630
},
{
"epoch": 0.13125069558116823,
"grad_norm": 78.75,
"learning_rate": 9.997436511054029e-07,
"loss": 133.4411,
"step": 1640
},
{
"epoch": 0.1320510047005656,
"grad_norm": 75.125,
"learning_rate": 9.99742088002387e-07,
"loss": 132.7751,
"step": 1650
},
{
"epoch": 0.13285131381996296,
"grad_norm": 79.4375,
"learning_rate": 9.997405248993713e-07,
"loss": 133.3775,
"step": 1660
},
{
"epoch": 0.13365162293936034,
"grad_norm": 82.0,
"learning_rate": 9.997389617963555e-07,
"loss": 133.9474,
"step": 1670
},
{
"epoch": 0.1344519320587577,
"grad_norm": 76.4375,
"learning_rate": 9.997373986933395e-07,
"loss": 134.6912,
"step": 1680
},
{
"epoch": 0.13525224117815507,
"grad_norm": 75.875,
"learning_rate": 9.997358355903237e-07,
"loss": 135.1857,
"step": 1690
},
{
"epoch": 0.13605255029755242,
"grad_norm": 82.5625,
"learning_rate": 9.99734272487308e-07,
"loss": 135.5683,
"step": 1700
},
{
"epoch": 0.1368528594169498,
"grad_norm": 74.25,
"learning_rate": 9.997327093842922e-07,
"loss": 133.0515,
"step": 1710
},
{
"epoch": 0.13765316853634715,
"grad_norm": 75.6875,
"learning_rate": 9.997311462812764e-07,
"loss": 134.285,
"step": 1720
},
{
"epoch": 0.13845347765574453,
"grad_norm": 78.0,
"learning_rate": 9.997295831782604e-07,
"loss": 132.9074,
"step": 1730
},
{
"epoch": 0.13925378677514189,
"grad_norm": 77.0,
"learning_rate": 9.997280200752446e-07,
"loss": 133.4366,
"step": 1740
},
{
"epoch": 0.14005409589453927,
"grad_norm": 75.5625,
"learning_rate": 9.997264569722288e-07,
"loss": 133.7744,
"step": 1750
},
{
"epoch": 0.14085440501393665,
"grad_norm": 77.1875,
"learning_rate": 9.997248938692128e-07,
"loss": 133.4301,
"step": 1760
},
{
"epoch": 0.141654714133334,
"grad_norm": 76.5625,
"learning_rate": 9.99723330766197e-07,
"loss": 133.3505,
"step": 1770
},
{
"epoch": 0.14245502325273138,
"grad_norm": 81.25,
"learning_rate": 9.997217676631813e-07,
"loss": 133.3679,
"step": 1780
},
{
"epoch": 0.14325533237212873,
"grad_norm": 80.75,
"learning_rate": 9.997202045601655e-07,
"loss": 131.0644,
"step": 1790
},
{
"epoch": 0.1440556414915261,
"grad_norm": 78.6875,
"learning_rate": 9.997186414571495e-07,
"loss": 133.6262,
"step": 1800
},
{
"epoch": 0.14485595061092346,
"grad_norm": 71.6875,
"learning_rate": 9.997170783541337e-07,
"loss": 133.8133,
"step": 1810
},
{
"epoch": 0.14565625973032084,
"grad_norm": 73.875,
"learning_rate": 9.99715515251118e-07,
"loss": 133.2261,
"step": 1820
},
{
"epoch": 0.1464565688497182,
"grad_norm": 76.375,
"learning_rate": 9.997139521481022e-07,
"loss": 134.2764,
"step": 1830
},
{
"epoch": 0.14725687796911557,
"grad_norm": 80.0,
"learning_rate": 9.997123890450862e-07,
"loss": 133.0369,
"step": 1840
},
{
"epoch": 0.14805718708851293,
"grad_norm": 79.25,
"learning_rate": 9.997108259420704e-07,
"loss": 132.5432,
"step": 1850
},
{
"epoch": 0.1488574962079103,
"grad_norm": 76.4375,
"learning_rate": 9.997092628390546e-07,
"loss": 132.448,
"step": 1860
},
{
"epoch": 0.14965780532730769,
"grad_norm": 74.3125,
"learning_rate": 9.997076997360388e-07,
"loss": 134.039,
"step": 1870
},
{
"epoch": 0.15045811444670504,
"grad_norm": 76.625,
"learning_rate": 9.99706136633023e-07,
"loss": 133.9122,
"step": 1880
},
{
"epoch": 0.15125842356610242,
"grad_norm": 73.9375,
"learning_rate": 9.99704573530007e-07,
"loss": 133.0405,
"step": 1890
},
{
"epoch": 0.15205873268549977,
"grad_norm": 76.25,
"learning_rate": 9.997030104269913e-07,
"loss": 133.0878,
"step": 1900
},
{
"epoch": 0.15285904180489715,
"grad_norm": 74.0625,
"learning_rate": 9.997014473239755e-07,
"loss": 132.5003,
"step": 1910
},
{
"epoch": 0.1536593509242945,
"grad_norm": 76.8125,
"learning_rate": 9.996998842209595e-07,
"loss": 134.9118,
"step": 1920
},
{
"epoch": 0.15445966004369188,
"grad_norm": 86.4375,
"learning_rate": 9.996983211179437e-07,
"loss": 134.4519,
"step": 1930
},
{
"epoch": 0.15525996916308923,
"grad_norm": 76.8125,
"learning_rate": 9.99696758014928e-07,
"loss": 133.9741,
"step": 1940
},
{
"epoch": 0.15606027828248661,
"grad_norm": 76.1875,
"learning_rate": 9.996951949119121e-07,
"loss": 133.7216,
"step": 1950
},
{
"epoch": 0.15686058740188397,
"grad_norm": 72.125,
"learning_rate": 9.996936318088962e-07,
"loss": 132.1554,
"step": 1960
},
{
"epoch": 0.15766089652128135,
"grad_norm": 79.375,
"learning_rate": 9.996920687058804e-07,
"loss": 133.7076,
"step": 1970
},
{
"epoch": 0.1584612056406787,
"grad_norm": 79.375,
"learning_rate": 9.996905056028646e-07,
"loss": 133.1065,
"step": 1980
},
{
"epoch": 0.15926151476007608,
"grad_norm": 78.625,
"learning_rate": 9.996889424998486e-07,
"loss": 133.6056,
"step": 1990
},
{
"epoch": 0.16006182387947346,
"grad_norm": 79.9375,
"learning_rate": 9.996873793968328e-07,
"loss": 133.6693,
"step": 2000
},
{
"epoch": 0.1608621329988708,
"grad_norm": 73.5625,
"learning_rate": 9.99685816293817e-07,
"loss": 133.3085,
"step": 2010
},
{
"epoch": 0.1616624421182682,
"grad_norm": 72.625,
"learning_rate": 9.996842531908013e-07,
"loss": 132.2977,
"step": 2020
},
{
"epoch": 0.16246275123766554,
"grad_norm": 73.6875,
"learning_rate": 9.996826900877855e-07,
"loss": 133.0748,
"step": 2030
},
{
"epoch": 0.16326306035706292,
"grad_norm": 78.5625,
"learning_rate": 9.996811269847697e-07,
"loss": 134.3872,
"step": 2040
},
{
"epoch": 0.16406336947646027,
"grad_norm": 75.5625,
"learning_rate": 9.996795638817537e-07,
"loss": 130.7821,
"step": 2050
},
{
"epoch": 0.16486367859585765,
"grad_norm": 72.4375,
"learning_rate": 9.99678000778738e-07,
"loss": 132.3199,
"step": 2060
},
{
"epoch": 0.165663987715255,
"grad_norm": 77.1875,
"learning_rate": 9.996764376757221e-07,
"loss": 132.7069,
"step": 2070
},
{
"epoch": 0.1664642968346524,
"grad_norm": 78.75,
"learning_rate": 9.996748745727061e-07,
"loss": 134.6731,
"step": 2080
},
{
"epoch": 0.16726460595404974,
"grad_norm": 77.8125,
"learning_rate": 9.996733114696904e-07,
"loss": 132.4518,
"step": 2090
},
{
"epoch": 0.16806491507344712,
"grad_norm": 80.0625,
"learning_rate": 9.996717483666746e-07,
"loss": 131.4908,
"step": 2100
},
{
"epoch": 0.1688652241928445,
"grad_norm": 79.4375,
"learning_rate": 9.996701852636586e-07,
"loss": 132.5609,
"step": 2110
},
{
"epoch": 0.16966553331224185,
"grad_norm": 75.75,
"learning_rate": 9.996686221606428e-07,
"loss": 133.5363,
"step": 2120
},
{
"epoch": 0.17046584243163923,
"grad_norm": 74.8125,
"learning_rate": 9.99667059057627e-07,
"loss": 133.372,
"step": 2130
},
{
"epoch": 0.17126615155103658,
"grad_norm": 78.125,
"learning_rate": 9.996654959546112e-07,
"loss": 134.09,
"step": 2140
},
{
"epoch": 0.17206646067043396,
"grad_norm": 73.6875,
"learning_rate": 9.996639328515952e-07,
"loss": 134.2827,
"step": 2150
},
{
"epoch": 0.17286676978983131,
"grad_norm": 75.6875,
"learning_rate": 9.996623697485795e-07,
"loss": 133.4792,
"step": 2160
},
{
"epoch": 0.1736670789092287,
"grad_norm": 80.8125,
"learning_rate": 9.996608066455637e-07,
"loss": 133.7547,
"step": 2170
},
{
"epoch": 0.17446738802862605,
"grad_norm": 75.5625,
"learning_rate": 9.99659243542548e-07,
"loss": 132.9351,
"step": 2180
},
{
"epoch": 0.17526769714802343,
"grad_norm": 72.125,
"learning_rate": 9.996576804395321e-07,
"loss": 131.2331,
"step": 2190
},
{
"epoch": 0.17606800626742078,
"grad_norm": 79.4375,
"learning_rate": 9.996561173365161e-07,
"loss": 134.6388,
"step": 2200
},
{
"epoch": 0.17686831538681816,
"grad_norm": 77.125,
"learning_rate": 9.996545542335003e-07,
"loss": 133.2961,
"step": 2210
},
{
"epoch": 0.17766862450621554,
"grad_norm": 75.875,
"learning_rate": 9.996529911304846e-07,
"loss": 133.8811,
"step": 2220
},
{
"epoch": 0.1784689336256129,
"grad_norm": 73.5625,
"learning_rate": 9.996514280274688e-07,
"loss": 133.1757,
"step": 2230
},
{
"epoch": 0.17926924274501027,
"grad_norm": 78.875,
"learning_rate": 9.996498649244528e-07,
"loss": 132.6364,
"step": 2240
},
{
"epoch": 0.18006955186440762,
"grad_norm": 75.0,
"learning_rate": 9.99648301821437e-07,
"loss": 133.5787,
"step": 2250
},
{
"epoch": 0.180869860983805,
"grad_norm": 73.75,
"learning_rate": 9.996467387184212e-07,
"loss": 133.0472,
"step": 2260
},
{
"epoch": 0.18167017010320236,
"grad_norm": 73.5,
"learning_rate": 9.996451756154052e-07,
"loss": 132.8575,
"step": 2270
},
{
"epoch": 0.18247047922259974,
"grad_norm": 81.75,
"learning_rate": 9.996436125123894e-07,
"loss": 132.7385,
"step": 2280
},
{
"epoch": 0.1832707883419971,
"grad_norm": 71.5,
"learning_rate": 9.996420494093737e-07,
"loss": 132.8017,
"step": 2290
},
{
"epoch": 0.18407109746139447,
"grad_norm": 73.75,
"learning_rate": 9.996404863063579e-07,
"loss": 134.2902,
"step": 2300
},
{
"epoch": 0.18487140658079182,
"grad_norm": 72.375,
"learning_rate": 9.996389232033419e-07,
"loss": 133.3126,
"step": 2310
},
{
"epoch": 0.1856717157001892,
"grad_norm": 72.875,
"learning_rate": 9.99637360100326e-07,
"loss": 133.4326,
"step": 2320
},
{
"epoch": 0.18647202481958655,
"grad_norm": 79.875,
"learning_rate": 9.996357969973103e-07,
"loss": 135.3909,
"step": 2330
},
{
"epoch": 0.18727233393898393,
"grad_norm": 77.0625,
"learning_rate": 9.996342338942945e-07,
"loss": 133.5528,
"step": 2340
},
{
"epoch": 0.1880726430583813,
"grad_norm": 77.5,
"learning_rate": 9.996326707912788e-07,
"loss": 133.1468,
"step": 2350
},
{
"epoch": 0.18887295217777866,
"grad_norm": 79.0,
"learning_rate": 9.996311076882628e-07,
"loss": 132.3221,
"step": 2360
},
{
"epoch": 0.18967326129717604,
"grad_norm": 78.625,
"learning_rate": 9.99629544585247e-07,
"loss": 134.5704,
"step": 2370
},
{
"epoch": 0.1904735704165734,
"grad_norm": 75.0,
"learning_rate": 9.996279814822312e-07,
"loss": 133.3002,
"step": 2380
},
{
"epoch": 0.19127387953597078,
"grad_norm": 81.8125,
"learning_rate": 9.996264183792154e-07,
"loss": 133.6831,
"step": 2390
},
{
"epoch": 0.19207418865536813,
"grad_norm": 74.375,
"learning_rate": 9.996248552761994e-07,
"loss": 132.5035,
"step": 2400
},
{
"epoch": 0.1928744977747655,
"grad_norm": 83.8125,
"learning_rate": 9.996232921731836e-07,
"loss": 132.3072,
"step": 2410
},
{
"epoch": 0.19367480689416286,
"grad_norm": 76.6875,
"learning_rate": 9.996217290701679e-07,
"loss": 133.4728,
"step": 2420
},
{
"epoch": 0.19447511601356024,
"grad_norm": 80.25,
"learning_rate": 9.996201659671519e-07,
"loss": 132.2028,
"step": 2430
},
{
"epoch": 0.1952754251329576,
"grad_norm": 72.5625,
"learning_rate": 9.99618602864136e-07,
"loss": 133.2437,
"step": 2440
},
{
"epoch": 0.19607573425235497,
"grad_norm": 77.125,
"learning_rate": 9.996170397611203e-07,
"loss": 134.2502,
"step": 2450
},
{
"epoch": 0.19687604337175235,
"grad_norm": 78.0,
"learning_rate": 9.996154766581043e-07,
"loss": 134.4984,
"step": 2460
},
{
"epoch": 0.1976763524911497,
"grad_norm": 80.3125,
"learning_rate": 9.996139135550885e-07,
"loss": 132.5038,
"step": 2470
},
{
"epoch": 0.19847666161054708,
"grad_norm": 72.5,
"learning_rate": 9.996123504520728e-07,
"loss": 130.4979,
"step": 2480
},
{
"epoch": 0.19927697072994444,
"grad_norm": 76.3125,
"learning_rate": 9.99610787349057e-07,
"loss": 132.6546,
"step": 2490
},
{
"epoch": 0.20007727984934182,
"grad_norm": 74.5,
"learning_rate": 9.996092242460412e-07,
"loss": 132.9037,
"step": 2500
},
{
"epoch": 0.20007727984934182,
"eval_loss": 2.079555034637451,
"eval_runtime": 423.7834,
"eval_samples_per_second": 1548.333,
"eval_steps_per_second": 48.386,
"step": 2500
},
{
"epoch": 0.20087758896873917,
"grad_norm": 71.4375,
"learning_rate": 9.996076611430254e-07,
"loss": 132.1891,
"step": 2510
},
{
"epoch": 0.20167789808813655,
"grad_norm": 76.6875,
"learning_rate": 9.996060980400094e-07,
"loss": 133.4134,
"step": 2520
},
{
"epoch": 0.2024782072075339,
"grad_norm": 78.3125,
"learning_rate": 9.996045349369936e-07,
"loss": 134.7546,
"step": 2530
},
{
"epoch": 0.20327851632693128,
"grad_norm": 74.1875,
"learning_rate": 9.996029718339778e-07,
"loss": 132.944,
"step": 2540
},
{
"epoch": 0.20407882544632863,
"grad_norm": 73.125,
"learning_rate": 9.99601408730962e-07,
"loss": 132.8733,
"step": 2550
},
{
"epoch": 0.204879134565726,
"grad_norm": 77.1875,
"learning_rate": 9.99599845627946e-07,
"loss": 134.1901,
"step": 2560
},
{
"epoch": 0.2056794436851234,
"grad_norm": 79.9375,
"learning_rate": 9.995982825249303e-07,
"loss": 132.7988,
"step": 2570
},
{
"epoch": 0.20647975280452074,
"grad_norm": 76.75,
"learning_rate": 9.995967194219145e-07,
"loss": 133.0693,
"step": 2580
},
{
"epoch": 0.20728006192391812,
"grad_norm": 72.375,
"learning_rate": 9.995951563188985e-07,
"loss": 132.1166,
"step": 2590
},
{
"epoch": 0.20808037104331548,
"grad_norm": 80.0,
"learning_rate": 9.995935932158827e-07,
"loss": 133.8413,
"step": 2600
},
{
"epoch": 0.20888068016271286,
"grad_norm": 75.9375,
"learning_rate": 9.99592030112867e-07,
"loss": 134.1737,
"step": 2610
},
{
"epoch": 0.2096809892821102,
"grad_norm": 82.9375,
"learning_rate": 9.99590467009851e-07,
"loss": 131.8263,
"step": 2620
},
{
"epoch": 0.2104812984015076,
"grad_norm": 73.375,
"learning_rate": 9.995889039068352e-07,
"loss": 134.3057,
"step": 2630
},
{
"epoch": 0.21128160752090494,
"grad_norm": 79.9375,
"learning_rate": 9.995873408038194e-07,
"loss": 131.4325,
"step": 2640
},
{
"epoch": 0.21208191664030232,
"grad_norm": 71.9375,
"learning_rate": 9.995857777008036e-07,
"loss": 133.1537,
"step": 2650
},
{
"epoch": 0.21288222575969967,
"grad_norm": 74.875,
"learning_rate": 9.995842145977878e-07,
"loss": 131.8966,
"step": 2660
},
{
"epoch": 0.21368253487909705,
"grad_norm": 82.875,
"learning_rate": 9.995826514947718e-07,
"loss": 132.7061,
"step": 2670
},
{
"epoch": 0.2144828439984944,
"grad_norm": 75.4375,
"learning_rate": 9.99581088391756e-07,
"loss": 133.5048,
"step": 2680
},
{
"epoch": 0.21528315311789178,
"grad_norm": 78.875,
"learning_rate": 9.995795252887403e-07,
"loss": 133.0009,
"step": 2690
},
{
"epoch": 0.21608346223728916,
"grad_norm": 76.875,
"learning_rate": 9.995779621857245e-07,
"loss": 132.829,
"step": 2700
},
{
"epoch": 0.21688377135668652,
"grad_norm": 75.4375,
"learning_rate": 9.995763990827087e-07,
"loss": 131.0754,
"step": 2710
},
{
"epoch": 0.2176840804760839,
"grad_norm": 75.125,
"learning_rate": 9.995748359796927e-07,
"loss": 133.1994,
"step": 2720
},
{
"epoch": 0.21848438959548125,
"grad_norm": 77.25,
"learning_rate": 9.99573272876677e-07,
"loss": 133.7851,
"step": 2730
},
{
"epoch": 0.21928469871487863,
"grad_norm": 74.5,
"learning_rate": 9.995717097736612e-07,
"loss": 134.2786,
"step": 2740
},
{
"epoch": 0.22008500783427598,
"grad_norm": 78.1875,
"learning_rate": 9.995701466706452e-07,
"loss": 132.9056,
"step": 2750
},
{
"epoch": 0.22088531695367336,
"grad_norm": 73.0,
"learning_rate": 9.995685835676294e-07,
"loss": 133.8631,
"step": 2760
},
{
"epoch": 0.2216856260730707,
"grad_norm": 75.0,
"learning_rate": 9.995670204646136e-07,
"loss": 133.2654,
"step": 2770
},
{
"epoch": 0.2224859351924681,
"grad_norm": 72.25,
"learning_rate": 9.995654573615976e-07,
"loss": 133.3257,
"step": 2780
},
{
"epoch": 0.22328624431186544,
"grad_norm": 74.625,
"learning_rate": 9.995638942585818e-07,
"loss": 134.212,
"step": 2790
},
{
"epoch": 0.22408655343126282,
"grad_norm": 81.625,
"learning_rate": 9.99562331155566e-07,
"loss": 131.8705,
"step": 2800
},
{
"epoch": 0.2248868625506602,
"grad_norm": 74.5625,
"learning_rate": 9.995607680525503e-07,
"loss": 132.6001,
"step": 2810
},
{
"epoch": 0.22568717167005756,
"grad_norm": 81.6875,
"learning_rate": 9.995592049495345e-07,
"loss": 132.6342,
"step": 2820
},
{
"epoch": 0.22648748078945494,
"grad_norm": 74.75,
"learning_rate": 9.995576418465185e-07,
"loss": 133.3588,
"step": 2830
},
{
"epoch": 0.2272877899088523,
"grad_norm": 76.625,
"learning_rate": 9.995560787435027e-07,
"loss": 133.0776,
"step": 2840
},
{
"epoch": 0.22808809902824967,
"grad_norm": 75.375,
"learning_rate": 9.99554515640487e-07,
"loss": 132.3477,
"step": 2850
},
{
"epoch": 0.22888840814764702,
"grad_norm": 78.6875,
"learning_rate": 9.995529525374711e-07,
"loss": 133.229,
"step": 2860
},
{
"epoch": 0.2296887172670444,
"grad_norm": 74.1875,
"learning_rate": 9.995513894344551e-07,
"loss": 134.0937,
"step": 2870
},
{
"epoch": 0.23048902638644175,
"grad_norm": 74.625,
"learning_rate": 9.995498263314394e-07,
"loss": 133.6065,
"step": 2880
},
{
"epoch": 0.23128933550583913,
"grad_norm": 76.125,
"learning_rate": 9.995482632284236e-07,
"loss": 134.7819,
"step": 2890
},
{
"epoch": 0.23208964462523649,
"grad_norm": 70.0,
"learning_rate": 9.995467001254078e-07,
"loss": 131.2454,
"step": 2900
},
{
"epoch": 0.23288995374463387,
"grad_norm": 79.0625,
"learning_rate": 9.995451370223918e-07,
"loss": 132.6385,
"step": 2910
},
{
"epoch": 0.23369026286403125,
"grad_norm": 76.4375,
"learning_rate": 9.99543573919376e-07,
"loss": 132.4604,
"step": 2920
},
{
"epoch": 0.2344905719834286,
"grad_norm": 69.75,
"learning_rate": 9.995420108163602e-07,
"loss": 133.4625,
"step": 2930
},
{
"epoch": 0.23529088110282598,
"grad_norm": 73.75,
"learning_rate": 9.995404477133443e-07,
"loss": 133.4203,
"step": 2940
},
{
"epoch": 0.23609119022222333,
"grad_norm": 77.5,
"learning_rate": 9.995388846103285e-07,
"loss": 133.9867,
"step": 2950
},
{
"epoch": 0.2368914993416207,
"grad_norm": 73.0,
"learning_rate": 9.995373215073127e-07,
"loss": 134.0537,
"step": 2960
},
{
"epoch": 0.23769180846101806,
"grad_norm": 76.6875,
"learning_rate": 9.99535758404297e-07,
"loss": 133.4288,
"step": 2970
},
{
"epoch": 0.23849211758041544,
"grad_norm": 81.625,
"learning_rate": 9.995341953012811e-07,
"loss": 131.8527,
"step": 2980
},
{
"epoch": 0.2392924266998128,
"grad_norm": 75.8125,
"learning_rate": 9.995326321982651e-07,
"loss": 133.3051,
"step": 2990
},
{
"epoch": 0.24009273581921017,
"grad_norm": 77.4375,
"learning_rate": 9.995310690952493e-07,
"loss": 133.7738,
"step": 3000
},
{
"epoch": 0.24089304493860753,
"grad_norm": 77.5625,
"learning_rate": 9.995295059922336e-07,
"loss": 134.9874,
"step": 3010
},
{
"epoch": 0.2416933540580049,
"grad_norm": 74.0,
"learning_rate": 9.995279428892178e-07,
"loss": 133.7144,
"step": 3020
},
{
"epoch": 0.24249366317740229,
"grad_norm": 82.375,
"learning_rate": 9.995263797862018e-07,
"loss": 133.7562,
"step": 3030
},
{
"epoch": 0.24329397229679964,
"grad_norm": 79.0,
"learning_rate": 9.99524816683186e-07,
"loss": 133.503,
"step": 3040
},
{
"epoch": 0.24409428141619702,
"grad_norm": 76.375,
"learning_rate": 9.995232535801702e-07,
"loss": 134.0484,
"step": 3050
},
{
"epoch": 0.24489459053559437,
"grad_norm": 73.9375,
"learning_rate": 9.995216904771542e-07,
"loss": 132.5795,
"step": 3060
},
{
"epoch": 0.24569489965499175,
"grad_norm": 75.625,
"learning_rate": 9.995201273741385e-07,
"loss": 131.5031,
"step": 3070
},
{
"epoch": 0.2464952087743891,
"grad_norm": 74.875,
"learning_rate": 9.995185642711227e-07,
"loss": 132.9786,
"step": 3080
},
{
"epoch": 0.24729551789378648,
"grad_norm": 76.9375,
"learning_rate": 9.995170011681069e-07,
"loss": 132.8848,
"step": 3090
},
{
"epoch": 0.24809582701318383,
"grad_norm": 77.1875,
"learning_rate": 9.99515438065091e-07,
"loss": 133.2721,
"step": 3100
},
{
"epoch": 0.2488961361325812,
"grad_norm": 83.8125,
"learning_rate": 9.995138749620751e-07,
"loss": 132.621,
"step": 3110
},
{
"epoch": 0.24969644525197857,
"grad_norm": 77.5,
"learning_rate": 9.995123118590593e-07,
"loss": 133.5524,
"step": 3120
},
{
"epoch": 0.2504967543713759,
"grad_norm": 72.4375,
"learning_rate": 9.995107487560436e-07,
"loss": 133.8227,
"step": 3130
},
{
"epoch": 0.2512970634907733,
"grad_norm": 75.4375,
"learning_rate": 9.995091856530278e-07,
"loss": 133.7567,
"step": 3140
},
{
"epoch": 0.2520973726101707,
"grad_norm": 76.0,
"learning_rate": 9.995076225500118e-07,
"loss": 133.4744,
"step": 3150
},
{
"epoch": 0.25289768172956806,
"grad_norm": 77.6875,
"learning_rate": 9.99506059446996e-07,
"loss": 131.4135,
"step": 3160
},
{
"epoch": 0.25369799084896544,
"grad_norm": 77.4375,
"learning_rate": 9.995044963439802e-07,
"loss": 131.6956,
"step": 3170
},
{
"epoch": 0.25449829996836276,
"grad_norm": 74.625,
"learning_rate": 9.995029332409644e-07,
"loss": 133.5626,
"step": 3180
},
{
"epoch": 0.25529860908776014,
"grad_norm": 75.875,
"learning_rate": 9.995013701379484e-07,
"loss": 131.9732,
"step": 3190
},
{
"epoch": 0.2560989182071575,
"grad_norm": 73.0,
"learning_rate": 9.994998070349327e-07,
"loss": 132.5818,
"step": 3200
},
{
"epoch": 0.2568992273265549,
"grad_norm": 75.0625,
"learning_rate": 9.994982439319169e-07,
"loss": 133.6618,
"step": 3210
},
{
"epoch": 0.2576995364459522,
"grad_norm": 75.8125,
"learning_rate": 9.994966808289009e-07,
"loss": 132.3088,
"step": 3220
},
{
"epoch": 0.2584998455653496,
"grad_norm": 78.5,
"learning_rate": 9.99495117725885e-07,
"loss": 131.4488,
"step": 3230
},
{
"epoch": 0.259300154684747,
"grad_norm": 73.0,
"learning_rate": 9.994935546228693e-07,
"loss": 133.2048,
"step": 3240
},
{
"epoch": 0.26010046380414437,
"grad_norm": 74.3125,
"learning_rate": 9.994919915198535e-07,
"loss": 133.9069,
"step": 3250
},
{
"epoch": 0.2609007729235417,
"grad_norm": 73.8125,
"learning_rate": 9.994904284168375e-07,
"loss": 132.9612,
"step": 3260
},
{
"epoch": 0.26170108204293907,
"grad_norm": 75.5625,
"learning_rate": 9.994888653138218e-07,
"loss": 133.8464,
"step": 3270
},
{
"epoch": 0.26250139116233645,
"grad_norm": 75.625,
"learning_rate": 9.99487302210806e-07,
"loss": 132.2074,
"step": 3280
},
{
"epoch": 0.26330170028173383,
"grad_norm": 73.125,
"learning_rate": 9.994857391077902e-07,
"loss": 134.991,
"step": 3290
},
{
"epoch": 0.2641020094011312,
"grad_norm": 77.375,
"learning_rate": 9.994841760047742e-07,
"loss": 132.3946,
"step": 3300
},
{
"epoch": 0.26490231852052853,
"grad_norm": 77.8125,
"learning_rate": 9.994826129017584e-07,
"loss": 134.2822,
"step": 3310
},
{
"epoch": 0.2657026276399259,
"grad_norm": 76.5625,
"learning_rate": 9.994810497987426e-07,
"loss": 134.2205,
"step": 3320
},
{
"epoch": 0.2665029367593233,
"grad_norm": 80.75,
"learning_rate": 9.994794866957269e-07,
"loss": 133.8495,
"step": 3330
},
{
"epoch": 0.2673032458787207,
"grad_norm": 76.125,
"learning_rate": 9.99477923592711e-07,
"loss": 133.0968,
"step": 3340
},
{
"epoch": 0.268103554998118,
"grad_norm": 76.0625,
"learning_rate": 9.99476360489695e-07,
"loss": 132.9434,
"step": 3350
},
{
"epoch": 0.2689038641175154,
"grad_norm": 77.6875,
"learning_rate": 9.994747973866793e-07,
"loss": 131.3061,
"step": 3360
},
{
"epoch": 0.26970417323691276,
"grad_norm": 78.5625,
"learning_rate": 9.994732342836635e-07,
"loss": 133.879,
"step": 3370
},
{
"epoch": 0.27050448235631014,
"grad_norm": 77.0625,
"learning_rate": 9.994716711806475e-07,
"loss": 132.1836,
"step": 3380
},
{
"epoch": 0.2713047914757075,
"grad_norm": 78.625,
"learning_rate": 9.994701080776317e-07,
"loss": 133.099,
"step": 3390
},
{
"epoch": 0.27210510059510484,
"grad_norm": 70.3125,
"learning_rate": 9.99468544974616e-07,
"loss": 131.4613,
"step": 3400
},
{
"epoch": 0.2729054097145022,
"grad_norm": 79.3125,
"learning_rate": 9.994669818716e-07,
"loss": 133.228,
"step": 3410
},
{
"epoch": 0.2737057188338996,
"grad_norm": 74.875,
"learning_rate": 9.994654187685842e-07,
"loss": 133.2147,
"step": 3420
},
{
"epoch": 0.274506027953297,
"grad_norm": 80.75,
"learning_rate": 9.994638556655684e-07,
"loss": 132.7644,
"step": 3430
},
{
"epoch": 0.2753063370726943,
"grad_norm": 77.25,
"learning_rate": 9.994622925625526e-07,
"loss": 132.9558,
"step": 3440
},
{
"epoch": 0.2761066461920917,
"grad_norm": 77.8125,
"learning_rate": 9.994607294595368e-07,
"loss": 131.8311,
"step": 3450
},
{
"epoch": 0.27690695531148907,
"grad_norm": 74.8125,
"learning_rate": 9.994591663565208e-07,
"loss": 133.9894,
"step": 3460
},
{
"epoch": 0.27770726443088645,
"grad_norm": 73.5,
"learning_rate": 9.99457603253505e-07,
"loss": 133.7439,
"step": 3470
},
{
"epoch": 0.27850757355028377,
"grad_norm": 75.1875,
"learning_rate": 9.994560401504893e-07,
"loss": 133.1463,
"step": 3480
},
{
"epoch": 0.27930788266968115,
"grad_norm": 73.0625,
"learning_rate": 9.994544770474735e-07,
"loss": 133.7504,
"step": 3490
},
{
"epoch": 0.28010819178907853,
"grad_norm": 80.75,
"learning_rate": 9.994529139444577e-07,
"loss": 133.6309,
"step": 3500
},
{
"epoch": 0.2809085009084759,
"grad_norm": 72.8125,
"learning_rate": 9.994513508414417e-07,
"loss": 133.597,
"step": 3510
},
{
"epoch": 0.2817088100278733,
"grad_norm": 84.75,
"learning_rate": 9.99449787738426e-07,
"loss": 133.3475,
"step": 3520
},
{
"epoch": 0.2825091191472706,
"grad_norm": 81.3125,
"learning_rate": 9.994482246354102e-07,
"loss": 131.89,
"step": 3530
},
{
"epoch": 0.283309428266668,
"grad_norm": 76.25,
"learning_rate": 9.994466615323942e-07,
"loss": 134.1571,
"step": 3540
},
{
"epoch": 0.2841097373860654,
"grad_norm": 78.125,
"learning_rate": 9.994450984293784e-07,
"loss": 132.8667,
"step": 3550
},
{
"epoch": 0.28491004650546276,
"grad_norm": 77.3125,
"learning_rate": 9.994435353263626e-07,
"loss": 132.635,
"step": 3560
},
{
"epoch": 0.2857103556248601,
"grad_norm": 76.9375,
"learning_rate": 9.994419722233466e-07,
"loss": 132.5345,
"step": 3570
},
{
"epoch": 0.28651066474425746,
"grad_norm": 79.6875,
"learning_rate": 9.994404091203308e-07,
"loss": 133.5979,
"step": 3580
},
{
"epoch": 0.28731097386365484,
"grad_norm": 74.5625,
"learning_rate": 9.99438846017315e-07,
"loss": 133.0856,
"step": 3590
},
{
"epoch": 0.2881112829830522,
"grad_norm": 81.75,
"learning_rate": 9.994372829142993e-07,
"loss": 133.0596,
"step": 3600
},
{
"epoch": 0.28891159210244954,
"grad_norm": 79.25,
"learning_rate": 9.994357198112835e-07,
"loss": 133.8565,
"step": 3610
},
{
"epoch": 0.2897119012218469,
"grad_norm": 78.1875,
"learning_rate": 9.994341567082675e-07,
"loss": 133.0675,
"step": 3620
},
{
"epoch": 0.2905122103412443,
"grad_norm": 78.8125,
"learning_rate": 9.994325936052517e-07,
"loss": 132.4503,
"step": 3630
},
{
"epoch": 0.2913125194606417,
"grad_norm": 74.0,
"learning_rate": 9.99431030502236e-07,
"loss": 131.9785,
"step": 3640
},
{
"epoch": 0.29211282858003906,
"grad_norm": 81.25,
"learning_rate": 9.994294673992201e-07,
"loss": 131.6815,
"step": 3650
},
{
"epoch": 0.2929131376994364,
"grad_norm": 78.125,
"learning_rate": 9.994279042962044e-07,
"loss": 133.6587,
"step": 3660
},
{
"epoch": 0.29371344681883377,
"grad_norm": 82.8125,
"learning_rate": 9.994263411931884e-07,
"loss": 132.4716,
"step": 3670
},
{
"epoch": 0.29451375593823115,
"grad_norm": 77.875,
"learning_rate": 9.994247780901726e-07,
"loss": 134.3209,
"step": 3680
},
{
"epoch": 0.2953140650576285,
"grad_norm": 73.8125,
"learning_rate": 9.994232149871568e-07,
"loss": 132.6673,
"step": 3690
},
{
"epoch": 0.29611437417702585,
"grad_norm": 81.0,
"learning_rate": 9.994216518841408e-07,
"loss": 132.5012,
"step": 3700
},
{
"epoch": 0.29691468329642323,
"grad_norm": 71.125,
"learning_rate": 9.99420088781125e-07,
"loss": 133.0868,
"step": 3710
},
{
"epoch": 0.2977149924158206,
"grad_norm": 85.5625,
"learning_rate": 9.994185256781093e-07,
"loss": 133.206,
"step": 3720
},
{
"epoch": 0.298515301535218,
"grad_norm": 78.9375,
"learning_rate": 9.994169625750933e-07,
"loss": 132.3186,
"step": 3730
},
{
"epoch": 0.29931561065461537,
"grad_norm": 82.1875,
"learning_rate": 9.994153994720775e-07,
"loss": 132.2575,
"step": 3740
},
{
"epoch": 0.3001159197740127,
"grad_norm": 83.5625,
"learning_rate": 9.994138363690617e-07,
"loss": 133.2065,
"step": 3750
},
{
"epoch": 0.3009162288934101,
"grad_norm": 72.3125,
"learning_rate": 9.99412273266046e-07,
"loss": 133.6776,
"step": 3760
},
{
"epoch": 0.30171653801280746,
"grad_norm": 77.6875,
"learning_rate": 9.9941071016303e-07,
"loss": 132.2097,
"step": 3770
},
{
"epoch": 0.30251684713220484,
"grad_norm": 74.875,
"learning_rate": 9.994091470600141e-07,
"loss": 130.3959,
"step": 3780
},
{
"epoch": 0.30331715625160216,
"grad_norm": 74.375,
"learning_rate": 9.994075839569984e-07,
"loss": 131.0167,
"step": 3790
},
{
"epoch": 0.30411746537099954,
"grad_norm": 76.375,
"learning_rate": 9.994060208539826e-07,
"loss": 132.5872,
"step": 3800
},
{
"epoch": 0.3049177744903969,
"grad_norm": 75.6875,
"learning_rate": 9.994044577509668e-07,
"loss": 133.6776,
"step": 3810
},
{
"epoch": 0.3057180836097943,
"grad_norm": 75.125,
"learning_rate": 9.99402894647951e-07,
"loss": 133.0905,
"step": 3820
},
{
"epoch": 0.3065183927291916,
"grad_norm": 77.4375,
"learning_rate": 9.99401331544935e-07,
"loss": 133.0921,
"step": 3830
},
{
"epoch": 0.307318701848589,
"grad_norm": 80.9375,
"learning_rate": 9.993997684419192e-07,
"loss": 133.8795,
"step": 3840
},
{
"epoch": 0.3081190109679864,
"grad_norm": 73.75,
"learning_rate": 9.993982053389035e-07,
"loss": 133.65,
"step": 3850
},
{
"epoch": 0.30891932008738376,
"grad_norm": 76.6875,
"learning_rate": 9.993966422358875e-07,
"loss": 133.7787,
"step": 3860
},
{
"epoch": 0.30971962920678114,
"grad_norm": 76.6875,
"learning_rate": 9.993950791328717e-07,
"loss": 132.1005,
"step": 3870
},
{
"epoch": 0.31051993832617847,
"grad_norm": 70.75,
"learning_rate": 9.99393516029856e-07,
"loss": 131.8651,
"step": 3880
},
{
"epoch": 0.31132024744557585,
"grad_norm": 82.3125,
"learning_rate": 9.9939195292684e-07,
"loss": 133.0827,
"step": 3890
},
{
"epoch": 0.31212055656497323,
"grad_norm": 76.75,
"learning_rate": 9.993903898238241e-07,
"loss": 132.4445,
"step": 3900
},
{
"epoch": 0.3129208656843706,
"grad_norm": 83.1875,
"learning_rate": 9.993888267208083e-07,
"loss": 134.2083,
"step": 3910
},
{
"epoch": 0.31372117480376793,
"grad_norm": 78.9375,
"learning_rate": 9.993872636177926e-07,
"loss": 132.1415,
"step": 3920
},
{
"epoch": 0.3145214839231653,
"grad_norm": 77.625,
"learning_rate": 9.993857005147766e-07,
"loss": 133.3125,
"step": 3930
},
{
"epoch": 0.3153217930425627,
"grad_norm": 77.125,
"learning_rate": 9.993841374117608e-07,
"loss": 133.077,
"step": 3940
},
{
"epoch": 0.3161221021619601,
"grad_norm": 72.4375,
"learning_rate": 9.99382574308745e-07,
"loss": 132.9227,
"step": 3950
},
{
"epoch": 0.3169224112813574,
"grad_norm": 76.0625,
"learning_rate": 9.993810112057292e-07,
"loss": 131.9877,
"step": 3960
},
{
"epoch": 0.3177227204007548,
"grad_norm": 75.6875,
"learning_rate": 9.993794481027134e-07,
"loss": 134.1346,
"step": 3970
},
{
"epoch": 0.31852302952015216,
"grad_norm": 72.9375,
"learning_rate": 9.993778849996974e-07,
"loss": 133.5554,
"step": 3980
},
{
"epoch": 0.31932333863954954,
"grad_norm": 80.1875,
"learning_rate": 9.993763218966817e-07,
"loss": 132.0662,
"step": 3990
},
{
"epoch": 0.3201236477589469,
"grad_norm": 75.8125,
"learning_rate": 9.993747587936659e-07,
"loss": 132.8713,
"step": 4000
},
{
"epoch": 0.32092395687834424,
"grad_norm": 79.625,
"learning_rate": 9.9937319569065e-07,
"loss": 132.2075,
"step": 4010
},
{
"epoch": 0.3217242659977416,
"grad_norm": 79.25,
"learning_rate": 9.993716325876341e-07,
"loss": 133.7796,
"step": 4020
},
{
"epoch": 0.322524575117139,
"grad_norm": 73.0,
"learning_rate": 9.993700694846183e-07,
"loss": 132.6325,
"step": 4030
},
{
"epoch": 0.3233248842365364,
"grad_norm": 78.9375,
"learning_rate": 9.993685063816025e-07,
"loss": 132.7415,
"step": 4040
},
{
"epoch": 0.3241251933559337,
"grad_norm": 74.5625,
"learning_rate": 9.993669432785866e-07,
"loss": 132.9165,
"step": 4050
},
{
"epoch": 0.3249255024753311,
"grad_norm": 70.0,
"learning_rate": 9.993653801755708e-07,
"loss": 133.0353,
"step": 4060
},
{
"epoch": 0.32572581159472846,
"grad_norm": 85.625,
"learning_rate": 9.99363817072555e-07,
"loss": 131.7386,
"step": 4070
},
{
"epoch": 0.32652612071412584,
"grad_norm": 76.6875,
"learning_rate": 9.993622539695392e-07,
"loss": 132.1626,
"step": 4080
},
{
"epoch": 0.3273264298335232,
"grad_norm": 72.0625,
"learning_rate": 9.993606908665232e-07,
"loss": 132.8323,
"step": 4090
},
{
"epoch": 0.32812673895292055,
"grad_norm": 75.3125,
"learning_rate": 9.993591277635074e-07,
"loss": 132.1428,
"step": 4100
},
{
"epoch": 0.32892704807231793,
"grad_norm": 82.1875,
"learning_rate": 9.993575646604916e-07,
"loss": 133.5749,
"step": 4110
},
{
"epoch": 0.3297273571917153,
"grad_norm": 75.0,
"learning_rate": 9.993560015574759e-07,
"loss": 134.0208,
"step": 4120
},
{
"epoch": 0.3305276663111127,
"grad_norm": 78.875,
"learning_rate": 9.9935443845446e-07,
"loss": 133.0864,
"step": 4130
},
{
"epoch": 0.33132797543051,
"grad_norm": 81.3125,
"learning_rate": 9.99352875351444e-07,
"loss": 132.96,
"step": 4140
},
{
"epoch": 0.3321282845499074,
"grad_norm": 74.75,
"learning_rate": 9.993513122484283e-07,
"loss": 132.6293,
"step": 4150
},
{
"epoch": 0.3329285936693048,
"grad_norm": 75.9375,
"learning_rate": 9.993497491454125e-07,
"loss": 134.6045,
"step": 4160
},
{
"epoch": 0.33372890278870215,
"grad_norm": 78.8125,
"learning_rate": 9.993481860423965e-07,
"loss": 133.819,
"step": 4170
},
{
"epoch": 0.3345292119080995,
"grad_norm": 76.4375,
"learning_rate": 9.993466229393808e-07,
"loss": 130.7153,
"step": 4180
},
{
"epoch": 0.33532952102749686,
"grad_norm": 76.0,
"learning_rate": 9.99345059836365e-07,
"loss": 132.3732,
"step": 4190
},
{
"epoch": 0.33612983014689424,
"grad_norm": 77.1875,
"learning_rate": 9.993434967333492e-07,
"loss": 131.4807,
"step": 4200
},
{
"epoch": 0.3369301392662916,
"grad_norm": 77.3125,
"learning_rate": 9.993419336303332e-07,
"loss": 132.6123,
"step": 4210
},
{
"epoch": 0.337730448385689,
"grad_norm": 76.125,
"learning_rate": 9.993403705273174e-07,
"loss": 131.4161,
"step": 4220
},
{
"epoch": 0.3385307575050863,
"grad_norm": 74.6875,
"learning_rate": 9.993388074243016e-07,
"loss": 132.329,
"step": 4230
},
{
"epoch": 0.3393310666244837,
"grad_norm": 114.5,
"learning_rate": 9.993372443212856e-07,
"loss": 132.9315,
"step": 4240
},
{
"epoch": 0.3401313757438811,
"grad_norm": 83.125,
"learning_rate": 9.993356812182699e-07,
"loss": 133.2091,
"step": 4250
},
{
"epoch": 0.34093168486327846,
"grad_norm": 77.4375,
"learning_rate": 9.99334118115254e-07,
"loss": 133.3165,
"step": 4260
},
{
"epoch": 0.3417319939826758,
"grad_norm": 73.375,
"learning_rate": 9.993325550122383e-07,
"loss": 132.3926,
"step": 4270
},
{
"epoch": 0.34253230310207317,
"grad_norm": 81.8125,
"learning_rate": 9.993309919092225e-07,
"loss": 132.2934,
"step": 4280
},
{
"epoch": 0.34333261222147055,
"grad_norm": 75.0,
"learning_rate": 9.993294288062067e-07,
"loss": 131.8824,
"step": 4290
},
{
"epoch": 0.3441329213408679,
"grad_norm": 78.625,
"learning_rate": 9.993278657031907e-07,
"loss": 133.3519,
"step": 4300
},
{
"epoch": 0.34493323046026525,
"grad_norm": 78.8125,
"learning_rate": 9.99326302600175e-07,
"loss": 132.0518,
"step": 4310
},
{
"epoch": 0.34573353957966263,
"grad_norm": 82.25,
"learning_rate": 9.993247394971592e-07,
"loss": 133.0713,
"step": 4320
},
{
"epoch": 0.34653384869906,
"grad_norm": 71.6875,
"learning_rate": 9.993231763941432e-07,
"loss": 133.3473,
"step": 4330
},
{
"epoch": 0.3473341578184574,
"grad_norm": 69.3125,
"learning_rate": 9.993216132911274e-07,
"loss": 132.6307,
"step": 4340
},
{
"epoch": 0.34813446693785477,
"grad_norm": 71.125,
"learning_rate": 9.993200501881116e-07,
"loss": 133.2306,
"step": 4350
},
{
"epoch": 0.3489347760572521,
"grad_norm": 77.125,
"learning_rate": 9.993184870850958e-07,
"loss": 132.2587,
"step": 4360
},
{
"epoch": 0.3497350851766495,
"grad_norm": 72.4375,
"learning_rate": 9.993169239820798e-07,
"loss": 132.2048,
"step": 4370
},
{
"epoch": 0.35053539429604685,
"grad_norm": 74.6875,
"learning_rate": 9.99315360879064e-07,
"loss": 132.5754,
"step": 4380
},
{
"epoch": 0.35133570341544423,
"grad_norm": 80.75,
"learning_rate": 9.993137977760483e-07,
"loss": 133.0396,
"step": 4390
},
{
"epoch": 0.35213601253484156,
"grad_norm": 75.625,
"learning_rate": 9.993122346730323e-07,
"loss": 132.3304,
"step": 4400
},
{
"epoch": 0.35293632165423894,
"grad_norm": 74.9375,
"learning_rate": 9.993106715700165e-07,
"loss": 133.6866,
"step": 4410
},
{
"epoch": 0.3537366307736363,
"grad_norm": 76.5,
"learning_rate": 9.993091084670007e-07,
"loss": 131.9718,
"step": 4420
},
{
"epoch": 0.3545369398930337,
"grad_norm": 73.9375,
"learning_rate": 9.99307545363985e-07,
"loss": 132.9161,
"step": 4430
},
{
"epoch": 0.3553372490124311,
"grad_norm": 76.3125,
"learning_rate": 9.993059822609692e-07,
"loss": 133.0902,
"step": 4440
},
{
"epoch": 0.3561375581318284,
"grad_norm": 77.0625,
"learning_rate": 9.993044191579532e-07,
"loss": 132.3219,
"step": 4450
},
{
"epoch": 0.3569378672512258,
"grad_norm": 82.375,
"learning_rate": 9.993028560549374e-07,
"loss": 132.6951,
"step": 4460
},
{
"epoch": 0.35773817637062316,
"grad_norm": 79.375,
"learning_rate": 9.993012929519216e-07,
"loss": 133.4258,
"step": 4470
},
{
"epoch": 0.35853848549002054,
"grad_norm": 77.5625,
"learning_rate": 9.992997298489058e-07,
"loss": 132.2778,
"step": 4480
},
{
"epoch": 0.35933879460941787,
"grad_norm": 76.8125,
"learning_rate": 9.992981667458898e-07,
"loss": 132.1174,
"step": 4490
},
{
"epoch": 0.36013910372881525,
"grad_norm": 77.9375,
"learning_rate": 9.99296603642874e-07,
"loss": 131.4624,
"step": 4500
},
{
"epoch": 0.3609394128482126,
"grad_norm": 78.4375,
"learning_rate": 9.992950405398583e-07,
"loss": 132.2248,
"step": 4510
},
{
"epoch": 0.36173972196761,
"grad_norm": 77.75,
"learning_rate": 9.992934774368423e-07,
"loss": 132.4694,
"step": 4520
},
{
"epoch": 0.36254003108700733,
"grad_norm": 75.75,
"learning_rate": 9.992919143338265e-07,
"loss": 133.191,
"step": 4530
},
{
"epoch": 0.3633403402064047,
"grad_norm": 78.625,
"learning_rate": 9.992903512308107e-07,
"loss": 132.9599,
"step": 4540
},
{
"epoch": 0.3641406493258021,
"grad_norm": 72.25,
"learning_rate": 9.99288788127795e-07,
"loss": 133.3042,
"step": 4550
},
{
"epoch": 0.36494095844519947,
"grad_norm": 75.25,
"learning_rate": 9.99287225024779e-07,
"loss": 133.097,
"step": 4560
},
{
"epoch": 0.36574126756459685,
"grad_norm": 76.5,
"learning_rate": 9.992856619217632e-07,
"loss": 131.8935,
"step": 4570
},
{
"epoch": 0.3665415766839942,
"grad_norm": 77.3125,
"learning_rate": 9.992840988187474e-07,
"loss": 132.2206,
"step": 4580
},
{
"epoch": 0.36734188580339155,
"grad_norm": 78.25,
"learning_rate": 9.992825357157316e-07,
"loss": 132.6171,
"step": 4590
},
{
"epoch": 0.36814219492278893,
"grad_norm": 78.25,
"learning_rate": 9.992809726127158e-07,
"loss": 132.2898,
"step": 4600
},
{
"epoch": 0.3689425040421863,
"grad_norm": 78.5,
"learning_rate": 9.992794095096998e-07,
"loss": 133.2866,
"step": 4610
},
{
"epoch": 0.36974281316158364,
"grad_norm": 72.875,
"learning_rate": 9.99277846406684e-07,
"loss": 132.4634,
"step": 4620
},
{
"epoch": 0.370543122280981,
"grad_norm": 77.4375,
"learning_rate": 9.992762833036682e-07,
"loss": 132.7402,
"step": 4630
},
{
"epoch": 0.3713434314003784,
"grad_norm": 71.6875,
"learning_rate": 9.992747202006525e-07,
"loss": 132.4131,
"step": 4640
},
{
"epoch": 0.3721437405197758,
"grad_norm": 80.5,
"learning_rate": 9.992731570976365e-07,
"loss": 132.4793,
"step": 4650
},
{
"epoch": 0.3729440496391731,
"grad_norm": 76.4375,
"learning_rate": 9.992715939946207e-07,
"loss": 133.3043,
"step": 4660
},
{
"epoch": 0.3737443587585705,
"grad_norm": 76.1875,
"learning_rate": 9.99270030891605e-07,
"loss": 132.1431,
"step": 4670
},
{
"epoch": 0.37454466787796786,
"grad_norm": 81.4375,
"learning_rate": 9.99268467788589e-07,
"loss": 130.6737,
"step": 4680
},
{
"epoch": 0.37534497699736524,
"grad_norm": 77.1875,
"learning_rate": 9.992669046855731e-07,
"loss": 134.3274,
"step": 4690
},
{
"epoch": 0.3761452861167626,
"grad_norm": 74.8125,
"learning_rate": 9.992653415825574e-07,
"loss": 132.4399,
"step": 4700
},
{
"epoch": 0.37694559523615995,
"grad_norm": 72.6875,
"learning_rate": 9.992637784795414e-07,
"loss": 133.9349,
"step": 4710
},
{
"epoch": 0.3777459043555573,
"grad_norm": 76.3125,
"learning_rate": 9.992622153765256e-07,
"loss": 134.1812,
"step": 4720
},
{
"epoch": 0.3785462134749547,
"grad_norm": 74.5,
"learning_rate": 9.992606522735098e-07,
"loss": 131.5583,
"step": 4730
},
{
"epoch": 0.3793465225943521,
"grad_norm": 78.625,
"learning_rate": 9.99259089170494e-07,
"loss": 132.3389,
"step": 4740
},
{
"epoch": 0.3801468317137494,
"grad_norm": 77.0,
"learning_rate": 9.992575260674782e-07,
"loss": 133.7076,
"step": 4750
},
{
"epoch": 0.3809471408331468,
"grad_norm": 79.6875,
"learning_rate": 9.992559629644625e-07,
"loss": 133.8963,
"step": 4760
},
{
"epoch": 0.38174744995254417,
"grad_norm": 75.875,
"learning_rate": 9.992543998614465e-07,
"loss": 132.6479,
"step": 4770
},
{
"epoch": 0.38254775907194155,
"grad_norm": 78.0625,
"learning_rate": 9.992528367584307e-07,
"loss": 133.2928,
"step": 4780
},
{
"epoch": 0.38334806819133893,
"grad_norm": 77.6875,
"learning_rate": 9.992512736554149e-07,
"loss": 130.3455,
"step": 4790
},
{
"epoch": 0.38414837731073626,
"grad_norm": 78.5,
"learning_rate": 9.992497105523991e-07,
"loss": 131.2587,
"step": 4800
},
{
"epoch": 0.38494868643013364,
"grad_norm": 73.5,
"learning_rate": 9.992481474493831e-07,
"loss": 131.3443,
"step": 4810
},
{
"epoch": 0.385748995549531,
"grad_norm": 77.8125,
"learning_rate": 9.992465843463673e-07,
"loss": 133.4118,
"step": 4820
},
{
"epoch": 0.3865493046689284,
"grad_norm": 79.1875,
"learning_rate": 9.992450212433516e-07,
"loss": 131.2462,
"step": 4830
},
{
"epoch": 0.3873496137883257,
"grad_norm": 84.5,
"learning_rate": 9.992434581403356e-07,
"loss": 131.1748,
"step": 4840
},
{
"epoch": 0.3881499229077231,
"grad_norm": 75.125,
"learning_rate": 9.992418950373198e-07,
"loss": 132.5554,
"step": 4850
},
{
"epoch": 0.3889502320271205,
"grad_norm": 72.875,
"learning_rate": 9.99240331934304e-07,
"loss": 132.8411,
"step": 4860
},
{
"epoch": 0.38975054114651786,
"grad_norm": 77.8125,
"learning_rate": 9.99238768831288e-07,
"loss": 131.7171,
"step": 4870
},
{
"epoch": 0.3905508502659152,
"grad_norm": 74.625,
"learning_rate": 9.992372057282722e-07,
"loss": 132.1889,
"step": 4880
},
{
"epoch": 0.39135115938531256,
"grad_norm": 80.4375,
"learning_rate": 9.992356426252564e-07,
"loss": 131.4297,
"step": 4890
},
{
"epoch": 0.39215146850470994,
"grad_norm": 81.8125,
"learning_rate": 9.992340795222407e-07,
"loss": 132.5631,
"step": 4900
},
{
"epoch": 0.3929517776241073,
"grad_norm": 73.5,
"learning_rate": 9.992325164192249e-07,
"loss": 133.1258,
"step": 4910
},
{
"epoch": 0.3937520867435047,
"grad_norm": 74.0,
"learning_rate": 9.99230953316209e-07,
"loss": 132.7684,
"step": 4920
},
{
"epoch": 0.39455239586290203,
"grad_norm": 76.0,
"learning_rate": 9.99229390213193e-07,
"loss": 133.6151,
"step": 4930
},
{
"epoch": 0.3953527049822994,
"grad_norm": 85.1875,
"learning_rate": 9.992278271101773e-07,
"loss": 132.0966,
"step": 4940
},
{
"epoch": 0.3961530141016968,
"grad_norm": 76.75,
"learning_rate": 9.992262640071615e-07,
"loss": 133.1567,
"step": 4950
},
{
"epoch": 0.39695332322109417,
"grad_norm": 74.375,
"learning_rate": 9.992247009041458e-07,
"loss": 132.1937,
"step": 4960
},
{
"epoch": 0.3977536323404915,
"grad_norm": 79.75,
"learning_rate": 9.992231378011298e-07,
"loss": 131.0156,
"step": 4970
},
{
"epoch": 0.39855394145988887,
"grad_norm": 79.3125,
"learning_rate": 9.99221574698114e-07,
"loss": 132.8796,
"step": 4980
},
{
"epoch": 0.39935425057928625,
"grad_norm": 73.0,
"learning_rate": 9.992200115950982e-07,
"loss": 132.0237,
"step": 4990
},
{
"epoch": 0.40015455969868363,
"grad_norm": 75.625,
"learning_rate": 9.992184484920822e-07,
"loss": 133.5153,
"step": 5000
},
{
"epoch": 0.40015455969868363,
"eval_loss": 2.0742883682250977,
"eval_runtime": 418.6562,
"eval_samples_per_second": 1567.295,
"eval_steps_per_second": 48.978,
"step": 5000
},
{
"epoch": 0.40095486881808096,
"grad_norm": 72.4375,
"learning_rate": 9.992168853890664e-07,
"loss": 133.3489,
"step": 5010
},
{
"epoch": 0.40175517793747834,
"grad_norm": 74.5625,
"learning_rate": 9.992153222860506e-07,
"loss": 132.38,
"step": 5020
},
{
"epoch": 0.4025554870568757,
"grad_norm": 81.25,
"learning_rate": 9.992137591830347e-07,
"loss": 133.7667,
"step": 5030
},
{
"epoch": 0.4033557961762731,
"grad_norm": 74.5625,
"learning_rate": 9.992121960800189e-07,
"loss": 133.8293,
"step": 5040
},
{
"epoch": 0.4041561052956705,
"grad_norm": 71.25,
"learning_rate": 9.99210632977003e-07,
"loss": 133.3581,
"step": 5050
},
{
"epoch": 0.4049564144150678,
"grad_norm": 79.5625,
"learning_rate": 9.992090698739873e-07,
"loss": 132.9888,
"step": 5060
},
{
"epoch": 0.4057567235344652,
"grad_norm": 73.9375,
"learning_rate": 9.992075067709715e-07,
"loss": 132.4104,
"step": 5070
},
{
"epoch": 0.40655703265386256,
"grad_norm": 72.5,
"learning_rate": 9.992059436679555e-07,
"loss": 132.3801,
"step": 5080
},
{
"epoch": 0.40735734177325994,
"grad_norm": 76.3125,
"learning_rate": 9.992043805649397e-07,
"loss": 132.4045,
"step": 5090
},
{
"epoch": 0.40815765089265726,
"grad_norm": 76.5625,
"learning_rate": 9.99202817461924e-07,
"loss": 132.4824,
"step": 5100
},
{
"epoch": 0.40895796001205464,
"grad_norm": 74.125,
"learning_rate": 9.992012543589082e-07,
"loss": 132.0051,
"step": 5110
},
{
"epoch": 0.409758269131452,
"grad_norm": 72.9375,
"learning_rate": 9.991996912558924e-07,
"loss": 133.8627,
"step": 5120
},
{
"epoch": 0.4105585782508494,
"grad_norm": 78.9375,
"learning_rate": 9.991981281528764e-07,
"loss": 132.2818,
"step": 5130
},
{
"epoch": 0.4113588873702468,
"grad_norm": 75.375,
"learning_rate": 9.991965650498606e-07,
"loss": 131.2578,
"step": 5140
},
{
"epoch": 0.4121591964896441,
"grad_norm": 77.125,
"learning_rate": 9.991950019468448e-07,
"loss": 133.7901,
"step": 5150
},
{
"epoch": 0.4129595056090415,
"grad_norm": 80.9375,
"learning_rate": 9.991934388438289e-07,
"loss": 132.6137,
"step": 5160
},
{
"epoch": 0.41375981472843887,
"grad_norm": 81.875,
"learning_rate": 9.99191875740813e-07,
"loss": 132.9288,
"step": 5170
},
{
"epoch": 0.41456012384783625,
"grad_norm": 78.6875,
"learning_rate": 9.991903126377973e-07,
"loss": 132.9595,
"step": 5180
},
{
"epoch": 0.4153604329672336,
"grad_norm": 76.9375,
"learning_rate": 9.991887495347813e-07,
"loss": 132.3299,
"step": 5190
},
{
"epoch": 0.41616074208663095,
"grad_norm": 79.0,
"learning_rate": 9.991871864317655e-07,
"loss": 132.1807,
"step": 5200
},
{
"epoch": 0.41696105120602833,
"grad_norm": 75.4375,
"learning_rate": 9.991856233287497e-07,
"loss": 133.9722,
"step": 5210
},
{
"epoch": 0.4177613603254257,
"grad_norm": 73.4375,
"learning_rate": 9.99184060225734e-07,
"loss": 132.654,
"step": 5220
},
{
"epoch": 0.41856166944482304,
"grad_norm": 74.1875,
"learning_rate": 9.991824971227182e-07,
"loss": 133.0139,
"step": 5230
},
{
"epoch": 0.4193619785642204,
"grad_norm": 79.25,
"learning_rate": 9.991809340197022e-07,
"loss": 131.9049,
"step": 5240
},
{
"epoch": 0.4201622876836178,
"grad_norm": 75.8125,
"learning_rate": 9.991793709166864e-07,
"loss": 131.9633,
"step": 5250
},
{
"epoch": 0.4209625968030152,
"grad_norm": 74.25,
"learning_rate": 9.991778078136706e-07,
"loss": 132.2361,
"step": 5260
},
{
"epoch": 0.42176290592241256,
"grad_norm": 72.8125,
"learning_rate": 9.991762447106548e-07,
"loss": 131.267,
"step": 5270
},
{
"epoch": 0.4225632150418099,
"grad_norm": 77.5625,
"learning_rate": 9.991746816076388e-07,
"loss": 132.4257,
"step": 5280
},
{
"epoch": 0.42336352416120726,
"grad_norm": 79.3125,
"learning_rate": 9.99173118504623e-07,
"loss": 132.785,
"step": 5290
},
{
"epoch": 0.42416383328060464,
"grad_norm": 76.9375,
"learning_rate": 9.991715554016073e-07,
"loss": 131.3379,
"step": 5300
},
{
"epoch": 0.424964142400002,
"grad_norm": 75.25,
"learning_rate": 9.991699922985915e-07,
"loss": 133.1422,
"step": 5310
},
{
"epoch": 0.42576445151939935,
"grad_norm": 72.5625,
"learning_rate": 9.991684291955755e-07,
"loss": 132.1037,
"step": 5320
},
{
"epoch": 0.4265647606387967,
"grad_norm": 79.0,
"learning_rate": 9.991668660925597e-07,
"loss": 132.1573,
"step": 5330
},
{
"epoch": 0.4273650697581941,
"grad_norm": 75.0625,
"learning_rate": 9.99165302989544e-07,
"loss": 132.0645,
"step": 5340
},
{
"epoch": 0.4281653788775915,
"grad_norm": 72.3125,
"learning_rate": 9.99163739886528e-07,
"loss": 132.8727,
"step": 5350
},
{
"epoch": 0.4289656879969888,
"grad_norm": 77.4375,
"learning_rate": 9.991621767835122e-07,
"loss": 133.5671,
"step": 5360
},
{
"epoch": 0.4297659971163862,
"grad_norm": 74.3125,
"learning_rate": 9.991606136804964e-07,
"loss": 132.3078,
"step": 5370
},
{
"epoch": 0.43056630623578357,
"grad_norm": 81.375,
"learning_rate": 9.991590505774806e-07,
"loss": 133.0639,
"step": 5380
},
{
"epoch": 0.43136661535518095,
"grad_norm": 74.0625,
"learning_rate": 9.991574874744648e-07,
"loss": 131.2669,
"step": 5390
},
{
"epoch": 0.43216692447457833,
"grad_norm": 79.375,
"learning_rate": 9.991559243714488e-07,
"loss": 131.9368,
"step": 5400
},
{
"epoch": 0.43296723359397565,
"grad_norm": 77.9375,
"learning_rate": 9.99154361268433e-07,
"loss": 132.4038,
"step": 5410
},
{
"epoch": 0.43376754271337303,
"grad_norm": 83.375,
"learning_rate": 9.991527981654173e-07,
"loss": 132.8988,
"step": 5420
},
{
"epoch": 0.4345678518327704,
"grad_norm": 73.6875,
"learning_rate": 9.991512350624015e-07,
"loss": 133.3637,
"step": 5430
},
{
"epoch": 0.4353681609521678,
"grad_norm": 74.5625,
"learning_rate": 9.991496719593855e-07,
"loss": 133.3608,
"step": 5440
},
{
"epoch": 0.4361684700715651,
"grad_norm": 71.0,
"learning_rate": 9.991481088563697e-07,
"loss": 131.265,
"step": 5450
},
{
"epoch": 0.4369687791909625,
"grad_norm": 73.4375,
"learning_rate": 9.99146545753354e-07,
"loss": 132.0777,
"step": 5460
},
{
"epoch": 0.4377690883103599,
"grad_norm": 75.125,
"learning_rate": 9.99144982650338e-07,
"loss": 132.4698,
"step": 5470
},
{
"epoch": 0.43856939742975726,
"grad_norm": 73.875,
"learning_rate": 9.991434195473221e-07,
"loss": 131.6167,
"step": 5480
},
{
"epoch": 0.43936970654915464,
"grad_norm": 76.9375,
"learning_rate": 9.991418564443064e-07,
"loss": 131.9013,
"step": 5490
},
{
"epoch": 0.44017001566855196,
"grad_norm": 73.4375,
"learning_rate": 9.991402933412906e-07,
"loss": 131.2487,
"step": 5500
},
{
"epoch": 0.44097032478794934,
"grad_norm": 83.4375,
"learning_rate": 9.991387302382746e-07,
"loss": 133.703,
"step": 5510
},
{
"epoch": 0.4417706339073467,
"grad_norm": 78.375,
"learning_rate": 9.991371671352588e-07,
"loss": 133.2442,
"step": 5520
},
{
"epoch": 0.4425709430267441,
"grad_norm": 71.375,
"learning_rate": 9.99135604032243e-07,
"loss": 131.6987,
"step": 5530
},
{
"epoch": 0.4433712521461414,
"grad_norm": 76.1875,
"learning_rate": 9.991340409292272e-07,
"loss": 131.8815,
"step": 5540
},
{
"epoch": 0.4441715612655388,
"grad_norm": 80.8125,
"learning_rate": 9.991324778262112e-07,
"loss": 132.4904,
"step": 5550
},
{
"epoch": 0.4449718703849362,
"grad_norm": 75.1875,
"learning_rate": 9.991309147231955e-07,
"loss": 130.7195,
"step": 5560
},
{
"epoch": 0.44577217950433357,
"grad_norm": 74.5625,
"learning_rate": 9.991293516201797e-07,
"loss": 132.3294,
"step": 5570
},
{
"epoch": 0.4465724886237309,
"grad_norm": 77.5,
"learning_rate": 9.99127788517164e-07,
"loss": 132.4166,
"step": 5580
},
{
"epoch": 0.44737279774312827,
"grad_norm": 74.375,
"learning_rate": 9.991262254141481e-07,
"loss": 132.624,
"step": 5590
},
{
"epoch": 0.44817310686252565,
"grad_norm": 82.3125,
"learning_rate": 9.991246623111321e-07,
"loss": 131.5993,
"step": 5600
},
{
"epoch": 0.44897341598192303,
"grad_norm": 70.5,
"learning_rate": 9.991230992081163e-07,
"loss": 130.5851,
"step": 5610
},
{
"epoch": 0.4497737251013204,
"grad_norm": 78.9375,
"learning_rate": 9.991215361051006e-07,
"loss": 131.7671,
"step": 5620
},
{
"epoch": 0.45057403422071773,
"grad_norm": 78.9375,
"learning_rate": 9.991199730020846e-07,
"loss": 132.6647,
"step": 5630
},
{
"epoch": 0.4513743433401151,
"grad_norm": 77.5,
"learning_rate": 9.991184098990688e-07,
"loss": 131.5212,
"step": 5640
},
{
"epoch": 0.4521746524595125,
"grad_norm": 80.25,
"learning_rate": 9.99116846796053e-07,
"loss": 131.0642,
"step": 5650
},
{
"epoch": 0.4529749615789099,
"grad_norm": 80.875,
"learning_rate": 9.991152836930372e-07,
"loss": 131.6869,
"step": 5660
},
{
"epoch": 0.4537752706983072,
"grad_norm": 73.125,
"learning_rate": 9.991137205900212e-07,
"loss": 132.0529,
"step": 5670
},
{
"epoch": 0.4545755798177046,
"grad_norm": 77.8125,
"learning_rate": 9.991121574870055e-07,
"loss": 133.4353,
"step": 5680
},
{
"epoch": 0.45537588893710196,
"grad_norm": 78.9375,
"learning_rate": 9.991105943839897e-07,
"loss": 131.1083,
"step": 5690
},
{
"epoch": 0.45617619805649934,
"grad_norm": 81.6875,
"learning_rate": 9.991090312809739e-07,
"loss": 132.5374,
"step": 5700
},
{
"epoch": 0.45697650717589666,
"grad_norm": 75.8125,
"learning_rate": 9.99107468177958e-07,
"loss": 131.5541,
"step": 5710
},
{
"epoch": 0.45777681629529404,
"grad_norm": 77.875,
"learning_rate": 9.991059050749421e-07,
"loss": 133.4234,
"step": 5720
},
{
"epoch": 0.4585771254146914,
"grad_norm": 82.6875,
"learning_rate": 9.991043419719263e-07,
"loss": 132.8354,
"step": 5730
},
{
"epoch": 0.4593774345340888,
"grad_norm": 69.4375,
"learning_rate": 9.991027788689105e-07,
"loss": 130.9775,
"step": 5740
},
{
"epoch": 0.4601777436534862,
"grad_norm": 73.875,
"learning_rate": 9.991012157658948e-07,
"loss": 133.6315,
"step": 5750
},
{
"epoch": 0.4609780527728835,
"grad_norm": 77.125,
"learning_rate": 9.990996526628788e-07,
"loss": 133.1302,
"step": 5760
},
{
"epoch": 0.4617783618922809,
"grad_norm": 75.875,
"learning_rate": 9.99098089559863e-07,
"loss": 132.4343,
"step": 5770
},
{
"epoch": 0.46257867101167827,
"grad_norm": 74.8125,
"learning_rate": 9.990965264568472e-07,
"loss": 132.1773,
"step": 5780
},
{
"epoch": 0.46337898013107565,
"grad_norm": 74.5625,
"learning_rate": 9.990949633538312e-07,
"loss": 131.587,
"step": 5790
},
{
"epoch": 0.46417928925047297,
"grad_norm": 73.6875,
"learning_rate": 9.990934002508154e-07,
"loss": 132.3335,
"step": 5800
},
{
"epoch": 0.46497959836987035,
"grad_norm": 78.375,
"learning_rate": 9.990918371477997e-07,
"loss": 131.2027,
"step": 5810
},
{
"epoch": 0.46577990748926773,
"grad_norm": 74.875,
"learning_rate": 9.990902740447837e-07,
"loss": 132.1983,
"step": 5820
},
{
"epoch": 0.4665802166086651,
"grad_norm": 76.8125,
"learning_rate": 9.990887109417679e-07,
"loss": 130.0307,
"step": 5830
},
{
"epoch": 0.4673805257280625,
"grad_norm": 74.375,
"learning_rate": 9.99087147838752e-07,
"loss": 132.1818,
"step": 5840
},
{
"epoch": 0.4681808348474598,
"grad_norm": 80.25,
"learning_rate": 9.990855847357363e-07,
"loss": 131.9706,
"step": 5850
},
{
"epoch": 0.4689811439668572,
"grad_norm": 78.3125,
"learning_rate": 9.990840216327205e-07,
"loss": 132.3331,
"step": 5860
},
{
"epoch": 0.4697814530862546,
"grad_norm": 75.25,
"learning_rate": 9.990824585297045e-07,
"loss": 132.2483,
"step": 5870
},
{
"epoch": 0.47058176220565195,
"grad_norm": 73.125,
"learning_rate": 9.990808954266888e-07,
"loss": 132.6989,
"step": 5880
},
{
"epoch": 0.4713820713250493,
"grad_norm": 78.625,
"learning_rate": 9.99079332323673e-07,
"loss": 133.315,
"step": 5890
},
{
"epoch": 0.47218238044444666,
"grad_norm": 73.8125,
"learning_rate": 9.990777692206572e-07,
"loss": 131.2918,
"step": 5900
},
{
"epoch": 0.47298268956384404,
"grad_norm": 82.0625,
"learning_rate": 9.990762061176414e-07,
"loss": 132.8047,
"step": 5910
},
{
"epoch": 0.4737829986832414,
"grad_norm": 83.4375,
"learning_rate": 9.990746430146254e-07,
"loss": 133.7549,
"step": 5920
},
{
"epoch": 0.47458330780263874,
"grad_norm": 71.9375,
"learning_rate": 9.990730799116096e-07,
"loss": 132.3743,
"step": 5930
},
{
"epoch": 0.4753836169220361,
"grad_norm": 82.25,
"learning_rate": 9.990715168085939e-07,
"loss": 130.8257,
"step": 5940
},
{
"epoch": 0.4761839260414335,
"grad_norm": 75.8125,
"learning_rate": 9.990699537055779e-07,
"loss": 130.2371,
"step": 5950
},
{
"epoch": 0.4769842351608309,
"grad_norm": 78.1875,
"learning_rate": 9.99068390602562e-07,
"loss": 131.3828,
"step": 5960
},
{
"epoch": 0.47778454428022826,
"grad_norm": 75.9375,
"learning_rate": 9.990668274995463e-07,
"loss": 132.8423,
"step": 5970
},
{
"epoch": 0.4785848533996256,
"grad_norm": 77.3125,
"learning_rate": 9.990652643965303e-07,
"loss": 133.5517,
"step": 5980
},
{
"epoch": 0.47938516251902297,
"grad_norm": 72.5,
"learning_rate": 9.990637012935145e-07,
"loss": 132.8932,
"step": 5990
},
{
"epoch": 0.48018547163842035,
"grad_norm": 74.25,
"learning_rate": 9.990621381904987e-07,
"loss": 133.124,
"step": 6000
},
{
"epoch": 0.4809857807578177,
"grad_norm": 77.0625,
"learning_rate": 9.99060575087483e-07,
"loss": 132.1078,
"step": 6010
},
{
"epoch": 0.48178608987721505,
"grad_norm": 76.0625,
"learning_rate": 9.99059011984467e-07,
"loss": 132.8352,
"step": 6020
},
{
"epoch": 0.48258639899661243,
"grad_norm": 79.4375,
"learning_rate": 9.990574488814512e-07,
"loss": 132.3257,
"step": 6030
},
{
"epoch": 0.4833867081160098,
"grad_norm": 74.375,
"learning_rate": 9.990558857784354e-07,
"loss": 132.8362,
"step": 6040
},
{
"epoch": 0.4841870172354072,
"grad_norm": 74.375,
"learning_rate": 9.990543226754196e-07,
"loss": 131.3189,
"step": 6050
},
{
"epoch": 0.48498732635480457,
"grad_norm": 72.0625,
"learning_rate": 9.990527595724038e-07,
"loss": 131.8034,
"step": 6060
},
{
"epoch": 0.4857876354742019,
"grad_norm": 80.5625,
"learning_rate": 9.99051196469388e-07,
"loss": 131.7185,
"step": 6070
},
{
"epoch": 0.4865879445935993,
"grad_norm": 77.5625,
"learning_rate": 9.99049633366372e-07,
"loss": 133.0989,
"step": 6080
},
{
"epoch": 0.48738825371299666,
"grad_norm": 74.4375,
"learning_rate": 9.990480702633563e-07,
"loss": 131.1641,
"step": 6090
},
{
"epoch": 0.48818856283239404,
"grad_norm": 77.625,
"learning_rate": 9.990465071603405e-07,
"loss": 131.6567,
"step": 6100
},
{
"epoch": 0.48898887195179136,
"grad_norm": 80.25,
"learning_rate": 9.990449440573245e-07,
"loss": 132.3284,
"step": 6110
},
{
"epoch": 0.48978918107118874,
"grad_norm": 72.9375,
"learning_rate": 9.990433809543087e-07,
"loss": 131.856,
"step": 6120
},
{
"epoch": 0.4905894901905861,
"grad_norm": 72.6875,
"learning_rate": 9.99041817851293e-07,
"loss": 132.8506,
"step": 6130
},
{
"epoch": 0.4913897993099835,
"grad_norm": 76.0625,
"learning_rate": 9.99040254748277e-07,
"loss": 132.9671,
"step": 6140
},
{
"epoch": 0.4921901084293808,
"grad_norm": 81.875,
"learning_rate": 9.990386916452612e-07,
"loss": 133.4054,
"step": 6150
},
{
"epoch": 0.4929904175487782,
"grad_norm": 73.1875,
"learning_rate": 9.990371285422454e-07,
"loss": 132.2609,
"step": 6160
},
{
"epoch": 0.4937907266681756,
"grad_norm": 73.9375,
"learning_rate": 9.990355654392296e-07,
"loss": 133.1529,
"step": 6170
},
{
"epoch": 0.49459103578757296,
"grad_norm": 74.125,
"learning_rate": 9.990340023362136e-07,
"loss": 133.8595,
"step": 6180
},
{
"epoch": 0.49539134490697034,
"grad_norm": 72.0625,
"learning_rate": 9.990324392331978e-07,
"loss": 132.0925,
"step": 6190
},
{
"epoch": 0.49619165402636767,
"grad_norm": 72.5,
"learning_rate": 9.99030876130182e-07,
"loss": 133.2302,
"step": 6200
},
{
"epoch": 0.49699196314576505,
"grad_norm": 71.9375,
"learning_rate": 9.990293130271663e-07,
"loss": 132.7785,
"step": 6210
},
{
"epoch": 0.4977922722651624,
"grad_norm": 78.0625,
"learning_rate": 9.990277499241505e-07,
"loss": 133.0601,
"step": 6220
},
{
"epoch": 0.4985925813845598,
"grad_norm": 74.6875,
"learning_rate": 9.990261868211345e-07,
"loss": 132.1573,
"step": 6230
},
{
"epoch": 0.49939289050395713,
"grad_norm": 80.3125,
"learning_rate": 9.990246237181187e-07,
"loss": 132.7871,
"step": 6240
},
{
"epoch": 0.5001931996233545,
"grad_norm": 74.875,
"learning_rate": 9.99023060615103e-07,
"loss": 132.2812,
"step": 6250
},
{
"epoch": 0.5009935087427518,
"grad_norm": 80.875,
"learning_rate": 9.990214975120871e-07,
"loss": 130.9305,
"step": 6260
},
{
"epoch": 0.5017938178621493,
"grad_norm": 72.4375,
"learning_rate": 9.990199344090712e-07,
"loss": 133.3334,
"step": 6270
},
{
"epoch": 0.5025941269815466,
"grad_norm": 71.875,
"learning_rate": 9.990183713060554e-07,
"loss": 132.1767,
"step": 6280
},
{
"epoch": 0.503394436100944,
"grad_norm": 79.375,
"learning_rate": 9.990168082030396e-07,
"loss": 133.0485,
"step": 6290
},
{
"epoch": 0.5041947452203414,
"grad_norm": 72.8125,
"learning_rate": 9.990152451000236e-07,
"loss": 132.0657,
"step": 6300
},
{
"epoch": 0.5049950543397387,
"grad_norm": 82.5,
"learning_rate": 9.990136819970078e-07,
"loss": 130.4747,
"step": 6310
},
{
"epoch": 0.5057953634591361,
"grad_norm": 77.5,
"learning_rate": 9.99012118893992e-07,
"loss": 133.164,
"step": 6320
},
{
"epoch": 0.5065956725785334,
"grad_norm": 72.5625,
"learning_rate": 9.990105557909763e-07,
"loss": 131.2053,
"step": 6330
},
{
"epoch": 0.5073959816979309,
"grad_norm": 74.5,
"learning_rate": 9.990089926879603e-07,
"loss": 132.6195,
"step": 6340
},
{
"epoch": 0.5081962908173282,
"grad_norm": 78.5625,
"learning_rate": 9.990074295849445e-07,
"loss": 131.7617,
"step": 6350
},
{
"epoch": 0.5089965999367255,
"grad_norm": 73.5,
"learning_rate": 9.990058664819287e-07,
"loss": 131.7562,
"step": 6360
},
{
"epoch": 0.509796909056123,
"grad_norm": 79.4375,
"learning_rate": 9.99004303378913e-07,
"loss": 133.1088,
"step": 6370
},
{
"epoch": 0.5105972181755203,
"grad_norm": 79.3125,
"learning_rate": 9.990027402758971e-07,
"loss": 133.757,
"step": 6380
},
{
"epoch": 0.5113975272949176,
"grad_norm": 74.0625,
"learning_rate": 9.990011771728811e-07,
"loss": 133.9952,
"step": 6390
},
{
"epoch": 0.512197836414315,
"grad_norm": 72.5625,
"learning_rate": 9.989996140698654e-07,
"loss": 131.275,
"step": 6400
},
{
"epoch": 0.5129981455337124,
"grad_norm": 78.4375,
"learning_rate": 9.989980509668496e-07,
"loss": 131.3286,
"step": 6410
},
{
"epoch": 0.5137984546531098,
"grad_norm": 73.6875,
"learning_rate": 9.989964878638338e-07,
"loss": 131.1604,
"step": 6420
},
{
"epoch": 0.5145987637725071,
"grad_norm": 77.0,
"learning_rate": 9.989949247608178e-07,
"loss": 131.0352,
"step": 6430
},
{
"epoch": 0.5153990728919045,
"grad_norm": 77.8125,
"learning_rate": 9.98993361657802e-07,
"loss": 132.0469,
"step": 6440
},
{
"epoch": 0.5161993820113019,
"grad_norm": 76.1875,
"learning_rate": 9.989917985547862e-07,
"loss": 133.7251,
"step": 6450
},
{
"epoch": 0.5169996911306992,
"grad_norm": 70.6875,
"learning_rate": 9.989902354517702e-07,
"loss": 131.82,
"step": 6460
},
{
"epoch": 0.5178000002500966,
"grad_norm": 76.8125,
"learning_rate": 9.989886723487545e-07,
"loss": 131.712,
"step": 6470
},
{
"epoch": 0.518600309369494,
"grad_norm": 77.9375,
"learning_rate": 9.989871092457387e-07,
"loss": 132.1365,
"step": 6480
},
{
"epoch": 0.5194006184888913,
"grad_norm": 74.0,
"learning_rate": 9.989855461427227e-07,
"loss": 131.9362,
"step": 6490
},
{
"epoch": 0.5202009276082887,
"grad_norm": 77.3125,
"learning_rate": 9.98983983039707e-07,
"loss": 133.708,
"step": 6500
},
{
"epoch": 0.5210012367276861,
"grad_norm": 77.75,
"learning_rate": 9.989824199366911e-07,
"loss": 132.9561,
"step": 6510
},
{
"epoch": 0.5218015458470834,
"grad_norm": 76.8125,
"learning_rate": 9.989808568336753e-07,
"loss": 133.186,
"step": 6520
},
{
"epoch": 0.5226018549664808,
"grad_norm": 79.125,
"learning_rate": 9.989792937306596e-07,
"loss": 131.5064,
"step": 6530
},
{
"epoch": 0.5234021640858781,
"grad_norm": 74.5,
"learning_rate": 9.989777306276438e-07,
"loss": 131.7443,
"step": 6540
},
{
"epoch": 0.5242024732052756,
"grad_norm": 75.9375,
"learning_rate": 9.989761675246278e-07,
"loss": 132.0182,
"step": 6550
},
{
"epoch": 0.5250027823246729,
"grad_norm": 74.25,
"learning_rate": 9.98974604421612e-07,
"loss": 132.1322,
"step": 6560
},
{
"epoch": 0.5258030914440702,
"grad_norm": 77.25,
"learning_rate": 9.989730413185962e-07,
"loss": 131.4318,
"step": 6570
},
{
"epoch": 0.5266034005634677,
"grad_norm": 79.375,
"learning_rate": 9.989714782155802e-07,
"loss": 131.6486,
"step": 6580
},
{
"epoch": 0.527403709682865,
"grad_norm": 72.875,
"learning_rate": 9.989699151125644e-07,
"loss": 132.6251,
"step": 6590
},
{
"epoch": 0.5282040188022624,
"grad_norm": 77.75,
"learning_rate": 9.989683520095487e-07,
"loss": 130.9398,
"step": 6600
},
{
"epoch": 0.5290043279216597,
"grad_norm": 78.0625,
"learning_rate": 9.989667889065329e-07,
"loss": 132.2289,
"step": 6610
},
{
"epoch": 0.5298046370410571,
"grad_norm": 80.75,
"learning_rate": 9.989652258035169e-07,
"loss": 133.6902,
"step": 6620
},
{
"epoch": 0.5306049461604545,
"grad_norm": 77.0,
"learning_rate": 9.989636627005011e-07,
"loss": 131.2565,
"step": 6630
},
{
"epoch": 0.5314052552798518,
"grad_norm": 72.3125,
"learning_rate": 9.989620995974853e-07,
"loss": 133.0545,
"step": 6640
},
{
"epoch": 0.5322055643992493,
"grad_norm": 76.125,
"learning_rate": 9.989605364944693e-07,
"loss": 131.2922,
"step": 6650
},
{
"epoch": 0.5330058735186466,
"grad_norm": 70.8125,
"learning_rate": 9.989589733914536e-07,
"loss": 133.2325,
"step": 6660
},
{
"epoch": 0.5338061826380439,
"grad_norm": 83.5625,
"learning_rate": 9.989574102884378e-07,
"loss": 133.1627,
"step": 6670
},
{
"epoch": 0.5346064917574413,
"grad_norm": 73.4375,
"learning_rate": 9.98955847185422e-07,
"loss": 132.0812,
"step": 6680
},
{
"epoch": 0.5354068008768387,
"grad_norm": 84.8125,
"learning_rate": 9.989542840824062e-07,
"loss": 131.5462,
"step": 6690
},
{
"epoch": 0.536207109996236,
"grad_norm": 75.0,
"learning_rate": 9.989527209793904e-07,
"loss": 132.3298,
"step": 6700
},
{
"epoch": 0.5370074191156334,
"grad_norm": 82.0625,
"learning_rate": 9.989511578763744e-07,
"loss": 133.5979,
"step": 6710
},
{
"epoch": 0.5378077282350308,
"grad_norm": 73.8125,
"learning_rate": 9.989495947733586e-07,
"loss": 130.3924,
"step": 6720
},
{
"epoch": 0.5386080373544282,
"grad_norm": 85.1875,
"learning_rate": 9.989480316703429e-07,
"loss": 130.9538,
"step": 6730
},
{
"epoch": 0.5394083464738255,
"grad_norm": 77.4375,
"learning_rate": 9.989464685673269e-07,
"loss": 132.9888,
"step": 6740
},
{
"epoch": 0.5402086555932228,
"grad_norm": 78.75,
"learning_rate": 9.98944905464311e-07,
"loss": 132.472,
"step": 6750
},
{
"epoch": 0.5410089647126203,
"grad_norm": 73.6875,
"learning_rate": 9.989433423612953e-07,
"loss": 132.7828,
"step": 6760
},
{
"epoch": 0.5418092738320176,
"grad_norm": 74.9375,
"learning_rate": 9.989417792582795e-07,
"loss": 131.1772,
"step": 6770
},
{
"epoch": 0.542609582951415,
"grad_norm": 73.5625,
"learning_rate": 9.989402161552635e-07,
"loss": 132.5478,
"step": 6780
},
{
"epoch": 0.5434098920708124,
"grad_norm": 79.5,
"learning_rate": 9.989386530522478e-07,
"loss": 130.6843,
"step": 6790
},
{
"epoch": 0.5442102011902097,
"grad_norm": 73.5625,
"learning_rate": 9.98937089949232e-07,
"loss": 131.9659,
"step": 6800
},
{
"epoch": 0.5450105103096071,
"grad_norm": 73.3125,
"learning_rate": 9.98935526846216e-07,
"loss": 131.0417,
"step": 6810
},
{
"epoch": 0.5458108194290044,
"grad_norm": 74.3125,
"learning_rate": 9.989339637432002e-07,
"loss": 133.0612,
"step": 6820
},
{
"epoch": 0.5466111285484018,
"grad_norm": 75.25,
"learning_rate": 9.989324006401844e-07,
"loss": 132.5644,
"step": 6830
},
{
"epoch": 0.5474114376677992,
"grad_norm": 77.75,
"learning_rate": 9.989308375371686e-07,
"loss": 132.1365,
"step": 6840
},
{
"epoch": 0.5482117467871965,
"grad_norm": 77.1875,
"learning_rate": 9.989292744341529e-07,
"loss": 132.7418,
"step": 6850
},
{
"epoch": 0.549012055906594,
"grad_norm": 79.125,
"learning_rate": 9.989277113311369e-07,
"loss": 131.9871,
"step": 6860
},
{
"epoch": 0.5498123650259913,
"grad_norm": 75.0,
"learning_rate": 9.98926148228121e-07,
"loss": 131.313,
"step": 6870
},
{
"epoch": 0.5506126741453886,
"grad_norm": 74.6875,
"learning_rate": 9.989245851251053e-07,
"loss": 132.0703,
"step": 6880
},
{
"epoch": 0.551412983264786,
"grad_norm": 73.5625,
"learning_rate": 9.989230220220895e-07,
"loss": 130.9999,
"step": 6890
},
{
"epoch": 0.5522132923841834,
"grad_norm": 75.0625,
"learning_rate": 9.989214589190735e-07,
"loss": 132.2652,
"step": 6900
},
{
"epoch": 0.5530136015035808,
"grad_norm": 74.125,
"learning_rate": 9.989198958160577e-07,
"loss": 131.5572,
"step": 6910
},
{
"epoch": 0.5538139106229781,
"grad_norm": 75.8125,
"learning_rate": 9.98918332713042e-07,
"loss": 130.6713,
"step": 6920
},
{
"epoch": 0.5546142197423755,
"grad_norm": 75.0625,
"learning_rate": 9.98916769610026e-07,
"loss": 132.5074,
"step": 6930
},
{
"epoch": 0.5554145288617729,
"grad_norm": 83.5,
"learning_rate": 9.989152065070102e-07,
"loss": 132.7792,
"step": 6940
},
{
"epoch": 0.5562148379811702,
"grad_norm": 78.6875,
"learning_rate": 9.989136434039944e-07,
"loss": 132.4211,
"step": 6950
},
{
"epoch": 0.5570151471005675,
"grad_norm": 84.125,
"learning_rate": 9.989120803009786e-07,
"loss": 133.1216,
"step": 6960
},
{
"epoch": 0.557815456219965,
"grad_norm": 80.8125,
"learning_rate": 9.989105171979626e-07,
"loss": 131.8983,
"step": 6970
},
{
"epoch": 0.5586157653393623,
"grad_norm": 77.875,
"learning_rate": 9.989089540949468e-07,
"loss": 131.918,
"step": 6980
},
{
"epoch": 0.5594160744587597,
"grad_norm": 73.1875,
"learning_rate": 9.98907390991931e-07,
"loss": 131.1051,
"step": 6990
},
{
"epoch": 0.5602163835781571,
"grad_norm": 78.0625,
"learning_rate": 9.989058278889153e-07,
"loss": 132.776,
"step": 7000
},
{
"epoch": 0.5610166926975544,
"grad_norm": 82.875,
"learning_rate": 9.989042647858995e-07,
"loss": 131.2556,
"step": 7010
},
{
"epoch": 0.5618170018169518,
"grad_norm": 79.8125,
"learning_rate": 9.989027016828835e-07,
"loss": 131.6359,
"step": 7020
},
{
"epoch": 0.5626173109363491,
"grad_norm": 78.875,
"learning_rate": 9.989011385798677e-07,
"loss": 132.1506,
"step": 7030
},
{
"epoch": 0.5634176200557466,
"grad_norm": 74.5625,
"learning_rate": 9.98899575476852e-07,
"loss": 131.6759,
"step": 7040
},
{
"epoch": 0.5642179291751439,
"grad_norm": 76.5,
"learning_rate": 9.988980123738362e-07,
"loss": 131.1462,
"step": 7050
},
{
"epoch": 0.5650182382945412,
"grad_norm": 82.625,
"learning_rate": 9.988964492708202e-07,
"loss": 130.7009,
"step": 7060
},
{
"epoch": 0.5658185474139387,
"grad_norm": 84.875,
"learning_rate": 9.988948861678044e-07,
"loss": 134.6625,
"step": 7070
},
{
"epoch": 0.566618856533336,
"grad_norm": 77.1875,
"learning_rate": 9.988933230647886e-07,
"loss": 132.1597,
"step": 7080
},
{
"epoch": 0.5674191656527333,
"grad_norm": 75.6875,
"learning_rate": 9.988917599617726e-07,
"loss": 131.5931,
"step": 7090
},
{
"epoch": 0.5682194747721308,
"grad_norm": 73.6875,
"learning_rate": 9.988901968587568e-07,
"loss": 131.8727,
"step": 7100
},
{
"epoch": 0.5690197838915281,
"grad_norm": 73.25,
"learning_rate": 9.98888633755741e-07,
"loss": 132.6044,
"step": 7110
},
{
"epoch": 0.5698200930109255,
"grad_norm": 85.75,
"learning_rate": 9.98887070652725e-07,
"loss": 131.7462,
"step": 7120
},
{
"epoch": 0.5706204021303228,
"grad_norm": 74.0625,
"learning_rate": 9.988855075497093e-07,
"loss": 132.0296,
"step": 7130
},
{
"epoch": 0.5714207112497202,
"grad_norm": 80.6875,
"learning_rate": 9.988839444466935e-07,
"loss": 134.3069,
"step": 7140
},
{
"epoch": 0.5722210203691176,
"grad_norm": 76.0,
"learning_rate": 9.988823813436777e-07,
"loss": 131.401,
"step": 7150
},
{
"epoch": 0.5730213294885149,
"grad_norm": 81.5625,
"learning_rate": 9.98880818240662e-07,
"loss": 132.4782,
"step": 7160
},
{
"epoch": 0.5738216386079124,
"grad_norm": 76.4375,
"learning_rate": 9.988792551376461e-07,
"loss": 132.7547,
"step": 7170
},
{
"epoch": 0.5746219477273097,
"grad_norm": 72.9375,
"learning_rate": 9.988776920346301e-07,
"loss": 131.6157,
"step": 7180
},
{
"epoch": 0.575422256846707,
"grad_norm": 75.5625,
"learning_rate": 9.988761289316144e-07,
"loss": 132.9658,
"step": 7190
},
{
"epoch": 0.5762225659661044,
"grad_norm": 73.3125,
"learning_rate": 9.988745658285986e-07,
"loss": 131.4694,
"step": 7200
},
{
"epoch": 0.5770228750855018,
"grad_norm": 76.6875,
"learning_rate": 9.988730027255828e-07,
"loss": 132.546,
"step": 7210
},
{
"epoch": 0.5778231842048991,
"grad_norm": 77.6875,
"learning_rate": 9.988714396225668e-07,
"loss": 131.9255,
"step": 7220
},
{
"epoch": 0.5786234933242965,
"grad_norm": 82.0625,
"learning_rate": 9.98869876519551e-07,
"loss": 133.1269,
"step": 7230
},
{
"epoch": 0.5794238024436938,
"grad_norm": 75.5625,
"learning_rate": 9.988683134165352e-07,
"loss": 131.8465,
"step": 7240
},
{
"epoch": 0.5802241115630913,
"grad_norm": 78.125,
"learning_rate": 9.988667503135193e-07,
"loss": 131.8865,
"step": 7250
},
{
"epoch": 0.5810244206824886,
"grad_norm": 77.625,
"learning_rate": 9.988651872105035e-07,
"loss": 131.2077,
"step": 7260
},
{
"epoch": 0.5818247298018859,
"grad_norm": 73.0,
"learning_rate": 9.988636241074877e-07,
"loss": 132.2729,
"step": 7270
},
{
"epoch": 0.5826250389212834,
"grad_norm": 76.125,
"learning_rate": 9.988620610044717e-07,
"loss": 132.5173,
"step": 7280
},
{
"epoch": 0.5834253480406807,
"grad_norm": 72.875,
"learning_rate": 9.98860497901456e-07,
"loss": 130.8854,
"step": 7290
},
{
"epoch": 0.5842256571600781,
"grad_norm": 76.5625,
"learning_rate": 9.988589347984401e-07,
"loss": 130.3204,
"step": 7300
},
{
"epoch": 0.5850259662794755,
"grad_norm": 79.375,
"learning_rate": 9.988573716954244e-07,
"loss": 131.2047,
"step": 7310
},
{
"epoch": 0.5858262753988728,
"grad_norm": 76.6875,
"learning_rate": 9.988558085924086e-07,
"loss": 132.3948,
"step": 7320
},
{
"epoch": 0.5866265845182702,
"grad_norm": 75.0625,
"learning_rate": 9.988542454893926e-07,
"loss": 131.0941,
"step": 7330
},
{
"epoch": 0.5874268936376675,
"grad_norm": 80.375,
"learning_rate": 9.988526823863768e-07,
"loss": 132.392,
"step": 7340
},
{
"epoch": 0.588227202757065,
"grad_norm": 75.3125,
"learning_rate": 9.98851119283361e-07,
"loss": 132.1595,
"step": 7350
},
{
"epoch": 0.5890275118764623,
"grad_norm": 80.5,
"learning_rate": 9.988495561803452e-07,
"loss": 131.9138,
"step": 7360
},
{
"epoch": 0.5898278209958596,
"grad_norm": 74.5625,
"learning_rate": 9.988479930773294e-07,
"loss": 129.6595,
"step": 7370
},
{
"epoch": 0.590628130115257,
"grad_norm": 74.5625,
"learning_rate": 9.988464299743135e-07,
"loss": 130.2861,
"step": 7380
},
{
"epoch": 0.5914284392346544,
"grad_norm": 71.625,
"learning_rate": 9.988448668712977e-07,
"loss": 132.6865,
"step": 7390
},
{
"epoch": 0.5922287483540517,
"grad_norm": 80.25,
"learning_rate": 9.988433037682819e-07,
"loss": 132.5223,
"step": 7400
},
{
"epoch": 0.5930290574734491,
"grad_norm": 70.5,
"learning_rate": 9.98841740665266e-07,
"loss": 132.5188,
"step": 7410
},
{
"epoch": 0.5938293665928465,
"grad_norm": 71.4375,
"learning_rate": 9.988401775622501e-07,
"loss": 132.2418,
"step": 7420
},
{
"epoch": 0.5946296757122439,
"grad_norm": 75.25,
"learning_rate": 9.988386144592343e-07,
"loss": 130.8538,
"step": 7430
},
{
"epoch": 0.5954299848316412,
"grad_norm": 75.8125,
"learning_rate": 9.988370513562183e-07,
"loss": 131.213,
"step": 7440
},
{
"epoch": 0.5962302939510385,
"grad_norm": 74.0,
"learning_rate": 9.988354882532026e-07,
"loss": 132.8653,
"step": 7450
},
{
"epoch": 0.597030603070436,
"grad_norm": 74.25,
"learning_rate": 9.988339251501868e-07,
"loss": 131.1389,
"step": 7460
},
{
"epoch": 0.5978309121898333,
"grad_norm": 79.0625,
"learning_rate": 9.98832362047171e-07,
"loss": 131.8387,
"step": 7470
},
{
"epoch": 0.5986312213092307,
"grad_norm": 92.9375,
"learning_rate": 9.988307989441552e-07,
"loss": 132.9031,
"step": 7480
},
{
"epoch": 0.5994315304286281,
"grad_norm": 75.75,
"learning_rate": 9.988292358411392e-07,
"loss": 130.4166,
"step": 7490
},
{
"epoch": 0.6002318395480254,
"grad_norm": 75.25,
"learning_rate": 9.988276727381234e-07,
"loss": 131.6747,
"step": 7500
},
{
"epoch": 0.6002318395480254,
"eval_loss": 2.0600602626800537,
"eval_runtime": 418.6157,
"eval_samples_per_second": 1567.447,
"eval_steps_per_second": 48.983,
"step": 7500
},
{
"epoch": 0.6010321486674228,
"grad_norm": 79.875,
"learning_rate": 9.988261096351077e-07,
"loss": 131.0494,
"step": 7510
},
{
"epoch": 0.6018324577868202,
"grad_norm": 72.125,
"learning_rate": 9.988245465320919e-07,
"loss": 131.0795,
"step": 7520
},
{
"epoch": 0.6026327669062175,
"grad_norm": 73.4375,
"learning_rate": 9.98822983429076e-07,
"loss": 131.2448,
"step": 7530
},
{
"epoch": 0.6034330760256149,
"grad_norm": 78.125,
"learning_rate": 9.9882142032606e-07,
"loss": 132.0931,
"step": 7540
},
{
"epoch": 0.6042333851450122,
"grad_norm": 71.25,
"learning_rate": 9.988198572230443e-07,
"loss": 131.5498,
"step": 7550
},
{
"epoch": 0.6050336942644097,
"grad_norm": 77.5625,
"learning_rate": 9.988182941200285e-07,
"loss": 132.7485,
"step": 7560
},
{
"epoch": 0.605834003383807,
"grad_norm": 77.375,
"learning_rate": 9.988167310170125e-07,
"loss": 132.6295,
"step": 7570
},
{
"epoch": 0.6066343125032043,
"grad_norm": 78.1875,
"learning_rate": 9.988151679139968e-07,
"loss": 130.9077,
"step": 7580
},
{
"epoch": 0.6074346216226018,
"grad_norm": 79.9375,
"learning_rate": 9.98813604810981e-07,
"loss": 133.7221,
"step": 7590
},
{
"epoch": 0.6082349307419991,
"grad_norm": 77.9375,
"learning_rate": 9.98812041707965e-07,
"loss": 131.693,
"step": 7600
},
{
"epoch": 0.6090352398613965,
"grad_norm": 79.875,
"learning_rate": 9.988104786049492e-07,
"loss": 131.3416,
"step": 7610
},
{
"epoch": 0.6098355489807938,
"grad_norm": 73.0,
"learning_rate": 9.988089155019334e-07,
"loss": 132.1983,
"step": 7620
},
{
"epoch": 0.6106358581001912,
"grad_norm": 77.375,
"learning_rate": 9.988073523989176e-07,
"loss": 132.4808,
"step": 7630
},
{
"epoch": 0.6114361672195886,
"grad_norm": 76.5625,
"learning_rate": 9.988057892959019e-07,
"loss": 131.0641,
"step": 7640
},
{
"epoch": 0.6122364763389859,
"grad_norm": 84.75,
"learning_rate": 9.988042261928859e-07,
"loss": 132.1883,
"step": 7650
},
{
"epoch": 0.6130367854583832,
"grad_norm": 71.75,
"learning_rate": 9.9880266308987e-07,
"loss": 132.5537,
"step": 7660
},
{
"epoch": 0.6138370945777807,
"grad_norm": 83.5625,
"learning_rate": 9.988010999868543e-07,
"loss": 131.5181,
"step": 7670
},
{
"epoch": 0.614637403697178,
"grad_norm": 79.9375,
"learning_rate": 9.987995368838385e-07,
"loss": 132.7484,
"step": 7680
},
{
"epoch": 0.6154377128165754,
"grad_norm": 76.25,
"learning_rate": 9.987979737808225e-07,
"loss": 132.5646,
"step": 7690
},
{
"epoch": 0.6162380219359728,
"grad_norm": 77.75,
"learning_rate": 9.987964106778067e-07,
"loss": 132.2048,
"step": 7700
},
{
"epoch": 0.6170383310553701,
"grad_norm": 74.125,
"learning_rate": 9.98794847574791e-07,
"loss": 132.1626,
"step": 7710
},
{
"epoch": 0.6178386401747675,
"grad_norm": 79.0,
"learning_rate": 9.987932844717752e-07,
"loss": 131.6138,
"step": 7720
},
{
"epoch": 0.6186389492941649,
"grad_norm": 76.9375,
"learning_rate": 9.987917213687592e-07,
"loss": 130.9984,
"step": 7730
},
{
"epoch": 0.6194392584135623,
"grad_norm": 79.875,
"learning_rate": 9.987901582657434e-07,
"loss": 132.2128,
"step": 7740
},
{
"epoch": 0.6202395675329596,
"grad_norm": 75.5625,
"learning_rate": 9.987885951627276e-07,
"loss": 132.1526,
"step": 7750
},
{
"epoch": 0.6210398766523569,
"grad_norm": 71.75,
"learning_rate": 9.987870320597116e-07,
"loss": 130.6957,
"step": 7760
},
{
"epoch": 0.6218401857717544,
"grad_norm": 80.8125,
"learning_rate": 9.987854689566959e-07,
"loss": 131.2453,
"step": 7770
},
{
"epoch": 0.6226404948911517,
"grad_norm": 82.4375,
"learning_rate": 9.9878390585368e-07,
"loss": 130.4836,
"step": 7780
},
{
"epoch": 0.623440804010549,
"grad_norm": 72.0625,
"learning_rate": 9.987823427506643e-07,
"loss": 131.348,
"step": 7790
},
{
"epoch": 0.6242411131299465,
"grad_norm": 74.125,
"learning_rate": 9.987807796476483e-07,
"loss": 132.7037,
"step": 7800
},
{
"epoch": 0.6250414222493438,
"grad_norm": 71.625,
"learning_rate": 9.987792165446325e-07,
"loss": 132.4081,
"step": 7810
},
{
"epoch": 0.6258417313687412,
"grad_norm": 75.6875,
"learning_rate": 9.987776534416167e-07,
"loss": 130.35,
"step": 7820
},
{
"epoch": 0.6266420404881385,
"grad_norm": 72.8125,
"learning_rate": 9.98776090338601e-07,
"loss": 132.3228,
"step": 7830
},
{
"epoch": 0.6274423496075359,
"grad_norm": 80.6875,
"learning_rate": 9.987745272355852e-07,
"loss": 131.8622,
"step": 7840
},
{
"epoch": 0.6282426587269333,
"grad_norm": 81.4375,
"learning_rate": 9.987729641325692e-07,
"loss": 132.0134,
"step": 7850
},
{
"epoch": 0.6290429678463306,
"grad_norm": 78.9375,
"learning_rate": 9.987714010295534e-07,
"loss": 132.4322,
"step": 7860
},
{
"epoch": 0.6298432769657281,
"grad_norm": 76.5625,
"learning_rate": 9.987698379265376e-07,
"loss": 132.5075,
"step": 7870
},
{
"epoch": 0.6306435860851254,
"grad_norm": 77.75,
"learning_rate": 9.987682748235216e-07,
"loss": 131.9704,
"step": 7880
},
{
"epoch": 0.6314438952045227,
"grad_norm": 78.375,
"learning_rate": 9.987667117205058e-07,
"loss": 131.3642,
"step": 7890
},
{
"epoch": 0.6322442043239201,
"grad_norm": 75.9375,
"learning_rate": 9.9876514861749e-07,
"loss": 132.9338,
"step": 7900
},
{
"epoch": 0.6330445134433175,
"grad_norm": 73.25,
"learning_rate": 9.987635855144743e-07,
"loss": 131.7696,
"step": 7910
},
{
"epoch": 0.6338448225627148,
"grad_norm": 79.4375,
"learning_rate": 9.987620224114583e-07,
"loss": 132.7947,
"step": 7920
},
{
"epoch": 0.6346451316821122,
"grad_norm": 75.3125,
"learning_rate": 9.987604593084425e-07,
"loss": 131.7236,
"step": 7930
},
{
"epoch": 0.6354454408015096,
"grad_norm": 74.1875,
"learning_rate": 9.987588962054267e-07,
"loss": 131.487,
"step": 7940
},
{
"epoch": 0.636245749920907,
"grad_norm": 76.3125,
"learning_rate": 9.98757333102411e-07,
"loss": 132.3433,
"step": 7950
},
{
"epoch": 0.6370460590403043,
"grad_norm": 76.4375,
"learning_rate": 9.98755769999395e-07,
"loss": 132.2295,
"step": 7960
},
{
"epoch": 0.6378463681597016,
"grad_norm": 73.375,
"learning_rate": 9.987542068963792e-07,
"loss": 130.3616,
"step": 7970
},
{
"epoch": 0.6386466772790991,
"grad_norm": 71.75,
"learning_rate": 9.987526437933634e-07,
"loss": 133.0935,
"step": 7980
},
{
"epoch": 0.6394469863984964,
"grad_norm": 74.1875,
"learning_rate": 9.987510806903476e-07,
"loss": 131.3436,
"step": 7990
},
{
"epoch": 0.6402472955178938,
"grad_norm": 73.1875,
"learning_rate": 9.987495175873318e-07,
"loss": 132.8394,
"step": 8000
},
{
"epoch": 0.6410476046372912,
"grad_norm": 77.4375,
"learning_rate": 9.987479544843158e-07,
"loss": 131.8165,
"step": 8010
},
{
"epoch": 0.6418479137566885,
"grad_norm": 76.5,
"learning_rate": 9.987463913813e-07,
"loss": 131.6329,
"step": 8020
},
{
"epoch": 0.6426482228760859,
"grad_norm": 74.3125,
"learning_rate": 9.987448282782843e-07,
"loss": 132.5564,
"step": 8030
},
{
"epoch": 0.6434485319954832,
"grad_norm": 80.25,
"learning_rate": 9.987432651752683e-07,
"loss": 131.5118,
"step": 8040
},
{
"epoch": 0.6442488411148807,
"grad_norm": 81.4375,
"learning_rate": 9.987417020722525e-07,
"loss": 130.5542,
"step": 8050
},
{
"epoch": 0.645049150234278,
"grad_norm": 75.3125,
"learning_rate": 9.987401389692367e-07,
"loss": 132.7824,
"step": 8060
},
{
"epoch": 0.6458494593536753,
"grad_norm": 75.6875,
"learning_rate": 9.98738575866221e-07,
"loss": 131.6364,
"step": 8070
},
{
"epoch": 0.6466497684730728,
"grad_norm": 69.125,
"learning_rate": 9.98737012763205e-07,
"loss": 131.1746,
"step": 8080
},
{
"epoch": 0.6474500775924701,
"grad_norm": 79.6875,
"learning_rate": 9.987354496601891e-07,
"loss": 131.0759,
"step": 8090
},
{
"epoch": 0.6482503867118674,
"grad_norm": 74.0625,
"learning_rate": 9.987338865571734e-07,
"loss": 131.1551,
"step": 8100
},
{
"epoch": 0.6490506958312648,
"grad_norm": 81.8125,
"learning_rate": 9.987323234541576e-07,
"loss": 130.3787,
"step": 8110
},
{
"epoch": 0.6498510049506622,
"grad_norm": 76.125,
"learning_rate": 9.987307603511416e-07,
"loss": 132.5409,
"step": 8120
},
{
"epoch": 0.6506513140700596,
"grad_norm": 74.25,
"learning_rate": 9.987291972481258e-07,
"loss": 132.2028,
"step": 8130
},
{
"epoch": 0.6514516231894569,
"grad_norm": 79.9375,
"learning_rate": 9.9872763414511e-07,
"loss": 130.8431,
"step": 8140
},
{
"epoch": 0.6522519323088543,
"grad_norm": 71.0,
"learning_rate": 9.987260710420942e-07,
"loss": 132.5052,
"step": 8150
},
{
"epoch": 0.6530522414282517,
"grad_norm": 88.9375,
"learning_rate": 9.987245079390785e-07,
"loss": 131.6914,
"step": 8160
},
{
"epoch": 0.653852550547649,
"grad_norm": 71.75,
"learning_rate": 9.987229448360625e-07,
"loss": 131.5299,
"step": 8170
},
{
"epoch": 0.6546528596670464,
"grad_norm": 75.1875,
"learning_rate": 9.987213817330467e-07,
"loss": 131.2931,
"step": 8180
},
{
"epoch": 0.6554531687864438,
"grad_norm": 76.625,
"learning_rate": 9.98719818630031e-07,
"loss": 132.1339,
"step": 8190
},
{
"epoch": 0.6562534779058411,
"grad_norm": 76.5625,
"learning_rate": 9.98718255527015e-07,
"loss": 131.0958,
"step": 8200
},
{
"epoch": 0.6570537870252385,
"grad_norm": 72.6875,
"learning_rate": 9.987166924239991e-07,
"loss": 131.8033,
"step": 8210
},
{
"epoch": 0.6578540961446359,
"grad_norm": 77.4375,
"learning_rate": 9.987151293209833e-07,
"loss": 132.3042,
"step": 8220
},
{
"epoch": 0.6586544052640332,
"grad_norm": 72.0625,
"learning_rate": 9.987135662179674e-07,
"loss": 131.9262,
"step": 8230
},
{
"epoch": 0.6594547143834306,
"grad_norm": 81.125,
"learning_rate": 9.987120031149516e-07,
"loss": 131.4936,
"step": 8240
},
{
"epoch": 0.6602550235028279,
"grad_norm": 72.125,
"learning_rate": 9.987104400119358e-07,
"loss": 131.1532,
"step": 8250
},
{
"epoch": 0.6610553326222254,
"grad_norm": 76.5,
"learning_rate": 9.9870887690892e-07,
"loss": 132.0334,
"step": 8260
},
{
"epoch": 0.6618556417416227,
"grad_norm": 70.9375,
"learning_rate": 9.98707313805904e-07,
"loss": 131.7788,
"step": 8270
},
{
"epoch": 0.66265595086102,
"grad_norm": 78.5,
"learning_rate": 9.987057507028882e-07,
"loss": 131.3623,
"step": 8280
},
{
"epoch": 0.6634562599804175,
"grad_norm": 77.75,
"learning_rate": 9.987041875998724e-07,
"loss": 131.0159,
"step": 8290
},
{
"epoch": 0.6642565690998148,
"grad_norm": 75.0,
"learning_rate": 9.987026244968567e-07,
"loss": 131.802,
"step": 8300
},
{
"epoch": 0.6650568782192122,
"grad_norm": 74.3125,
"learning_rate": 9.987010613938409e-07,
"loss": 130.4846,
"step": 8310
},
{
"epoch": 0.6658571873386095,
"grad_norm": 78.0625,
"learning_rate": 9.98699498290825e-07,
"loss": 130.5051,
"step": 8320
},
{
"epoch": 0.6666574964580069,
"grad_norm": 71.625,
"learning_rate": 9.986979351878091e-07,
"loss": 131.5967,
"step": 8330
},
{
"epoch": 0.6674578055774043,
"grad_norm": 71.5,
"learning_rate": 9.986963720847933e-07,
"loss": 132.5512,
"step": 8340
},
{
"epoch": 0.6682581146968016,
"grad_norm": 78.375,
"learning_rate": 9.986948089817775e-07,
"loss": 131.1753,
"step": 8350
},
{
"epoch": 0.669058423816199,
"grad_norm": 74.1875,
"learning_rate": 9.986932458787616e-07,
"loss": 130.4604,
"step": 8360
},
{
"epoch": 0.6698587329355964,
"grad_norm": 73.4375,
"learning_rate": 9.986916827757458e-07,
"loss": 132.9316,
"step": 8370
},
{
"epoch": 0.6706590420549937,
"grad_norm": 74.875,
"learning_rate": 9.9869011967273e-07,
"loss": 131.4552,
"step": 8380
},
{
"epoch": 0.6714593511743912,
"grad_norm": 78.0,
"learning_rate": 9.98688556569714e-07,
"loss": 131.1365,
"step": 8390
},
{
"epoch": 0.6722596602937885,
"grad_norm": 72.8125,
"learning_rate": 9.986869934666982e-07,
"loss": 132.5655,
"step": 8400
},
{
"epoch": 0.6730599694131858,
"grad_norm": 75.3125,
"learning_rate": 9.986854303636824e-07,
"loss": 132.0653,
"step": 8410
},
{
"epoch": 0.6738602785325832,
"grad_norm": 80.0,
"learning_rate": 9.986838672606667e-07,
"loss": 130.1704,
"step": 8420
},
{
"epoch": 0.6746605876519806,
"grad_norm": 79.9375,
"learning_rate": 9.986823041576507e-07,
"loss": 131.86,
"step": 8430
},
{
"epoch": 0.675460896771378,
"grad_norm": 77.5,
"learning_rate": 9.986807410546349e-07,
"loss": 132.2835,
"step": 8440
},
{
"epoch": 0.6762612058907753,
"grad_norm": 76.125,
"learning_rate": 9.98679177951619e-07,
"loss": 133.6062,
"step": 8450
},
{
"epoch": 0.6770615150101726,
"grad_norm": 74.4375,
"learning_rate": 9.986776148486033e-07,
"loss": 133.1529,
"step": 8460
},
{
"epoch": 0.6778618241295701,
"grad_norm": 77.6875,
"learning_rate": 9.986760517455875e-07,
"loss": 130.7149,
"step": 8470
},
{
"epoch": 0.6786621332489674,
"grad_norm": 76.6875,
"learning_rate": 9.986744886425715e-07,
"loss": 131.2965,
"step": 8480
},
{
"epoch": 0.6794624423683647,
"grad_norm": 78.0625,
"learning_rate": 9.986729255395558e-07,
"loss": 131.7146,
"step": 8490
},
{
"epoch": 0.6802627514877622,
"grad_norm": 77.1875,
"learning_rate": 9.9867136243654e-07,
"loss": 131.4988,
"step": 8500
},
{
"epoch": 0.6810630606071595,
"grad_norm": 75.625,
"learning_rate": 9.986697993335242e-07,
"loss": 131.0816,
"step": 8510
},
{
"epoch": 0.6818633697265569,
"grad_norm": 73.625,
"learning_rate": 9.986682362305082e-07,
"loss": 131.8303,
"step": 8520
},
{
"epoch": 0.6826636788459542,
"grad_norm": 77.625,
"learning_rate": 9.986666731274924e-07,
"loss": 130.0348,
"step": 8530
},
{
"epoch": 0.6834639879653516,
"grad_norm": 79.0,
"learning_rate": 9.986651100244766e-07,
"loss": 132.8476,
"step": 8540
},
{
"epoch": 0.684264297084749,
"grad_norm": 74.625,
"learning_rate": 9.986635469214606e-07,
"loss": 131.6226,
"step": 8550
},
{
"epoch": 0.6850646062041463,
"grad_norm": 79.0,
"learning_rate": 9.986619838184449e-07,
"loss": 131.3241,
"step": 8560
},
{
"epoch": 0.6858649153235438,
"grad_norm": 76.9375,
"learning_rate": 9.98660420715429e-07,
"loss": 131.6735,
"step": 8570
},
{
"epoch": 0.6866652244429411,
"grad_norm": 70.8125,
"learning_rate": 9.986588576124133e-07,
"loss": 131.7806,
"step": 8580
},
{
"epoch": 0.6874655335623384,
"grad_norm": 83.25,
"learning_rate": 9.986572945093973e-07,
"loss": 132.0227,
"step": 8590
},
{
"epoch": 0.6882658426817359,
"grad_norm": 76.5625,
"learning_rate": 9.986557314063815e-07,
"loss": 131.6524,
"step": 8600
},
{
"epoch": 0.6890661518011332,
"grad_norm": 79.875,
"learning_rate": 9.986541683033657e-07,
"loss": 132.2215,
"step": 8610
},
{
"epoch": 0.6898664609205305,
"grad_norm": 85.125,
"learning_rate": 9.9865260520035e-07,
"loss": 132.894,
"step": 8620
},
{
"epoch": 0.6906667700399279,
"grad_norm": 78.625,
"learning_rate": 9.986510420973342e-07,
"loss": 131.7396,
"step": 8630
},
{
"epoch": 0.6914670791593253,
"grad_norm": 84.375,
"learning_rate": 9.986494789943182e-07,
"loss": 132.6336,
"step": 8640
},
{
"epoch": 0.6922673882787227,
"grad_norm": 80.4375,
"learning_rate": 9.986479158913024e-07,
"loss": 130.9795,
"step": 8650
},
{
"epoch": 0.69306769739812,
"grad_norm": 73.0625,
"learning_rate": 9.986463527882866e-07,
"loss": 132.994,
"step": 8660
},
{
"epoch": 0.6938680065175173,
"grad_norm": 77.75,
"learning_rate": 9.986447896852708e-07,
"loss": 131.1462,
"step": 8670
},
{
"epoch": 0.6946683156369148,
"grad_norm": 73.4375,
"learning_rate": 9.986432265822548e-07,
"loss": 132.2478,
"step": 8680
},
{
"epoch": 0.6954686247563121,
"grad_norm": 76.125,
"learning_rate": 9.98641663479239e-07,
"loss": 130.9919,
"step": 8690
},
{
"epoch": 0.6962689338757095,
"grad_norm": 76.0625,
"learning_rate": 9.986401003762233e-07,
"loss": 130.4246,
"step": 8700
},
{
"epoch": 0.6970692429951069,
"grad_norm": 75.25,
"learning_rate": 9.986385372732073e-07,
"loss": 132.2129,
"step": 8710
},
{
"epoch": 0.6978695521145042,
"grad_norm": 77.8125,
"learning_rate": 9.986369741701915e-07,
"loss": 132.2469,
"step": 8720
},
{
"epoch": 0.6986698612339016,
"grad_norm": 74.3125,
"learning_rate": 9.986354110671757e-07,
"loss": 131.4874,
"step": 8730
},
{
"epoch": 0.699470170353299,
"grad_norm": 74.6875,
"learning_rate": 9.986338479641597e-07,
"loss": 131.288,
"step": 8740
},
{
"epoch": 0.7002704794726964,
"grad_norm": 78.875,
"learning_rate": 9.98632284861144e-07,
"loss": 130.9051,
"step": 8750
},
{
"epoch": 0.7010707885920937,
"grad_norm": 70.1875,
"learning_rate": 9.986307217581282e-07,
"loss": 131.5563,
"step": 8760
},
{
"epoch": 0.701871097711491,
"grad_norm": 76.75,
"learning_rate": 9.986291586551124e-07,
"loss": 132.3225,
"step": 8770
},
{
"epoch": 0.7026714068308885,
"grad_norm": 84.9375,
"learning_rate": 9.986275955520966e-07,
"loss": 132.3153,
"step": 8780
},
{
"epoch": 0.7034717159502858,
"grad_norm": 79.0,
"learning_rate": 9.986260324490808e-07,
"loss": 131.7471,
"step": 8790
},
{
"epoch": 0.7042720250696831,
"grad_norm": 79.0,
"learning_rate": 9.986244693460648e-07,
"loss": 132.4772,
"step": 8800
},
{
"epoch": 0.7050723341890806,
"grad_norm": 74.5625,
"learning_rate": 9.98622906243049e-07,
"loss": 130.8647,
"step": 8810
},
{
"epoch": 0.7058726433084779,
"grad_norm": 75.9375,
"learning_rate": 9.986213431400333e-07,
"loss": 131.4361,
"step": 8820
},
{
"epoch": 0.7066729524278753,
"grad_norm": 75.3125,
"learning_rate": 9.986197800370175e-07,
"loss": 131.5961,
"step": 8830
},
{
"epoch": 0.7074732615472726,
"grad_norm": 74.5,
"learning_rate": 9.986182169340015e-07,
"loss": 131.5035,
"step": 8840
},
{
"epoch": 0.70827357066667,
"grad_norm": 77.125,
"learning_rate": 9.986166538309857e-07,
"loss": 131.417,
"step": 8850
},
{
"epoch": 0.7090738797860674,
"grad_norm": 71.4375,
"learning_rate": 9.9861509072797e-07,
"loss": 130.5734,
"step": 8860
},
{
"epoch": 0.7098741889054647,
"grad_norm": 79.75,
"learning_rate": 9.98613527624954e-07,
"loss": 130.9962,
"step": 8870
},
{
"epoch": 0.7106744980248622,
"grad_norm": 71.6875,
"learning_rate": 9.986119645219382e-07,
"loss": 131.3529,
"step": 8880
},
{
"epoch": 0.7114748071442595,
"grad_norm": 72.875,
"learning_rate": 9.986104014189224e-07,
"loss": 131.3156,
"step": 8890
},
{
"epoch": 0.7122751162636568,
"grad_norm": 73.0625,
"learning_rate": 9.986088383159064e-07,
"loss": 130.3418,
"step": 8900
},
{
"epoch": 0.7130754253830542,
"grad_norm": 79.1875,
"learning_rate": 9.986072752128906e-07,
"loss": 131.1646,
"step": 8910
},
{
"epoch": 0.7138757345024516,
"grad_norm": 81.6875,
"learning_rate": 9.986057121098748e-07,
"loss": 131.9824,
"step": 8920
},
{
"epoch": 0.7146760436218489,
"grad_norm": 81.8125,
"learning_rate": 9.98604149006859e-07,
"loss": 132.71,
"step": 8930
},
{
"epoch": 0.7154763527412463,
"grad_norm": 69.9375,
"learning_rate": 9.986025859038433e-07,
"loss": 131.8712,
"step": 8940
},
{
"epoch": 0.7162766618606436,
"grad_norm": 83.1875,
"learning_rate": 9.986010228008275e-07,
"loss": 131.4506,
"step": 8950
},
{
"epoch": 0.7170769709800411,
"grad_norm": 76.5625,
"learning_rate": 9.985994596978115e-07,
"loss": 131.5453,
"step": 8960
},
{
"epoch": 0.7178772800994384,
"grad_norm": 73.625,
"learning_rate": 9.985978965947957e-07,
"loss": 131.9996,
"step": 8970
},
{
"epoch": 0.7186775892188357,
"grad_norm": 78.75,
"learning_rate": 9.9859633349178e-07,
"loss": 132.0983,
"step": 8980
},
{
"epoch": 0.7194778983382332,
"grad_norm": 72.3125,
"learning_rate": 9.98594770388764e-07,
"loss": 131.87,
"step": 8990
},
{
"epoch": 0.7202782074576305,
"grad_norm": 73.625,
"learning_rate": 9.985932072857481e-07,
"loss": 131.6589,
"step": 9000
},
{
"epoch": 0.7210785165770279,
"grad_norm": 76.5,
"learning_rate": 9.985916441827324e-07,
"loss": 132.6549,
"step": 9010
},
{
"epoch": 0.7218788256964253,
"grad_norm": 80.75,
"learning_rate": 9.985900810797166e-07,
"loss": 131.3108,
"step": 9020
},
{
"epoch": 0.7226791348158226,
"grad_norm": 78.9375,
"learning_rate": 9.985885179767006e-07,
"loss": 132.2872,
"step": 9030
},
{
"epoch": 0.72347944393522,
"grad_norm": 76.8125,
"learning_rate": 9.985869548736848e-07,
"loss": 131.6559,
"step": 9040
},
{
"epoch": 0.7242797530546173,
"grad_norm": 80.1875,
"learning_rate": 9.98585391770669e-07,
"loss": 131.6873,
"step": 9050
},
{
"epoch": 0.7250800621740147,
"grad_norm": 76.875,
"learning_rate": 9.98583828667653e-07,
"loss": 131.4747,
"step": 9060
},
{
"epoch": 0.7258803712934121,
"grad_norm": 76.1875,
"learning_rate": 9.985822655646372e-07,
"loss": 131.7952,
"step": 9070
},
{
"epoch": 0.7266806804128094,
"grad_norm": 75.5,
"learning_rate": 9.985807024616215e-07,
"loss": 131.458,
"step": 9080
},
{
"epoch": 0.7274809895322069,
"grad_norm": 73.75,
"learning_rate": 9.985791393586057e-07,
"loss": 132.2186,
"step": 9090
},
{
"epoch": 0.7282812986516042,
"grad_norm": 79.5,
"learning_rate": 9.9857757625559e-07,
"loss": 133.5451,
"step": 9100
},
{
"epoch": 0.7290816077710015,
"grad_norm": 70.6875,
"learning_rate": 9.98576013152574e-07,
"loss": 131.524,
"step": 9110
},
{
"epoch": 0.7298819168903989,
"grad_norm": 78.25,
"learning_rate": 9.985744500495581e-07,
"loss": 132.1586,
"step": 9120
},
{
"epoch": 0.7306822260097963,
"grad_norm": 76.4375,
"learning_rate": 9.985728869465423e-07,
"loss": 131.0122,
"step": 9130
},
{
"epoch": 0.7314825351291937,
"grad_norm": 75.9375,
"learning_rate": 9.985713238435266e-07,
"loss": 132.5875,
"step": 9140
},
{
"epoch": 0.732282844248591,
"grad_norm": 77.8125,
"learning_rate": 9.985697607405106e-07,
"loss": 132.9425,
"step": 9150
},
{
"epoch": 0.7330831533679883,
"grad_norm": 71.6875,
"learning_rate": 9.985681976374948e-07,
"loss": 131.7688,
"step": 9160
},
{
"epoch": 0.7338834624873858,
"grad_norm": 78.0625,
"learning_rate": 9.98566634534479e-07,
"loss": 130.2091,
"step": 9170
},
{
"epoch": 0.7346837716067831,
"grad_norm": 73.375,
"learning_rate": 9.98565071431463e-07,
"loss": 132.0214,
"step": 9180
},
{
"epoch": 0.7354840807261804,
"grad_norm": 72.75,
"learning_rate": 9.985635083284472e-07,
"loss": 131.4804,
"step": 9190
},
{
"epoch": 0.7362843898455779,
"grad_norm": 72.8125,
"learning_rate": 9.985619452254314e-07,
"loss": 130.8607,
"step": 9200
},
{
"epoch": 0.7370846989649752,
"grad_norm": 75.4375,
"learning_rate": 9.985603821224157e-07,
"loss": 131.9145,
"step": 9210
},
{
"epoch": 0.7378850080843726,
"grad_norm": 73.875,
"learning_rate": 9.985588190193997e-07,
"loss": 130.5753,
"step": 9220
},
{
"epoch": 0.73868531720377,
"grad_norm": 81.0,
"learning_rate": 9.985572559163839e-07,
"loss": 132.0938,
"step": 9230
},
{
"epoch": 0.7394856263231673,
"grad_norm": 78.0625,
"learning_rate": 9.98555692813368e-07,
"loss": 132.836,
"step": 9240
},
{
"epoch": 0.7402859354425647,
"grad_norm": 76.25,
"learning_rate": 9.985541297103523e-07,
"loss": 131.6837,
"step": 9250
},
{
"epoch": 0.741086244561962,
"grad_norm": 74.125,
"learning_rate": 9.985525666073365e-07,
"loss": 132.7221,
"step": 9260
},
{
"epoch": 0.7418865536813595,
"grad_norm": 81.875,
"learning_rate": 9.985510035043205e-07,
"loss": 131.037,
"step": 9270
},
{
"epoch": 0.7426868628007568,
"grad_norm": 77.0,
"learning_rate": 9.985494404013048e-07,
"loss": 130.6028,
"step": 9280
},
{
"epoch": 0.7434871719201541,
"grad_norm": 76.4375,
"learning_rate": 9.98547877298289e-07,
"loss": 131.0622,
"step": 9290
},
{
"epoch": 0.7442874810395516,
"grad_norm": 72.3125,
"learning_rate": 9.985463141952732e-07,
"loss": 130.6438,
"step": 9300
},
{
"epoch": 0.7450877901589489,
"grad_norm": 73.75,
"learning_rate": 9.985447510922572e-07,
"loss": 132.5858,
"step": 9310
},
{
"epoch": 0.7458880992783462,
"grad_norm": 81.375,
"learning_rate": 9.985431879892414e-07,
"loss": 131.7563,
"step": 9320
},
{
"epoch": 0.7466884083977436,
"grad_norm": 75.0625,
"learning_rate": 9.985416248862256e-07,
"loss": 131.6322,
"step": 9330
},
{
"epoch": 0.747488717517141,
"grad_norm": 74.375,
"learning_rate": 9.985400617832097e-07,
"loss": 129.3306,
"step": 9340
},
{
"epoch": 0.7482890266365384,
"grad_norm": 77.25,
"learning_rate": 9.985384986801939e-07,
"loss": 131.3398,
"step": 9350
},
{
"epoch": 0.7490893357559357,
"grad_norm": 80.4375,
"learning_rate": 9.98536935577178e-07,
"loss": 131.5261,
"step": 9360
},
{
"epoch": 0.749889644875333,
"grad_norm": 76.625,
"learning_rate": 9.985353724741623e-07,
"loss": 130.4829,
"step": 9370
},
{
"epoch": 0.7506899539947305,
"grad_norm": 75.1875,
"learning_rate": 9.985338093711463e-07,
"loss": 130.9288,
"step": 9380
},
{
"epoch": 0.7514902631141278,
"grad_norm": 74.5,
"learning_rate": 9.985322462681305e-07,
"loss": 131.758,
"step": 9390
},
{
"epoch": 0.7522905722335252,
"grad_norm": 72.9375,
"learning_rate": 9.985306831651148e-07,
"loss": 131.2881,
"step": 9400
},
{
"epoch": 0.7530908813529226,
"grad_norm": 77.875,
"learning_rate": 9.98529120062099e-07,
"loss": 132.0472,
"step": 9410
},
{
"epoch": 0.7538911904723199,
"grad_norm": 81.875,
"learning_rate": 9.985275569590832e-07,
"loss": 133.1372,
"step": 9420
},
{
"epoch": 0.7546914995917173,
"grad_norm": 76.4375,
"learning_rate": 9.985259938560672e-07,
"loss": 132.0327,
"step": 9430
},
{
"epoch": 0.7554918087111147,
"grad_norm": 82.5,
"learning_rate": 9.985244307530514e-07,
"loss": 130.7069,
"step": 9440
},
{
"epoch": 0.7562921178305121,
"grad_norm": 79.9375,
"learning_rate": 9.985228676500356e-07,
"loss": 131.1774,
"step": 9450
},
{
"epoch": 0.7570924269499094,
"grad_norm": 76.9375,
"learning_rate": 9.985213045470198e-07,
"loss": 131.4195,
"step": 9460
},
{
"epoch": 0.7578927360693067,
"grad_norm": 83.25,
"learning_rate": 9.985197414440039e-07,
"loss": 130.8186,
"step": 9470
},
{
"epoch": 0.7586930451887042,
"grad_norm": 76.1875,
"learning_rate": 9.98518178340988e-07,
"loss": 129.8441,
"step": 9480
},
{
"epoch": 0.7594933543081015,
"grad_norm": 71.5,
"learning_rate": 9.985166152379723e-07,
"loss": 131.93,
"step": 9490
},
{
"epoch": 0.7602936634274988,
"grad_norm": 79.0625,
"learning_rate": 9.985150521349563e-07,
"loss": 131.2559,
"step": 9500
},
{
"epoch": 0.7610939725468963,
"grad_norm": 69.3125,
"learning_rate": 9.985134890319405e-07,
"loss": 131.3485,
"step": 9510
},
{
"epoch": 0.7618942816662936,
"grad_norm": 79.5,
"learning_rate": 9.985119259289247e-07,
"loss": 130.6342,
"step": 9520
},
{
"epoch": 0.762694590785691,
"grad_norm": 78.3125,
"learning_rate": 9.985103628259087e-07,
"loss": 132.0574,
"step": 9530
},
{
"epoch": 0.7634948999050883,
"grad_norm": 73.0,
"learning_rate": 9.98508799722893e-07,
"loss": 130.7042,
"step": 9540
},
{
"epoch": 0.7642952090244857,
"grad_norm": 87.75,
"learning_rate": 9.985072366198772e-07,
"loss": 131.7032,
"step": 9550
},
{
"epoch": 0.7650955181438831,
"grad_norm": 78.9375,
"learning_rate": 9.985056735168614e-07,
"loss": 132.2712,
"step": 9560
},
{
"epoch": 0.7658958272632804,
"grad_norm": 75.25,
"learning_rate": 9.985041104138456e-07,
"loss": 132.5755,
"step": 9570
},
{
"epoch": 0.7666961363826779,
"grad_norm": 74.9375,
"learning_rate": 9.985025473108296e-07,
"loss": 131.8575,
"step": 9580
},
{
"epoch": 0.7674964455020752,
"grad_norm": 74.3125,
"learning_rate": 9.985009842078138e-07,
"loss": 131.8135,
"step": 9590
},
{
"epoch": 0.7682967546214725,
"grad_norm": 74.5,
"learning_rate": 9.98499421104798e-07,
"loss": 130.349,
"step": 9600
},
{
"epoch": 0.76909706374087,
"grad_norm": 75.625,
"learning_rate": 9.984978580017823e-07,
"loss": 132.1411,
"step": 9610
},
{
"epoch": 0.7698973728602673,
"grad_norm": 78.4375,
"learning_rate": 9.984962948987665e-07,
"loss": 132.007,
"step": 9620
},
{
"epoch": 0.7706976819796646,
"grad_norm": 76.875,
"learning_rate": 9.984947317957505e-07,
"loss": 131.2099,
"step": 9630
},
{
"epoch": 0.771497991099062,
"grad_norm": 78.875,
"learning_rate": 9.984931686927347e-07,
"loss": 130.7354,
"step": 9640
},
{
"epoch": 0.7722983002184594,
"grad_norm": 79.875,
"learning_rate": 9.98491605589719e-07,
"loss": 132.1408,
"step": 9650
},
{
"epoch": 0.7730986093378568,
"grad_norm": 72.1875,
"learning_rate": 9.98490042486703e-07,
"loss": 130.8756,
"step": 9660
},
{
"epoch": 0.7738989184572541,
"grad_norm": 69.4375,
"learning_rate": 9.984884793836872e-07,
"loss": 131.2112,
"step": 9670
},
{
"epoch": 0.7746992275766514,
"grad_norm": 76.875,
"learning_rate": 9.984869162806714e-07,
"loss": 131.0196,
"step": 9680
},
{
"epoch": 0.7754995366960489,
"grad_norm": 85.0,
"learning_rate": 9.984853531776554e-07,
"loss": 131.2716,
"step": 9690
},
{
"epoch": 0.7762998458154462,
"grad_norm": 72.5,
"learning_rate": 9.984837900746396e-07,
"loss": 130.7754,
"step": 9700
},
{
"epoch": 0.7771001549348436,
"grad_norm": 75.0,
"learning_rate": 9.984822269716238e-07,
"loss": 131.3929,
"step": 9710
},
{
"epoch": 0.777900464054241,
"grad_norm": 75.6875,
"learning_rate": 9.98480663868608e-07,
"loss": 131.4491,
"step": 9720
},
{
"epoch": 0.7787007731736383,
"grad_norm": 77.8125,
"learning_rate": 9.984791007655923e-07,
"loss": 130.3078,
"step": 9730
},
{
"epoch": 0.7795010822930357,
"grad_norm": 79.375,
"learning_rate": 9.984775376625763e-07,
"loss": 132.3819,
"step": 9740
},
{
"epoch": 0.780301391412433,
"grad_norm": 77.875,
"learning_rate": 9.984759745595605e-07,
"loss": 132.8559,
"step": 9750
},
{
"epoch": 0.7811017005318304,
"grad_norm": 76.875,
"learning_rate": 9.984744114565447e-07,
"loss": 130.5132,
"step": 9760
},
{
"epoch": 0.7819020096512278,
"grad_norm": 80.625,
"learning_rate": 9.98472848353529e-07,
"loss": 130.5794,
"step": 9770
},
{
"epoch": 0.7827023187706251,
"grad_norm": 79.1875,
"learning_rate": 9.984712852505131e-07,
"loss": 132.3585,
"step": 9780
},
{
"epoch": 0.7835026278900226,
"grad_norm": 76.625,
"learning_rate": 9.984697221474971e-07,
"loss": 132.778,
"step": 9790
},
{
"epoch": 0.7843029370094199,
"grad_norm": 74.625,
"learning_rate": 9.984681590444814e-07,
"loss": 130.5637,
"step": 9800
},
{
"epoch": 0.7851032461288172,
"grad_norm": 82.9375,
"learning_rate": 9.984665959414656e-07,
"loss": 130.9602,
"step": 9810
},
{
"epoch": 0.7859035552482146,
"grad_norm": 78.5,
"learning_rate": 9.984650328384496e-07,
"loss": 131.8248,
"step": 9820
},
{
"epoch": 0.786703864367612,
"grad_norm": 71.875,
"learning_rate": 9.984634697354338e-07,
"loss": 132.0758,
"step": 9830
},
{
"epoch": 0.7875041734870094,
"grad_norm": 78.5625,
"learning_rate": 9.98461906632418e-07,
"loss": 131.6132,
"step": 9840
},
{
"epoch": 0.7883044826064067,
"grad_norm": 82.9375,
"learning_rate": 9.98460343529402e-07,
"loss": 130.1421,
"step": 9850
},
{
"epoch": 0.7891047917258041,
"grad_norm": 90.0625,
"learning_rate": 9.984587804263863e-07,
"loss": 130.7962,
"step": 9860
},
{
"epoch": 0.7899051008452015,
"grad_norm": 75.5,
"learning_rate": 9.984572173233705e-07,
"loss": 131.8376,
"step": 9870
},
{
"epoch": 0.7907054099645988,
"grad_norm": 75.4375,
"learning_rate": 9.984556542203547e-07,
"loss": 132.0439,
"step": 9880
},
{
"epoch": 0.7915057190839961,
"grad_norm": 86.25,
"learning_rate": 9.98454091117339e-07,
"loss": 130.9918,
"step": 9890
},
{
"epoch": 0.7923060282033936,
"grad_norm": 77.1875,
"learning_rate": 9.98452528014323e-07,
"loss": 132.0871,
"step": 9900
},
{
"epoch": 0.7931063373227909,
"grad_norm": 78.375,
"learning_rate": 9.984509649113071e-07,
"loss": 132.0267,
"step": 9910
},
{
"epoch": 0.7939066464421883,
"grad_norm": 75.125,
"learning_rate": 9.984494018082913e-07,
"loss": 131.7625,
"step": 9920
},
{
"epoch": 0.7947069555615857,
"grad_norm": 78.75,
"learning_rate": 9.984478387052756e-07,
"loss": 131.006,
"step": 9930
},
{
"epoch": 0.795507264680983,
"grad_norm": 72.375,
"learning_rate": 9.984462756022598e-07,
"loss": 130.9044,
"step": 9940
},
{
"epoch": 0.7963075738003804,
"grad_norm": 77.8125,
"learning_rate": 9.984447124992438e-07,
"loss": 131.8305,
"step": 9950
},
{
"epoch": 0.7971078829197777,
"grad_norm": 79.8125,
"learning_rate": 9.98443149396228e-07,
"loss": 129.9232,
"step": 9960
},
{
"epoch": 0.7979081920391752,
"grad_norm": 72.5625,
"learning_rate": 9.984415862932122e-07,
"loss": 131.7845,
"step": 9970
},
{
"epoch": 0.7987085011585725,
"grad_norm": 72.5,
"learning_rate": 9.984400231901962e-07,
"loss": 131.7831,
"step": 9980
},
{
"epoch": 0.7995088102779698,
"grad_norm": 77.125,
"learning_rate": 9.984384600871805e-07,
"loss": 133.024,
"step": 9990
},
{
"epoch": 0.8003091193973673,
"grad_norm": 81.375,
"learning_rate": 9.984368969841647e-07,
"loss": 131.0512,
"step": 10000
},
{
"epoch": 0.8003091193973673,
"eval_loss": 2.0527355670928955,
"eval_runtime": 416.7476,
"eval_samples_per_second": 1574.473,
"eval_steps_per_second": 49.202,
"step": 10000
},
{
"epoch": 0.8011094285167646,
"grad_norm": 77.1875,
"learning_rate": 9.984353338811487e-07,
"loss": 130.3064,
"step": 10010
},
{
"epoch": 0.8019097376361619,
"grad_norm": 74.0,
"learning_rate": 9.98433770778133e-07,
"loss": 132.0653,
"step": 10020
},
{
"epoch": 0.8027100467555593,
"grad_norm": 84.875,
"learning_rate": 9.984322076751171e-07,
"loss": 132.2227,
"step": 10030
},
{
"epoch": 0.8035103558749567,
"grad_norm": 76.625,
"learning_rate": 9.984306445721013e-07,
"loss": 130.7247,
"step": 10040
},
{
"epoch": 0.8043106649943541,
"grad_norm": 75.8125,
"learning_rate": 9.984290814690853e-07,
"loss": 131.7206,
"step": 10050
},
{
"epoch": 0.8051109741137514,
"grad_norm": 73.6875,
"learning_rate": 9.984275183660696e-07,
"loss": 131.013,
"step": 10060
},
{
"epoch": 0.8059112832331488,
"grad_norm": 72.625,
"learning_rate": 9.984259552630538e-07,
"loss": 132.8077,
"step": 10070
},
{
"epoch": 0.8067115923525462,
"grad_norm": 72.375,
"learning_rate": 9.98424392160038e-07,
"loss": 132.2377,
"step": 10080
},
{
"epoch": 0.8075119014719435,
"grad_norm": 82.5625,
"learning_rate": 9.984228290570222e-07,
"loss": 131.6139,
"step": 10090
},
{
"epoch": 0.808312210591341,
"grad_norm": 75.4375,
"learning_rate": 9.984212659540062e-07,
"loss": 130.3295,
"step": 10100
},
{
"epoch": 0.8091125197107383,
"grad_norm": 75.5,
"learning_rate": 9.984197028509904e-07,
"loss": 131.7549,
"step": 10110
},
{
"epoch": 0.8099128288301356,
"grad_norm": 77.875,
"learning_rate": 9.984181397479747e-07,
"loss": 131.4637,
"step": 10120
},
{
"epoch": 0.810713137949533,
"grad_norm": 74.8125,
"learning_rate": 9.984165766449589e-07,
"loss": 131.4822,
"step": 10130
},
{
"epoch": 0.8115134470689304,
"grad_norm": 78.6875,
"learning_rate": 9.984150135419429e-07,
"loss": 131.1819,
"step": 10140
},
{
"epoch": 0.8123137561883278,
"grad_norm": 78.0,
"learning_rate": 9.98413450438927e-07,
"loss": 131.0785,
"step": 10150
},
{
"epoch": 0.8131140653077251,
"grad_norm": 78.9375,
"learning_rate": 9.984118873359113e-07,
"loss": 130.286,
"step": 10160
},
{
"epoch": 0.8139143744271224,
"grad_norm": 78.0625,
"learning_rate": 9.984103242328953e-07,
"loss": 131.6022,
"step": 10170
},
{
"epoch": 0.8147146835465199,
"grad_norm": 78.375,
"learning_rate": 9.984087611298795e-07,
"loss": 131.5716,
"step": 10180
},
{
"epoch": 0.8155149926659172,
"grad_norm": 74.875,
"learning_rate": 9.984071980268638e-07,
"loss": 131.0095,
"step": 10190
},
{
"epoch": 0.8163153017853145,
"grad_norm": 77.875,
"learning_rate": 9.98405634923848e-07,
"loss": 131.2664,
"step": 10200
},
{
"epoch": 0.817115610904712,
"grad_norm": 73.5625,
"learning_rate": 9.98404071820832e-07,
"loss": 131.8767,
"step": 10210
},
{
"epoch": 0.8179159200241093,
"grad_norm": 79.8125,
"learning_rate": 9.984025087178162e-07,
"loss": 128.864,
"step": 10220
},
{
"epoch": 0.8187162291435067,
"grad_norm": 77.375,
"learning_rate": 9.984009456148004e-07,
"loss": 130.4429,
"step": 10230
},
{
"epoch": 0.819516538262904,
"grad_norm": 70.8125,
"learning_rate": 9.983993825117846e-07,
"loss": 128.6023,
"step": 10240
},
{
"epoch": 0.8203168473823014,
"grad_norm": 74.4375,
"learning_rate": 9.983978194087689e-07,
"loss": 130.9503,
"step": 10250
},
{
"epoch": 0.8211171565016988,
"grad_norm": 80.125,
"learning_rate": 9.983962563057529e-07,
"loss": 132.8031,
"step": 10260
},
{
"epoch": 0.8219174656210961,
"grad_norm": 73.8125,
"learning_rate": 9.98394693202737e-07,
"loss": 130.115,
"step": 10270
},
{
"epoch": 0.8227177747404936,
"grad_norm": 73.75,
"learning_rate": 9.983931300997213e-07,
"loss": 130.5055,
"step": 10280
},
{
"epoch": 0.8235180838598909,
"grad_norm": 74.0,
"learning_rate": 9.983915669967053e-07,
"loss": 130.6606,
"step": 10290
},
{
"epoch": 0.8243183929792882,
"grad_norm": 78.6875,
"learning_rate": 9.983900038936895e-07,
"loss": 130.8658,
"step": 10300
},
{
"epoch": 0.8251187020986857,
"grad_norm": 78.0,
"learning_rate": 9.983884407906737e-07,
"loss": 131.8639,
"step": 10310
},
{
"epoch": 0.825919011218083,
"grad_norm": 74.0,
"learning_rate": 9.98386877687658e-07,
"loss": 131.2057,
"step": 10320
},
{
"epoch": 0.8267193203374803,
"grad_norm": 78.0625,
"learning_rate": 9.98385314584642e-07,
"loss": 131.9255,
"step": 10330
},
{
"epoch": 0.8275196294568777,
"grad_norm": 78.75,
"learning_rate": 9.983837514816262e-07,
"loss": 131.7983,
"step": 10340
},
{
"epoch": 0.8283199385762751,
"grad_norm": 81.9375,
"learning_rate": 9.983821883786104e-07,
"loss": 132.316,
"step": 10350
},
{
"epoch": 0.8291202476956725,
"grad_norm": 80.875,
"learning_rate": 9.983806252755946e-07,
"loss": 131.1004,
"step": 10360
},
{
"epoch": 0.8299205568150698,
"grad_norm": 71.0,
"learning_rate": 9.983790621725786e-07,
"loss": 132.1643,
"step": 10370
},
{
"epoch": 0.8307208659344671,
"grad_norm": 74.375,
"learning_rate": 9.983774990695628e-07,
"loss": 130.8654,
"step": 10380
},
{
"epoch": 0.8315211750538646,
"grad_norm": 74.875,
"learning_rate": 9.98375935966547e-07,
"loss": 130.205,
"step": 10390
},
{
"epoch": 0.8323214841732619,
"grad_norm": 73.375,
"learning_rate": 9.983743728635313e-07,
"loss": 130.6661,
"step": 10400
},
{
"epoch": 0.8331217932926593,
"grad_norm": 73.4375,
"learning_rate": 9.983728097605155e-07,
"loss": 131.2944,
"step": 10410
},
{
"epoch": 0.8339221024120567,
"grad_norm": 75.875,
"learning_rate": 9.983712466574995e-07,
"loss": 132.2854,
"step": 10420
},
{
"epoch": 0.834722411531454,
"grad_norm": 80.625,
"learning_rate": 9.983696835544837e-07,
"loss": 130.7651,
"step": 10430
},
{
"epoch": 0.8355227206508514,
"grad_norm": 71.6875,
"learning_rate": 9.98368120451468e-07,
"loss": 130.8648,
"step": 10440
},
{
"epoch": 0.8363230297702487,
"grad_norm": 77.125,
"learning_rate": 9.98366557348452e-07,
"loss": 130.8581,
"step": 10450
},
{
"epoch": 0.8371233388896461,
"grad_norm": 73.625,
"learning_rate": 9.983649942454362e-07,
"loss": 130.3991,
"step": 10460
},
{
"epoch": 0.8379236480090435,
"grad_norm": 79.375,
"learning_rate": 9.983634311424204e-07,
"loss": 131.2294,
"step": 10470
},
{
"epoch": 0.8387239571284408,
"grad_norm": 78.3125,
"learning_rate": 9.983618680394046e-07,
"loss": 129.3754,
"step": 10480
},
{
"epoch": 0.8395242662478383,
"grad_norm": 81.75,
"learning_rate": 9.983603049363886e-07,
"loss": 131.7653,
"step": 10490
},
{
"epoch": 0.8403245753672356,
"grad_norm": 74.9375,
"learning_rate": 9.983587418333728e-07,
"loss": 131.5391,
"step": 10500
},
{
"epoch": 0.8411248844866329,
"grad_norm": 76.75,
"learning_rate": 9.98357178730357e-07,
"loss": 131.4889,
"step": 10510
},
{
"epoch": 0.8419251936060304,
"grad_norm": 72.1875,
"learning_rate": 9.98355615627341e-07,
"loss": 133.0458,
"step": 10520
},
{
"epoch": 0.8427255027254277,
"grad_norm": 81.3125,
"learning_rate": 9.983540525243253e-07,
"loss": 130.8461,
"step": 10530
},
{
"epoch": 0.8435258118448251,
"grad_norm": 74.75,
"learning_rate": 9.983524894213095e-07,
"loss": 131.7515,
"step": 10540
},
{
"epoch": 0.8443261209642224,
"grad_norm": 70.8125,
"learning_rate": 9.983509263182937e-07,
"loss": 132.1545,
"step": 10550
},
{
"epoch": 0.8451264300836198,
"grad_norm": 76.9375,
"learning_rate": 9.98349363215278e-07,
"loss": 130.9946,
"step": 10560
},
{
"epoch": 0.8459267392030172,
"grad_norm": 72.25,
"learning_rate": 9.983478001122621e-07,
"loss": 130.338,
"step": 10570
},
{
"epoch": 0.8467270483224145,
"grad_norm": 75.1875,
"learning_rate": 9.983462370092462e-07,
"loss": 131.5349,
"step": 10580
},
{
"epoch": 0.8475273574418118,
"grad_norm": 70.125,
"learning_rate": 9.983446739062304e-07,
"loss": 130.6219,
"step": 10590
},
{
"epoch": 0.8483276665612093,
"grad_norm": 78.0625,
"learning_rate": 9.983431108032146e-07,
"loss": 131.3493,
"step": 10600
},
{
"epoch": 0.8491279756806066,
"grad_norm": 74.0,
"learning_rate": 9.983415477001986e-07,
"loss": 131.0083,
"step": 10610
},
{
"epoch": 0.849928284800004,
"grad_norm": 80.75,
"learning_rate": 9.983399845971828e-07,
"loss": 131.9277,
"step": 10620
},
{
"epoch": 0.8507285939194014,
"grad_norm": 79.1875,
"learning_rate": 9.98338421494167e-07,
"loss": 130.602,
"step": 10630
},
{
"epoch": 0.8515289030387987,
"grad_norm": 77.5625,
"learning_rate": 9.98336858391151e-07,
"loss": 130.4636,
"step": 10640
},
{
"epoch": 0.8523292121581961,
"grad_norm": 77.75,
"learning_rate": 9.983352952881353e-07,
"loss": 131.2724,
"step": 10650
},
{
"epoch": 0.8531295212775935,
"grad_norm": 78.875,
"learning_rate": 9.983337321851195e-07,
"loss": 130.8466,
"step": 10660
},
{
"epoch": 0.8539298303969909,
"grad_norm": 77.375,
"learning_rate": 9.983321690821037e-07,
"loss": 130.6045,
"step": 10670
},
{
"epoch": 0.8547301395163882,
"grad_norm": 77.875,
"learning_rate": 9.983306059790877e-07,
"loss": 131.8468,
"step": 10680
},
{
"epoch": 0.8555304486357855,
"grad_norm": 75.25,
"learning_rate": 9.98329042876072e-07,
"loss": 130.7249,
"step": 10690
},
{
"epoch": 0.856330757755183,
"grad_norm": 79.25,
"learning_rate": 9.983274797730561e-07,
"loss": 130.1633,
"step": 10700
},
{
"epoch": 0.8571310668745803,
"grad_norm": 74.625,
"learning_rate": 9.983259166700404e-07,
"loss": 130.6425,
"step": 10710
},
{
"epoch": 0.8579313759939776,
"grad_norm": 75.25,
"learning_rate": 9.983243535670246e-07,
"loss": 130.9092,
"step": 10720
},
{
"epoch": 0.858731685113375,
"grad_norm": 77.625,
"learning_rate": 9.983227904640088e-07,
"loss": 129.9958,
"step": 10730
},
{
"epoch": 0.8595319942327724,
"grad_norm": 79.5,
"learning_rate": 9.983212273609928e-07,
"loss": 132.0485,
"step": 10740
},
{
"epoch": 0.8603323033521698,
"grad_norm": 77.0625,
"learning_rate": 9.98319664257977e-07,
"loss": 131.1308,
"step": 10750
},
{
"epoch": 0.8611326124715671,
"grad_norm": 82.4375,
"learning_rate": 9.983181011549612e-07,
"loss": 131.665,
"step": 10760
},
{
"epoch": 0.8619329215909645,
"grad_norm": 74.375,
"learning_rate": 9.983165380519452e-07,
"loss": 131.7367,
"step": 10770
},
{
"epoch": 0.8627332307103619,
"grad_norm": 77.9375,
"learning_rate": 9.983149749489295e-07,
"loss": 130.5749,
"step": 10780
},
{
"epoch": 0.8635335398297592,
"grad_norm": 72.1875,
"learning_rate": 9.983134118459137e-07,
"loss": 129.4679,
"step": 10790
},
{
"epoch": 0.8643338489491567,
"grad_norm": 79.1875,
"learning_rate": 9.983118487428977e-07,
"loss": 132.4236,
"step": 10800
},
{
"epoch": 0.865134158068554,
"grad_norm": 72.75,
"learning_rate": 9.98310285639882e-07,
"loss": 129.7827,
"step": 10810
},
{
"epoch": 0.8659344671879513,
"grad_norm": 73.5625,
"learning_rate": 9.983087225368661e-07,
"loss": 131.0587,
"step": 10820
},
{
"epoch": 0.8667347763073487,
"grad_norm": 75.375,
"learning_rate": 9.983071594338503e-07,
"loss": 131.5654,
"step": 10830
},
{
"epoch": 0.8675350854267461,
"grad_norm": 79.875,
"learning_rate": 9.983055963308343e-07,
"loss": 131.1106,
"step": 10840
},
{
"epoch": 0.8683353945461435,
"grad_norm": 74.0625,
"learning_rate": 9.983040332278186e-07,
"loss": 131.5627,
"step": 10850
},
{
"epoch": 0.8691357036655408,
"grad_norm": 75.0,
"learning_rate": 9.983024701248028e-07,
"loss": 130.7058,
"step": 10860
},
{
"epoch": 0.8699360127849382,
"grad_norm": 77.125,
"learning_rate": 9.98300907021787e-07,
"loss": 131.1618,
"step": 10870
},
{
"epoch": 0.8707363219043356,
"grad_norm": 78.5,
"learning_rate": 9.982993439187712e-07,
"loss": 131.6918,
"step": 10880
},
{
"epoch": 0.8715366310237329,
"grad_norm": 77.0,
"learning_rate": 9.982977808157552e-07,
"loss": 130.2754,
"step": 10890
},
{
"epoch": 0.8723369401431302,
"grad_norm": 76.375,
"learning_rate": 9.982962177127394e-07,
"loss": 130.6358,
"step": 10900
},
{
"epoch": 0.8731372492625277,
"grad_norm": 76.1875,
"learning_rate": 9.982946546097237e-07,
"loss": 131.4433,
"step": 10910
},
{
"epoch": 0.873937558381925,
"grad_norm": 76.8125,
"learning_rate": 9.982930915067079e-07,
"loss": 131.1567,
"step": 10920
},
{
"epoch": 0.8747378675013224,
"grad_norm": 75.0625,
"learning_rate": 9.982915284036919e-07,
"loss": 130.6782,
"step": 10930
},
{
"epoch": 0.8755381766207198,
"grad_norm": 73.4375,
"learning_rate": 9.982899653006761e-07,
"loss": 130.3121,
"step": 10940
},
{
"epoch": 0.8763384857401171,
"grad_norm": 75.25,
"learning_rate": 9.982884021976603e-07,
"loss": 131.139,
"step": 10950
},
{
"epoch": 0.8771387948595145,
"grad_norm": 73.4375,
"learning_rate": 9.982868390946443e-07,
"loss": 130.9144,
"step": 10960
},
{
"epoch": 0.8779391039789118,
"grad_norm": 79.5,
"learning_rate": 9.982852759916286e-07,
"loss": 129.6913,
"step": 10970
},
{
"epoch": 0.8787394130983093,
"grad_norm": 73.3125,
"learning_rate": 9.982837128886128e-07,
"loss": 129.9289,
"step": 10980
},
{
"epoch": 0.8795397222177066,
"grad_norm": 73.8125,
"learning_rate": 9.98282149785597e-07,
"loss": 132.2781,
"step": 10990
},
{
"epoch": 0.8803400313371039,
"grad_norm": 75.9375,
"learning_rate": 9.98280586682581e-07,
"loss": 130.8845,
"step": 11000
},
{
"epoch": 0.8811403404565014,
"grad_norm": 76.25,
"learning_rate": 9.982790235795652e-07,
"loss": 129.7611,
"step": 11010
},
{
"epoch": 0.8819406495758987,
"grad_norm": 83.375,
"learning_rate": 9.982774604765494e-07,
"loss": 129.9167,
"step": 11020
},
{
"epoch": 0.882740958695296,
"grad_norm": 76.3125,
"learning_rate": 9.982758973735337e-07,
"loss": 130.6375,
"step": 11030
},
{
"epoch": 0.8835412678146934,
"grad_norm": 77.4375,
"learning_rate": 9.982743342705179e-07,
"loss": 130.627,
"step": 11040
},
{
"epoch": 0.8843415769340908,
"grad_norm": 71.3125,
"learning_rate": 9.982727711675019e-07,
"loss": 130.6759,
"step": 11050
},
{
"epoch": 0.8851418860534882,
"grad_norm": 77.875,
"learning_rate": 9.98271208064486e-07,
"loss": 131.0551,
"step": 11060
},
{
"epoch": 0.8859421951728855,
"grad_norm": 73.8125,
"learning_rate": 9.982696449614703e-07,
"loss": 130.5227,
"step": 11070
},
{
"epoch": 0.8867425042922829,
"grad_norm": 77.25,
"learning_rate": 9.982680818584545e-07,
"loss": 132.4977,
"step": 11080
},
{
"epoch": 0.8875428134116803,
"grad_norm": 81.375,
"learning_rate": 9.982665187554385e-07,
"loss": 131.0946,
"step": 11090
},
{
"epoch": 0.8883431225310776,
"grad_norm": 74.375,
"learning_rate": 9.982649556524228e-07,
"loss": 130.3095,
"step": 11100
},
{
"epoch": 0.889143431650475,
"grad_norm": 76.3125,
"learning_rate": 9.98263392549407e-07,
"loss": 131.8633,
"step": 11110
},
{
"epoch": 0.8899437407698724,
"grad_norm": 83.125,
"learning_rate": 9.98261829446391e-07,
"loss": 131.3789,
"step": 11120
},
{
"epoch": 0.8907440498892697,
"grad_norm": 71.375,
"learning_rate": 9.982602663433752e-07,
"loss": 131.0635,
"step": 11130
},
{
"epoch": 0.8915443590086671,
"grad_norm": 76.0625,
"learning_rate": 9.982587032403594e-07,
"loss": 131.6485,
"step": 11140
},
{
"epoch": 0.8923446681280645,
"grad_norm": 84.5625,
"learning_rate": 9.982571401373434e-07,
"loss": 131.5727,
"step": 11150
},
{
"epoch": 0.8931449772474618,
"grad_norm": 76.3125,
"learning_rate": 9.982555770343276e-07,
"loss": 131.7047,
"step": 11160
},
{
"epoch": 0.8939452863668592,
"grad_norm": 78.25,
"learning_rate": 9.982540139313119e-07,
"loss": 132.1118,
"step": 11170
},
{
"epoch": 0.8947455954862565,
"grad_norm": 77.5,
"learning_rate": 9.98252450828296e-07,
"loss": 130.6895,
"step": 11180
},
{
"epoch": 0.895545904605654,
"grad_norm": 74.75,
"learning_rate": 9.982508877252803e-07,
"loss": 130.8122,
"step": 11190
},
{
"epoch": 0.8963462137250513,
"grad_norm": 75.0,
"learning_rate": 9.982493246222645e-07,
"loss": 130.9431,
"step": 11200
},
{
"epoch": 0.8971465228444486,
"grad_norm": 75.375,
"learning_rate": 9.982477615192485e-07,
"loss": 131.6024,
"step": 11210
},
{
"epoch": 0.8979468319638461,
"grad_norm": 75.25,
"learning_rate": 9.982461984162327e-07,
"loss": 130.6127,
"step": 11220
},
{
"epoch": 0.8987471410832434,
"grad_norm": 84.375,
"learning_rate": 9.98244635313217e-07,
"loss": 132.7165,
"step": 11230
},
{
"epoch": 0.8995474502026408,
"grad_norm": 76.75,
"learning_rate": 9.982430722102012e-07,
"loss": 129.617,
"step": 11240
},
{
"epoch": 0.9003477593220381,
"grad_norm": 84.6875,
"learning_rate": 9.982415091071852e-07,
"loss": 130.9968,
"step": 11250
},
{
"epoch": 0.9011480684414355,
"grad_norm": 74.1875,
"learning_rate": 9.982399460041694e-07,
"loss": 130.6432,
"step": 11260
},
{
"epoch": 0.9019483775608329,
"grad_norm": 77.9375,
"learning_rate": 9.982383829011536e-07,
"loss": 131.2904,
"step": 11270
},
{
"epoch": 0.9027486866802302,
"grad_norm": 71.9375,
"learning_rate": 9.982368197981376e-07,
"loss": 131.8605,
"step": 11280
},
{
"epoch": 0.9035489957996276,
"grad_norm": 76.75,
"learning_rate": 9.982352566951218e-07,
"loss": 132.6155,
"step": 11290
},
{
"epoch": 0.904349304919025,
"grad_norm": 74.8125,
"learning_rate": 9.98233693592106e-07,
"loss": 130.2238,
"step": 11300
},
{
"epoch": 0.9051496140384223,
"grad_norm": 78.4375,
"learning_rate": 9.9823213048909e-07,
"loss": 131.5677,
"step": 11310
},
{
"epoch": 0.9059499231578197,
"grad_norm": 72.0,
"learning_rate": 9.982305673860743e-07,
"loss": 130.6105,
"step": 11320
},
{
"epoch": 0.9067502322772171,
"grad_norm": 74.375,
"learning_rate": 9.982290042830585e-07,
"loss": 130.9643,
"step": 11330
},
{
"epoch": 0.9075505413966144,
"grad_norm": 80.125,
"learning_rate": 9.982274411800427e-07,
"loss": 131.6026,
"step": 11340
},
{
"epoch": 0.9083508505160118,
"grad_norm": 79.1875,
"learning_rate": 9.98225878077027e-07,
"loss": 131.4425,
"step": 11350
},
{
"epoch": 0.9091511596354092,
"grad_norm": 76.875,
"learning_rate": 9.98224314974011e-07,
"loss": 131.8976,
"step": 11360
},
{
"epoch": 0.9099514687548066,
"grad_norm": 80.0625,
"learning_rate": 9.982227518709952e-07,
"loss": 131.6046,
"step": 11370
},
{
"epoch": 0.9107517778742039,
"grad_norm": 80.0625,
"learning_rate": 9.982211887679794e-07,
"loss": 130.2627,
"step": 11380
},
{
"epoch": 0.9115520869936012,
"grad_norm": 73.6875,
"learning_rate": 9.982196256649636e-07,
"loss": 131.1418,
"step": 11390
},
{
"epoch": 0.9123523961129987,
"grad_norm": 75.3125,
"learning_rate": 9.982180625619476e-07,
"loss": 130.4075,
"step": 11400
},
{
"epoch": 0.913152705232396,
"grad_norm": 81.3125,
"learning_rate": 9.982164994589318e-07,
"loss": 131.0209,
"step": 11410
},
{
"epoch": 0.9139530143517933,
"grad_norm": 76.0625,
"learning_rate": 9.98214936355916e-07,
"loss": 130.3062,
"step": 11420
},
{
"epoch": 0.9147533234711908,
"grad_norm": 73.0,
"learning_rate": 9.982133732529003e-07,
"loss": 131.0498,
"step": 11430
},
{
"epoch": 0.9155536325905881,
"grad_norm": 80.6875,
"learning_rate": 9.982118101498843e-07,
"loss": 130.2114,
"step": 11440
},
{
"epoch": 0.9163539417099855,
"grad_norm": 75.25,
"learning_rate": 9.982102470468685e-07,
"loss": 130.6461,
"step": 11450
},
{
"epoch": 0.9171542508293828,
"grad_norm": 82.5,
"learning_rate": 9.982086839438527e-07,
"loss": 131.5808,
"step": 11460
},
{
"epoch": 0.9179545599487802,
"grad_norm": 75.0625,
"learning_rate": 9.982071208408367e-07,
"loss": 131.6145,
"step": 11470
},
{
"epoch": 0.9187548690681776,
"grad_norm": 77.375,
"learning_rate": 9.98205557737821e-07,
"loss": 131.2061,
"step": 11480
},
{
"epoch": 0.9195551781875749,
"grad_norm": 82.75,
"learning_rate": 9.982039946348052e-07,
"loss": 130.0431,
"step": 11490
},
{
"epoch": 0.9203554873069724,
"grad_norm": 73.8125,
"learning_rate": 9.982024315317894e-07,
"loss": 131.8762,
"step": 11500
},
{
"epoch": 0.9211557964263697,
"grad_norm": 71.25,
"learning_rate": 9.982008684287736e-07,
"loss": 131.6012,
"step": 11510
},
{
"epoch": 0.921956105545767,
"grad_norm": 77.5,
"learning_rate": 9.981993053257576e-07,
"loss": 131.3987,
"step": 11520
},
{
"epoch": 0.9227564146651644,
"grad_norm": 79.3125,
"learning_rate": 9.981977422227418e-07,
"loss": 132.0234,
"step": 11530
},
{
"epoch": 0.9235567237845618,
"grad_norm": 81.0,
"learning_rate": 9.98196179119726e-07,
"loss": 132.7055,
"step": 11540
},
{
"epoch": 0.9243570329039592,
"grad_norm": 81.5,
"learning_rate": 9.981946160167102e-07,
"loss": 131.5491,
"step": 11550
},
{
"epoch": 0.9251573420233565,
"grad_norm": 81.3125,
"learning_rate": 9.981930529136943e-07,
"loss": 129.9155,
"step": 11560
},
{
"epoch": 0.9259576511427539,
"grad_norm": 77.5,
"learning_rate": 9.981914898106785e-07,
"loss": 131.0387,
"step": 11570
},
{
"epoch": 0.9267579602621513,
"grad_norm": 74.5625,
"learning_rate": 9.981899267076627e-07,
"loss": 129.5762,
"step": 11580
},
{
"epoch": 0.9275582693815486,
"grad_norm": 72.5,
"learning_rate": 9.981883636046467e-07,
"loss": 129.8288,
"step": 11590
},
{
"epoch": 0.9283585785009459,
"grad_norm": 74.6875,
"learning_rate": 9.98186800501631e-07,
"loss": 130.526,
"step": 11600
},
{
"epoch": 0.9291588876203434,
"grad_norm": 79.3125,
"learning_rate": 9.981852373986151e-07,
"loss": 130.7928,
"step": 11610
},
{
"epoch": 0.9299591967397407,
"grad_norm": 74.5625,
"learning_rate": 9.981836742955994e-07,
"loss": 130.5095,
"step": 11620
},
{
"epoch": 0.9307595058591381,
"grad_norm": 78.6875,
"learning_rate": 9.981821111925834e-07,
"loss": 130.1216,
"step": 11630
},
{
"epoch": 0.9315598149785355,
"grad_norm": 73.9375,
"learning_rate": 9.981805480895676e-07,
"loss": 130.3142,
"step": 11640
},
{
"epoch": 0.9323601240979328,
"grad_norm": 80.25,
"learning_rate": 9.981789849865518e-07,
"loss": 129.8451,
"step": 11650
},
{
"epoch": 0.9331604332173302,
"grad_norm": 77.75,
"learning_rate": 9.98177421883536e-07,
"loss": 131.5875,
"step": 11660
},
{
"epoch": 0.9339607423367275,
"grad_norm": 75.9375,
"learning_rate": 9.981758587805202e-07,
"loss": 131.1091,
"step": 11670
},
{
"epoch": 0.934761051456125,
"grad_norm": 79.4375,
"learning_rate": 9.981742956775042e-07,
"loss": 129.5825,
"step": 11680
},
{
"epoch": 0.9355613605755223,
"grad_norm": 78.375,
"learning_rate": 9.981727325744885e-07,
"loss": 131.5865,
"step": 11690
},
{
"epoch": 0.9363616696949196,
"grad_norm": 74.25,
"learning_rate": 9.981711694714727e-07,
"loss": 131.3969,
"step": 11700
},
{
"epoch": 0.9371619788143171,
"grad_norm": 82.5625,
"learning_rate": 9.98169606368457e-07,
"loss": 131.5427,
"step": 11710
},
{
"epoch": 0.9379622879337144,
"grad_norm": 81.125,
"learning_rate": 9.98168043265441e-07,
"loss": 131.2085,
"step": 11720
},
{
"epoch": 0.9387625970531117,
"grad_norm": 76.25,
"learning_rate": 9.981664801624251e-07,
"loss": 130.8618,
"step": 11730
},
{
"epoch": 0.9395629061725091,
"grad_norm": 75.0625,
"learning_rate": 9.981649170594093e-07,
"loss": 130.0715,
"step": 11740
},
{
"epoch": 0.9403632152919065,
"grad_norm": 76.4375,
"learning_rate": 9.981633539563933e-07,
"loss": 131.8718,
"step": 11750
},
{
"epoch": 0.9411635244113039,
"grad_norm": 78.1875,
"learning_rate": 9.981617908533776e-07,
"loss": 131.3843,
"step": 11760
},
{
"epoch": 0.9419638335307012,
"grad_norm": 75.3125,
"learning_rate": 9.981602277503618e-07,
"loss": 130.1011,
"step": 11770
},
{
"epoch": 0.9427641426500986,
"grad_norm": 78.0625,
"learning_rate": 9.98158664647346e-07,
"loss": 131.4609,
"step": 11780
},
{
"epoch": 0.943564451769496,
"grad_norm": 73.75,
"learning_rate": 9.9815710154433e-07,
"loss": 130.214,
"step": 11790
},
{
"epoch": 0.9443647608888933,
"grad_norm": 81.3125,
"learning_rate": 9.981555384413142e-07,
"loss": 132.272,
"step": 11800
},
{
"epoch": 0.9451650700082908,
"grad_norm": 76.0625,
"learning_rate": 9.981539753382984e-07,
"loss": 130.5801,
"step": 11810
},
{
"epoch": 0.9459653791276881,
"grad_norm": 71.3125,
"learning_rate": 9.981524122352827e-07,
"loss": 130.7302,
"step": 11820
},
{
"epoch": 0.9467656882470854,
"grad_norm": 84.0625,
"learning_rate": 9.981508491322667e-07,
"loss": 130.839,
"step": 11830
},
{
"epoch": 0.9475659973664828,
"grad_norm": 78.5,
"learning_rate": 9.981492860292509e-07,
"loss": 130.5891,
"step": 11840
},
{
"epoch": 0.9483663064858802,
"grad_norm": 78.4375,
"learning_rate": 9.98147722926235e-07,
"loss": 130.0701,
"step": 11850
},
{
"epoch": 0.9491666156052775,
"grad_norm": 76.1875,
"learning_rate": 9.981461598232193e-07,
"loss": 130.6237,
"step": 11860
},
{
"epoch": 0.9499669247246749,
"grad_norm": 79.625,
"learning_rate": 9.981445967202035e-07,
"loss": 131.0294,
"step": 11870
},
{
"epoch": 0.9507672338440722,
"grad_norm": 79.3125,
"learning_rate": 9.981430336171875e-07,
"loss": 132.2408,
"step": 11880
},
{
"epoch": 0.9515675429634697,
"grad_norm": 72.0625,
"learning_rate": 9.981414705141718e-07,
"loss": 131.3878,
"step": 11890
},
{
"epoch": 0.952367852082867,
"grad_norm": 75.5625,
"learning_rate": 9.98139907411156e-07,
"loss": 131.9279,
"step": 11900
},
{
"epoch": 0.9531681612022643,
"grad_norm": 86.5,
"learning_rate": 9.9813834430814e-07,
"loss": 130.3583,
"step": 11910
},
{
"epoch": 0.9539684703216618,
"grad_norm": 74.4375,
"learning_rate": 9.981367812051242e-07,
"loss": 130.4393,
"step": 11920
},
{
"epoch": 0.9547687794410591,
"grad_norm": 74.6875,
"learning_rate": 9.981352181021084e-07,
"loss": 129.8773,
"step": 11930
},
{
"epoch": 0.9555690885604565,
"grad_norm": 77.6875,
"learning_rate": 9.981336549990924e-07,
"loss": 130.8676,
"step": 11940
},
{
"epoch": 0.9563693976798538,
"grad_norm": 80.5,
"learning_rate": 9.981320918960767e-07,
"loss": 131.1644,
"step": 11950
},
{
"epoch": 0.9571697067992512,
"grad_norm": 81.125,
"learning_rate": 9.981305287930609e-07,
"loss": 131.0869,
"step": 11960
},
{
"epoch": 0.9579700159186486,
"grad_norm": 85.0,
"learning_rate": 9.98128965690045e-07,
"loss": 130.5896,
"step": 11970
},
{
"epoch": 0.9587703250380459,
"grad_norm": 72.625,
"learning_rate": 9.981274025870293e-07,
"loss": 131.133,
"step": 11980
},
{
"epoch": 0.9595706341574433,
"grad_norm": 77.625,
"learning_rate": 9.981258394840133e-07,
"loss": 131.9948,
"step": 11990
},
{
"epoch": 0.9603709432768407,
"grad_norm": 78.25,
"learning_rate": 9.981242763809975e-07,
"loss": 130.3504,
"step": 12000
},
{
"epoch": 0.961171252396238,
"grad_norm": 69.0625,
"learning_rate": 9.981227132779817e-07,
"loss": 131.8812,
"step": 12010
},
{
"epoch": 0.9619715615156355,
"grad_norm": 76.125,
"learning_rate": 9.98121150174966e-07,
"loss": 131.7717,
"step": 12020
},
{
"epoch": 0.9627718706350328,
"grad_norm": 76.6875,
"learning_rate": 9.981195870719502e-07,
"loss": 131.0739,
"step": 12030
},
{
"epoch": 0.9635721797544301,
"grad_norm": 79.5,
"learning_rate": 9.981180239689342e-07,
"loss": 132.3484,
"step": 12040
},
{
"epoch": 0.9643724888738275,
"grad_norm": 81.25,
"learning_rate": 9.981164608659184e-07,
"loss": 131.6032,
"step": 12050
},
{
"epoch": 0.9651727979932249,
"grad_norm": 80.375,
"learning_rate": 9.981148977629026e-07,
"loss": 131.8261,
"step": 12060
},
{
"epoch": 0.9659731071126223,
"grad_norm": 80.9375,
"learning_rate": 9.981133346598866e-07,
"loss": 129.7895,
"step": 12070
},
{
"epoch": 0.9667734162320196,
"grad_norm": 77.75,
"learning_rate": 9.981117715568709e-07,
"loss": 129.8964,
"step": 12080
},
{
"epoch": 0.967573725351417,
"grad_norm": 72.5,
"learning_rate": 9.98110208453855e-07,
"loss": 130.2092,
"step": 12090
},
{
"epoch": 0.9683740344708144,
"grad_norm": 70.75,
"learning_rate": 9.98108645350839e-07,
"loss": 130.064,
"step": 12100
},
{
"epoch": 0.9691743435902117,
"grad_norm": 82.3125,
"learning_rate": 9.981070822478233e-07,
"loss": 133.1195,
"step": 12110
},
{
"epoch": 0.9699746527096091,
"grad_norm": 73.375,
"learning_rate": 9.981055191448075e-07,
"loss": 131.6047,
"step": 12120
},
{
"epoch": 0.9707749618290065,
"grad_norm": 78.0625,
"learning_rate": 9.981039560417917e-07,
"loss": 131.0189,
"step": 12130
},
{
"epoch": 0.9715752709484038,
"grad_norm": 80.375,
"learning_rate": 9.98102392938776e-07,
"loss": 131.1126,
"step": 12140
},
{
"epoch": 0.9723755800678012,
"grad_norm": 74.4375,
"learning_rate": 9.9810082983576e-07,
"loss": 131.885,
"step": 12150
},
{
"epoch": 0.9731758891871986,
"grad_norm": 78.25,
"learning_rate": 9.980992667327442e-07,
"loss": 130.9126,
"step": 12160
},
{
"epoch": 0.9739761983065959,
"grad_norm": 83.875,
"learning_rate": 9.980977036297284e-07,
"loss": 131.2764,
"step": 12170
},
{
"epoch": 0.9747765074259933,
"grad_norm": 72.875,
"learning_rate": 9.980961405267126e-07,
"loss": 132.7158,
"step": 12180
},
{
"epoch": 0.9755768165453906,
"grad_norm": 77.25,
"learning_rate": 9.980945774236968e-07,
"loss": 131.1642,
"step": 12190
},
{
"epoch": 0.9763771256647881,
"grad_norm": 77.125,
"learning_rate": 9.980930143206808e-07,
"loss": 130.3661,
"step": 12200
},
{
"epoch": 0.9771774347841854,
"grad_norm": 81.25,
"learning_rate": 9.98091451217665e-07,
"loss": 132.4058,
"step": 12210
},
{
"epoch": 0.9779777439035827,
"grad_norm": 77.1875,
"learning_rate": 9.980898881146493e-07,
"loss": 131.1993,
"step": 12220
},
{
"epoch": 0.9787780530229802,
"grad_norm": 78.375,
"learning_rate": 9.980883250116333e-07,
"loss": 129.8341,
"step": 12230
},
{
"epoch": 0.9795783621423775,
"grad_norm": 82.3125,
"learning_rate": 9.980867619086175e-07,
"loss": 130.408,
"step": 12240
},
{
"epoch": 0.9803786712617749,
"grad_norm": 79.375,
"learning_rate": 9.980851988056017e-07,
"loss": 132.0676,
"step": 12250
},
{
"epoch": 0.9811789803811722,
"grad_norm": 75.4375,
"learning_rate": 9.980836357025857e-07,
"loss": 130.5135,
"step": 12260
},
{
"epoch": 0.9819792895005696,
"grad_norm": 79.75,
"learning_rate": 9.9808207259957e-07,
"loss": 129.7958,
"step": 12270
},
{
"epoch": 0.982779598619967,
"grad_norm": 78.8125,
"learning_rate": 9.980805094965542e-07,
"loss": 131.8359,
"step": 12280
},
{
"epoch": 0.9835799077393643,
"grad_norm": 73.75,
"learning_rate": 9.980789463935384e-07,
"loss": 130.6134,
"step": 12290
},
{
"epoch": 0.9843802168587616,
"grad_norm": 79.0,
"learning_rate": 9.980773832905224e-07,
"loss": 130.2782,
"step": 12300
},
{
"epoch": 0.9851805259781591,
"grad_norm": 74.0,
"learning_rate": 9.980758201875066e-07,
"loss": 130.808,
"step": 12310
},
{
"epoch": 0.9859808350975564,
"grad_norm": 73.0,
"learning_rate": 9.980742570844908e-07,
"loss": 130.4844,
"step": 12320
},
{
"epoch": 0.9867811442169538,
"grad_norm": 72.4375,
"learning_rate": 9.98072693981475e-07,
"loss": 131.225,
"step": 12330
},
{
"epoch": 0.9875814533363512,
"grad_norm": 78.875,
"learning_rate": 9.980711308784593e-07,
"loss": 132.9991,
"step": 12340
},
{
"epoch": 0.9883817624557485,
"grad_norm": 75.9375,
"learning_rate": 9.980695677754435e-07,
"loss": 131.7026,
"step": 12350
},
{
"epoch": 0.9891820715751459,
"grad_norm": 74.125,
"learning_rate": 9.980680046724275e-07,
"loss": 130.6302,
"step": 12360
},
{
"epoch": 0.9899823806945433,
"grad_norm": 80.75,
"learning_rate": 9.980664415694117e-07,
"loss": 129.955,
"step": 12370
},
{
"epoch": 0.9907826898139407,
"grad_norm": 70.125,
"learning_rate": 9.98064878466396e-07,
"loss": 130.9879,
"step": 12380
},
{
"epoch": 0.991582998933338,
"grad_norm": 79.75,
"learning_rate": 9.9806331536338e-07,
"loss": 130.061,
"step": 12390
},
{
"epoch": 0.9923833080527353,
"grad_norm": 75.375,
"learning_rate": 9.980617522603641e-07,
"loss": 129.8558,
"step": 12400
},
{
"epoch": 0.9931836171721328,
"grad_norm": 78.4375,
"learning_rate": 9.980601891573484e-07,
"loss": 129.8251,
"step": 12410
},
{
"epoch": 0.9939839262915301,
"grad_norm": 70.1875,
"learning_rate": 9.980586260543324e-07,
"loss": 130.4507,
"step": 12420
},
{
"epoch": 0.9947842354109274,
"grad_norm": 74.4375,
"learning_rate": 9.980570629513166e-07,
"loss": 131.6167,
"step": 12430
},
{
"epoch": 0.9955845445303249,
"grad_norm": 76.1875,
"learning_rate": 9.980554998483008e-07,
"loss": 132.0188,
"step": 12440
},
{
"epoch": 0.9963848536497222,
"grad_norm": 79.6875,
"learning_rate": 9.98053936745285e-07,
"loss": 130.7771,
"step": 12450
},
{
"epoch": 0.9971851627691196,
"grad_norm": 74.25,
"learning_rate": 9.98052373642269e-07,
"loss": 130.4612,
"step": 12460
},
{
"epoch": 0.9979854718885169,
"grad_norm": 78.0625,
"learning_rate": 9.980508105392532e-07,
"loss": 129.6428,
"step": 12470
},
{
"epoch": 0.9987857810079143,
"grad_norm": 77.6875,
"learning_rate": 9.980492474362375e-07,
"loss": 130.6133,
"step": 12480
},
{
"epoch": 0.9995860901273117,
"grad_norm": 77.75,
"learning_rate": 9.980476843332217e-07,
"loss": 131.8606,
"step": 12490
}
],
"logging_steps": 10,
"max_steps": 12495,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.450015163893783e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}