Sky610TX / trainer_state.json
8BitStudio's picture
Upload 13 files
c36a18a verified
{
"best_global_step": 50000,
"best_metric": 2.1760547161102295,
"best_model_checkpoint": "./sky-389m-tx-project/checkpoint-50000",
"epoch": 1.7269964078474715,
"eval_steps": 1000,
"global_step": 50000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0034539928156949434,
"grad_norm": 3.3880717754364014,
"learning_rate": 2.4750000000000002e-05,
"loss": 8.5386,
"step": 100
},
{
"epoch": 0.006907985631389887,
"grad_norm": 2.3290209770202637,
"learning_rate": 4.975e-05,
"loss": 6.7703,
"step": 200
},
{
"epoch": 0.01036197844708483,
"grad_norm": 2.278391122817993,
"learning_rate": 7.475e-05,
"loss": 6.0778,
"step": 300
},
{
"epoch": 0.013815971262779773,
"grad_norm": 1.8386310338974,
"learning_rate": 9.975000000000001e-05,
"loss": 5.7081,
"step": 400
},
{
"epoch": 0.017269964078474715,
"grad_norm": 1.0589734315872192,
"learning_rate": 0.00012475,
"loss": 5.3779,
"step": 500
},
{
"epoch": 0.02072395689416966,
"grad_norm": 1.060039758682251,
"learning_rate": 0.00014975,
"loss": 5.0624,
"step": 600
},
{
"epoch": 0.024177949709864602,
"grad_norm": 1.0565826892852783,
"learning_rate": 0.00017475,
"loss": 4.8215,
"step": 700
},
{
"epoch": 0.027631942525559547,
"grad_norm": 0.9314415454864502,
"learning_rate": 0.00019975,
"loss": 4.5897,
"step": 800
},
{
"epoch": 0.03108593534125449,
"grad_norm": 0.9964447021484375,
"learning_rate": 0.00022475000000000001,
"loss": 4.4049,
"step": 900
},
{
"epoch": 0.03453992815694943,
"grad_norm": 0.9212857484817505,
"learning_rate": 0.00024975,
"loss": 4.2051,
"step": 1000
},
{
"epoch": 0.03453992815694943,
"eval_loss": 4.235040187835693,
"eval_runtime": 935.7314,
"eval_samples_per_second": 162.843,
"eval_steps_per_second": 1.629,
"step": 1000
},
{
"epoch": 0.037993920972644375,
"grad_norm": 0.8937363028526306,
"learning_rate": 0.00027475,
"loss": 4.0112,
"step": 1100
},
{
"epoch": 0.04144791378833932,
"grad_norm": 0.8522709012031555,
"learning_rate": 0.00029975000000000005,
"loss": 3.8575,
"step": 1200
},
{
"epoch": 0.044901906604034265,
"grad_norm": 0.8380929231643677,
"learning_rate": 0.00032475,
"loss": 3.7379,
"step": 1300
},
{
"epoch": 0.048355899419729204,
"grad_norm": 0.704521894454956,
"learning_rate": 0.00034975,
"loss": 3.6267,
"step": 1400
},
{
"epoch": 0.05180989223542415,
"grad_norm": 0.7526060938835144,
"learning_rate": 0.00037475000000000003,
"loss": 3.5288,
"step": 1500
},
{
"epoch": 0.055263885051119094,
"grad_norm": 0.8213881254196167,
"learning_rate": 0.00039975,
"loss": 3.4769,
"step": 1600
},
{
"epoch": 0.05871787786681404,
"grad_norm": 0.6610364317893982,
"learning_rate": 0.00042475000000000005,
"loss": 3.4406,
"step": 1700
},
{
"epoch": 0.06217187068250898,
"grad_norm": 0.8810706734657288,
"learning_rate": 0.00044975,
"loss": 3.3477,
"step": 1800
},
{
"epoch": 0.06562586349820393,
"grad_norm": 1.3641518354415894,
"learning_rate": 0.00047475,
"loss": 3.3183,
"step": 1900
},
{
"epoch": 0.06907985631389886,
"grad_norm": 0.6155968904495239,
"learning_rate": 0.0004997500000000001,
"loss": 3.3016,
"step": 2000
},
{
"epoch": 0.06907985631389886,
"eval_loss": 3.551506280899048,
"eval_runtime": 937.0574,
"eval_samples_per_second": 162.612,
"eval_steps_per_second": 1.626,
"step": 2000
},
{
"epoch": 0.0725338491295938,
"grad_norm": 0.5413870215415955,
"learning_rate": 0.0004994375,
"loss": 3.2393,
"step": 2100
},
{
"epoch": 0.07598784194528875,
"grad_norm": 0.5018215775489807,
"learning_rate": 0.0004988693181818182,
"loss": 3.1889,
"step": 2200
},
{
"epoch": 0.0794418347609837,
"grad_norm": 0.5307313203811646,
"learning_rate": 0.0004983011363636364,
"loss": 3.1391,
"step": 2300
},
{
"epoch": 0.08289582757667864,
"grad_norm": 0.5371428728103638,
"learning_rate": 0.0004977329545454545,
"loss": 3.1102,
"step": 2400
},
{
"epoch": 0.08634982039237359,
"grad_norm": 0.5079624652862549,
"learning_rate": 0.0004971647727272727,
"loss": 3.0868,
"step": 2500
},
{
"epoch": 0.08980381320806853,
"grad_norm": 0.4562855660915375,
"learning_rate": 0.000496596590909091,
"loss": 3.0448,
"step": 2600
},
{
"epoch": 0.09325780602376348,
"grad_norm": 0.4660443067550659,
"learning_rate": 0.0004960284090909092,
"loss": 3.0341,
"step": 2700
},
{
"epoch": 0.09671179883945841,
"grad_norm": 0.48204541206359863,
"learning_rate": 0.0004954602272727273,
"loss": 2.9917,
"step": 2800
},
{
"epoch": 0.10016579165515335,
"grad_norm": 0.43557730317115784,
"learning_rate": 0.0004948920454545454,
"loss": 2.9464,
"step": 2900
},
{
"epoch": 0.1036197844708483,
"grad_norm": 0.5063506364822388,
"learning_rate": 0.0004943238636363637,
"loss": 2.9463,
"step": 3000
},
{
"epoch": 0.1036197844708483,
"eval_loss": 3.2142741680145264,
"eval_runtime": 937.7254,
"eval_samples_per_second": 162.496,
"eval_steps_per_second": 1.625,
"step": 3000
},
{
"epoch": 0.10707377728654324,
"grad_norm": 0.4616130590438843,
"learning_rate": 0.0004937556818181818,
"loss": 2.9168,
"step": 3100
},
{
"epoch": 0.11052777010223819,
"grad_norm": 0.447933554649353,
"learning_rate": 0.0004931875,
"loss": 2.9172,
"step": 3200
},
{
"epoch": 0.11398176291793313,
"grad_norm": 0.4423768222332001,
"learning_rate": 0.0004926193181818183,
"loss": 2.8891,
"step": 3300
},
{
"epoch": 0.11743575573362808,
"grad_norm": 0.4318563640117645,
"learning_rate": 0.0004920511363636364,
"loss": 2.8479,
"step": 3400
},
{
"epoch": 0.12088974854932302,
"grad_norm": 0.41672056913375854,
"learning_rate": 0.0004914829545454545,
"loss": 2.8462,
"step": 3500
},
{
"epoch": 0.12434374136501795,
"grad_norm": 0.3930817246437073,
"learning_rate": 0.0004909147727272727,
"loss": 2.8219,
"step": 3600
},
{
"epoch": 0.1277977341807129,
"grad_norm": 0.4133651852607727,
"learning_rate": 0.0004903465909090909,
"loss": 2.7957,
"step": 3700
},
{
"epoch": 0.13125172699640786,
"grad_norm": 0.40811586380004883,
"learning_rate": 0.0004897784090909091,
"loss": 2.7879,
"step": 3800
},
{
"epoch": 0.1347057198121028,
"grad_norm": 0.42227643728256226,
"learning_rate": 0.0004892102272727273,
"loss": 2.7716,
"step": 3900
},
{
"epoch": 0.13815971262779772,
"grad_norm": 0.41413313150405884,
"learning_rate": 0.0004886420454545455,
"loss": 2.7606,
"step": 4000
},
{
"epoch": 0.13815971262779772,
"eval_loss": 3.061166286468506,
"eval_runtime": 936.0728,
"eval_samples_per_second": 162.783,
"eval_steps_per_second": 1.628,
"step": 4000
},
{
"epoch": 0.14161370544349267,
"grad_norm": 0.40972092747688293,
"learning_rate": 0.00048807386363636365,
"loss": 2.7386,
"step": 4100
},
{
"epoch": 0.1450676982591876,
"grad_norm": 0.4020697772502899,
"learning_rate": 0.00048750568181818183,
"loss": 2.7522,
"step": 4200
},
{
"epoch": 0.14852169107488256,
"grad_norm": 0.40231621265411377,
"learning_rate": 0.0004869375,
"loss": 2.7442,
"step": 4300
},
{
"epoch": 0.1519756838905775,
"grad_norm": 0.455773264169693,
"learning_rate": 0.0004863693181818182,
"loss": 2.7146,
"step": 4400
},
{
"epoch": 0.15542967670627245,
"grad_norm": 0.38691282272338867,
"learning_rate": 0.00048580113636363637,
"loss": 2.6924,
"step": 4500
},
{
"epoch": 0.1588836695219674,
"grad_norm": 0.3897066116333008,
"learning_rate": 0.00048523295454545455,
"loss": 2.6964,
"step": 4600
},
{
"epoch": 0.16233766233766234,
"grad_norm": 0.3786475360393524,
"learning_rate": 0.00048466477272727273,
"loss": 2.6566,
"step": 4700
},
{
"epoch": 0.16579165515335728,
"grad_norm": 0.3838929235935211,
"learning_rate": 0.0004840965909090909,
"loss": 2.6634,
"step": 4800
},
{
"epoch": 0.16924564796905223,
"grad_norm": 0.3646841049194336,
"learning_rate": 0.0004835284090909091,
"loss": 2.6708,
"step": 4900
},
{
"epoch": 0.17269964078474717,
"grad_norm": 0.37178680300712585,
"learning_rate": 0.0004829602272727273,
"loss": 2.6389,
"step": 5000
},
{
"epoch": 0.17269964078474717,
"eval_loss": 2.940995216369629,
"eval_runtime": 935.4231,
"eval_samples_per_second": 162.896,
"eval_steps_per_second": 1.629,
"step": 5000
},
{
"epoch": 0.17615363360044212,
"grad_norm": 0.37742722034454346,
"learning_rate": 0.00048239204545454545,
"loss": 2.644,
"step": 5100
},
{
"epoch": 0.17960762641613706,
"grad_norm": 0.3702583611011505,
"learning_rate": 0.00048182386363636363,
"loss": 2.624,
"step": 5200
},
{
"epoch": 0.183061619231832,
"grad_norm": 0.4044618308544159,
"learning_rate": 0.0004812556818181818,
"loss": 2.6197,
"step": 5300
},
{
"epoch": 0.18651561204752695,
"grad_norm": 0.3829458951950073,
"learning_rate": 0.0004806875,
"loss": 2.614,
"step": 5400
},
{
"epoch": 0.1899696048632219,
"grad_norm": 0.3829841911792755,
"learning_rate": 0.0004801193181818182,
"loss": 2.6118,
"step": 5500
},
{
"epoch": 0.19342359767891681,
"grad_norm": 0.3528871238231659,
"learning_rate": 0.0004795511363636364,
"loss": 2.6041,
"step": 5600
},
{
"epoch": 0.19687759049461176,
"grad_norm": 0.3476055860519409,
"learning_rate": 0.00047898295454545454,
"loss": 2.5908,
"step": 5700
},
{
"epoch": 0.2003315833103067,
"grad_norm": 0.3490158021450043,
"learning_rate": 0.0004784147727272727,
"loss": 2.569,
"step": 5800
},
{
"epoch": 0.20378557612600165,
"grad_norm": 0.3507535457611084,
"learning_rate": 0.00047784659090909095,
"loss": 2.5502,
"step": 5900
},
{
"epoch": 0.2072395689416966,
"grad_norm": 0.37472763657569885,
"learning_rate": 0.0004772784090909091,
"loss": 2.5656,
"step": 6000
},
{
"epoch": 0.2072395689416966,
"eval_loss": 2.869264602661133,
"eval_runtime": 927.5758,
"eval_samples_per_second": 164.274,
"eval_steps_per_second": 1.027,
"step": 6000
},
{
"epoch": 0.21069356175739154,
"grad_norm": 0.34653300046920776,
"learning_rate": 0.00047671022727272726,
"loss": 2.5509,
"step": 6100
},
{
"epoch": 0.21414755457308648,
"grad_norm": 0.3335779011249542,
"learning_rate": 0.0004761420454545455,
"loss": 2.5421,
"step": 6200
},
{
"epoch": 0.21760154738878143,
"grad_norm": 0.37146443128585815,
"learning_rate": 0.0004755738636363636,
"loss": 2.5438,
"step": 6300
},
{
"epoch": 0.22105554020447638,
"grad_norm": 0.33024120330810547,
"learning_rate": 0.0004750056818181818,
"loss": 2.5318,
"step": 6400
},
{
"epoch": 0.22450953302017132,
"grad_norm": 0.3545812666416168,
"learning_rate": 0.00047443750000000003,
"loss": 2.5167,
"step": 6500
},
{
"epoch": 0.22796352583586627,
"grad_norm": 0.3502351641654968,
"learning_rate": 0.00047386931818181816,
"loss": 2.5247,
"step": 6600
},
{
"epoch": 0.2314175186515612,
"grad_norm": 0.35102933645248413,
"learning_rate": 0.00047330113636363634,
"loss": 2.5271,
"step": 6700
},
{
"epoch": 0.23487151146725616,
"grad_norm": 0.34355252981185913,
"learning_rate": 0.0004727329545454546,
"loss": 2.536,
"step": 6800
},
{
"epoch": 0.2383255042829511,
"grad_norm": 0.3270651400089264,
"learning_rate": 0.00047216477272727275,
"loss": 2.5081,
"step": 6900
},
{
"epoch": 0.24177949709864605,
"grad_norm": 0.35053566098213196,
"learning_rate": 0.0004715965909090909,
"loss": 2.4945,
"step": 7000
},
{
"epoch": 0.24177949709864605,
"eval_loss": 2.804372549057007,
"eval_runtime": 927.2887,
"eval_samples_per_second": 164.325,
"eval_steps_per_second": 1.028,
"step": 7000
},
{
"epoch": 0.245233489914341,
"grad_norm": 0.3321439325809479,
"learning_rate": 0.0004710284090909091,
"loss": 2.482,
"step": 7100
},
{
"epoch": 0.2486874827300359,
"grad_norm": 0.3228578567504883,
"learning_rate": 0.0004704602272727273,
"loss": 2.4787,
"step": 7200
},
{
"epoch": 0.25214147554573085,
"grad_norm": 0.3319440186023712,
"learning_rate": 0.0004698920454545454,
"loss": 2.4704,
"step": 7300
},
{
"epoch": 0.2555954683614258,
"grad_norm": 0.34676915407180786,
"learning_rate": 0.00046932386363636366,
"loss": 2.479,
"step": 7400
},
{
"epoch": 0.25904946117712074,
"grad_norm": 0.3456803560256958,
"learning_rate": 0.00046875568181818184,
"loss": 2.462,
"step": 7500
},
{
"epoch": 0.2625034539928157,
"grad_norm": 0.330388605594635,
"learning_rate": 0.00046818749999999996,
"loss": 2.4638,
"step": 7600
},
{
"epoch": 0.26595744680851063,
"grad_norm": 0.3278537690639496,
"learning_rate": 0.0004676193181818182,
"loss": 2.456,
"step": 7700
},
{
"epoch": 0.2694114396242056,
"grad_norm": 0.331632137298584,
"learning_rate": 0.0004670511363636364,
"loss": 2.4459,
"step": 7800
},
{
"epoch": 0.2728654324399005,
"grad_norm": 0.34204795956611633,
"learning_rate": 0.0004664829545454545,
"loss": 2.4545,
"step": 7900
},
{
"epoch": 0.27631942525559544,
"grad_norm": 0.33582791686058044,
"learning_rate": 0.00046591477272727274,
"loss": 2.4377,
"step": 8000
},
{
"epoch": 0.27631942525559544,
"eval_loss": 2.753157138824463,
"eval_runtime": 927.4163,
"eval_samples_per_second": 164.303,
"eval_steps_per_second": 1.028,
"step": 8000
},
{
"epoch": 0.2797734180712904,
"grad_norm": 0.3626213073730469,
"learning_rate": 0.0004653465909090909,
"loss": 2.4395,
"step": 8100
},
{
"epoch": 0.28322741088698533,
"grad_norm": 0.33439400792121887,
"learning_rate": 0.00046477840909090915,
"loss": 2.4267,
"step": 8200
},
{
"epoch": 0.2866814037026803,
"grad_norm": 0.31855249404907227,
"learning_rate": 0.0004642102272727273,
"loss": 2.4349,
"step": 8300
},
{
"epoch": 0.2901353965183752,
"grad_norm": 0.3519601821899414,
"learning_rate": 0.00046364204545454546,
"loss": 2.4248,
"step": 8400
},
{
"epoch": 0.2935893893340702,
"grad_norm": 0.31838154792785645,
"learning_rate": 0.0004630738636363637,
"loss": 2.3968,
"step": 8500
},
{
"epoch": 0.2970433821497651,
"grad_norm": 0.3294484317302704,
"learning_rate": 0.0004625056818181818,
"loss": 2.4162,
"step": 8600
},
{
"epoch": 0.3004973749654601,
"grad_norm": 0.31714752316474915,
"learning_rate": 0.0004619375,
"loss": 2.4073,
"step": 8700
},
{
"epoch": 0.303951367781155,
"grad_norm": 0.32918691635131836,
"learning_rate": 0.00046136931818181823,
"loss": 2.4229,
"step": 8800
},
{
"epoch": 0.30740536059685,
"grad_norm": 0.3097570538520813,
"learning_rate": 0.00046080113636363636,
"loss": 2.3966,
"step": 8900
},
{
"epoch": 0.3108593534125449,
"grad_norm": 0.3247508406639099,
"learning_rate": 0.00046023295454545454,
"loss": 2.3925,
"step": 9000
},
{
"epoch": 0.3108593534125449,
"eval_loss": 2.705599308013916,
"eval_runtime": 927.4293,
"eval_samples_per_second": 164.3,
"eval_steps_per_second": 1.028,
"step": 9000
},
{
"epoch": 0.31431334622823986,
"grad_norm": 0.3189142048358917,
"learning_rate": 0.0004596647727272728,
"loss": 2.3932,
"step": 9100
},
{
"epoch": 0.3177673390439348,
"grad_norm": 0.3028543293476105,
"learning_rate": 0.0004590965909090909,
"loss": 2.3787,
"step": 9200
},
{
"epoch": 0.32122133185962976,
"grad_norm": 0.3109678030014038,
"learning_rate": 0.0004585284090909091,
"loss": 2.3665,
"step": 9300
},
{
"epoch": 0.3246753246753247,
"grad_norm": 0.31394320726394653,
"learning_rate": 0.0004579602272727273,
"loss": 2.3722,
"step": 9400
},
{
"epoch": 0.32812931749101965,
"grad_norm": 0.3214563727378845,
"learning_rate": 0.0004573920454545455,
"loss": 2.3586,
"step": 9500
},
{
"epoch": 0.33158331030671456,
"grad_norm": 0.33052927255630493,
"learning_rate": 0.0004568238636363636,
"loss": 2.3838,
"step": 9600
},
{
"epoch": 0.3350373031224095,
"grad_norm": 0.3511188328266144,
"learning_rate": 0.00045625568181818186,
"loss": 2.3588,
"step": 9700
},
{
"epoch": 0.33849129593810445,
"grad_norm": 0.31076422333717346,
"learning_rate": 0.00045568750000000004,
"loss": 2.3552,
"step": 9800
},
{
"epoch": 0.34194528875379937,
"grad_norm": 0.32571229338645935,
"learning_rate": 0.00045511931818181816,
"loss": 2.3496,
"step": 9900
},
{
"epoch": 0.34539928156949434,
"grad_norm": 0.3375560939311981,
"learning_rate": 0.0004545511363636364,
"loss": 2.3526,
"step": 10000
},
{
"epoch": 0.34539928156949434,
"eval_loss": 2.6650397777557373,
"eval_runtime": 927.3482,
"eval_samples_per_second": 164.315,
"eval_steps_per_second": 1.028,
"step": 10000
},
{
"epoch": 0.34885327438518926,
"grad_norm": 0.3159004747867584,
"learning_rate": 0.0004539829545454546,
"loss": 2.3606,
"step": 10100
},
{
"epoch": 0.35230726720088423,
"grad_norm": 0.32001519203186035,
"learning_rate": 0.0004534147727272727,
"loss": 2.37,
"step": 10200
},
{
"epoch": 0.35576126001657915,
"grad_norm": 0.328933447599411,
"learning_rate": 0.00045284659090909094,
"loss": 2.3515,
"step": 10300
},
{
"epoch": 0.3592152528322741,
"grad_norm": 0.3076813817024231,
"learning_rate": 0.0004522784090909091,
"loss": 2.3276,
"step": 10400
},
{
"epoch": 0.36266924564796904,
"grad_norm": 0.3153810501098633,
"learning_rate": 0.00045171022727272725,
"loss": 2.3373,
"step": 10500
},
{
"epoch": 0.366123238463664,
"grad_norm": 0.32247108221054077,
"learning_rate": 0.0004511420454545455,
"loss": 2.3335,
"step": 10600
},
{
"epoch": 0.36957723127935893,
"grad_norm": 0.3074076771736145,
"learning_rate": 0.00045057386363636366,
"loss": 2.325,
"step": 10700
},
{
"epoch": 0.3730312240950539,
"grad_norm": 0.31907033920288086,
"learning_rate": 0.0004500056818181818,
"loss": 2.3155,
"step": 10800
},
{
"epoch": 0.3764852169107488,
"grad_norm": 0.32912886142730713,
"learning_rate": 0.0004494375,
"loss": 2.324,
"step": 10900
},
{
"epoch": 0.3799392097264438,
"grad_norm": 0.3003767430782318,
"learning_rate": 0.0004488693181818182,
"loss": 2.3222,
"step": 11000
},
{
"epoch": 0.3799392097264438,
"eval_loss": 2.633434772491455,
"eval_runtime": 927.254,
"eval_samples_per_second": 164.331,
"eval_steps_per_second": 1.028,
"step": 11000
},
{
"epoch": 0.3833932025421387,
"grad_norm": 0.3144666254520416,
"learning_rate": 0.0004483011363636364,
"loss": 2.3251,
"step": 11100
},
{
"epoch": 0.38684719535783363,
"grad_norm": 0.3284156322479248,
"learning_rate": 0.00044773295454545456,
"loss": 2.3033,
"step": 11200
},
{
"epoch": 0.3903011881735286,
"grad_norm": 0.3226972222328186,
"learning_rate": 0.00044716477272727274,
"loss": 2.296,
"step": 11300
},
{
"epoch": 0.3937551809892235,
"grad_norm": 0.34044623374938965,
"learning_rate": 0.0004465965909090909,
"loss": 2.3198,
"step": 11400
},
{
"epoch": 0.3972091738049185,
"grad_norm": 0.3101319372653961,
"learning_rate": 0.0004460284090909091,
"loss": 2.3107,
"step": 11500
},
{
"epoch": 0.4006631666206134,
"grad_norm": 0.3044012486934662,
"learning_rate": 0.0004454602272727273,
"loss": 2.2984,
"step": 11600
},
{
"epoch": 0.4041171594363084,
"grad_norm": 0.3155890107154846,
"learning_rate": 0.00044489204545454546,
"loss": 2.2968,
"step": 11700
},
{
"epoch": 0.4075711522520033,
"grad_norm": 0.33918723464012146,
"learning_rate": 0.00044432386363636364,
"loss": 2.2707,
"step": 11800
},
{
"epoch": 0.41102514506769827,
"grad_norm": 0.30243411660194397,
"learning_rate": 0.0004437556818181818,
"loss": 2.2979,
"step": 11900
},
{
"epoch": 0.4144791378833932,
"grad_norm": 0.3046514391899109,
"learning_rate": 0.0004431875,
"loss": 2.2809,
"step": 12000
},
{
"epoch": 0.4144791378833932,
"eval_loss": 2.61051344871521,
"eval_runtime": 927.0521,
"eval_samples_per_second": 164.367,
"eval_steps_per_second": 1.028,
"step": 12000
},
{
"epoch": 0.41793313069908816,
"grad_norm": 0.32584163546562195,
"learning_rate": 0.0004426193181818182,
"loss": 2.2876,
"step": 12100
},
{
"epoch": 0.4213871235147831,
"grad_norm": 0.34489238262176514,
"learning_rate": 0.00044205113636363637,
"loss": 2.298,
"step": 12200
},
{
"epoch": 0.42484111633047805,
"grad_norm": 0.30355241894721985,
"learning_rate": 0.00044148295454545455,
"loss": 2.2767,
"step": 12300
},
{
"epoch": 0.42829510914617297,
"grad_norm": 0.3140780031681061,
"learning_rate": 0.0004409147727272728,
"loss": 2.2779,
"step": 12400
},
{
"epoch": 0.43174910196186794,
"grad_norm": 0.31298449635505676,
"learning_rate": 0.0004403465909090909,
"loss": 2.2831,
"step": 12500
},
{
"epoch": 0.43520309477756286,
"grad_norm": 0.32630786299705505,
"learning_rate": 0.0004397784090909091,
"loss": 2.2698,
"step": 12600
},
{
"epoch": 0.43865708759325783,
"grad_norm": 0.303371399641037,
"learning_rate": 0.0004392102272727273,
"loss": 2.2767,
"step": 12700
},
{
"epoch": 0.44211108040895275,
"grad_norm": 0.30070436000823975,
"learning_rate": 0.00043864204545454545,
"loss": 2.2449,
"step": 12800
},
{
"epoch": 0.44556507322464767,
"grad_norm": 0.2887287139892578,
"learning_rate": 0.00043807386363636363,
"loss": 2.2688,
"step": 12900
},
{
"epoch": 0.44901906604034264,
"grad_norm": 0.306916743516922,
"learning_rate": 0.00043750568181818186,
"loss": 2.2557,
"step": 13000
},
{
"epoch": 0.44901906604034264,
"eval_loss": 2.5854439735412598,
"eval_runtime": 926.6658,
"eval_samples_per_second": 164.436,
"eval_steps_per_second": 1.028,
"step": 13000
},
{
"epoch": 0.45247305885603756,
"grad_norm": 0.34850597381591797,
"learning_rate": 0.0004369375,
"loss": 2.2423,
"step": 13100
},
{
"epoch": 0.45592705167173253,
"grad_norm": 0.35393500328063965,
"learning_rate": 0.00043636931818181817,
"loss": 2.2543,
"step": 13200
},
{
"epoch": 0.45938104448742745,
"grad_norm": 0.3059336543083191,
"learning_rate": 0.0004358011363636364,
"loss": 2.2516,
"step": 13300
},
{
"epoch": 0.4628350373031224,
"grad_norm": 0.3357197344303131,
"learning_rate": 0.00043523295454545453,
"loss": 2.2328,
"step": 13400
},
{
"epoch": 0.46628903011881734,
"grad_norm": 0.31849172711372375,
"learning_rate": 0.0004346647727272727,
"loss": 2.2424,
"step": 13500
},
{
"epoch": 0.4697430229345123,
"grad_norm": 0.31968438625335693,
"learning_rate": 0.00043409659090909094,
"loss": 2.2228,
"step": 13600
},
{
"epoch": 0.47319701575020723,
"grad_norm": 0.3293677568435669,
"learning_rate": 0.0004335284090909091,
"loss": 2.2555,
"step": 13700
},
{
"epoch": 0.4766510085659022,
"grad_norm": 0.3031880855560303,
"learning_rate": 0.00043296022727272725,
"loss": 2.2387,
"step": 13800
},
{
"epoch": 0.4801050013815971,
"grad_norm": 0.2914179563522339,
"learning_rate": 0.0004323920454545455,
"loss": 2.2494,
"step": 13900
},
{
"epoch": 0.4835589941972921,
"grad_norm": 0.3345280587673187,
"learning_rate": 0.00043182386363636367,
"loss": 2.2322,
"step": 14000
},
{
"epoch": 0.4835589941972921,
"eval_loss": 2.5606906414031982,
"eval_runtime": 929.2705,
"eval_samples_per_second": 163.975,
"eval_steps_per_second": 1.026,
"step": 14000
},
{
"epoch": 0.487012987012987,
"grad_norm": 0.3165434002876282,
"learning_rate": 0.0004312556818181818,
"loss": 2.2266,
"step": 14100
},
{
"epoch": 0.490466979828682,
"grad_norm": 0.30577775835990906,
"learning_rate": 0.0004306875,
"loss": 2.2401,
"step": 14200
},
{
"epoch": 0.4939209726443769,
"grad_norm": 0.2920292019844055,
"learning_rate": 0.0004301193181818182,
"loss": 2.2203,
"step": 14300
},
{
"epoch": 0.4973749654600718,
"grad_norm": 0.32168078422546387,
"learning_rate": 0.00042955113636363633,
"loss": 2.2064,
"step": 14400
},
{
"epoch": 0.5008289582757668,
"grad_norm": 0.31879886984825134,
"learning_rate": 0.00042898295454545457,
"loss": 2.219,
"step": 14500
},
{
"epoch": 0.5042829510914617,
"grad_norm": 0.2906869351863861,
"learning_rate": 0.00042841477272727275,
"loss": 2.2055,
"step": 14600
},
{
"epoch": 0.5077369439071566,
"grad_norm": 0.3648407757282257,
"learning_rate": 0.0004278465909090909,
"loss": 2.2157,
"step": 14700
},
{
"epoch": 0.5111909367228517,
"grad_norm": 0.30823054909706116,
"learning_rate": 0.0004272784090909091,
"loss": 2.2158,
"step": 14800
},
{
"epoch": 0.5146449295385466,
"grad_norm": 0.3004588782787323,
"learning_rate": 0.0004267102272727273,
"loss": 2.2009,
"step": 14900
},
{
"epoch": 0.5180989223542415,
"grad_norm": 0.29552149772644043,
"learning_rate": 0.00042614204545454547,
"loss": 2.2194,
"step": 15000
},
{
"epoch": 0.5180989223542415,
"eval_loss": 2.537440538406372,
"eval_runtime": 928.9697,
"eval_samples_per_second": 164.028,
"eval_steps_per_second": 1.026,
"step": 15000
},
{
"epoch": 0.5215529151699364,
"grad_norm": 0.3077145516872406,
"learning_rate": 0.00042557386363636365,
"loss": 2.199,
"step": 15100
},
{
"epoch": 0.5250069079856314,
"grad_norm": 0.32205095887184143,
"learning_rate": 0.00042500568181818183,
"loss": 2.2045,
"step": 15200
},
{
"epoch": 0.5284609008013263,
"grad_norm": 0.30157867074012756,
"learning_rate": 0.0004244375,
"loss": 2.1926,
"step": 15300
},
{
"epoch": 0.5319148936170213,
"grad_norm": 0.35868486762046814,
"learning_rate": 0.0004238693181818182,
"loss": 2.1911,
"step": 15400
},
{
"epoch": 0.5353688864327162,
"grad_norm": 0.3132970631122589,
"learning_rate": 0.00042330113636363637,
"loss": 2.193,
"step": 15500
},
{
"epoch": 0.5388228792484112,
"grad_norm": 0.31356823444366455,
"learning_rate": 0.00042273295454545455,
"loss": 2.1959,
"step": 15600
},
{
"epoch": 0.5422768720641061,
"grad_norm": 0.31471192836761475,
"learning_rate": 0.00042216477272727273,
"loss": 2.2069,
"step": 15700
},
{
"epoch": 0.545730864879801,
"grad_norm": 0.33163174986839294,
"learning_rate": 0.0004215965909090909,
"loss": 2.1929,
"step": 15800
},
{
"epoch": 0.549184857695496,
"grad_norm": 0.31774455308914185,
"learning_rate": 0.0004210284090909091,
"loss": 2.1816,
"step": 15900
},
{
"epoch": 0.5526388505111909,
"grad_norm": 0.30572381615638733,
"learning_rate": 0.00042046022727272727,
"loss": 2.206,
"step": 16000
},
{
"epoch": 0.5526388505111909,
"eval_loss": 2.52417254447937,
"eval_runtime": 928.7275,
"eval_samples_per_second": 164.071,
"eval_steps_per_second": 1.026,
"step": 16000
},
{
"epoch": 0.5560928433268859,
"grad_norm": 0.3196762502193451,
"learning_rate": 0.00041989204545454545,
"loss": 2.1801,
"step": 16100
},
{
"epoch": 0.5595468361425808,
"grad_norm": 0.3148038685321808,
"learning_rate": 0.00041932386363636363,
"loss": 2.1722,
"step": 16200
},
{
"epoch": 0.5630008289582757,
"grad_norm": 0.32507434487342834,
"learning_rate": 0.0004187556818181818,
"loss": 2.1836,
"step": 16300
},
{
"epoch": 0.5664548217739707,
"grad_norm": 0.3227043151855469,
"learning_rate": 0.0004181875,
"loss": 2.1794,
"step": 16400
},
{
"epoch": 0.5699088145896657,
"grad_norm": 0.3271748721599579,
"learning_rate": 0.0004176193181818182,
"loss": 2.1786,
"step": 16500
},
{
"epoch": 0.5733628074053606,
"grad_norm": 0.31076040863990784,
"learning_rate": 0.0004170511363636364,
"loss": 2.1616,
"step": 16600
},
{
"epoch": 0.5768168002210555,
"grad_norm": 0.32442960143089294,
"learning_rate": 0.00041648295454545453,
"loss": 2.1642,
"step": 16700
},
{
"epoch": 0.5802707930367504,
"grad_norm": 0.2945985794067383,
"learning_rate": 0.0004159147727272727,
"loss": 2.1641,
"step": 16800
},
{
"epoch": 0.5837247858524455,
"grad_norm": 0.32005414366722107,
"learning_rate": 0.00041534659090909095,
"loss": 2.1968,
"step": 16900
},
{
"epoch": 0.5871787786681404,
"grad_norm": 0.31035128235816956,
"learning_rate": 0.0004147784090909091,
"loss": 2.1735,
"step": 17000
},
{
"epoch": 0.5871787786681404,
"eval_loss": 2.5003573894500732,
"eval_runtime": 929.6579,
"eval_samples_per_second": 163.907,
"eval_steps_per_second": 1.025,
"step": 17000
},
{
"epoch": 0.5906327714838353,
"grad_norm": 0.3103092908859253,
"learning_rate": 0.00041421022727272726,
"loss": 2.1625,
"step": 17100
},
{
"epoch": 0.5940867642995302,
"grad_norm": 0.3217906057834625,
"learning_rate": 0.0004136420454545455,
"loss": 2.1485,
"step": 17200
},
{
"epoch": 0.5975407571152253,
"grad_norm": 0.2988424301147461,
"learning_rate": 0.0004130738636363636,
"loss": 2.1628,
"step": 17300
},
{
"epoch": 0.6009947499309202,
"grad_norm": 0.3058546185493469,
"learning_rate": 0.0004125056818181818,
"loss": 2.1701,
"step": 17400
},
{
"epoch": 0.6044487427466151,
"grad_norm": 0.3056589961051941,
"learning_rate": 0.00041193750000000003,
"loss": 2.1515,
"step": 17500
},
{
"epoch": 0.60790273556231,
"grad_norm": 0.31840789318084717,
"learning_rate": 0.00041136931818181816,
"loss": 2.1536,
"step": 17600
},
{
"epoch": 0.6113567283780049,
"grad_norm": 0.3044828772544861,
"learning_rate": 0.00041080113636363634,
"loss": 2.162,
"step": 17700
},
{
"epoch": 0.6148107211937,
"grad_norm": 0.30973371863365173,
"learning_rate": 0.00041023295454545457,
"loss": 2.1498,
"step": 17800
},
{
"epoch": 0.6182647140093949,
"grad_norm": 0.30947718024253845,
"learning_rate": 0.00040966477272727275,
"loss": 2.1435,
"step": 17900
},
{
"epoch": 0.6217187068250898,
"grad_norm": 0.34587281942367554,
"learning_rate": 0.0004090965909090909,
"loss": 2.1504,
"step": 18000
},
{
"epoch": 0.6217187068250898,
"eval_loss": 2.484160900115967,
"eval_runtime": 927.1891,
"eval_samples_per_second": 164.343,
"eval_steps_per_second": 1.028,
"step": 18000
},
{
"epoch": 0.6251726996407847,
"grad_norm": 0.30945053696632385,
"learning_rate": 0.0004085284090909091,
"loss": 2.1545,
"step": 18100
},
{
"epoch": 0.6286266924564797,
"grad_norm": 0.3018719255924225,
"learning_rate": 0.0004079602272727273,
"loss": 2.1439,
"step": 18200
},
{
"epoch": 0.6320806852721746,
"grad_norm": 0.3113386332988739,
"learning_rate": 0.0004073920454545454,
"loss": 2.1225,
"step": 18300
},
{
"epoch": 0.6355346780878696,
"grad_norm": 0.29737088084220886,
"learning_rate": 0.00040682386363636365,
"loss": 2.1286,
"step": 18400
},
{
"epoch": 0.6389886709035645,
"grad_norm": 0.31960177421569824,
"learning_rate": 0.00040625568181818183,
"loss": 2.1249,
"step": 18500
},
{
"epoch": 0.6424426637192595,
"grad_norm": 0.3072162866592407,
"learning_rate": 0.00040568749999999996,
"loss": 2.1348,
"step": 18600
},
{
"epoch": 0.6458966565349544,
"grad_norm": 0.3196597397327423,
"learning_rate": 0.0004051193181818182,
"loss": 2.1408,
"step": 18700
},
{
"epoch": 0.6493506493506493,
"grad_norm": 0.3315812051296234,
"learning_rate": 0.0004045511363636364,
"loss": 2.1439,
"step": 18800
},
{
"epoch": 0.6528046421663443,
"grad_norm": 0.2933200001716614,
"learning_rate": 0.0004039829545454545,
"loss": 2.1465,
"step": 18900
},
{
"epoch": 0.6562586349820393,
"grad_norm": 0.33558085560798645,
"learning_rate": 0.00040341477272727274,
"loss": 2.1416,
"step": 19000
},
{
"epoch": 0.6562586349820393,
"eval_loss": 2.473646640777588,
"eval_runtime": 926.9255,
"eval_samples_per_second": 164.39,
"eval_steps_per_second": 1.028,
"step": 19000
},
{
"epoch": 0.6597126277977342,
"grad_norm": 0.2992997169494629,
"learning_rate": 0.0004028465909090909,
"loss": 2.1386,
"step": 19100
},
{
"epoch": 0.6631666206134291,
"grad_norm": 0.3051714599132538,
"learning_rate": 0.00040227840909090915,
"loss": 2.1225,
"step": 19200
},
{
"epoch": 0.666620613429124,
"grad_norm": 0.31724849343299866,
"learning_rate": 0.0004017102272727273,
"loss": 2.1174,
"step": 19300
},
{
"epoch": 0.670074606244819,
"grad_norm": 0.2937643826007843,
"learning_rate": 0.00040114204545454546,
"loss": 2.1279,
"step": 19400
},
{
"epoch": 0.673528599060514,
"grad_norm": 0.31908687949180603,
"learning_rate": 0.0004005738636363637,
"loss": 2.1151,
"step": 19500
},
{
"epoch": 0.6769825918762089,
"grad_norm": 0.31399762630462646,
"learning_rate": 0.0004000056818181818,
"loss": 2.1378,
"step": 19600
},
{
"epoch": 0.6804365846919038,
"grad_norm": 0.3157575726509094,
"learning_rate": 0.0003994375,
"loss": 2.1149,
"step": 19700
},
{
"epoch": 0.6838905775075987,
"grad_norm": 0.32018882036209106,
"learning_rate": 0.00039886931818181823,
"loss": 2.0993,
"step": 19800
},
{
"epoch": 0.6873445703232938,
"grad_norm": 0.31708574295043945,
"learning_rate": 0.00039830113636363636,
"loss": 2.1132,
"step": 19900
},
{
"epoch": 0.6907985631389887,
"grad_norm": 0.2904827892780304,
"learning_rate": 0.00039773295454545454,
"loss": 2.1088,
"step": 20000
},
{
"epoch": 0.6907985631389887,
"eval_loss": 2.4520211219787598,
"eval_runtime": 926.891,
"eval_samples_per_second": 164.396,
"eval_steps_per_second": 1.028,
"step": 20000
},
{
"epoch": 0.6942525559546836,
"grad_norm": 0.3298169672489166,
"learning_rate": 0.0003971647727272728,
"loss": 2.1177,
"step": 20100
},
{
"epoch": 0.6977065487703785,
"grad_norm": 0.291166752576828,
"learning_rate": 0.0003965965909090909,
"loss": 2.0954,
"step": 20200
},
{
"epoch": 0.7011605415860735,
"grad_norm": 0.3211086094379425,
"learning_rate": 0.0003960284090909091,
"loss": 2.1209,
"step": 20300
},
{
"epoch": 0.7046145344017685,
"grad_norm": 0.3161545395851135,
"learning_rate": 0.0003954602272727273,
"loss": 2.1149,
"step": 20400
},
{
"epoch": 0.7080685272174634,
"grad_norm": 0.3262562155723572,
"learning_rate": 0.0003948920454545455,
"loss": 2.1204,
"step": 20500
},
{
"epoch": 0.7115225200331583,
"grad_norm": 0.3347005546092987,
"learning_rate": 0.0003943238636363636,
"loss": 2.104,
"step": 20600
},
{
"epoch": 0.7149765128488533,
"grad_norm": 0.30474451184272766,
"learning_rate": 0.00039375568181818186,
"loss": 2.0955,
"step": 20700
},
{
"epoch": 0.7184305056645482,
"grad_norm": 0.32672184705734253,
"learning_rate": 0.00039318750000000004,
"loss": 2.0998,
"step": 20800
},
{
"epoch": 0.7218844984802432,
"grad_norm": 0.3041098713874817,
"learning_rate": 0.00039261931818181816,
"loss": 2.0897,
"step": 20900
},
{
"epoch": 0.7253384912959381,
"grad_norm": 0.351904034614563,
"learning_rate": 0.0003920511363636364,
"loss": 2.0925,
"step": 21000
},
{
"epoch": 0.7253384912959381,
"eval_loss": 2.4404454231262207,
"eval_runtime": 927.0383,
"eval_samples_per_second": 164.37,
"eval_steps_per_second": 1.028,
"step": 21000
},
{
"epoch": 0.728792484111633,
"grad_norm": 0.34308210015296936,
"learning_rate": 0.0003914829545454546,
"loss": 2.1039,
"step": 21100
},
{
"epoch": 0.732246476927328,
"grad_norm": 0.3298318088054657,
"learning_rate": 0.0003909147727272727,
"loss": 2.0774,
"step": 21200
},
{
"epoch": 0.735700469743023,
"grad_norm": 0.3102123737335205,
"learning_rate": 0.00039034659090909094,
"loss": 2.1111,
"step": 21300
},
{
"epoch": 0.7391544625587179,
"grad_norm": 0.3186514973640442,
"learning_rate": 0.0003897784090909091,
"loss": 2.084,
"step": 21400
},
{
"epoch": 0.7426084553744128,
"grad_norm": 0.31114721298217773,
"learning_rate": 0.00038921022727272724,
"loss": 2.1037,
"step": 21500
},
{
"epoch": 0.7460624481901078,
"grad_norm": 0.330563485622406,
"learning_rate": 0.0003886420454545455,
"loss": 2.0831,
"step": 21600
},
{
"epoch": 0.7495164410058027,
"grad_norm": 0.3088129460811615,
"learning_rate": 0.00038807386363636366,
"loss": 2.0914,
"step": 21700
},
{
"epoch": 0.7529704338214976,
"grad_norm": 0.28733545541763306,
"learning_rate": 0.0003875056818181818,
"loss": 2.0955,
"step": 21800
},
{
"epoch": 0.7564244266371926,
"grad_norm": 0.3190239667892456,
"learning_rate": 0.0003869375,
"loss": 2.0828,
"step": 21900
},
{
"epoch": 0.7598784194528876,
"grad_norm": 0.3163771331310272,
"learning_rate": 0.0003863693181818182,
"loss": 2.0786,
"step": 22000
},
{
"epoch": 0.7598784194528876,
"eval_loss": 2.4309139251708984,
"eval_runtime": 926.6419,
"eval_samples_per_second": 164.44,
"eval_steps_per_second": 1.028,
"step": 22000
},
{
"epoch": 0.7633324122685825,
"grad_norm": 0.2819238603115082,
"learning_rate": 0.0003858011363636364,
"loss": 2.092,
"step": 22100
},
{
"epoch": 0.7667864050842774,
"grad_norm": 0.31991979479789734,
"learning_rate": 0.00038523295454545456,
"loss": 2.0628,
"step": 22200
},
{
"epoch": 0.7702403978999723,
"grad_norm": 0.3094194233417511,
"learning_rate": 0.00038466477272727274,
"loss": 2.0826,
"step": 22300
},
{
"epoch": 0.7736943907156673,
"grad_norm": 0.30959707498550415,
"learning_rate": 0.0003840965909090909,
"loss": 2.0858,
"step": 22400
},
{
"epoch": 0.7771483835313623,
"grad_norm": 0.30589380860328674,
"learning_rate": 0.0003835284090909091,
"loss": 2.0864,
"step": 22500
},
{
"epoch": 0.7806023763470572,
"grad_norm": 0.3400673270225525,
"learning_rate": 0.0003829602272727273,
"loss": 2.069,
"step": 22600
},
{
"epoch": 0.7840563691627521,
"grad_norm": 0.3428845703601837,
"learning_rate": 0.00038239204545454546,
"loss": 2.0622,
"step": 22700
},
{
"epoch": 0.787510361978447,
"grad_norm": 0.3274592459201813,
"learning_rate": 0.00038182386363636364,
"loss": 2.0714,
"step": 22800
},
{
"epoch": 0.7909643547941421,
"grad_norm": 0.3281017243862152,
"learning_rate": 0.0003812556818181818,
"loss": 2.0856,
"step": 22900
},
{
"epoch": 0.794418347609837,
"grad_norm": 0.32381513714790344,
"learning_rate": 0.0003806875,
"loss": 2.0687,
"step": 23000
},
{
"epoch": 0.794418347609837,
"eval_loss": 2.416405439376831,
"eval_runtime": 926.9677,
"eval_samples_per_second": 164.382,
"eval_steps_per_second": 1.028,
"step": 23000
},
{
"epoch": 0.7978723404255319,
"grad_norm": 0.31997501850128174,
"learning_rate": 0.0003801193181818182,
"loss": 2.0923,
"step": 23100
},
{
"epoch": 0.8013263332412268,
"grad_norm": 0.315775603055954,
"learning_rate": 0.00037955113636363636,
"loss": 2.0578,
"step": 23200
},
{
"epoch": 0.8047803260569218,
"grad_norm": 0.3135242462158203,
"learning_rate": 0.00037898295454545454,
"loss": 2.0604,
"step": 23300
},
{
"epoch": 0.8082343188726168,
"grad_norm": 0.33324697613716125,
"learning_rate": 0.0003784147727272728,
"loss": 2.0776,
"step": 23400
},
{
"epoch": 0.8116883116883117,
"grad_norm": 0.3114740252494812,
"learning_rate": 0.0003778465909090909,
"loss": 2.0679,
"step": 23500
},
{
"epoch": 0.8151423045040066,
"grad_norm": 0.37432342767715454,
"learning_rate": 0.0003772784090909091,
"loss": 2.0685,
"step": 23600
},
{
"epoch": 0.8185962973197016,
"grad_norm": 0.31538712978363037,
"learning_rate": 0.0003767102272727273,
"loss": 2.0687,
"step": 23700
},
{
"epoch": 0.8220502901353965,
"grad_norm": 0.3598659336566925,
"learning_rate": 0.00037614204545454545,
"loss": 2.0909,
"step": 23800
},
{
"epoch": 0.8255042829510915,
"grad_norm": 0.3034459948539734,
"learning_rate": 0.0003755738636363636,
"loss": 2.0588,
"step": 23900
},
{
"epoch": 0.8289582757667864,
"grad_norm": 0.3221229016780853,
"learning_rate": 0.00037500568181818186,
"loss": 2.0661,
"step": 24000
},
{
"epoch": 0.8289582757667864,
"eval_loss": 2.4008617401123047,
"eval_runtime": 927.2556,
"eval_samples_per_second": 164.331,
"eval_steps_per_second": 1.028,
"step": 24000
},
{
"epoch": 0.8324122685824813,
"grad_norm": 0.3049459755420685,
"learning_rate": 0.0003744375,
"loss": 2.0428,
"step": 24100
},
{
"epoch": 0.8358662613981763,
"grad_norm": 0.3034842908382416,
"learning_rate": 0.00037386931818181817,
"loss": 2.0639,
"step": 24200
},
{
"epoch": 0.8393202542138712,
"grad_norm": 0.3170601427555084,
"learning_rate": 0.0003733011363636364,
"loss": 2.0606,
"step": 24300
},
{
"epoch": 0.8427742470295662,
"grad_norm": 0.3232339918613434,
"learning_rate": 0.00037273295454545453,
"loss": 2.0394,
"step": 24400
},
{
"epoch": 0.8462282398452611,
"grad_norm": 0.3366962671279907,
"learning_rate": 0.0003721647727272727,
"loss": 2.0415,
"step": 24500
},
{
"epoch": 0.8496822326609561,
"grad_norm": 0.3091275095939636,
"learning_rate": 0.00037159659090909094,
"loss": 2.0789,
"step": 24600
},
{
"epoch": 0.853136225476651,
"grad_norm": 0.3144051432609558,
"learning_rate": 0.0003710284090909091,
"loss": 2.059,
"step": 24700
},
{
"epoch": 0.8565902182923459,
"grad_norm": 0.3365747332572937,
"learning_rate": 0.00037046022727272725,
"loss": 2.0388,
"step": 24800
},
{
"epoch": 0.8600442111080409,
"grad_norm": 0.2965666949748993,
"learning_rate": 0.0003698920454545455,
"loss": 2.0576,
"step": 24900
},
{
"epoch": 0.8634982039237359,
"grad_norm": 0.3322639465332031,
"learning_rate": 0.00036932386363636366,
"loss": 2.0633,
"step": 25000
},
{
"epoch": 0.8634982039237359,
"eval_loss": 2.392946243286133,
"eval_runtime": 926.5204,
"eval_samples_per_second": 164.462,
"eval_steps_per_second": 1.029,
"step": 25000
},
{
"epoch": 0.8669521967394308,
"grad_norm": 0.3184923827648163,
"learning_rate": 0.0003687556818181818,
"loss": 2.0442,
"step": 25100
},
{
"epoch": 0.8704061895551257,
"grad_norm": 0.30526450276374817,
"learning_rate": 0.0003681875,
"loss": 2.0364,
"step": 25200
},
{
"epoch": 0.8738601823708206,
"grad_norm": 0.3035339117050171,
"learning_rate": 0.0003676193181818182,
"loss": 2.0399,
"step": 25300
},
{
"epoch": 0.8773141751865157,
"grad_norm": 0.3300335705280304,
"learning_rate": 0.00036705113636363633,
"loss": 2.0388,
"step": 25400
},
{
"epoch": 0.8807681680022106,
"grad_norm": 0.33707037568092346,
"learning_rate": 0.00036648295454545457,
"loss": 2.0364,
"step": 25500
},
{
"epoch": 0.8842221608179055,
"grad_norm": 0.3057771623134613,
"learning_rate": 0.00036591477272727275,
"loss": 2.0377,
"step": 25600
},
{
"epoch": 0.8876761536336004,
"grad_norm": 0.33993765711784363,
"learning_rate": 0.00036534659090909087,
"loss": 2.0485,
"step": 25700
},
{
"epoch": 0.8911301464492953,
"grad_norm": 0.3075715899467468,
"learning_rate": 0.0003647784090909091,
"loss": 2.0256,
"step": 25800
},
{
"epoch": 0.8945841392649904,
"grad_norm": 0.30490240454673767,
"learning_rate": 0.0003642102272727273,
"loss": 2.0489,
"step": 25900
},
{
"epoch": 0.8980381320806853,
"grad_norm": 0.3403315246105194,
"learning_rate": 0.00036364204545454547,
"loss": 2.0476,
"step": 26000
},
{
"epoch": 0.8980381320806853,
"eval_loss": 2.382169008255005,
"eval_runtime": 932.2661,
"eval_samples_per_second": 163.448,
"eval_steps_per_second": 1.635,
"step": 26000
},
{
"epoch": 0.9014921248963802,
"grad_norm": 0.31369808316230774,
"learning_rate": 0.00036307386363636365,
"loss": 2.0265,
"step": 26100
},
{
"epoch": 0.9049461177120751,
"grad_norm": 0.30494198203086853,
"learning_rate": 0.00036250568181818183,
"loss": 2.0328,
"step": 26200
},
{
"epoch": 0.9084001105277701,
"grad_norm": 0.2981790006160736,
"learning_rate": 0.0003619375,
"loss": 2.0196,
"step": 26300
},
{
"epoch": 0.9118541033434651,
"grad_norm": 0.3235887587070465,
"learning_rate": 0.0003613693181818182,
"loss": 2.0224,
"step": 26400
},
{
"epoch": 0.91530809615916,
"grad_norm": 0.32602986693382263,
"learning_rate": 0.00036080113636363637,
"loss": 2.0169,
"step": 26500
},
{
"epoch": 0.9187620889748549,
"grad_norm": 0.3355056643486023,
"learning_rate": 0.00036023295454545455,
"loss": 2.0338,
"step": 26600
},
{
"epoch": 0.9222160817905499,
"grad_norm": 0.3180111348628998,
"learning_rate": 0.00035966477272727273,
"loss": 2.0297,
"step": 26700
},
{
"epoch": 0.9256700746062448,
"grad_norm": 0.2988349199295044,
"learning_rate": 0.0003590965909090909,
"loss": 2.0189,
"step": 26800
},
{
"epoch": 0.9291240674219398,
"grad_norm": 0.30824485421180725,
"learning_rate": 0.0003585284090909091,
"loss": 2.0086,
"step": 26900
},
{
"epoch": 0.9325780602376347,
"grad_norm": 0.33140483498573303,
"learning_rate": 0.00035796022727272727,
"loss": 2.0127,
"step": 27000
},
{
"epoch": 0.9325780602376347,
"eval_loss": 2.3799469470977783,
"eval_runtime": 931.9535,
"eval_samples_per_second": 163.503,
"eval_steps_per_second": 1.635,
"step": 27000
},
{
"epoch": 0.9360320530533297,
"grad_norm": 0.31175485253334045,
"learning_rate": 0.00035739204545454545,
"loss": 2.027,
"step": 27100
},
{
"epoch": 0.9394860458690246,
"grad_norm": 0.3109052777290344,
"learning_rate": 0.00035682386363636363,
"loss": 2.029,
"step": 27200
},
{
"epoch": 0.9429400386847195,
"grad_norm": 0.3299388885498047,
"learning_rate": 0.0003562556818181818,
"loss": 2.0194,
"step": 27300
},
{
"epoch": 0.9463940315004145,
"grad_norm": 0.35121017694473267,
"learning_rate": 0.0003556875,
"loss": 2.0158,
"step": 27400
},
{
"epoch": 0.9498480243161094,
"grad_norm": 0.3052006959915161,
"learning_rate": 0.00035511931818181817,
"loss": 2.0109,
"step": 27500
},
{
"epoch": 0.9533020171318044,
"grad_norm": 0.3126027584075928,
"learning_rate": 0.0003545511363636364,
"loss": 2.0215,
"step": 27600
},
{
"epoch": 0.9567560099474993,
"grad_norm": 0.32444655895233154,
"learning_rate": 0.00035398295454545453,
"loss": 2.0108,
"step": 27700
},
{
"epoch": 0.9602100027631942,
"grad_norm": 0.31381282210350037,
"learning_rate": 0.0003534147727272727,
"loss": 2.0151,
"step": 27800
},
{
"epoch": 0.9636639955788892,
"grad_norm": 0.3093770444393158,
"learning_rate": 0.00035284659090909095,
"loss": 1.9959,
"step": 27900
},
{
"epoch": 0.9671179883945842,
"grad_norm": 0.3137684762477875,
"learning_rate": 0.0003522784090909091,
"loss": 2.0223,
"step": 28000
},
{
"epoch": 0.9671179883945842,
"eval_loss": 2.3616411685943604,
"eval_runtime": 936.0723,
"eval_samples_per_second": 162.783,
"eval_steps_per_second": 1.628,
"step": 28000
},
{
"epoch": 0.9705719812102791,
"grad_norm": 0.3130528926849365,
"learning_rate": 0.00035171022727272725,
"loss": 2.0078,
"step": 28100
},
{
"epoch": 0.974025974025974,
"grad_norm": 0.33664995431900024,
"learning_rate": 0.0003511420454545455,
"loss": 2.0087,
"step": 28200
},
{
"epoch": 0.9774799668416689,
"grad_norm": 0.32277122139930725,
"learning_rate": 0.0003505738636363636,
"loss": 2.0106,
"step": 28300
},
{
"epoch": 0.980933959657364,
"grad_norm": 0.33459389209747314,
"learning_rate": 0.0003500056818181818,
"loss": 2.019,
"step": 28400
},
{
"epoch": 0.9843879524730589,
"grad_norm": 0.31769075989723206,
"learning_rate": 0.00034943750000000003,
"loss": 2.0105,
"step": 28500
},
{
"epoch": 0.9878419452887538,
"grad_norm": 0.3090764582157135,
"learning_rate": 0.00034886931818181816,
"loss": 2.0121,
"step": 28600
},
{
"epoch": 0.9912959381044487,
"grad_norm": 0.3254571557044983,
"learning_rate": 0.00034830113636363634,
"loss": 2.0069,
"step": 28700
},
{
"epoch": 0.9947499309201436,
"grad_norm": 0.3087945878505707,
"learning_rate": 0.00034773295454545457,
"loss": 1.9956,
"step": 28800
},
{
"epoch": 0.9982039237358387,
"grad_norm": 0.2959256172180176,
"learning_rate": 0.00034716477272727275,
"loss": 2.0202,
"step": 28900
},
{
"epoch": 1.0016579165515336,
"grad_norm": 0.3626255691051483,
"learning_rate": 0.0003465965909090909,
"loss": 1.9733,
"step": 29000
},
{
"epoch": 1.0016579165515336,
"eval_loss": 2.3518831729888916,
"eval_runtime": 933.6764,
"eval_samples_per_second": 163.201,
"eval_steps_per_second": 1.632,
"step": 29000
},
{
"epoch": 1.0051119093672285,
"grad_norm": 0.3299137353897095,
"learning_rate": 0.0003460284090909091,
"loss": 1.9406,
"step": 29100
},
{
"epoch": 1.0085659021829234,
"grad_norm": 0.3189757168292999,
"learning_rate": 0.0003454602272727273,
"loss": 1.9547,
"step": 29200
},
{
"epoch": 1.0120198949986183,
"grad_norm": 0.33895236253738403,
"learning_rate": 0.0003448920454545454,
"loss": 1.9462,
"step": 29300
},
{
"epoch": 1.0154738878143132,
"grad_norm": 0.3329538106918335,
"learning_rate": 0.00034432386363636365,
"loss": 1.9445,
"step": 29400
},
{
"epoch": 1.0189278806300084,
"grad_norm": 0.33972305059432983,
"learning_rate": 0.00034375568181818183,
"loss": 1.9482,
"step": 29500
},
{
"epoch": 1.0223818734457033,
"grad_norm": 0.3170960545539856,
"learning_rate": 0.00034318749999999996,
"loss": 1.9322,
"step": 29600
},
{
"epoch": 1.0258358662613982,
"grad_norm": 0.3435528576374054,
"learning_rate": 0.0003426193181818182,
"loss": 1.9651,
"step": 29700
},
{
"epoch": 1.0292898590770931,
"grad_norm": 0.3118680715560913,
"learning_rate": 0.0003420511363636364,
"loss": 1.9553,
"step": 29800
},
{
"epoch": 1.032743851892788,
"grad_norm": 0.30952584743499756,
"learning_rate": 0.0003414829545454545,
"loss": 1.9594,
"step": 29900
},
{
"epoch": 1.036197844708483,
"grad_norm": 0.3205563724040985,
"learning_rate": 0.00034091477272727274,
"loss": 1.951,
"step": 30000
},
{
"epoch": 1.036197844708483,
"eval_loss": 2.3421385288238525,
"eval_runtime": 931.9003,
"eval_samples_per_second": 163.512,
"eval_steps_per_second": 1.635,
"step": 30000
},
{
"epoch": 1.039651837524178,
"grad_norm": 0.3193325400352478,
"learning_rate": 0.0003403465909090909,
"loss": 1.9781,
"step": 30100
},
{
"epoch": 1.0431058303398728,
"grad_norm": 0.3476419448852539,
"learning_rate": 0.00033977840909090915,
"loss": 1.9804,
"step": 30200
},
{
"epoch": 1.046559823155568,
"grad_norm": 0.334945946931839,
"learning_rate": 0.0003392102272727273,
"loss": 1.9956,
"step": 30300
},
{
"epoch": 1.0500138159712629,
"grad_norm": 0.3205523192882538,
"learning_rate": 0.00033864204545454546,
"loss": 1.9738,
"step": 30400
},
{
"epoch": 1.0534678087869578,
"grad_norm": 0.3324650824069977,
"learning_rate": 0.0003380738636363637,
"loss": 1.9851,
"step": 30500
},
{
"epoch": 1.0569218016026527,
"grad_norm": 0.3181789815425873,
"learning_rate": 0.0003375056818181818,
"loss": 1.9993,
"step": 30600
},
{
"epoch": 1.0603757944183476,
"grad_norm": 0.3182109594345093,
"learning_rate": 0.0003369375,
"loss": 1.9808,
"step": 30700
},
{
"epoch": 1.0638297872340425,
"grad_norm": 0.3040473163127899,
"learning_rate": 0.00033636931818181823,
"loss": 1.9697,
"step": 30800
},
{
"epoch": 1.0672837800497375,
"grad_norm": 0.3187369108200073,
"learning_rate": 0.00033580113636363636,
"loss": 1.9668,
"step": 30900
},
{
"epoch": 1.0707377728654324,
"grad_norm": 0.31757599115371704,
"learning_rate": 0.00033523295454545454,
"loss": 1.9797,
"step": 31000
},
{
"epoch": 1.0707377728654324,
"eval_loss": 2.334416151046753,
"eval_runtime": 932.4289,
"eval_samples_per_second": 163.419,
"eval_steps_per_second": 1.634,
"step": 31000
},
{
"epoch": 1.0741917656811273,
"grad_norm": 0.3234330713748932,
"learning_rate": 0.0003346647727272728,
"loss": 1.9646,
"step": 31100
},
{
"epoch": 1.0776457584968224,
"grad_norm": 0.346343457698822,
"learning_rate": 0.0003340965909090909,
"loss": 1.9633,
"step": 31200
},
{
"epoch": 1.0810997513125173,
"grad_norm": 0.33652421832084656,
"learning_rate": 0.0003335284090909091,
"loss": 1.9635,
"step": 31300
},
{
"epoch": 1.0845537441282123,
"grad_norm": 0.3355984091758728,
"learning_rate": 0.0003329602272727273,
"loss": 1.9714,
"step": 31400
},
{
"epoch": 1.0880077369439072,
"grad_norm": 0.3155532479286194,
"learning_rate": 0.0003323920454545455,
"loss": 1.9579,
"step": 31500
},
{
"epoch": 1.091461729759602,
"grad_norm": 0.3124435842037201,
"learning_rate": 0.0003318238636363636,
"loss": 1.9896,
"step": 31600
},
{
"epoch": 1.094915722575297,
"grad_norm": 0.3473125100135803,
"learning_rate": 0.00033125568181818185,
"loss": 1.9604,
"step": 31700
},
{
"epoch": 1.098369715390992,
"grad_norm": 0.33051636815071106,
"learning_rate": 0.00033068750000000004,
"loss": 1.9703,
"step": 31800
},
{
"epoch": 1.1018237082066868,
"grad_norm": 0.3092711865901947,
"learning_rate": 0.00033011931818181816,
"loss": 1.9583,
"step": 31900
},
{
"epoch": 1.105277701022382,
"grad_norm": 0.32419732213020325,
"learning_rate": 0.0003295511363636364,
"loss": 1.9603,
"step": 32000
},
{
"epoch": 1.105277701022382,
"eval_loss": 2.3255245685577393,
"eval_runtime": 932.6931,
"eval_samples_per_second": 163.373,
"eval_steps_per_second": 1.634,
"step": 32000
},
{
"epoch": 1.108731693838077,
"grad_norm": 0.332787424325943,
"learning_rate": 0.0003289829545454546,
"loss": 1.992,
"step": 32100
},
{
"epoch": 1.1121856866537718,
"grad_norm": 0.3273712992668152,
"learning_rate": 0.0003284147727272727,
"loss": 1.9632,
"step": 32200
},
{
"epoch": 1.1156396794694667,
"grad_norm": 0.32147789001464844,
"learning_rate": 0.00032784659090909094,
"loss": 1.9838,
"step": 32300
},
{
"epoch": 1.1190936722851617,
"grad_norm": 0.3235771358013153,
"learning_rate": 0.0003272784090909091,
"loss": 1.9594,
"step": 32400
},
{
"epoch": 1.1225476651008566,
"grad_norm": 0.31604549288749695,
"learning_rate": 0.00032671022727272724,
"loss": 1.9716,
"step": 32500
},
{
"epoch": 1.1260016579165515,
"grad_norm": 0.3200394809246063,
"learning_rate": 0.0003261420454545455,
"loss": 1.9598,
"step": 32600
},
{
"epoch": 1.1294556507322464,
"grad_norm": 0.31569093465805054,
"learning_rate": 0.00032557386363636366,
"loss": 1.9598,
"step": 32700
},
{
"epoch": 1.1329096435479413,
"grad_norm": 0.3108920753002167,
"learning_rate": 0.0003250056818181818,
"loss": 1.9333,
"step": 32800
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.31714916229248047,
"learning_rate": 0.0003244375,
"loss": 1.9665,
"step": 32900
},
{
"epoch": 1.1398176291793314,
"grad_norm": 0.3428919017314911,
"learning_rate": 0.0003238693181818182,
"loss": 1.9367,
"step": 33000
},
{
"epoch": 1.1398176291793314,
"eval_loss": 2.314099073410034,
"eval_runtime": 933.8501,
"eval_samples_per_second": 163.171,
"eval_steps_per_second": 1.632,
"step": 33000
},
{
"epoch": 1.1432716219950263,
"grad_norm": 0.31503021717071533,
"learning_rate": 0.0003233011363636364,
"loss": 1.952,
"step": 33100
},
{
"epoch": 1.1467256148107212,
"grad_norm": 0.3151177763938904,
"learning_rate": 0.00032273295454545456,
"loss": 1.9711,
"step": 33200
},
{
"epoch": 1.1501796076264161,
"grad_norm": 0.33299991488456726,
"learning_rate": 0.00032216477272727274,
"loss": 1.966,
"step": 33300
},
{
"epoch": 1.153633600442111,
"grad_norm": 0.35912394523620605,
"learning_rate": 0.0003215965909090909,
"loss": 1.9345,
"step": 33400
},
{
"epoch": 1.157087593257806,
"grad_norm": 0.3316855728626251,
"learning_rate": 0.0003210284090909091,
"loss": 1.9473,
"step": 33500
},
{
"epoch": 1.1605415860735009,
"grad_norm": 0.32025349140167236,
"learning_rate": 0.0003204602272727273,
"loss": 1.9512,
"step": 33600
},
{
"epoch": 1.163995578889196,
"grad_norm": 0.31566309928894043,
"learning_rate": 0.00031989204545454546,
"loss": 1.9451,
"step": 33700
},
{
"epoch": 1.167449571704891,
"grad_norm": 0.32200607657432556,
"learning_rate": 0.00031932386363636364,
"loss": 1.9382,
"step": 33800
},
{
"epoch": 1.1709035645205859,
"grad_norm": 0.3362364172935486,
"learning_rate": 0.0003187556818181818,
"loss": 1.9504,
"step": 33900
},
{
"epoch": 1.1743575573362808,
"grad_norm": 0.3156588077545166,
"learning_rate": 0.0003181875,
"loss": 1.9488,
"step": 34000
},
{
"epoch": 1.1743575573362808,
"eval_loss": 2.308772563934326,
"eval_runtime": 932.5965,
"eval_samples_per_second": 163.39,
"eval_steps_per_second": 1.634,
"step": 34000
},
{
"epoch": 1.1778115501519757,
"grad_norm": 0.3278816342353821,
"learning_rate": 0.0003176193181818182,
"loss": 1.9547,
"step": 34100
},
{
"epoch": 1.1812655429676706,
"grad_norm": 0.3398403227329254,
"learning_rate": 0.00031705113636363636,
"loss": 1.9293,
"step": 34200
},
{
"epoch": 1.1847195357833655,
"grad_norm": 0.34434807300567627,
"learning_rate": 0.00031648295454545454,
"loss": 1.9497,
"step": 34300
},
{
"epoch": 1.1881735285990604,
"grad_norm": 0.33737897872924805,
"learning_rate": 0.0003159147727272728,
"loss": 1.9471,
"step": 34400
},
{
"epoch": 1.1916275214147554,
"grad_norm": 0.3157757520675659,
"learning_rate": 0.0003153465909090909,
"loss": 1.9395,
"step": 34500
},
{
"epoch": 1.1950815142304503,
"grad_norm": 0.3554360866546631,
"learning_rate": 0.0003147784090909091,
"loss": 1.9589,
"step": 34600
},
{
"epoch": 1.1985355070461454,
"grad_norm": 0.31714192032814026,
"learning_rate": 0.0003142102272727273,
"loss": 1.9382,
"step": 34700
},
{
"epoch": 1.2019894998618403,
"grad_norm": 0.3395540416240692,
"learning_rate": 0.00031364204545454545,
"loss": 1.9245,
"step": 34800
},
{
"epoch": 1.2054434926775353,
"grad_norm": 0.38380250334739685,
"learning_rate": 0.0003130738636363636,
"loss": 1.9379,
"step": 34900
},
{
"epoch": 1.2088974854932302,
"grad_norm": 0.3237415552139282,
"learning_rate": 0.00031250568181818186,
"loss": 1.9433,
"step": 35000
},
{
"epoch": 1.2088974854932302,
"eval_loss": 2.299807548522949,
"eval_runtime": 932.2028,
"eval_samples_per_second": 163.459,
"eval_steps_per_second": 1.635,
"step": 35000
},
{
"epoch": 1.212351478308925,
"grad_norm": 0.3568110764026642,
"learning_rate": 0.0003119375,
"loss": 1.9359,
"step": 35100
},
{
"epoch": 1.21580547112462,
"grad_norm": 0.3228346109390259,
"learning_rate": 0.00031136931818181817,
"loss": 1.9398,
"step": 35200
},
{
"epoch": 1.219259463940315,
"grad_norm": 0.4409060478210449,
"learning_rate": 0.0003108011363636364,
"loss": 1.9271,
"step": 35300
},
{
"epoch": 1.22271345675601,
"grad_norm": 0.3323960602283478,
"learning_rate": 0.0003102329545454545,
"loss": 1.9351,
"step": 35400
},
{
"epoch": 1.226167449571705,
"grad_norm": 0.33286628127098083,
"learning_rate": 0.0003096647727272727,
"loss": 1.9261,
"step": 35500
},
{
"epoch": 1.2296214423874,
"grad_norm": 0.32433241605758667,
"learning_rate": 0.00030909659090909094,
"loss": 1.9235,
"step": 35600
},
{
"epoch": 1.2330754352030948,
"grad_norm": 0.33505016565322876,
"learning_rate": 0.0003085284090909091,
"loss": 1.9463,
"step": 35700
},
{
"epoch": 1.2365294280187897,
"grad_norm": 0.33028197288513184,
"learning_rate": 0.00030796022727272725,
"loss": 1.9425,
"step": 35800
},
{
"epoch": 1.2399834208344847,
"grad_norm": 0.32460519671440125,
"learning_rate": 0.0003073920454545455,
"loss": 1.9237,
"step": 35900
},
{
"epoch": 1.2434374136501796,
"grad_norm": 0.34961310029029846,
"learning_rate": 0.00030682386363636366,
"loss": 1.927,
"step": 36000
},
{
"epoch": 1.2434374136501796,
"eval_loss": 2.2926623821258545,
"eval_runtime": 933.7737,
"eval_samples_per_second": 163.184,
"eval_steps_per_second": 1.632,
"step": 36000
},
{
"epoch": 1.2468914064658745,
"grad_norm": 0.3421266973018646,
"learning_rate": 0.0003062556818181818,
"loss": 1.9172,
"step": 36100
},
{
"epoch": 1.2503453992815694,
"grad_norm": 0.31496691703796387,
"learning_rate": 0.0003056875,
"loss": 1.9283,
"step": 36200
},
{
"epoch": 1.2537993920972643,
"grad_norm": 0.3333700895309448,
"learning_rate": 0.0003051193181818182,
"loss": 1.9083,
"step": 36300
},
{
"epoch": 1.2572533849129595,
"grad_norm": 0.33785733580589294,
"learning_rate": 0.00030455113636363633,
"loss": 1.9364,
"step": 36400
},
{
"epoch": 1.2607073777286544,
"grad_norm": 0.3140362799167633,
"learning_rate": 0.00030398295454545456,
"loss": 1.9202,
"step": 36500
},
{
"epoch": 1.2641613705443493,
"grad_norm": 0.332356721162796,
"learning_rate": 0.00030341477272727275,
"loss": 1.9219,
"step": 36600
},
{
"epoch": 1.2676153633600442,
"grad_norm": 0.30988287925720215,
"learning_rate": 0.00030284659090909087,
"loss": 1.9247,
"step": 36700
},
{
"epoch": 1.2710693561757391,
"grad_norm": 0.3257978856563568,
"learning_rate": 0.0003022784090909091,
"loss": 1.9274,
"step": 36800
},
{
"epoch": 1.274523348991434,
"grad_norm": 0.3108922243118286,
"learning_rate": 0.0003017102272727273,
"loss": 1.9182,
"step": 36900
},
{
"epoch": 1.277977341807129,
"grad_norm": 0.32838690280914307,
"learning_rate": 0.00030114204545454547,
"loss": 1.921,
"step": 37000
},
{
"epoch": 1.277977341807129,
"eval_loss": 2.28013014793396,
"eval_runtime": 932.9674,
"eval_samples_per_second": 163.325,
"eval_steps_per_second": 1.633,
"step": 37000
},
{
"epoch": 1.281431334622824,
"grad_norm": 0.33043205738067627,
"learning_rate": 0.00030057386363636365,
"loss": 1.9282,
"step": 37100
},
{
"epoch": 1.284885327438519,
"grad_norm": 0.3355056047439575,
"learning_rate": 0.00030000568181818183,
"loss": 1.9146,
"step": 37200
},
{
"epoch": 1.288339320254214,
"grad_norm": 0.34499314427375793,
"learning_rate": 0.0002994375,
"loss": 1.9121,
"step": 37300
},
{
"epoch": 1.2917933130699089,
"grad_norm": 0.33857813477516174,
"learning_rate": 0.0002988693181818182,
"loss": 1.906,
"step": 37400
},
{
"epoch": 1.2952473058856038,
"grad_norm": 0.34451091289520264,
"learning_rate": 0.00029830113636363637,
"loss": 1.9069,
"step": 37500
},
{
"epoch": 1.2987012987012987,
"grad_norm": 0.31819987297058105,
"learning_rate": 0.00029773295454545455,
"loss": 1.905,
"step": 37600
},
{
"epoch": 1.3021552915169936,
"grad_norm": 0.32892873883247375,
"learning_rate": 0.00029716477272727273,
"loss": 1.9358,
"step": 37700
},
{
"epoch": 1.3056092843326885,
"grad_norm": 0.3139948844909668,
"learning_rate": 0.0002965965909090909,
"loss": 1.917,
"step": 37800
},
{
"epoch": 1.3090632771483834,
"grad_norm": 0.3358207046985626,
"learning_rate": 0.0002960284090909091,
"loss": 1.8979,
"step": 37900
},
{
"epoch": 1.3125172699640784,
"grad_norm": 0.3274485468864441,
"learning_rate": 0.00029546022727272727,
"loss": 1.9147,
"step": 38000
},
{
"epoch": 1.3125172699640784,
"eval_loss": 2.2716429233551025,
"eval_runtime": 932.4022,
"eval_samples_per_second": 163.424,
"eval_steps_per_second": 1.634,
"step": 38000
},
{
"epoch": 1.3159712627797735,
"grad_norm": 0.3326353430747986,
"learning_rate": 0.00029489204545454545,
"loss": 1.9151,
"step": 38100
},
{
"epoch": 1.3194252555954684,
"grad_norm": 0.33048099279403687,
"learning_rate": 0.00029432386363636363,
"loss": 1.9003,
"step": 38200
},
{
"epoch": 1.3228792484111633,
"grad_norm": 0.3198449909687042,
"learning_rate": 0.0002937556818181818,
"loss": 1.9012,
"step": 38300
},
{
"epoch": 1.3263332412268583,
"grad_norm": 0.3347759246826172,
"learning_rate": 0.0002931875,
"loss": 1.889,
"step": 38400
},
{
"epoch": 1.3297872340425532,
"grad_norm": 0.344235360622406,
"learning_rate": 0.00029261931818181817,
"loss": 1.9096,
"step": 38500
},
{
"epoch": 1.333241226858248,
"grad_norm": 0.34197336435317993,
"learning_rate": 0.0002920511363636364,
"loss": 1.9083,
"step": 38600
},
{
"epoch": 1.336695219673943,
"grad_norm": 0.3257678747177124,
"learning_rate": 0.00029148295454545453,
"loss": 1.9007,
"step": 38700
},
{
"epoch": 1.3401492124896381,
"grad_norm": 0.3299179971218109,
"learning_rate": 0.0002909147727272727,
"loss": 1.8992,
"step": 38800
},
{
"epoch": 1.343603205305333,
"grad_norm": 0.32206007838249207,
"learning_rate": 0.00029034659090909095,
"loss": 1.8853,
"step": 38900
},
{
"epoch": 1.347057198121028,
"grad_norm": 0.3281271159648895,
"learning_rate": 0.0002897784090909091,
"loss": 1.9075,
"step": 39000
},
{
"epoch": 1.347057198121028,
"eval_loss": 2.266144275665283,
"eval_runtime": 933.0379,
"eval_samples_per_second": 163.313,
"eval_steps_per_second": 1.633,
"step": 39000
},
{
"epoch": 1.350511190936723,
"grad_norm": 0.32982590794563293,
"learning_rate": 0.00028921022727272725,
"loss": 1.9255,
"step": 39100
},
{
"epoch": 1.3539651837524178,
"grad_norm": 0.33906838297843933,
"learning_rate": 0.0002886420454545455,
"loss": 1.9119,
"step": 39200
},
{
"epoch": 1.3574191765681127,
"grad_norm": 0.32768332958221436,
"learning_rate": 0.0002880738636363636,
"loss": 1.8838,
"step": 39300
},
{
"epoch": 1.3608731693838076,
"grad_norm": 0.3550179600715637,
"learning_rate": 0.0002875056818181818,
"loss": 1.8889,
"step": 39400
},
{
"epoch": 1.3643271621995026,
"grad_norm": 0.32649099826812744,
"learning_rate": 0.00028693750000000003,
"loss": 1.8983,
"step": 39500
},
{
"epoch": 1.3677811550151975,
"grad_norm": 0.33756542205810547,
"learning_rate": 0.00028636931818181816,
"loss": 1.8982,
"step": 39600
},
{
"epoch": 1.3712351478308924,
"grad_norm": 0.3554450571537018,
"learning_rate": 0.00028580113636363634,
"loss": 1.8831,
"step": 39700
},
{
"epoch": 1.3746891406465875,
"grad_norm": 0.3348751962184906,
"learning_rate": 0.00028523295454545457,
"loss": 1.9022,
"step": 39800
},
{
"epoch": 1.3781431334622825,
"grad_norm": 0.3384929895401001,
"learning_rate": 0.00028466477272727275,
"loss": 1.8973,
"step": 39900
},
{
"epoch": 1.3815971262779774,
"grad_norm": 0.3346748352050781,
"learning_rate": 0.0002840965909090909,
"loss": 1.897,
"step": 40000
},
{
"epoch": 1.3815971262779774,
"eval_loss": 2.258094072341919,
"eval_runtime": 932.2639,
"eval_samples_per_second": 163.448,
"eval_steps_per_second": 1.635,
"step": 40000
},
{
"epoch": 1.3850511190936723,
"grad_norm": 0.3488174378871918,
"learning_rate": 0.0002835284090909091,
"loss": 1.899,
"step": 40100
},
{
"epoch": 1.3885051119093672,
"grad_norm": 0.357048898935318,
"learning_rate": 0.0002829602272727273,
"loss": 1.8874,
"step": 40200
},
{
"epoch": 1.3919591047250621,
"grad_norm": 0.34619608521461487,
"learning_rate": 0.0002823920454545454,
"loss": 1.8971,
"step": 40300
},
{
"epoch": 1.395413097540757,
"grad_norm": 0.3450053930282593,
"learning_rate": 0.00028182386363636365,
"loss": 1.8951,
"step": 40400
},
{
"epoch": 1.3988670903564522,
"grad_norm": 0.3244158923625946,
"learning_rate": 0.00028125568181818183,
"loss": 1.887,
"step": 40500
},
{
"epoch": 1.402321083172147,
"grad_norm": 0.36656075716018677,
"learning_rate": 0.00028068749999999996,
"loss": 1.8961,
"step": 40600
},
{
"epoch": 1.405775075987842,
"grad_norm": 0.3427944481372833,
"learning_rate": 0.0002801193181818182,
"loss": 1.8801,
"step": 40700
},
{
"epoch": 1.409229068803537,
"grad_norm": 0.3511246144771576,
"learning_rate": 0.0002795511363636364,
"loss": 1.8856,
"step": 40800
},
{
"epoch": 1.4126830616192319,
"grad_norm": 0.34178775548934937,
"learning_rate": 0.0002789829545454545,
"loss": 1.8888,
"step": 40900
},
{
"epoch": 1.4161370544349268,
"grad_norm": 0.35453692078590393,
"learning_rate": 0.00027841477272727273,
"loss": 1.8867,
"step": 41000
},
{
"epoch": 1.4161370544349268,
"eval_loss": 2.2483203411102295,
"eval_runtime": 932.6422,
"eval_samples_per_second": 163.382,
"eval_steps_per_second": 1.634,
"step": 41000
},
{
"epoch": 1.4195910472506217,
"grad_norm": 0.38095447421073914,
"learning_rate": 0.0002778465909090909,
"loss": 1.8847,
"step": 41100
},
{
"epoch": 1.4230450400663166,
"grad_norm": 0.3299073576927185,
"learning_rate": 0.00027727840909090915,
"loss": 1.8848,
"step": 41200
},
{
"epoch": 1.4264990328820115,
"grad_norm": 0.3188841640949249,
"learning_rate": 0.0002767102272727273,
"loss": 1.9009,
"step": 41300
},
{
"epoch": 1.4299530256977064,
"grad_norm": 0.3500712811946869,
"learning_rate": 0.00027614204545454546,
"loss": 1.885,
"step": 41400
},
{
"epoch": 1.4334070185134014,
"grad_norm": 0.34655386209487915,
"learning_rate": 0.0002755738636363637,
"loss": 1.8862,
"step": 41500
},
{
"epoch": 1.4368610113290965,
"grad_norm": 0.34666162729263306,
"learning_rate": 0.0002750056818181818,
"loss": 1.8859,
"step": 41600
},
{
"epoch": 1.4403150041447914,
"grad_norm": 0.3630838692188263,
"learning_rate": 0.0002744375,
"loss": 1.8796,
"step": 41700
},
{
"epoch": 1.4437689969604863,
"grad_norm": 0.40710654854774475,
"learning_rate": 0.00027386931818181823,
"loss": 1.8822,
"step": 41800
},
{
"epoch": 1.4472229897761812,
"grad_norm": 0.33801448345184326,
"learning_rate": 0.00027330113636363636,
"loss": 1.8788,
"step": 41900
},
{
"epoch": 1.4506769825918762,
"grad_norm": 0.3448280692100525,
"learning_rate": 0.00027273295454545454,
"loss": 1.8685,
"step": 42000
},
{
"epoch": 1.4506769825918762,
"eval_loss": 2.24458909034729,
"eval_runtime": 932.8306,
"eval_samples_per_second": 163.349,
"eval_steps_per_second": 1.634,
"step": 42000
},
{
"epoch": 1.454130975407571,
"grad_norm": 0.35361775755882263,
"learning_rate": 0.00027216477272727277,
"loss": 1.8657,
"step": 42100
},
{
"epoch": 1.4575849682232662,
"grad_norm": 0.3468896448612213,
"learning_rate": 0.0002715965909090909,
"loss": 1.8701,
"step": 42200
},
{
"epoch": 1.4610389610389611,
"grad_norm": 0.3501305878162384,
"learning_rate": 0.0002710284090909091,
"loss": 1.8729,
"step": 42300
},
{
"epoch": 1.464492953854656,
"grad_norm": 0.3370625078678131,
"learning_rate": 0.0002704602272727273,
"loss": 1.8723,
"step": 42400
},
{
"epoch": 1.467946946670351,
"grad_norm": 0.33096930384635925,
"learning_rate": 0.0002698920454545455,
"loss": 1.8642,
"step": 42500
},
{
"epoch": 1.471400939486046,
"grad_norm": 0.3265809118747711,
"learning_rate": 0.0002693238636363636,
"loss": 1.8757,
"step": 42600
},
{
"epoch": 1.4748549323017408,
"grad_norm": 0.3586813509464264,
"learning_rate": 0.00026875568181818185,
"loss": 1.8639,
"step": 42700
},
{
"epoch": 1.4783089251174357,
"grad_norm": 0.3498245179653168,
"learning_rate": 0.00026818750000000003,
"loss": 1.888,
"step": 42800
},
{
"epoch": 1.4817629179331306,
"grad_norm": 0.34165388345718384,
"learning_rate": 0.00026761931818181816,
"loss": 1.8644,
"step": 42900
},
{
"epoch": 1.4852169107488256,
"grad_norm": 0.32099393010139465,
"learning_rate": 0.0002670511363636364,
"loss": 1.8747,
"step": 43000
},
{
"epoch": 1.4852169107488256,
"eval_loss": 2.237309455871582,
"eval_runtime": 932.7872,
"eval_samples_per_second": 163.357,
"eval_steps_per_second": 1.634,
"step": 43000
},
{
"epoch": 1.4886709035645205,
"grad_norm": 0.4318270981311798,
"learning_rate": 0.0002664829545454546,
"loss": 1.8684,
"step": 43100
},
{
"epoch": 1.4921248963802154,
"grad_norm": 0.34946203231811523,
"learning_rate": 0.0002659147727272727,
"loss": 1.8813,
"step": 43200
},
{
"epoch": 1.4955788891959105,
"grad_norm": 0.33623960614204407,
"learning_rate": 0.00026534659090909094,
"loss": 1.8566,
"step": 43300
},
{
"epoch": 1.4990328820116054,
"grad_norm": 0.3431924283504486,
"learning_rate": 0.0002647784090909091,
"loss": 1.8555,
"step": 43400
},
{
"epoch": 1.5024868748273004,
"grad_norm": 0.3669569492340088,
"learning_rate": 0.00026421022727272724,
"loss": 1.8656,
"step": 43500
},
{
"epoch": 1.5059408676429953,
"grad_norm": 0.3411414623260498,
"learning_rate": 0.0002636420454545455,
"loss": 1.8533,
"step": 43600
},
{
"epoch": 1.5093948604586902,
"grad_norm": 0.348023384809494,
"learning_rate": 0.00026307386363636366,
"loss": 1.8583,
"step": 43700
},
{
"epoch": 1.5128488532743853,
"grad_norm": 0.3822565972805023,
"learning_rate": 0.0002625056818181818,
"loss": 1.8669,
"step": 43800
},
{
"epoch": 1.5163028460900803,
"grad_norm": 0.34821194410324097,
"learning_rate": 0.0002619375,
"loss": 1.8513,
"step": 43900
},
{
"epoch": 1.5197568389057752,
"grad_norm": 0.35662829875946045,
"learning_rate": 0.0002613693181818182,
"loss": 1.8699,
"step": 44000
},
{
"epoch": 1.5197568389057752,
"eval_loss": 2.2242226600646973,
"eval_runtime": 933.1522,
"eval_samples_per_second": 163.293,
"eval_steps_per_second": 1.633,
"step": 44000
},
{
"epoch": 1.52321083172147,
"grad_norm": 0.34279394149780273,
"learning_rate": 0.0002608011363636364,
"loss": 1.8583,
"step": 44100
},
{
"epoch": 1.526664824537165,
"grad_norm": 0.35233989357948303,
"learning_rate": 0.00026023295454545456,
"loss": 1.8434,
"step": 44200
},
{
"epoch": 1.53011881735286,
"grad_norm": 0.34149396419525146,
"learning_rate": 0.00025966477272727274,
"loss": 1.8593,
"step": 44300
},
{
"epoch": 1.5335728101685548,
"grad_norm": 0.35298213362693787,
"learning_rate": 0.0002590965909090909,
"loss": 1.8439,
"step": 44400
},
{
"epoch": 1.5370268029842498,
"grad_norm": 0.3766247630119324,
"learning_rate": 0.0002585284090909091,
"loss": 1.8645,
"step": 44500
},
{
"epoch": 1.5404807957999447,
"grad_norm": 0.3492392301559448,
"learning_rate": 0.0002579602272727273,
"loss": 1.8551,
"step": 44600
},
{
"epoch": 1.5439347886156396,
"grad_norm": 0.324101060628891,
"learning_rate": 0.00025739204545454546,
"loss": 1.8657,
"step": 44700
},
{
"epoch": 1.5473887814313345,
"grad_norm": 0.3346399664878845,
"learning_rate": 0.00025682386363636364,
"loss": 1.8483,
"step": 44800
},
{
"epoch": 1.5508427742470294,
"grad_norm": 0.35447120666503906,
"learning_rate": 0.0002562556818181818,
"loss": 1.8424,
"step": 44900
},
{
"epoch": 1.5542967670627243,
"grad_norm": 0.3583132326602936,
"learning_rate": 0.0002556875,
"loss": 1.8619,
"step": 45000
},
{
"epoch": 1.5542967670627243,
"eval_loss": 2.2166972160339355,
"eval_runtime": 933.428,
"eval_samples_per_second": 163.245,
"eval_steps_per_second": 1.633,
"step": 45000
},
{
"epoch": 1.5577507598784195,
"grad_norm": 0.34049317240715027,
"learning_rate": 0.0002551193181818182,
"loss": 1.8577,
"step": 45100
},
{
"epoch": 1.5612047526941144,
"grad_norm": 0.3376822769641876,
"learning_rate": 0.00025455113636363636,
"loss": 1.8448,
"step": 45200
},
{
"epoch": 1.5646587455098093,
"grad_norm": 0.3559693396091461,
"learning_rate": 0.00025398295454545454,
"loss": 1.8366,
"step": 45300
},
{
"epoch": 1.5681127383255042,
"grad_norm": 0.34435904026031494,
"learning_rate": 0.0002534147727272728,
"loss": 1.8485,
"step": 45400
},
{
"epoch": 1.5715667311411994,
"grad_norm": 0.35500675439834595,
"learning_rate": 0.0002528465909090909,
"loss": 1.8516,
"step": 45500
},
{
"epoch": 1.5750207239568943,
"grad_norm": 0.34272322058677673,
"learning_rate": 0.0002522784090909091,
"loss": 1.8296,
"step": 45600
},
{
"epoch": 1.5784747167725892,
"grad_norm": 0.36497625708580017,
"learning_rate": 0.0002517102272727273,
"loss": 1.8255,
"step": 45700
},
{
"epoch": 1.5819287095882841,
"grad_norm": 0.31943902373313904,
"learning_rate": 0.00025114204545454544,
"loss": 1.8657,
"step": 45800
},
{
"epoch": 1.585382702403979,
"grad_norm": 0.3567992150783539,
"learning_rate": 0.0002505738636363636,
"loss": 1.8727,
"step": 45900
},
{
"epoch": 1.588836695219674,
"grad_norm": 0.3523275554180145,
"learning_rate": 0.00025000568181818186,
"loss": 1.8348,
"step": 46000
},
{
"epoch": 1.588836695219674,
"eval_loss": 2.211845874786377,
"eval_runtime": 932.6736,
"eval_samples_per_second": 163.377,
"eval_steps_per_second": 1.634,
"step": 46000
},
{
"epoch": 1.5922906880353689,
"grad_norm": 0.3533009886741638,
"learning_rate": 0.0002494375,
"loss": 1.8324,
"step": 46100
},
{
"epoch": 1.5957446808510638,
"grad_norm": 0.35436585545539856,
"learning_rate": 0.00024886931818181817,
"loss": 1.8329,
"step": 46200
},
{
"epoch": 1.5991986736667587,
"grad_norm": 0.35463017225265503,
"learning_rate": 0.0002483011363636364,
"loss": 1.848,
"step": 46300
},
{
"epoch": 1.6026526664824536,
"grad_norm": 0.33948197960853577,
"learning_rate": 0.0002477329545454546,
"loss": 1.8416,
"step": 46400
},
{
"epoch": 1.6061066592981486,
"grad_norm": 0.3487997353076935,
"learning_rate": 0.0002471647727272727,
"loss": 1.8331,
"step": 46500
},
{
"epoch": 1.6095606521138435,
"grad_norm": 0.3553692698478699,
"learning_rate": 0.00024659659090909094,
"loss": 1.8443,
"step": 46600
},
{
"epoch": 1.6130146449295384,
"grad_norm": 0.3699355721473694,
"learning_rate": 0.0002460284090909091,
"loss": 1.8396,
"step": 46700
},
{
"epoch": 1.6164686377452335,
"grad_norm": 0.33341851830482483,
"learning_rate": 0.00024546022727272725,
"loss": 1.8266,
"step": 46800
},
{
"epoch": 1.6199226305609284,
"grad_norm": 0.3703523874282837,
"learning_rate": 0.0002448920454545455,
"loss": 1.8356,
"step": 46900
},
{
"epoch": 1.6233766233766234,
"grad_norm": 0.34331998229026794,
"learning_rate": 0.00024432386363636366,
"loss": 1.8506,
"step": 47000
},
{
"epoch": 1.6233766233766234,
"eval_loss": 2.201261281967163,
"eval_runtime": 932.8465,
"eval_samples_per_second": 163.346,
"eval_steps_per_second": 1.634,
"step": 47000
},
{
"epoch": 1.6268306161923183,
"grad_norm": 0.3524048924446106,
"learning_rate": 0.00024375568181818184,
"loss": 1.8276,
"step": 47100
},
{
"epoch": 1.6302846090080134,
"grad_norm": 0.6397112607955933,
"learning_rate": 0.0002431875,
"loss": 1.8358,
"step": 47200
},
{
"epoch": 1.6337386018237083,
"grad_norm": 0.3624354600906372,
"learning_rate": 0.00024261931818181818,
"loss": 1.819,
"step": 47300
},
{
"epoch": 1.6371925946394033,
"grad_norm": 0.3678456246852875,
"learning_rate": 0.00024205113636363638,
"loss": 1.8151,
"step": 47400
},
{
"epoch": 1.6406465874550982,
"grad_norm": 0.38248035311698914,
"learning_rate": 0.00024148295454545454,
"loss": 1.8303,
"step": 47500
},
{
"epoch": 1.644100580270793,
"grad_norm": 0.36703070998191833,
"learning_rate": 0.00024091477272727272,
"loss": 1.8375,
"step": 47600
},
{
"epoch": 1.647554573086488,
"grad_norm": 0.34606924653053284,
"learning_rate": 0.00024034659090909092,
"loss": 1.8261,
"step": 47700
},
{
"epoch": 1.651008565902183,
"grad_norm": 0.35459455847740173,
"learning_rate": 0.00023977840909090908,
"loss": 1.8541,
"step": 47800
},
{
"epoch": 1.6544625587178778,
"grad_norm": 0.35106080770492554,
"learning_rate": 0.00023921022727272728,
"loss": 1.8434,
"step": 47900
},
{
"epoch": 1.6579165515335728,
"grad_norm": 0.3380804657936096,
"learning_rate": 0.00023864204545454547,
"loss": 1.8323,
"step": 48000
},
{
"epoch": 1.6579165515335728,
"eval_loss": 2.1915159225463867,
"eval_runtime": 932.8372,
"eval_samples_per_second": 163.348,
"eval_steps_per_second": 1.634,
"step": 48000
},
{
"epoch": 1.6613705443492677,
"grad_norm": 0.36180025339126587,
"learning_rate": 0.00023807386363636362,
"loss": 1.8347,
"step": 48100
},
{
"epoch": 1.6648245371649626,
"grad_norm": 0.33836793899536133,
"learning_rate": 0.00023750568181818183,
"loss": 1.8169,
"step": 48200
},
{
"epoch": 1.6682785299806575,
"grad_norm": 0.34874165058135986,
"learning_rate": 0.0002369375,
"loss": 1.8206,
"step": 48300
},
{
"epoch": 1.6717325227963524,
"grad_norm": 0.3255716562271118,
"learning_rate": 0.0002363693181818182,
"loss": 1.8319,
"step": 48400
},
{
"epoch": 1.6751865156120476,
"grad_norm": 0.3886810839176178,
"learning_rate": 0.00023580113636363637,
"loss": 1.8208,
"step": 48500
},
{
"epoch": 1.6786405084277425,
"grad_norm": 0.38673707842826843,
"learning_rate": 0.00023523295454545455,
"loss": 1.8294,
"step": 48600
},
{
"epoch": 1.6820945012434374,
"grad_norm": 0.3884912431240082,
"learning_rate": 0.00023466477272727273,
"loss": 1.8137,
"step": 48700
},
{
"epoch": 1.6855484940591323,
"grad_norm": 0.35155996680259705,
"learning_rate": 0.0002340965909090909,
"loss": 1.8135,
"step": 48800
},
{
"epoch": 1.6890024868748275,
"grad_norm": 0.34583061933517456,
"learning_rate": 0.0002335284090909091,
"loss": 1.8125,
"step": 48900
},
{
"epoch": 1.6924564796905224,
"grad_norm": 0.3412420451641083,
"learning_rate": 0.00023296022727272727,
"loss": 1.8238,
"step": 49000
},
{
"epoch": 1.6924564796905224,
"eval_loss": 2.1860053539276123,
"eval_runtime": 932.5574,
"eval_samples_per_second": 163.397,
"eval_steps_per_second": 1.634,
"step": 49000
},
{
"epoch": 1.6959104725062173,
"grad_norm": 0.36108842492103577,
"learning_rate": 0.00023239204545454545,
"loss": 1.8195,
"step": 49100
},
{
"epoch": 1.6993644653219122,
"grad_norm": 0.3617706000804901,
"learning_rate": 0.00023182386363636366,
"loss": 1.8032,
"step": 49200
},
{
"epoch": 1.7028184581376071,
"grad_norm": 0.36145681142807007,
"learning_rate": 0.00023125568181818184,
"loss": 1.8441,
"step": 49300
},
{
"epoch": 1.706272450953302,
"grad_norm": 0.3923262059688568,
"learning_rate": 0.0002306875,
"loss": 1.8136,
"step": 49400
},
{
"epoch": 1.709726443768997,
"grad_norm": 0.3287799656391144,
"learning_rate": 0.0002301193181818182,
"loss": 1.8211,
"step": 49500
},
{
"epoch": 1.7131804365846919,
"grad_norm": 0.35752880573272705,
"learning_rate": 0.00022955113636363638,
"loss": 1.8108,
"step": 49600
},
{
"epoch": 1.7166344294003868,
"grad_norm": 0.3737923204898834,
"learning_rate": 0.00022898295454545456,
"loss": 1.8033,
"step": 49700
},
{
"epoch": 1.7200884222160817,
"grad_norm": 0.374796599149704,
"learning_rate": 0.00022841477272727274,
"loss": 1.8097,
"step": 49800
},
{
"epoch": 1.7235424150317766,
"grad_norm": 0.386203408241272,
"learning_rate": 0.00022784659090909092,
"loss": 1.811,
"step": 49900
},
{
"epoch": 1.7269964078474715,
"grad_norm": 0.3648054003715515,
"learning_rate": 0.0002272784090909091,
"loss": 1.8061,
"step": 50000
},
{
"epoch": 1.7269964078474715,
"eval_loss": 2.1760547161102295,
"eval_runtime": 932.5357,
"eval_samples_per_second": 163.401,
"eval_steps_per_second": 1.634,
"step": 50000
}
],
"logging_steps": 100,
"max_steps": 90000,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.64606111435273e+18,
"train_batch_size": 100,
"trial_name": null,
"trial_params": null
}