gpt-m-4096 / trainer_state.json
hidude562's picture
Upload 13 files
aa2b0e6
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.4831883251890403,
"eval_steps": 500,
"global_step": 112000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 4.996689311774132e-05,
"loss": 2.8505,
"step": 500
},
{
"epoch": 0.01,
"learning_rate": 4.9933786235482635e-05,
"loss": 2.0552,
"step": 1000
},
{
"epoch": 0.02,
"learning_rate": 4.990067935322395e-05,
"loss": 1.8823,
"step": 1500
},
{
"epoch": 0.03,
"learning_rate": 4.986757247096527e-05,
"loss": 1.7906,
"step": 2000
},
{
"epoch": 0.03,
"learning_rate": 4.9834465588706583e-05,
"loss": 1.7442,
"step": 2500
},
{
"epoch": 0.04,
"learning_rate": 4.98013587064479e-05,
"loss": 1.6741,
"step": 3000
},
{
"epoch": 0.05,
"learning_rate": 4.9768251824189216e-05,
"loss": 1.6635,
"step": 3500
},
{
"epoch": 0.05,
"learning_rate": 4.9735144941930525e-05,
"loss": 1.6282,
"step": 4000
},
{
"epoch": 0.06,
"learning_rate": 4.970203805967185e-05,
"loss": 1.5507,
"step": 4500
},
{
"epoch": 0.07,
"learning_rate": 4.9668931177413165e-05,
"loss": 1.5326,
"step": 5000
},
{
"epoch": 0.07,
"learning_rate": 4.963582429515448e-05,
"loss": 1.5385,
"step": 5500
},
{
"epoch": 0.08,
"learning_rate": 4.96027174128958e-05,
"loss": 1.5191,
"step": 6000
},
{
"epoch": 0.09,
"learning_rate": 4.956961053063711e-05,
"loss": 1.5043,
"step": 6500
},
{
"epoch": 0.09,
"learning_rate": 4.953650364837843e-05,
"loss": 1.4498,
"step": 7000
},
{
"epoch": 0.1,
"learning_rate": 4.950339676611974e-05,
"loss": 1.4116,
"step": 7500
},
{
"epoch": 0.11,
"learning_rate": 4.947028988386106e-05,
"loss": 1.4536,
"step": 8000
},
{
"epoch": 0.11,
"learning_rate": 4.943718300160238e-05,
"loss": 1.4486,
"step": 8500
},
{
"epoch": 0.12,
"learning_rate": 4.940407611934369e-05,
"loss": 1.3881,
"step": 9000
},
{
"epoch": 0.13,
"learning_rate": 4.937096923708501e-05,
"loss": 1.3976,
"step": 9500
},
{
"epoch": 0.13,
"learning_rate": 4.933786235482633e-05,
"loss": 1.4139,
"step": 10000
},
{
"epoch": 0.14,
"learning_rate": 4.9304755472567636e-05,
"loss": 1.3606,
"step": 10500
},
{
"epoch": 0.15,
"learning_rate": 4.927164859030895e-05,
"loss": 1.3726,
"step": 11000
},
{
"epoch": 0.15,
"learning_rate": 4.9238541708050275e-05,
"loss": 1.3495,
"step": 11500
},
{
"epoch": 0.16,
"learning_rate": 4.9205434825791585e-05,
"loss": 1.3682,
"step": 12000
},
{
"epoch": 0.17,
"learning_rate": 4.91723279435329e-05,
"loss": 1.3387,
"step": 12500
},
{
"epoch": 0.17,
"learning_rate": 4.9139221061274224e-05,
"loss": 1.3478,
"step": 13000
},
{
"epoch": 0.18,
"learning_rate": 4.910611417901554e-05,
"loss": 1.3471,
"step": 13500
},
{
"epoch": 0.19,
"learning_rate": 4.907300729675685e-05,
"loss": 1.3502,
"step": 14000
},
{
"epoch": 0.19,
"learning_rate": 4.9039900414498166e-05,
"loss": 1.3192,
"step": 14500
},
{
"epoch": 0.2,
"learning_rate": 4.900679353223949e-05,
"loss": 1.3058,
"step": 15000
},
{
"epoch": 0.21,
"learning_rate": 4.89736866499808e-05,
"loss": 1.2715,
"step": 15500
},
{
"epoch": 0.21,
"learning_rate": 4.8940579767722115e-05,
"loss": 1.2697,
"step": 16000
},
{
"epoch": 0.22,
"learning_rate": 4.890747288546344e-05,
"loss": 1.2916,
"step": 16500
},
{
"epoch": 0.23,
"learning_rate": 4.887436600320475e-05,
"loss": 1.3412,
"step": 17000
},
{
"epoch": 0.23,
"learning_rate": 4.884125912094606e-05,
"loss": 1.2475,
"step": 17500
},
{
"epoch": 0.24,
"learning_rate": 4.880815223868738e-05,
"loss": 1.2941,
"step": 18000
},
{
"epoch": 0.24,
"learning_rate": 4.8775045356428696e-05,
"loss": 1.2782,
"step": 18500
},
{
"epoch": 0.25,
"learning_rate": 4.874193847417001e-05,
"loss": 1.2175,
"step": 19000
},
{
"epoch": 0.26,
"learning_rate": 4.870883159191133e-05,
"loss": 1.2491,
"step": 19500
},
{
"epoch": 0.26,
"learning_rate": 4.8675724709652644e-05,
"loss": 1.2523,
"step": 20000
},
{
"epoch": 0.27,
"learning_rate": 4.864261782739396e-05,
"loss": 1.2281,
"step": 20500
},
{
"epoch": 0.28,
"learning_rate": 4.860951094513528e-05,
"loss": 1.2957,
"step": 21000
},
{
"epoch": 0.28,
"learning_rate": 4.857640406287659e-05,
"loss": 1.2399,
"step": 21500
},
{
"epoch": 0.29,
"learning_rate": 4.854329718061791e-05,
"loss": 1.1924,
"step": 22000
},
{
"epoch": 0.3,
"learning_rate": 4.8510190298359225e-05,
"loss": 1.2297,
"step": 22500
},
{
"epoch": 0.3,
"learning_rate": 4.847708341610054e-05,
"loss": 1.2049,
"step": 23000
},
{
"epoch": 0.31,
"learning_rate": 4.844397653384186e-05,
"loss": 1.239,
"step": 23500
},
{
"epoch": 0.32,
"learning_rate": 4.8410869651583174e-05,
"loss": 1.2048,
"step": 24000
},
{
"epoch": 0.32,
"learning_rate": 4.837776276932449e-05,
"loss": 1.2137,
"step": 24500
},
{
"epoch": 0.33,
"learning_rate": 4.8344655887065806e-05,
"loss": 1.2287,
"step": 25000
},
{
"epoch": 0.34,
"learning_rate": 4.831154900480712e-05,
"loss": 1.2014,
"step": 25500
},
{
"epoch": 0.34,
"learning_rate": 4.827844212254844e-05,
"loss": 1.1585,
"step": 26000
},
{
"epoch": 0.35,
"learning_rate": 4.8245335240289755e-05,
"loss": 1.1711,
"step": 26500
},
{
"epoch": 0.36,
"learning_rate": 4.821222835803107e-05,
"loss": 1.1586,
"step": 27000
},
{
"epoch": 0.36,
"learning_rate": 4.817912147577239e-05,
"loss": 1.2042,
"step": 27500
},
{
"epoch": 0.37,
"learning_rate": 4.81460145935137e-05,
"loss": 1.1517,
"step": 28000
},
{
"epoch": 0.38,
"learning_rate": 4.811290771125502e-05,
"loss": 1.1824,
"step": 28500
},
{
"epoch": 0.38,
"learning_rate": 4.8079800828996336e-05,
"loss": 1.1687,
"step": 29000
},
{
"epoch": 0.39,
"learning_rate": 4.8046693946737646e-05,
"loss": 1.1349,
"step": 29500
},
{
"epoch": 0.4,
"learning_rate": 4.801358706447897e-05,
"loss": 1.178,
"step": 30000
},
{
"epoch": 0.4,
"learning_rate": 4.7980480182220285e-05,
"loss": 1.1487,
"step": 30500
},
{
"epoch": 0.41,
"learning_rate": 4.79473732999616e-05,
"loss": 1.1274,
"step": 31000
},
{
"epoch": 0.42,
"learning_rate": 4.791426641770291e-05,
"loss": 1.1583,
"step": 31500
},
{
"epoch": 0.42,
"learning_rate": 4.7881159535444233e-05,
"loss": 1.0724,
"step": 32000
},
{
"epoch": 0.43,
"learning_rate": 4.784805265318555e-05,
"loss": 1.1455,
"step": 32500
},
{
"epoch": 0.44,
"learning_rate": 4.781494577092686e-05,
"loss": 1.0937,
"step": 33000
},
{
"epoch": 0.44,
"learning_rate": 4.778183888866818e-05,
"loss": 1.1291,
"step": 33500
},
{
"epoch": 0.45,
"learning_rate": 4.77487320064095e-05,
"loss": 1.0853,
"step": 34000
},
{
"epoch": 0.46,
"learning_rate": 4.771562512415081e-05,
"loss": 1.1579,
"step": 34500
},
{
"epoch": 0.46,
"learning_rate": 4.7682518241892124e-05,
"loss": 1.1043,
"step": 35000
},
{
"epoch": 0.47,
"learning_rate": 4.764941135963345e-05,
"loss": 1.1075,
"step": 35500
},
{
"epoch": 0.48,
"learning_rate": 4.7616304477374756e-05,
"loss": 1.0889,
"step": 36000
},
{
"epoch": 0.48,
"learning_rate": 4.758319759511607e-05,
"loss": 1.0683,
"step": 36500
},
{
"epoch": 0.49,
"learning_rate": 4.7550090712857396e-05,
"loss": 1.088,
"step": 37000
},
{
"epoch": 0.5,
"learning_rate": 4.7516983830598705e-05,
"loss": 1.0876,
"step": 37500
},
{
"epoch": 0.5,
"learning_rate": 4.748387694834002e-05,
"loss": 1.0683,
"step": 38000
},
{
"epoch": 0.51,
"learning_rate": 4.745077006608134e-05,
"loss": 1.1057,
"step": 38500
},
{
"epoch": 0.52,
"learning_rate": 4.7417663183822654e-05,
"loss": 1.0698,
"step": 39000
},
{
"epoch": 0.52,
"learning_rate": 4.738455630156397e-05,
"loss": 1.0771,
"step": 39500
},
{
"epoch": 0.53,
"learning_rate": 4.7351449419305286e-05,
"loss": 1.0594,
"step": 40000
},
{
"epoch": 0.54,
"learning_rate": 4.731834253704661e-05,
"loss": 1.0968,
"step": 40500
},
{
"epoch": 0.54,
"learning_rate": 4.728523565478792e-05,
"loss": 1.0522,
"step": 41000
},
{
"epoch": 0.55,
"learning_rate": 4.7252128772529235e-05,
"loss": 1.0144,
"step": 41500
},
{
"epoch": 0.56,
"learning_rate": 4.721902189027055e-05,
"loss": 1.1111,
"step": 42000
},
{
"epoch": 0.56,
"learning_rate": 4.718591500801187e-05,
"loss": 1.0307,
"step": 42500
},
{
"epoch": 0.57,
"learning_rate": 4.715280812575318e-05,
"loss": 1.0248,
"step": 43000
},
{
"epoch": 0.58,
"learning_rate": 4.71197012434945e-05,
"loss": 1.0452,
"step": 43500
},
{
"epoch": 0.58,
"learning_rate": 4.7086594361235816e-05,
"loss": 1.0108,
"step": 44000
},
{
"epoch": 0.59,
"learning_rate": 4.705348747897713e-05,
"loss": 0.9867,
"step": 44500
},
{
"epoch": 0.6,
"learning_rate": 4.702038059671845e-05,
"loss": 0.9852,
"step": 45000
},
{
"epoch": 0.6,
"learning_rate": 4.6987273714459764e-05,
"loss": 1.023,
"step": 45500
},
{
"epoch": 0.61,
"learning_rate": 4.695416683220108e-05,
"loss": 1.0288,
"step": 46000
},
{
"epoch": 0.62,
"learning_rate": 4.69210599499424e-05,
"loss": 0.9871,
"step": 46500
},
{
"epoch": 0.62,
"learning_rate": 4.688795306768371e-05,
"loss": 0.9374,
"step": 47000
},
{
"epoch": 0.63,
"learning_rate": 4.685484618542503e-05,
"loss": 0.9943,
"step": 47500
},
{
"epoch": 0.64,
"learning_rate": 4.6821739303166346e-05,
"loss": 1.0175,
"step": 48000
},
{
"epoch": 0.64,
"learning_rate": 4.678863242090766e-05,
"loss": 1.0241,
"step": 48500
},
{
"epoch": 0.65,
"learning_rate": 4.675552553864898e-05,
"loss": 0.9686,
"step": 49000
},
{
"epoch": 0.66,
"learning_rate": 4.6722418656390294e-05,
"loss": 0.9604,
"step": 49500
},
{
"epoch": 0.66,
"learning_rate": 4.668931177413161e-05,
"loss": 0.9428,
"step": 50000
},
{
"epoch": 0.67,
"learning_rate": 4.6656204891872927e-05,
"loss": 0.9422,
"step": 50500
},
{
"epoch": 0.68,
"learning_rate": 4.662309800961424e-05,
"loss": 0.9291,
"step": 51000
},
{
"epoch": 0.68,
"learning_rate": 4.658999112735556e-05,
"loss": 0.9709,
"step": 51500
},
{
"epoch": 0.69,
"learning_rate": 4.655688424509687e-05,
"loss": 0.8939,
"step": 52000
},
{
"epoch": 0.7,
"learning_rate": 4.652377736283819e-05,
"loss": 0.9091,
"step": 52500
},
{
"epoch": 0.7,
"learning_rate": 4.649067048057951e-05,
"loss": 0.8881,
"step": 53000
},
{
"epoch": 0.71,
"learning_rate": 4.645756359832082e-05,
"loss": 0.9033,
"step": 53500
},
{
"epoch": 0.72,
"learning_rate": 4.642445671606214e-05,
"loss": 0.9164,
"step": 54000
},
{
"epoch": 0.72,
"learning_rate": 4.6391349833803456e-05,
"loss": 0.8774,
"step": 54500
},
{
"epoch": 0.73,
"learning_rate": 4.6358242951544766e-05,
"loss": 0.8821,
"step": 55000
},
{
"epoch": 0.73,
"learning_rate": 4.632513606928608e-05,
"loss": 0.9005,
"step": 55500
},
{
"epoch": 0.74,
"learning_rate": 4.6292029187027405e-05,
"loss": 0.8919,
"step": 56000
},
{
"epoch": 0.75,
"learning_rate": 4.6258922304768714e-05,
"loss": 0.8867,
"step": 56500
},
{
"epoch": 0.75,
"learning_rate": 4.622581542251003e-05,
"loss": 0.8683,
"step": 57000
},
{
"epoch": 0.76,
"learning_rate": 4.6192708540251354e-05,
"loss": 0.8654,
"step": 57500
},
{
"epoch": 0.77,
"learning_rate": 4.615960165799267e-05,
"loss": 0.8778,
"step": 58000
},
{
"epoch": 0.77,
"learning_rate": 4.612649477573398e-05,
"loss": 0.8472,
"step": 58500
},
{
"epoch": 0.78,
"learning_rate": 4.6093387893475295e-05,
"loss": 0.8738,
"step": 59000
},
{
"epoch": 0.79,
"learning_rate": 4.606028101121662e-05,
"loss": 0.8645,
"step": 59500
},
{
"epoch": 0.79,
"learning_rate": 4.602717412895793e-05,
"loss": 0.8331,
"step": 60000
},
{
"epoch": 0.8,
"learning_rate": 4.5994067246699244e-05,
"loss": 0.8553,
"step": 60500
},
{
"epoch": 0.81,
"learning_rate": 4.596096036444057e-05,
"loss": 0.8358,
"step": 61000
},
{
"epoch": 0.81,
"learning_rate": 4.5927853482181877e-05,
"loss": 0.8024,
"step": 61500
},
{
"epoch": 0.82,
"learning_rate": 4.589474659992319e-05,
"loss": 0.7889,
"step": 62000
},
{
"epoch": 0.83,
"learning_rate": 4.586163971766451e-05,
"loss": 0.8082,
"step": 62500
},
{
"epoch": 0.83,
"learning_rate": 4.5828532835405825e-05,
"loss": 0.8247,
"step": 63000
},
{
"epoch": 0.84,
"learning_rate": 4.579542595314714e-05,
"loss": 0.8027,
"step": 63500
},
{
"epoch": 0.85,
"learning_rate": 4.576231907088846e-05,
"loss": 0.7913,
"step": 64000
},
{
"epoch": 0.85,
"learning_rate": 4.5729212188629774e-05,
"loss": 0.7875,
"step": 64500
},
{
"epoch": 0.86,
"learning_rate": 4.569610530637109e-05,
"loss": 0.7853,
"step": 65000
},
{
"epoch": 0.87,
"learning_rate": 4.5662998424112406e-05,
"loss": 0.809,
"step": 65500
},
{
"epoch": 0.87,
"learning_rate": 4.562989154185372e-05,
"loss": 0.7814,
"step": 66000
},
{
"epoch": 0.88,
"learning_rate": 4.559678465959504e-05,
"loss": 0.7879,
"step": 66500
},
{
"epoch": 0.89,
"learning_rate": 4.5563677777336355e-05,
"loss": 0.7588,
"step": 67000
},
{
"epoch": 0.89,
"learning_rate": 4.553057089507767e-05,
"loss": 0.7654,
"step": 67500
},
{
"epoch": 0.9,
"learning_rate": 4.549746401281899e-05,
"loss": 0.7426,
"step": 68000
},
{
"epoch": 0.91,
"learning_rate": 4.5464357130560304e-05,
"loss": 0.7339,
"step": 68500
},
{
"epoch": 0.91,
"learning_rate": 4.543125024830162e-05,
"loss": 0.7302,
"step": 69000
},
{
"epoch": 0.92,
"learning_rate": 4.5398143366042936e-05,
"loss": 0.7671,
"step": 69500
},
{
"epoch": 0.93,
"learning_rate": 4.536503648378425e-05,
"loss": 0.741,
"step": 70000
},
{
"epoch": 0.93,
"learning_rate": 4.533192960152557e-05,
"loss": 0.7094,
"step": 70500
},
{
"epoch": 0.94,
"learning_rate": 4.5298822719266885e-05,
"loss": 0.6589,
"step": 71000
},
{
"epoch": 0.95,
"learning_rate": 4.52657158370082e-05,
"loss": 0.7308,
"step": 71500
},
{
"epoch": 0.95,
"learning_rate": 4.523260895474952e-05,
"loss": 0.7093,
"step": 72000
},
{
"epoch": 0.96,
"learning_rate": 4.5199502072490827e-05,
"loss": 0.6963,
"step": 72500
},
{
"epoch": 0.97,
"learning_rate": 4.516639519023215e-05,
"loss": 0.6853,
"step": 73000
},
{
"epoch": 0.97,
"learning_rate": 4.5133288307973466e-05,
"loss": 0.7045,
"step": 73500
},
{
"epoch": 0.98,
"learning_rate": 4.5100181425714775e-05,
"loss": 0.6737,
"step": 74000
},
{
"epoch": 0.99,
"learning_rate": 4.50670745434561e-05,
"loss": 0.6671,
"step": 74500
},
{
"epoch": 0.99,
"learning_rate": 4.5033967661197414e-05,
"loss": 0.7017,
"step": 75000
},
{
"epoch": 1.0,
"learning_rate": 4.500086077893873e-05,
"loss": 0.6869,
"step": 75500
},
{
"epoch": 1.01,
"learning_rate": 4.496775389668004e-05,
"loss": 0.6791,
"step": 76000
},
{
"epoch": 1.01,
"learning_rate": 4.493464701442136e-05,
"loss": 0.6988,
"step": 76500
},
{
"epoch": 1.02,
"learning_rate": 4.490154013216268e-05,
"loss": 0.6329,
"step": 77000
},
{
"epoch": 1.03,
"learning_rate": 4.486843324990399e-05,
"loss": 0.6474,
"step": 77500
},
{
"epoch": 1.03,
"learning_rate": 4.483532636764531e-05,
"loss": 0.6792,
"step": 78000
},
{
"epoch": 1.04,
"learning_rate": 4.480221948538663e-05,
"loss": 0.6872,
"step": 78500
},
{
"epoch": 1.05,
"learning_rate": 4.476911260312794e-05,
"loss": 0.6592,
"step": 79000
},
{
"epoch": 1.05,
"learning_rate": 4.4736005720869254e-05,
"loss": 0.6289,
"step": 79500
},
{
"epoch": 1.06,
"learning_rate": 4.4702898838610577e-05,
"loss": 0.6262,
"step": 80000
},
{
"epoch": 1.07,
"learning_rate": 4.4669791956351886e-05,
"loss": 0.6517,
"step": 80500
},
{
"epoch": 1.07,
"learning_rate": 4.46366850740932e-05,
"loss": 0.6525,
"step": 81000
},
{
"epoch": 1.08,
"learning_rate": 4.4603578191834525e-05,
"loss": 0.6421,
"step": 81500
},
{
"epoch": 1.09,
"learning_rate": 4.4570471309575835e-05,
"loss": 0.6109,
"step": 82000
},
{
"epoch": 1.09,
"learning_rate": 4.453736442731715e-05,
"loss": 0.6616,
"step": 82500
},
{
"epoch": 1.1,
"learning_rate": 4.450425754505847e-05,
"loss": 0.6213,
"step": 83000
},
{
"epoch": 1.11,
"learning_rate": 4.447115066279979e-05,
"loss": 0.623,
"step": 83500
},
{
"epoch": 1.11,
"learning_rate": 4.44380437805411e-05,
"loss": 0.6034,
"step": 84000
},
{
"epoch": 1.12,
"learning_rate": 4.4404936898282416e-05,
"loss": 0.589,
"step": 84500
},
{
"epoch": 1.13,
"learning_rate": 4.437183001602374e-05,
"loss": 0.6605,
"step": 85000
},
{
"epoch": 1.13,
"learning_rate": 4.433872313376505e-05,
"loss": 0.6419,
"step": 85500
},
{
"epoch": 1.14,
"learning_rate": 4.4305616251506364e-05,
"loss": 0.636,
"step": 86000
},
{
"epoch": 1.15,
"learning_rate": 4.427250936924768e-05,
"loss": 0.6244,
"step": 86500
},
{
"epoch": 1.15,
"learning_rate": 4.4239402486989e-05,
"loss": 0.62,
"step": 87000
},
{
"epoch": 1.16,
"learning_rate": 4.420629560473031e-05,
"loss": 0.5971,
"step": 87500
},
{
"epoch": 1.17,
"learning_rate": 4.417318872247163e-05,
"loss": 0.6287,
"step": 88000
},
{
"epoch": 1.17,
"learning_rate": 4.4140081840212945e-05,
"loss": 0.5951,
"step": 88500
},
{
"epoch": 1.18,
"learning_rate": 4.410697495795426e-05,
"loss": 0.5648,
"step": 89000
},
{
"epoch": 1.19,
"learning_rate": 4.407386807569558e-05,
"loss": 0.6369,
"step": 89500
},
{
"epoch": 1.19,
"learning_rate": 4.4040761193436894e-05,
"loss": 0.6173,
"step": 90000
},
{
"epoch": 1.2,
"learning_rate": 4.400765431117821e-05,
"loss": 0.5884,
"step": 90500
},
{
"epoch": 1.21,
"learning_rate": 4.3974547428919526e-05,
"loss": 0.5841,
"step": 91000
},
{
"epoch": 1.21,
"learning_rate": 4.3941440546660836e-05,
"loss": 0.5925,
"step": 91500
},
{
"epoch": 1.22,
"learning_rate": 4.390833366440216e-05,
"loss": 0.5721,
"step": 92000
},
{
"epoch": 1.22,
"learning_rate": 4.3875226782143475e-05,
"loss": 0.5779,
"step": 92500
},
{
"epoch": 1.23,
"learning_rate": 4.384211989988479e-05,
"loss": 0.562,
"step": 93000
},
{
"epoch": 1.24,
"learning_rate": 4.380901301762611e-05,
"loss": 0.5614,
"step": 93500
},
{
"epoch": 1.24,
"learning_rate": 4.3775906135367424e-05,
"loss": 0.5836,
"step": 94000
},
{
"epoch": 1.25,
"learning_rate": 4.374279925310874e-05,
"loss": 0.5902,
"step": 94500
},
{
"epoch": 1.26,
"learning_rate": 4.370969237085005e-05,
"loss": 0.5607,
"step": 95000
},
{
"epoch": 1.26,
"learning_rate": 4.367658548859137e-05,
"loss": 0.5557,
"step": 95500
},
{
"epoch": 1.27,
"learning_rate": 4.364347860633269e-05,
"loss": 0.5492,
"step": 96000
},
{
"epoch": 1.28,
"learning_rate": 4.3610371724074e-05,
"loss": 0.5815,
"step": 96500
},
{
"epoch": 1.28,
"learning_rate": 4.357726484181532e-05,
"loss": 0.5609,
"step": 97000
},
{
"epoch": 1.29,
"learning_rate": 4.354415795955664e-05,
"loss": 0.5671,
"step": 97500
},
{
"epoch": 1.3,
"learning_rate": 4.351105107729795e-05,
"loss": 0.5487,
"step": 98000
},
{
"epoch": 1.3,
"learning_rate": 4.347794419503926e-05,
"loss": 0.5282,
"step": 98500
},
{
"epoch": 1.31,
"learning_rate": 4.3444837312780586e-05,
"loss": 0.5501,
"step": 99000
},
{
"epoch": 1.32,
"learning_rate": 4.3411730430521895e-05,
"loss": 0.5601,
"step": 99500
},
{
"epoch": 1.32,
"learning_rate": 4.337862354826321e-05,
"loss": 0.5708,
"step": 100000
},
{
"epoch": 1.33,
"learning_rate": 4.3345516666004535e-05,
"loss": 0.5609,
"step": 100500
},
{
"epoch": 1.34,
"learning_rate": 4.331240978374585e-05,
"loss": 0.5328,
"step": 101000
},
{
"epoch": 1.34,
"learning_rate": 4.327930290148716e-05,
"loss": 0.5491,
"step": 101500
},
{
"epoch": 1.35,
"learning_rate": 4.324619601922848e-05,
"loss": 0.5452,
"step": 102000
},
{
"epoch": 1.36,
"learning_rate": 4.32130891369698e-05,
"loss": 0.5396,
"step": 102500
},
{
"epoch": 1.36,
"learning_rate": 4.317998225471111e-05,
"loss": 0.5432,
"step": 103000
},
{
"epoch": 1.37,
"learning_rate": 4.3146875372452425e-05,
"loss": 0.5524,
"step": 103500
},
{
"epoch": 1.38,
"learning_rate": 4.311376849019375e-05,
"loss": 0.521,
"step": 104000
},
{
"epoch": 1.38,
"learning_rate": 4.308066160793506e-05,
"loss": 0.5557,
"step": 104500
},
{
"epoch": 1.39,
"learning_rate": 4.3047554725676374e-05,
"loss": 0.5342,
"step": 105000
},
{
"epoch": 1.4,
"learning_rate": 4.30144478434177e-05,
"loss": 0.535,
"step": 105500
},
{
"epoch": 1.4,
"learning_rate": 4.2981340961159006e-05,
"loss": 0.5467,
"step": 106000
},
{
"epoch": 1.41,
"learning_rate": 4.294823407890032e-05,
"loss": 0.5082,
"step": 106500
},
{
"epoch": 1.42,
"learning_rate": 4.291512719664164e-05,
"loss": 0.5267,
"step": 107000
},
{
"epoch": 1.42,
"learning_rate": 4.2882020314382955e-05,
"loss": 0.5532,
"step": 107500
},
{
"epoch": 1.43,
"learning_rate": 4.284891343212427e-05,
"loss": 0.5548,
"step": 108000
},
{
"epoch": 1.44,
"learning_rate": 4.281580654986559e-05,
"loss": 0.5538,
"step": 108500
},
{
"epoch": 1.44,
"learning_rate": 4.2782699667606903e-05,
"loss": 0.5501,
"step": 109000
},
{
"epoch": 1.45,
"learning_rate": 4.274959278534822e-05,
"loss": 0.525,
"step": 109500
},
{
"epoch": 1.46,
"learning_rate": 4.2716485903089536e-05,
"loss": 0.5237,
"step": 110000
},
{
"epoch": 1.46,
"learning_rate": 4.268337902083085e-05,
"loss": 0.4852,
"step": 110500
},
{
"epoch": 1.47,
"learning_rate": 4.265027213857217e-05,
"loss": 0.507,
"step": 111000
},
{
"epoch": 1.48,
"learning_rate": 4.2617165256313485e-05,
"loss": 0.5217,
"step": 111500
},
{
"epoch": 1.48,
"learning_rate": 4.25840583740548e-05,
"loss": 0.5229,
"step": 112000
}
],
"logging_steps": 500,
"max_steps": 755130,
"num_train_epochs": 10,
"save_steps": 2000,
"total_flos": 4.16057911148544e+17,
"trial_name": null,
"trial_params": null
}