cc_kaz / checkpoint-200000 /trainer_state.json
DaniilOr's picture
Initial upload of multiple checkpoints
769e510 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 127.95905310300704,
"eval_steps": 500,
"global_step": 200000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.3198976327575176,
"grad_norm": 4.1601386070251465,
"learning_rate": 5e-06,
"loss": 10.3279,
"step": 500
},
{
"epoch": 0.6397952655150352,
"grad_norm": 4.366061687469482,
"learning_rate": 1e-05,
"loss": 9.3834,
"step": 1000
},
{
"epoch": 0.9596928982725528,
"grad_norm": 4.784337043762207,
"learning_rate": 1.5e-05,
"loss": 8.8888,
"step": 1500
},
{
"epoch": 1.2795905310300704,
"grad_norm": 3.9968652725219727,
"learning_rate": 2e-05,
"loss": 8.6568,
"step": 2000
},
{
"epoch": 1.599488163787588,
"grad_norm": 4.402552127838135,
"learning_rate": 2.5e-05,
"loss": 8.5473,
"step": 2500
},
{
"epoch": 1.9193857965451055,
"grad_norm": 4.639041423797607,
"learning_rate": 3e-05,
"loss": 8.4044,
"step": 3000
},
{
"epoch": 2.239283429302623,
"grad_norm": 5.651747226715088,
"learning_rate": 3.5e-05,
"loss": 8.2868,
"step": 3500
},
{
"epoch": 2.5591810620601407,
"grad_norm": 4.6999359130859375,
"learning_rate": 4e-05,
"loss": 8.1766,
"step": 4000
},
{
"epoch": 2.8790786948176583,
"grad_norm": 4.838181495666504,
"learning_rate": 4.499e-05,
"loss": 8.1118,
"step": 4500
},
{
"epoch": 3.198976327575176,
"grad_norm": 4.238831996917725,
"learning_rate": 4.999e-05,
"loss": 8.0038,
"step": 5000
},
{
"epoch": 3.5188739603326935,
"grad_norm": 4.455530643463135,
"learning_rate": 5.499000000000001e-05,
"loss": 7.9014,
"step": 5500
},
{
"epoch": 3.838771593090211,
"grad_norm": 5.811736583709717,
"learning_rate": 5.999e-05,
"loss": 7.8352,
"step": 6000
},
{
"epoch": 4.158669225847729,
"grad_norm": 4.998301982879639,
"learning_rate": 6.498e-05,
"loss": 7.7613,
"step": 6500
},
{
"epoch": 4.478566858605246,
"grad_norm": 5.011510848999023,
"learning_rate": 6.998e-05,
"loss": 7.6554,
"step": 7000
},
{
"epoch": 4.798464491362764,
"grad_norm": 4.750300884246826,
"learning_rate": 7.498e-05,
"loss": 7.6109,
"step": 7500
},
{
"epoch": 5.1183621241202815,
"grad_norm": 6.24017858505249,
"learning_rate": 7.998e-05,
"loss": 7.5186,
"step": 8000
},
{
"epoch": 5.438259756877799,
"grad_norm": 6.061458587646484,
"learning_rate": 8.497000000000001e-05,
"loss": 7.3966,
"step": 8500
},
{
"epoch": 5.758157389635317,
"grad_norm": 7.151447772979736,
"learning_rate": 8.997000000000001e-05,
"loss": 7.2877,
"step": 9000
},
{
"epoch": 6.078055022392834,
"grad_norm": 7.578985214233398,
"learning_rate": 9.497000000000001e-05,
"loss": 7.1542,
"step": 9500
},
{
"epoch": 6.397952655150352,
"grad_norm": 5.948920726776123,
"learning_rate": 9.997e-05,
"loss": 7.0008,
"step": 10000
},
{
"epoch": 6.717850287907869,
"grad_norm": 8.036959648132324,
"learning_rate": 9.982896551724137e-05,
"loss": 6.8966,
"step": 10500
},
{
"epoch": 7.037747920665387,
"grad_norm": 7.160433292388916,
"learning_rate": 9.965655172413794e-05,
"loss": 6.7509,
"step": 11000
},
{
"epoch": 7.357645553422905,
"grad_norm": 5.934999465942383,
"learning_rate": 9.948413793103449e-05,
"loss": 6.5833,
"step": 11500
},
{
"epoch": 7.677543186180422,
"grad_norm": 7.745622634887695,
"learning_rate": 9.931172413793104e-05,
"loss": 6.4975,
"step": 12000
},
{
"epoch": 7.99744081893794,
"grad_norm": 7.0418477058410645,
"learning_rate": 9.91393103448276e-05,
"loss": 6.4261,
"step": 12500
},
{
"epoch": 8.317338451695457,
"grad_norm": 6.101259708404541,
"learning_rate": 9.896689655172414e-05,
"loss": 6.2092,
"step": 13000
},
{
"epoch": 8.637236084452976,
"grad_norm": 7.289799213409424,
"learning_rate": 9.87944827586207e-05,
"loss": 6.1436,
"step": 13500
},
{
"epoch": 8.957133717210493,
"grad_norm": 8.126811027526855,
"learning_rate": 9.862206896551725e-05,
"loss": 6.0456,
"step": 14000
},
{
"epoch": 9.277031349968011,
"grad_norm": 8.221816062927246,
"learning_rate": 9.845000000000001e-05,
"loss": 5.9141,
"step": 14500
},
{
"epoch": 9.596928982725528,
"grad_norm": 7.361550331115723,
"learning_rate": 9.827793103448277e-05,
"loss": 5.8326,
"step": 15000
},
{
"epoch": 9.916826615483046,
"grad_norm": 7.1737775802612305,
"learning_rate": 9.810551724137932e-05,
"loss": 5.7974,
"step": 15500
},
{
"epoch": 10.236724248240563,
"grad_norm": 9.80185604095459,
"learning_rate": 9.793310344827586e-05,
"loss": 5.6282,
"step": 16000
},
{
"epoch": 10.556621880998081,
"grad_norm": 7.2062153816223145,
"learning_rate": 9.776068965517242e-05,
"loss": 5.5619,
"step": 16500
},
{
"epoch": 10.876519513755598,
"grad_norm": 10.801878929138184,
"learning_rate": 9.758827586206896e-05,
"loss": 5.5155,
"step": 17000
},
{
"epoch": 11.196417146513117,
"grad_norm": 8.48509693145752,
"learning_rate": 9.741586206896553e-05,
"loss": 5.4259,
"step": 17500
},
{
"epoch": 11.516314779270633,
"grad_norm": 8.47572135925293,
"learning_rate": 9.724344827586207e-05,
"loss": 5.3205,
"step": 18000
},
{
"epoch": 11.836212412028152,
"grad_norm": 6.122796535491943,
"learning_rate": 9.707103448275863e-05,
"loss": 5.3025,
"step": 18500
},
{
"epoch": 12.156110044785668,
"grad_norm": 8.210710525512695,
"learning_rate": 9.689896551724139e-05,
"loss": 5.2264,
"step": 19000
},
{
"epoch": 12.476007677543187,
"grad_norm": 7.857537746429443,
"learning_rate": 9.672655172413794e-05,
"loss": 5.1395,
"step": 19500
},
{
"epoch": 12.795905310300704,
"grad_norm": 7.743075370788574,
"learning_rate": 9.655413793103448e-05,
"loss": 5.1109,
"step": 20000
},
{
"epoch": 13.115802943058222,
"grad_norm": 10.574569702148438,
"learning_rate": 9.638172413793104e-05,
"loss": 5.0794,
"step": 20500
},
{
"epoch": 13.435700575815739,
"grad_norm": 8.313858985900879,
"learning_rate": 9.620931034482758e-05,
"loss": 4.921,
"step": 21000
},
{
"epoch": 13.755598208573257,
"grad_norm": 9.096057891845703,
"learning_rate": 9.603689655172414e-05,
"loss": 4.96,
"step": 21500
},
{
"epoch": 14.075495841330774,
"grad_norm": 8.402993202209473,
"learning_rate": 9.58644827586207e-05,
"loss": 4.9062,
"step": 22000
},
{
"epoch": 14.395393474088293,
"grad_norm": 8.110074996948242,
"learning_rate": 9.569206896551725e-05,
"loss": 4.8026,
"step": 22500
},
{
"epoch": 14.71529110684581,
"grad_norm": 7.908292293548584,
"learning_rate": 9.552000000000001e-05,
"loss": 4.82,
"step": 23000
},
{
"epoch": 15.035188739603328,
"grad_norm": 7.991878986358643,
"learning_rate": 9.534758620689655e-05,
"loss": 4.7397,
"step": 23500
},
{
"epoch": 15.355086372360844,
"grad_norm": 8.696029663085938,
"learning_rate": 9.517551724137932e-05,
"loss": 4.6656,
"step": 24000
},
{
"epoch": 15.674984005118363,
"grad_norm": 9.421612739562988,
"learning_rate": 9.500310344827586e-05,
"loss": 4.6412,
"step": 24500
},
{
"epoch": 15.99488163787588,
"grad_norm": 9.747482299804688,
"learning_rate": 9.483068965517242e-05,
"loss": 4.6048,
"step": 25000
},
{
"epoch": 16.314779270633398,
"grad_norm": 10.389492988586426,
"learning_rate": 9.465827586206897e-05,
"loss": 4.481,
"step": 25500
},
{
"epoch": 16.634676903390915,
"grad_norm": 8.661949157714844,
"learning_rate": 9.448586206896553e-05,
"loss": 4.4923,
"step": 26000
},
{
"epoch": 16.95457453614843,
"grad_norm": 12.681297302246094,
"learning_rate": 9.431344827586207e-05,
"loss": 4.4816,
"step": 26500
},
{
"epoch": 17.27447216890595,
"grad_norm": 8.993134498596191,
"learning_rate": 9.414103448275863e-05,
"loss": 4.3512,
"step": 27000
},
{
"epoch": 17.59436980166347,
"grad_norm": 10.020146369934082,
"learning_rate": 9.396862068965517e-05,
"loss": 4.3447,
"step": 27500
},
{
"epoch": 17.914267434420985,
"grad_norm": 9.514701843261719,
"learning_rate": 9.379655172413794e-05,
"loss": 4.3376,
"step": 28000
},
{
"epoch": 18.234165067178502,
"grad_norm": 10.324498176574707,
"learning_rate": 9.362413793103448e-05,
"loss": 4.2612,
"step": 28500
},
{
"epoch": 18.554062699936022,
"grad_norm": 10.682856559753418,
"learning_rate": 9.345172413793104e-05,
"loss": 4.226,
"step": 29000
},
{
"epoch": 18.87396033269354,
"grad_norm": 7.883260726928711,
"learning_rate": 9.327931034482758e-05,
"loss": 4.19,
"step": 29500
},
{
"epoch": 19.193857965451055,
"grad_norm": 12.470623016357422,
"learning_rate": 9.310724137931035e-05,
"loss": 4.1881,
"step": 30000
},
{
"epoch": 19.513755598208572,
"grad_norm": 9.932331085205078,
"learning_rate": 9.29348275862069e-05,
"loss": 4.0853,
"step": 30500
},
{
"epoch": 19.833653230966092,
"grad_norm": 8.153782844543457,
"learning_rate": 9.276241379310345e-05,
"loss": 4.1087,
"step": 31000
},
{
"epoch": 20.15355086372361,
"grad_norm": 8.214093208312988,
"learning_rate": 9.258999999999999e-05,
"loss": 4.0751,
"step": 31500
},
{
"epoch": 20.473448496481126,
"grad_norm": 11.927350044250488,
"learning_rate": 9.241758620689656e-05,
"loss": 3.9686,
"step": 32000
},
{
"epoch": 20.793346129238643,
"grad_norm": 9.67835807800293,
"learning_rate": 9.224551724137932e-05,
"loss": 3.9745,
"step": 32500
},
{
"epoch": 21.113243761996163,
"grad_norm": 9.911735534667969,
"learning_rate": 9.207310344827586e-05,
"loss": 3.9308,
"step": 33000
},
{
"epoch": 21.43314139475368,
"grad_norm": 9.05053424835205,
"learning_rate": 9.190068965517242e-05,
"loss": 3.8718,
"step": 33500
},
{
"epoch": 21.753039027511196,
"grad_norm": 9.588044166564941,
"learning_rate": 9.172827586206897e-05,
"loss": 3.8425,
"step": 34000
},
{
"epoch": 22.072936660268713,
"grad_norm": 8.788230895996094,
"learning_rate": 9.155620689655173e-05,
"loss": 3.8617,
"step": 34500
},
{
"epoch": 22.392834293026233,
"grad_norm": 9.435895919799805,
"learning_rate": 9.138379310344827e-05,
"loss": 3.7524,
"step": 35000
},
{
"epoch": 22.71273192578375,
"grad_norm": 9.870182037353516,
"learning_rate": 9.121137931034483e-05,
"loss": 3.7916,
"step": 35500
},
{
"epoch": 23.032629558541267,
"grad_norm": 9.612881660461426,
"learning_rate": 9.103896551724139e-05,
"loss": 3.8011,
"step": 36000
},
{
"epoch": 23.352527191298783,
"grad_norm": 9.643827438354492,
"learning_rate": 9.086689655172414e-05,
"loss": 3.6478,
"step": 36500
},
{
"epoch": 23.672424824056304,
"grad_norm": 14.105424880981445,
"learning_rate": 9.069448275862069e-05,
"loss": 3.6671,
"step": 37000
},
{
"epoch": 23.99232245681382,
"grad_norm": 10.427962303161621,
"learning_rate": 9.052206896551724e-05,
"loss": 3.6809,
"step": 37500
},
{
"epoch": 24.312220089571337,
"grad_norm": 11.505946159362793,
"learning_rate": 9.03496551724138e-05,
"loss": 3.553,
"step": 38000
},
{
"epoch": 24.632117722328854,
"grad_norm": 10.393635749816895,
"learning_rate": 9.017724137931035e-05,
"loss": 3.5408,
"step": 38500
},
{
"epoch": 24.952015355086374,
"grad_norm": 9.023842811584473,
"learning_rate": 9.00051724137931e-05,
"loss": 3.5915,
"step": 39000
},
{
"epoch": 25.27191298784389,
"grad_norm": 10.69048023223877,
"learning_rate": 8.983275862068967e-05,
"loss": 3.4896,
"step": 39500
},
{
"epoch": 25.591810620601407,
"grad_norm": 10.803936958312988,
"learning_rate": 8.966034482758621e-05,
"loss": 3.4854,
"step": 40000
},
{
"epoch": 25.911708253358924,
"grad_norm": 10.489801406860352,
"learning_rate": 8.948793103448276e-05,
"loss": 3.4871,
"step": 40500
},
{
"epoch": 26.231605886116444,
"grad_norm": 10.558309555053711,
"learning_rate": 8.931586206896552e-05,
"loss": 3.4186,
"step": 41000
},
{
"epoch": 26.55150351887396,
"grad_norm": 12.186748504638672,
"learning_rate": 8.914344827586208e-05,
"loss": 3.4027,
"step": 41500
},
{
"epoch": 26.871401151631478,
"grad_norm": 9.8623046875,
"learning_rate": 8.897103448275862e-05,
"loss": 3.4191,
"step": 42000
},
{
"epoch": 27.191298784388994,
"grad_norm": 11.407792091369629,
"learning_rate": 8.879862068965518e-05,
"loss": 3.341,
"step": 42500
},
{
"epoch": 27.511196417146515,
"grad_norm": 13.37617301940918,
"learning_rate": 8.862655172413794e-05,
"loss": 3.3137,
"step": 43000
},
{
"epoch": 27.83109404990403,
"grad_norm": 10.30826187133789,
"learning_rate": 8.845413793103449e-05,
"loss": 3.3036,
"step": 43500
},
{
"epoch": 28.150991682661548,
"grad_norm": 12.024778366088867,
"learning_rate": 8.828172413793105e-05,
"loss": 3.2678,
"step": 44000
},
{
"epoch": 28.470889315419065,
"grad_norm": 9.730340957641602,
"learning_rate": 8.810931034482759e-05,
"loss": 3.1949,
"step": 44500
},
{
"epoch": 28.790786948176585,
"grad_norm": 9.700602531433105,
"learning_rate": 8.793689655172414e-05,
"loss": 3.2541,
"step": 45000
},
{
"epoch": 29.1106845809341,
"grad_norm": 12.359143257141113,
"learning_rate": 8.77648275862069e-05,
"loss": 3.2456,
"step": 45500
},
{
"epoch": 29.43058221369162,
"grad_norm": 11.989018440246582,
"learning_rate": 8.759241379310346e-05,
"loss": 3.1154,
"step": 46000
},
{
"epoch": 29.750479846449135,
"grad_norm": 10.904190063476562,
"learning_rate": 8.742e-05,
"loss": 3.175,
"step": 46500
},
{
"epoch": 30.070377479206655,
"grad_norm": 11.253949165344238,
"learning_rate": 8.724758620689656e-05,
"loss": 3.1478,
"step": 47000
},
{
"epoch": 30.390275111964172,
"grad_norm": 12.229791641235352,
"learning_rate": 8.707517241379311e-05,
"loss": 3.0632,
"step": 47500
},
{
"epoch": 30.71017274472169,
"grad_norm": 9.516524314880371,
"learning_rate": 8.690275862068967e-05,
"loss": 3.0843,
"step": 48000
},
{
"epoch": 31.030070377479205,
"grad_norm": 13.730731010437012,
"learning_rate": 8.673034482758621e-05,
"loss": 3.098,
"step": 48500
},
{
"epoch": 31.349968010236726,
"grad_norm": 9.73539924621582,
"learning_rate": 8.655827586206897e-05,
"loss": 2.9611,
"step": 49000
},
{
"epoch": 31.669865642994242,
"grad_norm": 12.066815376281738,
"learning_rate": 8.638586206896552e-05,
"loss": 2.9943,
"step": 49500
},
{
"epoch": 31.98976327575176,
"grad_norm": 11.028585433959961,
"learning_rate": 8.621344827586208e-05,
"loss": 3.0424,
"step": 50000
},
{
"epoch": 32.30966090850928,
"grad_norm": 11.2380952835083,
"learning_rate": 8.604103448275862e-05,
"loss": 2.9023,
"step": 50500
},
{
"epoch": 32.629558541266796,
"grad_norm": 9.345772743225098,
"learning_rate": 8.586862068965518e-05,
"loss": 2.9586,
"step": 51000
},
{
"epoch": 32.94945617402431,
"grad_norm": 10.239849090576172,
"learning_rate": 8.569655172413793e-05,
"loss": 2.9461,
"step": 51500
},
{
"epoch": 33.26935380678183,
"grad_norm": 11.058523178100586,
"learning_rate": 8.552413793103449e-05,
"loss": 2.8453,
"step": 52000
},
{
"epoch": 33.589251439539346,
"grad_norm": 12.131317138671875,
"learning_rate": 8.535172413793105e-05,
"loss": 2.8603,
"step": 52500
},
{
"epoch": 33.90914907229686,
"grad_norm": 10.392476081848145,
"learning_rate": 8.517931034482759e-05,
"loss": 2.8817,
"step": 53000
},
{
"epoch": 34.22904670505438,
"grad_norm": 10.749021530151367,
"learning_rate": 8.500724137931036e-05,
"loss": 2.8073,
"step": 53500
},
{
"epoch": 34.5489443378119,
"grad_norm": 12.33171558380127,
"learning_rate": 8.48348275862069e-05,
"loss": 2.7793,
"step": 54000
},
{
"epoch": 34.86884197056942,
"grad_norm": 12.961758613586426,
"learning_rate": 8.466241379310346e-05,
"loss": 2.8066,
"step": 54500
},
{
"epoch": 35.18873960332694,
"grad_norm": 13.320075035095215,
"learning_rate": 8.449e-05,
"loss": 2.7459,
"step": 55000
},
{
"epoch": 35.50863723608445,
"grad_norm": 14.416489601135254,
"learning_rate": 8.431758620689655e-05,
"loss": 2.7321,
"step": 55500
},
{
"epoch": 35.82853486884197,
"grad_norm": 11.203073501586914,
"learning_rate": 8.414551724137931e-05,
"loss": 2.7486,
"step": 56000
},
{
"epoch": 36.14843250159949,
"grad_norm": 10.463476181030273,
"learning_rate": 8.397310344827587e-05,
"loss": 2.7086,
"step": 56500
},
{
"epoch": 36.468330134357004,
"grad_norm": 11.375761985778809,
"learning_rate": 8.380068965517241e-05,
"loss": 2.6387,
"step": 57000
},
{
"epoch": 36.78822776711452,
"grad_norm": 11.649105072021484,
"learning_rate": 8.362827586206897e-05,
"loss": 2.6746,
"step": 57500
},
{
"epoch": 37.108125399872044,
"grad_norm": 12.708244323730469,
"learning_rate": 8.345586206896552e-05,
"loss": 2.6454,
"step": 58000
},
{
"epoch": 37.42802303262956,
"grad_norm": 12.876201629638672,
"learning_rate": 8.328344827586208e-05,
"loss": 2.5798,
"step": 58500
},
{
"epoch": 37.74792066538708,
"grad_norm": 11.92346477508545,
"learning_rate": 8.311103448275862e-05,
"loss": 2.6589,
"step": 59000
},
{
"epoch": 38.067818298144594,
"grad_norm": 10.742238998413086,
"learning_rate": 8.293896551724138e-05,
"loss": 2.5868,
"step": 59500
},
{
"epoch": 38.38771593090211,
"grad_norm": 11.399048805236816,
"learning_rate": 8.276655172413793e-05,
"loss": 2.5124,
"step": 60000
},
{
"epoch": 38.70761356365963,
"grad_norm": 13.563875198364258,
"learning_rate": 8.259413793103449e-05,
"loss": 2.576,
"step": 60500
},
{
"epoch": 39.027511196417144,
"grad_norm": 11.297135353088379,
"learning_rate": 8.242172413793103e-05,
"loss": 2.5671,
"step": 61000
},
{
"epoch": 39.34740882917466,
"grad_norm": 11.336121559143066,
"learning_rate": 8.22496551724138e-05,
"loss": 2.4445,
"step": 61500
},
{
"epoch": 39.667306461932185,
"grad_norm": 9.477692604064941,
"learning_rate": 8.207724137931035e-05,
"loss": 2.4981,
"step": 62000
},
{
"epoch": 39.9872040946897,
"grad_norm": 11.597848892211914,
"learning_rate": 8.19048275862069e-05,
"loss": 2.5382,
"step": 62500
},
{
"epoch": 40.30710172744722,
"grad_norm": 14.910037994384766,
"learning_rate": 8.173241379310346e-05,
"loss": 2.4158,
"step": 63000
},
{
"epoch": 40.626999360204735,
"grad_norm": 11.870673179626465,
"learning_rate": 8.156e-05,
"loss": 2.4395,
"step": 63500
},
{
"epoch": 40.94689699296225,
"grad_norm": 15.279576301574707,
"learning_rate": 8.138758620689655e-05,
"loss": 2.4653,
"step": 64000
},
{
"epoch": 41.26679462571977,
"grad_norm": 11.710406303405762,
"learning_rate": 8.121551724137931e-05,
"loss": 2.3567,
"step": 64500
},
{
"epoch": 41.586692258477285,
"grad_norm": 10.663411140441895,
"learning_rate": 8.104310344827587e-05,
"loss": 2.3655,
"step": 65000
},
{
"epoch": 41.9065898912348,
"grad_norm": 13.946629524230957,
"learning_rate": 8.087068965517241e-05,
"loss": 2.4336,
"step": 65500
},
{
"epoch": 42.226487523992326,
"grad_norm": 13.782262802124023,
"learning_rate": 8.069827586206898e-05,
"loss": 2.3351,
"step": 66000
},
{
"epoch": 42.54638515674984,
"grad_norm": 11.177961349487305,
"learning_rate": 8.052586206896552e-05,
"loss": 2.359,
"step": 66500
},
{
"epoch": 42.86628278950736,
"grad_norm": 15.120301246643066,
"learning_rate": 8.035344827586208e-05,
"loss": 2.355,
"step": 67000
},
{
"epoch": 43.186180422264876,
"grad_norm": 10.805267333984375,
"learning_rate": 8.018103448275862e-05,
"loss": 2.2905,
"step": 67500
},
{
"epoch": 43.50607805502239,
"grad_norm": 11.777176856994629,
"learning_rate": 8.000862068965517e-05,
"loss": 2.2906,
"step": 68000
},
{
"epoch": 43.82597568777991,
"grad_norm": 15.457807540893555,
"learning_rate": 7.983689655172414e-05,
"loss": 2.3269,
"step": 68500
},
{
"epoch": 44.145873320537426,
"grad_norm": 11.639357566833496,
"learning_rate": 7.966448275862069e-05,
"loss": 2.2371,
"step": 69000
},
{
"epoch": 44.46577095329494,
"grad_norm": 11.710591316223145,
"learning_rate": 7.949206896551725e-05,
"loss": 2.2248,
"step": 69500
},
{
"epoch": 44.785668586052466,
"grad_norm": 12.675103187561035,
"learning_rate": 7.93196551724138e-05,
"loss": 2.2586,
"step": 70000
},
{
"epoch": 45.10556621880998,
"grad_norm": 12.752120971679688,
"learning_rate": 7.914724137931034e-05,
"loss": 2.2489,
"step": 70500
},
{
"epoch": 45.4254638515675,
"grad_norm": 11.379339218139648,
"learning_rate": 7.89751724137931e-05,
"loss": 2.1774,
"step": 71000
},
{
"epoch": 45.74536148432502,
"grad_norm": 12.76633358001709,
"learning_rate": 7.880275862068966e-05,
"loss": 2.2007,
"step": 71500
},
{
"epoch": 46.06525911708253,
"grad_norm": 11.421367645263672,
"learning_rate": 7.863034482758621e-05,
"loss": 2.2262,
"step": 72000
},
{
"epoch": 46.38515674984005,
"grad_norm": 14.81748104095459,
"learning_rate": 7.845793103448277e-05,
"loss": 2.101,
"step": 72500
},
{
"epoch": 46.70505438259757,
"grad_norm": 12.902971267700195,
"learning_rate": 7.828551724137931e-05,
"loss": 2.1568,
"step": 73000
},
{
"epoch": 47.02495201535508,
"grad_norm": 10.685113906860352,
"learning_rate": 7.811310344827587e-05,
"loss": 2.1655,
"step": 73500
},
{
"epoch": 47.34484964811261,
"grad_norm": 15.892518043518066,
"learning_rate": 7.794068965517242e-05,
"loss": 2.0551,
"step": 74000
},
{
"epoch": 47.664747280870124,
"grad_norm": 13.730358123779297,
"learning_rate": 7.776862068965518e-05,
"loss": 2.1053,
"step": 74500
},
{
"epoch": 47.98464491362764,
"grad_norm": 13.635787963867188,
"learning_rate": 7.759620689655172e-05,
"loss": 2.1408,
"step": 75000
},
{
"epoch": 48.30454254638516,
"grad_norm": 12.861611366271973,
"learning_rate": 7.742379310344828e-05,
"loss": 2.0104,
"step": 75500
},
{
"epoch": 48.624440179142674,
"grad_norm": 11.84931468963623,
"learning_rate": 7.725137931034483e-05,
"loss": 2.0555,
"step": 76000
},
{
"epoch": 48.94433781190019,
"grad_norm": 15.812765121459961,
"learning_rate": 7.70793103448276e-05,
"loss": 2.1087,
"step": 76500
},
{
"epoch": 49.26423544465771,
"grad_norm": 14.233431816101074,
"learning_rate": 7.690689655172414e-05,
"loss": 1.9998,
"step": 77000
},
{
"epoch": 49.584133077415224,
"grad_norm": 14.329803466796875,
"learning_rate": 7.673448275862069e-05,
"loss": 2.0189,
"step": 77500
},
{
"epoch": 49.90403071017275,
"grad_norm": 11.00400161743164,
"learning_rate": 7.656206896551725e-05,
"loss": 2.059,
"step": 78000
},
{
"epoch": 50.223928342930265,
"grad_norm": 13.582133293151855,
"learning_rate": 7.63896551724138e-05,
"loss": 1.9753,
"step": 78500
},
{
"epoch": 50.54382597568778,
"grad_norm": 12.560907363891602,
"learning_rate": 7.621724137931034e-05,
"loss": 1.9759,
"step": 79000
},
{
"epoch": 50.8637236084453,
"grad_norm": 12.169915199279785,
"learning_rate": 7.60451724137931e-05,
"loss": 1.9944,
"step": 79500
},
{
"epoch": 51.183621241202815,
"grad_norm": 13.5604248046875,
"learning_rate": 7.587275862068966e-05,
"loss": 1.9323,
"step": 80000
},
{
"epoch": 51.50351887396033,
"grad_norm": 15.892741203308105,
"learning_rate": 7.570034482758621e-05,
"loss": 1.9095,
"step": 80500
},
{
"epoch": 51.82341650671785,
"grad_norm": 13.435209274291992,
"learning_rate": 7.552793103448276e-05,
"loss": 1.9508,
"step": 81000
},
{
"epoch": 52.143314139475365,
"grad_norm": 11.180010795593262,
"learning_rate": 7.535586206896551e-05,
"loss": 1.9216,
"step": 81500
},
{
"epoch": 52.46321177223289,
"grad_norm": 12.792661666870117,
"learning_rate": 7.518344827586207e-05,
"loss": 1.8817,
"step": 82000
},
{
"epoch": 52.783109404990405,
"grad_norm": 11.785886764526367,
"learning_rate": 7.501103448275863e-05,
"loss": 1.9121,
"step": 82500
},
{
"epoch": 53.10300703774792,
"grad_norm": 10.568120002746582,
"learning_rate": 7.483862068965518e-05,
"loss": 1.8885,
"step": 83000
},
{
"epoch": 53.42290467050544,
"grad_norm": 14.641459465026855,
"learning_rate": 7.466620689655172e-05,
"loss": 1.8357,
"step": 83500
},
{
"epoch": 53.742802303262955,
"grad_norm": 13.5363187789917,
"learning_rate": 7.449379310344828e-05,
"loss": 1.8515,
"step": 84000
},
{
"epoch": 54.06269993602047,
"grad_norm": 12.997908592224121,
"learning_rate": 7.432172413793104e-05,
"loss": 1.8681,
"step": 84500
},
{
"epoch": 54.38259756877799,
"grad_norm": 12.53503131866455,
"learning_rate": 7.414931034482759e-05,
"loss": 1.7785,
"step": 85000
},
{
"epoch": 54.702495201535505,
"grad_norm": 11.986194610595703,
"learning_rate": 7.397689655172413e-05,
"loss": 1.8507,
"step": 85500
},
{
"epoch": 55.02239283429303,
"grad_norm": 12.089723587036133,
"learning_rate": 7.380448275862069e-05,
"loss": 1.8537,
"step": 86000
},
{
"epoch": 55.342290467050546,
"grad_norm": 13.552453994750977,
"learning_rate": 7.363206896551725e-05,
"loss": 1.7622,
"step": 86500
},
{
"epoch": 55.66218809980806,
"grad_norm": 12.03878116607666,
"learning_rate": 7.346e-05,
"loss": 1.8076,
"step": 87000
},
{
"epoch": 55.98208573256558,
"grad_norm": 11.187782287597656,
"learning_rate": 7.328758620689655e-05,
"loss": 1.8241,
"step": 87500
},
{
"epoch": 56.301983365323096,
"grad_norm": 14.924737930297852,
"learning_rate": 7.311517241379312e-05,
"loss": 1.7077,
"step": 88000
},
{
"epoch": 56.62188099808061,
"grad_norm": 12.302467346191406,
"learning_rate": 7.294275862068966e-05,
"loss": 1.74,
"step": 88500
},
{
"epoch": 56.94177863083813,
"grad_norm": 10.834394454956055,
"learning_rate": 7.277034482758621e-05,
"loss": 1.7827,
"step": 89000
},
{
"epoch": 57.261676263595646,
"grad_norm": 14.356012344360352,
"learning_rate": 7.259793103448276e-05,
"loss": 1.7082,
"step": 89500
},
{
"epoch": 57.58157389635317,
"grad_norm": 14.632678031921387,
"learning_rate": 7.242551724137931e-05,
"loss": 1.7045,
"step": 90000
},
{
"epoch": 57.90147152911069,
"grad_norm": 13.501043319702148,
"learning_rate": 7.225344827586207e-05,
"loss": 1.7522,
"step": 90500
},
{
"epoch": 58.2213691618682,
"grad_norm": 18.839614868164062,
"learning_rate": 7.208103448275862e-05,
"loss": 1.6801,
"step": 91000
},
{
"epoch": 58.54126679462572,
"grad_norm": 13.41618824005127,
"learning_rate": 7.190862068965517e-05,
"loss": 1.7005,
"step": 91500
},
{
"epoch": 58.86116442738324,
"grad_norm": 12.56169605255127,
"learning_rate": 7.173620689655172e-05,
"loss": 1.7018,
"step": 92000
},
{
"epoch": 59.18106206014075,
"grad_norm": 13.447467803955078,
"learning_rate": 7.15641379310345e-05,
"loss": 1.6691,
"step": 92500
},
{
"epoch": 59.50095969289827,
"grad_norm": 12.452493667602539,
"learning_rate": 7.139172413793104e-05,
"loss": 1.651,
"step": 93000
},
{
"epoch": 59.82085732565579,
"grad_norm": 10.552214622497559,
"learning_rate": 7.121931034482759e-05,
"loss": 1.6916,
"step": 93500
},
{
"epoch": 60.14075495841331,
"grad_norm": 11.099422454833984,
"learning_rate": 7.104689655172413e-05,
"loss": 1.6716,
"step": 94000
},
{
"epoch": 60.46065259117083,
"grad_norm": 13.663276672363281,
"learning_rate": 7.08744827586207e-05,
"loss": 1.6105,
"step": 94500
},
{
"epoch": 60.780550223928344,
"grad_norm": 11.783934593200684,
"learning_rate": 7.070206896551725e-05,
"loss": 1.6399,
"step": 95000
},
{
"epoch": 61.10044785668586,
"grad_norm": 12.881340026855469,
"learning_rate": 7.053e-05,
"loss": 1.6058,
"step": 95500
},
{
"epoch": 61.42034548944338,
"grad_norm": 12.405476570129395,
"learning_rate": 7.035758620689656e-05,
"loss": 1.5703,
"step": 96000
},
{
"epoch": 61.740243122200894,
"grad_norm": 11.660452842712402,
"learning_rate": 7.018517241379311e-05,
"loss": 1.6002,
"step": 96500
},
{
"epoch": 62.06014075495841,
"grad_norm": 11.69723892211914,
"learning_rate": 7.001275862068966e-05,
"loss": 1.6186,
"step": 97000
},
{
"epoch": 62.38003838771593,
"grad_norm": 16.210947036743164,
"learning_rate": 6.984034482758621e-05,
"loss": 1.5663,
"step": 97500
},
{
"epoch": 62.69993602047345,
"grad_norm": 11.853803634643555,
"learning_rate": 6.966827586206897e-05,
"loss": 1.5744,
"step": 98000
},
{
"epoch": 63.01983365323097,
"grad_norm": 10.565818786621094,
"learning_rate": 6.949586206896553e-05,
"loss": 1.5829,
"step": 98500
},
{
"epoch": 63.339731285988485,
"grad_norm": 11.621013641357422,
"learning_rate": 6.932344827586207e-05,
"loss": 1.5213,
"step": 99000
},
{
"epoch": 63.659628918746,
"grad_norm": 10.182308197021484,
"learning_rate": 6.915103448275862e-05,
"loss": 1.5291,
"step": 99500
},
{
"epoch": 63.97952655150352,
"grad_norm": 14.434243202209473,
"learning_rate": 6.897862068965517e-05,
"loss": 1.5612,
"step": 100000
},
{
"epoch": 64.29942418426104,
"grad_norm": 12.513864517211914,
"learning_rate": 6.880655172413794e-05,
"loss": 1.5041,
"step": 100500
},
{
"epoch": 64.61932181701856,
"grad_norm": 13.189037322998047,
"learning_rate": 6.863413793103448e-05,
"loss": 1.5097,
"step": 101000
},
{
"epoch": 64.93921944977608,
"grad_norm": 11.867232322692871,
"learning_rate": 6.846172413793104e-05,
"loss": 1.5401,
"step": 101500
},
{
"epoch": 65.25911708253359,
"grad_norm": 13.00894832611084,
"learning_rate": 6.828931034482758e-05,
"loss": 1.4581,
"step": 102000
},
{
"epoch": 65.57901471529111,
"grad_norm": 11.719345092773438,
"learning_rate": 6.811689655172415e-05,
"loss": 1.4807,
"step": 102500
},
{
"epoch": 65.89891234804863,
"grad_norm": 11.355063438415527,
"learning_rate": 6.79444827586207e-05,
"loss": 1.4892,
"step": 103000
},
{
"epoch": 66.21880998080614,
"grad_norm": 13.351948738098145,
"learning_rate": 6.777241379310345e-05,
"loss": 1.4636,
"step": 103500
},
{
"epoch": 66.53870761356366,
"grad_norm": 15.406342506408691,
"learning_rate": 6.76e-05,
"loss": 1.4678,
"step": 104000
},
{
"epoch": 66.85860524632118,
"grad_norm": 14.357329368591309,
"learning_rate": 6.742758620689656e-05,
"loss": 1.4936,
"step": 104500
},
{
"epoch": 67.17850287907869,
"grad_norm": 12.275686264038086,
"learning_rate": 6.725517241379311e-05,
"loss": 1.4566,
"step": 105000
},
{
"epoch": 67.49840051183621,
"grad_norm": 13.198380470275879,
"learning_rate": 6.708310344827586e-05,
"loss": 1.4326,
"step": 105500
},
{
"epoch": 67.81829814459373,
"grad_norm": 13.365631103515625,
"learning_rate": 6.691068965517242e-05,
"loss": 1.426,
"step": 106000
},
{
"epoch": 68.13819577735124,
"grad_norm": 14.106985092163086,
"learning_rate": 6.673827586206897e-05,
"loss": 1.4289,
"step": 106500
},
{
"epoch": 68.45809341010876,
"grad_norm": 10.076281547546387,
"learning_rate": 6.656586206896553e-05,
"loss": 1.396,
"step": 107000
},
{
"epoch": 68.77799104286628,
"grad_norm": 14.63807201385498,
"learning_rate": 6.639344827586207e-05,
"loss": 1.3979,
"step": 107500
},
{
"epoch": 69.0978886756238,
"grad_norm": 13.643959045410156,
"learning_rate": 6.622137931034483e-05,
"loss": 1.4274,
"step": 108000
},
{
"epoch": 69.41778630838132,
"grad_norm": 11.819470405578613,
"learning_rate": 6.604896551724138e-05,
"loss": 1.3769,
"step": 108500
},
{
"epoch": 69.73768394113884,
"grad_norm": 14.33261775970459,
"learning_rate": 6.587655172413794e-05,
"loss": 1.3825,
"step": 109000
},
{
"epoch": 70.05758157389636,
"grad_norm": 10.918536186218262,
"learning_rate": 6.570413793103448e-05,
"loss": 1.3913,
"step": 109500
},
{
"epoch": 70.37747920665387,
"grad_norm": 13.519926071166992,
"learning_rate": 6.553172413793104e-05,
"loss": 1.3341,
"step": 110000
},
{
"epoch": 70.69737683941139,
"grad_norm": 12.5425386428833,
"learning_rate": 6.535931034482759e-05,
"loss": 1.3828,
"step": 110500
},
{
"epoch": 71.0172744721689,
"grad_norm": 11.435805320739746,
"learning_rate": 6.518724137931035e-05,
"loss": 1.3821,
"step": 111000
},
{
"epoch": 71.33717210492642,
"grad_norm": 12.65505313873291,
"learning_rate": 6.501482758620689e-05,
"loss": 1.3083,
"step": 111500
},
{
"epoch": 71.65706973768394,
"grad_norm": 15.489115715026855,
"learning_rate": 6.484241379310345e-05,
"loss": 1.341,
"step": 112000
},
{
"epoch": 71.97696737044146,
"grad_norm": 14.14395809173584,
"learning_rate": 6.467e-05,
"loss": 1.3579,
"step": 112500
},
{
"epoch": 72.29686500319897,
"grad_norm": 13.708014488220215,
"learning_rate": 6.449758620689656e-05,
"loss": 1.3032,
"step": 113000
},
{
"epoch": 72.61676263595649,
"grad_norm": 10.75635814666748,
"learning_rate": 6.432551724137932e-05,
"loss": 1.3045,
"step": 113500
},
{
"epoch": 72.93666026871401,
"grad_norm": 12.12192440032959,
"learning_rate": 6.415310344827586e-05,
"loss": 1.3248,
"step": 114000
},
{
"epoch": 73.25655790147152,
"grad_norm": 13.368456840515137,
"learning_rate": 6.398068965517241e-05,
"loss": 1.287,
"step": 114500
},
{
"epoch": 73.57645553422904,
"grad_norm": 12.584633827209473,
"learning_rate": 6.380827586206897e-05,
"loss": 1.3015,
"step": 115000
},
{
"epoch": 73.89635316698656,
"grad_norm": 13.863194465637207,
"learning_rate": 6.363620689655173e-05,
"loss": 1.2898,
"step": 115500
},
{
"epoch": 74.21625079974409,
"grad_norm": 12.937112808227539,
"learning_rate": 6.346379310344827e-05,
"loss": 1.2456,
"step": 116000
},
{
"epoch": 74.5361484325016,
"grad_norm": 11.274981498718262,
"learning_rate": 6.329137931034484e-05,
"loss": 1.2689,
"step": 116500
},
{
"epoch": 74.85604606525912,
"grad_norm": 14.425061225891113,
"learning_rate": 6.311896551724138e-05,
"loss": 1.2752,
"step": 117000
},
{
"epoch": 75.17594369801664,
"grad_norm": 11.654635429382324,
"learning_rate": 6.294655172413794e-05,
"loss": 1.2442,
"step": 117500
},
{
"epoch": 75.49584133077416,
"grad_norm": 10.00129222869873,
"learning_rate": 6.277413793103448e-05,
"loss": 1.2506,
"step": 118000
},
{
"epoch": 75.81573896353167,
"grad_norm": 11.665295600891113,
"learning_rate": 6.260206896551725e-05,
"loss": 1.2541,
"step": 118500
},
{
"epoch": 76.13563659628919,
"grad_norm": 10.555766105651855,
"learning_rate": 6.24296551724138e-05,
"loss": 1.2486,
"step": 119000
},
{
"epoch": 76.4555342290467,
"grad_norm": 14.879280090332031,
"learning_rate": 6.225724137931035e-05,
"loss": 1.2124,
"step": 119500
},
{
"epoch": 76.77543186180422,
"grad_norm": 15.131136894226074,
"learning_rate": 6.208482758620689e-05,
"loss": 1.2549,
"step": 120000
},
{
"epoch": 77.09532949456174,
"grad_norm": 9.889472961425781,
"learning_rate": 6.191241379310345e-05,
"loss": 1.2376,
"step": 120500
},
{
"epoch": 77.41522712731926,
"grad_norm": 11.307145118713379,
"learning_rate": 6.174e-05,
"loss": 1.1958,
"step": 121000
},
{
"epoch": 77.73512476007677,
"grad_norm": 14.303799629211426,
"learning_rate": 6.156758620689656e-05,
"loss": 1.2009,
"step": 121500
},
{
"epoch": 78.05502239283429,
"grad_norm": 11.318217277526855,
"learning_rate": 6.139517241379311e-05,
"loss": 1.2215,
"step": 122000
},
{
"epoch": 78.3749200255918,
"grad_norm": 13.979291915893555,
"learning_rate": 6.122310344827586e-05,
"loss": 1.1713,
"step": 122500
},
{
"epoch": 78.69481765834932,
"grad_norm": 12.78084945678711,
"learning_rate": 6.105103448275863e-05,
"loss": 1.1901,
"step": 123000
},
{
"epoch": 79.01471529110684,
"grad_norm": 10.332459449768066,
"learning_rate": 6.087862068965517e-05,
"loss": 1.2141,
"step": 123500
},
{
"epoch": 79.33461292386437,
"grad_norm": 11.179670333862305,
"learning_rate": 6.0706206896551735e-05,
"loss": 1.1641,
"step": 124000
},
{
"epoch": 79.65451055662189,
"grad_norm": 12.706995964050293,
"learning_rate": 6.053379310344828e-05,
"loss": 1.1887,
"step": 124500
},
{
"epoch": 79.9744081893794,
"grad_norm": 12.575511932373047,
"learning_rate": 6.036137931034483e-05,
"loss": 1.1994,
"step": 125000
},
{
"epoch": 80.29430582213692,
"grad_norm": 11.299592971801758,
"learning_rate": 6.0189310344827584e-05,
"loss": 1.1535,
"step": 125500
},
{
"epoch": 80.61420345489444,
"grad_norm": 9.741961479187012,
"learning_rate": 6.0016896551724147e-05,
"loss": 1.1508,
"step": 126000
},
{
"epoch": 80.93410108765195,
"grad_norm": 13.86517333984375,
"learning_rate": 5.984448275862069e-05,
"loss": 1.1453,
"step": 126500
},
{
"epoch": 81.25399872040947,
"grad_norm": 12.810471534729004,
"learning_rate": 5.9672068965517244e-05,
"loss": 1.131,
"step": 127000
},
{
"epoch": 81.57389635316699,
"grad_norm": 11.828211784362793,
"learning_rate": 5.949965517241379e-05,
"loss": 1.1462,
"step": 127500
},
{
"epoch": 81.8937939859245,
"grad_norm": 13.588178634643555,
"learning_rate": 5.932724137931035e-05,
"loss": 1.1519,
"step": 128000
},
{
"epoch": 82.21369161868202,
"grad_norm": 13.903426170349121,
"learning_rate": 5.91548275862069e-05,
"loss": 1.1222,
"step": 128500
},
{
"epoch": 82.53358925143954,
"grad_norm": 13.447443962097168,
"learning_rate": 5.898275862068966e-05,
"loss": 1.1173,
"step": 129000
},
{
"epoch": 82.85348688419705,
"grad_norm": 12.132195472717285,
"learning_rate": 5.8810344827586205e-05,
"loss": 1.1262,
"step": 129500
},
{
"epoch": 83.17338451695457,
"grad_norm": 11.170686721801758,
"learning_rate": 5.863793103448276e-05,
"loss": 1.0957,
"step": 130000
},
{
"epoch": 83.49328214971209,
"grad_norm": 12.57539176940918,
"learning_rate": 5.846551724137931e-05,
"loss": 1.0862,
"step": 130500
},
{
"epoch": 83.8131797824696,
"grad_norm": 14.212547302246094,
"learning_rate": 5.8293448275862074e-05,
"loss": 1.0929,
"step": 131000
},
{
"epoch": 84.13307741522712,
"grad_norm": 14.803600311279297,
"learning_rate": 5.8121034482758616e-05,
"loss": 1.0948,
"step": 131500
},
{
"epoch": 84.45297504798465,
"grad_norm": 19.55899429321289,
"learning_rate": 5.794862068965518e-05,
"loss": 1.0788,
"step": 132000
},
{
"epoch": 84.77287268074217,
"grad_norm": 11.086203575134277,
"learning_rate": 5.7776206896551734e-05,
"loss": 1.098,
"step": 132500
},
{
"epoch": 85.09277031349968,
"grad_norm": 10.74999713897705,
"learning_rate": 5.7603793103448276e-05,
"loss": 1.0649,
"step": 133000
},
{
"epoch": 85.4126679462572,
"grad_norm": 14.409449577331543,
"learning_rate": 5.743137931034484e-05,
"loss": 1.0592,
"step": 133500
},
{
"epoch": 85.73256557901472,
"grad_norm": 10.215742111206055,
"learning_rate": 5.725931034482759e-05,
"loss": 1.0765,
"step": 134000
},
{
"epoch": 86.05246321177223,
"grad_norm": 12.911944389343262,
"learning_rate": 5.7086896551724146e-05,
"loss": 1.0504,
"step": 134500
},
{
"epoch": 86.37236084452975,
"grad_norm": 14.987035751342773,
"learning_rate": 5.691448275862069e-05,
"loss": 1.0141,
"step": 135000
},
{
"epoch": 86.69225847728727,
"grad_norm": 11.989995002746582,
"learning_rate": 5.674206896551725e-05,
"loss": 1.0431,
"step": 135500
},
{
"epoch": 87.01215611004478,
"grad_norm": 12.771849632263184,
"learning_rate": 5.657e-05,
"loss": 1.054,
"step": 136000
},
{
"epoch": 87.3320537428023,
"grad_norm": 13.398333549499512,
"learning_rate": 5.639758620689656e-05,
"loss": 0.9984,
"step": 136500
},
{
"epoch": 87.65195137555982,
"grad_norm": 10.814030647277832,
"learning_rate": 5.6225172413793106e-05,
"loss": 1.0283,
"step": 137000
},
{
"epoch": 87.97184900831734,
"grad_norm": 12.13095760345459,
"learning_rate": 5.605275862068966e-05,
"loss": 1.0414,
"step": 137500
},
{
"epoch": 88.29174664107485,
"grad_norm": 12.733049392700195,
"learning_rate": 5.5880344827586204e-05,
"loss": 1.0087,
"step": 138000
},
{
"epoch": 88.61164427383237,
"grad_norm": 16.555213928222656,
"learning_rate": 5.570827586206897e-05,
"loss": 1.0062,
"step": 138500
},
{
"epoch": 88.93154190658989,
"grad_norm": 11.025595664978027,
"learning_rate": 5.553586206896552e-05,
"loss": 1.0212,
"step": 139000
},
{
"epoch": 89.2514395393474,
"grad_norm": 9.93308162689209,
"learning_rate": 5.5363448275862074e-05,
"loss": 0.998,
"step": 139500
},
{
"epoch": 89.57133717210493,
"grad_norm": 14.131500244140625,
"learning_rate": 5.519103448275862e-05,
"loss": 0.9797,
"step": 140000
},
{
"epoch": 89.89123480486245,
"grad_norm": 13.041298866271973,
"learning_rate": 5.501862068965518e-05,
"loss": 1.0205,
"step": 140500
},
{
"epoch": 90.21113243761997,
"grad_norm": 10.885424613952637,
"learning_rate": 5.484655172413793e-05,
"loss": 0.9763,
"step": 141000
},
{
"epoch": 90.53103007037748,
"grad_norm": 17.75884437561035,
"learning_rate": 5.4674137931034485e-05,
"loss": 0.9791,
"step": 141500
},
{
"epoch": 90.850927703135,
"grad_norm": 15.903059005737305,
"learning_rate": 5.4501724137931034e-05,
"loss": 0.9823,
"step": 142000
},
{
"epoch": 91.17082533589252,
"grad_norm": 12.409110069274902,
"learning_rate": 5.432931034482759e-05,
"loss": 0.9732,
"step": 142500
},
{
"epoch": 91.49072296865003,
"grad_norm": 14.427364349365234,
"learning_rate": 5.4157241379310355e-05,
"loss": 0.9594,
"step": 143000
},
{
"epoch": 91.81062060140755,
"grad_norm": 10.96267032623291,
"learning_rate": 5.39848275862069e-05,
"loss": 0.9852,
"step": 143500
},
{
"epoch": 92.13051823416507,
"grad_norm": 9.344204902648926,
"learning_rate": 5.381241379310345e-05,
"loss": 0.9776,
"step": 144000
},
{
"epoch": 92.45041586692258,
"grad_norm": 13.800095558166504,
"learning_rate": 5.364e-05,
"loss": 0.9477,
"step": 144500
},
{
"epoch": 92.7703134996801,
"grad_norm": 14.4652099609375,
"learning_rate": 5.346758620689656e-05,
"loss": 0.9521,
"step": 145000
},
{
"epoch": 93.09021113243762,
"grad_norm": 10.197824478149414,
"learning_rate": 5.3295172413793106e-05,
"loss": 0.9508,
"step": 145500
},
{
"epoch": 93.41010876519513,
"grad_norm": 14.248830795288086,
"learning_rate": 5.312310344827587e-05,
"loss": 0.9298,
"step": 146000
},
{
"epoch": 93.73000639795265,
"grad_norm": 15.136180877685547,
"learning_rate": 5.295068965517241e-05,
"loss": 0.9363,
"step": 146500
},
{
"epoch": 94.04990403071017,
"grad_norm": 14.999555587768555,
"learning_rate": 5.277862068965518e-05,
"loss": 0.9574,
"step": 147000
},
{
"epoch": 94.36980166346768,
"grad_norm": 10.511527061462402,
"learning_rate": 5.260620689655172e-05,
"loss": 0.9429,
"step": 147500
},
{
"epoch": 94.68969929622521,
"grad_norm": 12.433847427368164,
"learning_rate": 5.243379310344828e-05,
"loss": 0.9327,
"step": 148000
},
{
"epoch": 95.00959692898273,
"grad_norm": 11.800546646118164,
"learning_rate": 5.2261379310344825e-05,
"loss": 0.9363,
"step": 148500
},
{
"epoch": 95.32949456174025,
"grad_norm": 11.03012466430664,
"learning_rate": 5.208896551724138e-05,
"loss": 0.9172,
"step": 149000
},
{
"epoch": 95.64939219449776,
"grad_norm": 13.628169059753418,
"learning_rate": 5.191655172413793e-05,
"loss": 0.9101,
"step": 149500
},
{
"epoch": 95.96928982725528,
"grad_norm": 11.726004600524902,
"learning_rate": 5.1744137931034485e-05,
"loss": 0.9292,
"step": 150000
},
{
"epoch": 96.2891874600128,
"grad_norm": 9.179962158203125,
"learning_rate": 5.1571724137931033e-05,
"loss": 0.8977,
"step": 150500
},
{
"epoch": 96.60908509277031,
"grad_norm": 11.146485328674316,
"learning_rate": 5.139931034482759e-05,
"loss": 0.9037,
"step": 151000
},
{
"epoch": 96.92898272552783,
"grad_norm": 14.070140838623047,
"learning_rate": 5.122724137931034e-05,
"loss": 0.9075,
"step": 151500
},
{
"epoch": 97.24888035828535,
"grad_norm": 12.702670097351074,
"learning_rate": 5.1054827586206897e-05,
"loss": 0.8721,
"step": 152000
},
{
"epoch": 97.56877799104286,
"grad_norm": 11.813859939575195,
"learning_rate": 5.088241379310346e-05,
"loss": 0.8876,
"step": 152500
},
{
"epoch": 97.88867562380038,
"grad_norm": 14.402729034423828,
"learning_rate": 5.071e-05,
"loss": 0.8986,
"step": 153000
},
{
"epoch": 98.2085732565579,
"grad_norm": 11.126707077026367,
"learning_rate": 5.0537586206896556e-05,
"loss": 0.8569,
"step": 153500
},
{
"epoch": 98.52847088931541,
"grad_norm": 13.64499282836914,
"learning_rate": 5.0365172413793105e-05,
"loss": 0.8774,
"step": 154000
},
{
"epoch": 98.84836852207293,
"grad_norm": 13.022969245910645,
"learning_rate": 5.019275862068966e-05,
"loss": 0.8801,
"step": 154500
},
{
"epoch": 99.16826615483045,
"grad_norm": 12.945636749267578,
"learning_rate": 5.002034482758621e-05,
"loss": 0.8597,
"step": 155000
},
{
"epoch": 99.48816378758798,
"grad_norm": 12.05784797668457,
"learning_rate": 4.984827586206897e-05,
"loss": 0.8477,
"step": 155500
},
{
"epoch": 99.8080614203455,
"grad_norm": 11.149604797363281,
"learning_rate": 4.967586206896552e-05,
"loss": 0.8533,
"step": 156000
},
{
"epoch": 100.12795905310301,
"grad_norm": 12.993003845214844,
"learning_rate": 4.950344827586207e-05,
"loss": 0.8722,
"step": 156500
},
{
"epoch": 100.44785668586053,
"grad_norm": 12.33105754852295,
"learning_rate": 4.933103448275863e-05,
"loss": 0.8397,
"step": 157000
},
{
"epoch": 100.76775431861805,
"grad_norm": 12.321619987487793,
"learning_rate": 4.9158965517241387e-05,
"loss": 0.8548,
"step": 157500
},
{
"epoch": 101.08765195137556,
"grad_norm": 13.47990894317627,
"learning_rate": 4.8986896551724145e-05,
"loss": 0.8493,
"step": 158000
},
{
"epoch": 101.40754958413308,
"grad_norm": 10.382761001586914,
"learning_rate": 4.8814482758620694e-05,
"loss": 0.8434,
"step": 158500
},
{
"epoch": 101.7274472168906,
"grad_norm": 8.643112182617188,
"learning_rate": 4.864206896551724e-05,
"loss": 0.8408,
"step": 159000
},
{
"epoch": 102.04734484964811,
"grad_norm": 10.81409740447998,
"learning_rate": 4.84696551724138e-05,
"loss": 0.8415,
"step": 159500
},
{
"epoch": 102.36724248240563,
"grad_norm": 11.696605682373047,
"learning_rate": 4.829758620689656e-05,
"loss": 0.8305,
"step": 160000
},
{
"epoch": 102.68714011516315,
"grad_norm": 13.345202445983887,
"learning_rate": 4.8125172413793106e-05,
"loss": 0.8271,
"step": 160500
},
{
"epoch": 103.00703774792066,
"grad_norm": 11.675226211547852,
"learning_rate": 4.795275862068966e-05,
"loss": 0.8491,
"step": 161000
},
{
"epoch": 103.32693538067818,
"grad_norm": 12.083547592163086,
"learning_rate": 4.778034482758621e-05,
"loss": 0.8052,
"step": 161500
},
{
"epoch": 103.6468330134357,
"grad_norm": 9.721264839172363,
"learning_rate": 4.760793103448276e-05,
"loss": 0.8128,
"step": 162000
},
{
"epoch": 103.96673064619321,
"grad_norm": 13.526360511779785,
"learning_rate": 4.743586206896552e-05,
"loss": 0.8225,
"step": 162500
},
{
"epoch": 104.28662827895073,
"grad_norm": 14.503246307373047,
"learning_rate": 4.726344827586207e-05,
"loss": 0.8014,
"step": 163000
},
{
"epoch": 104.60652591170825,
"grad_norm": 12.239891052246094,
"learning_rate": 4.709103448275862e-05,
"loss": 0.8032,
"step": 163500
},
{
"epoch": 104.92642354446578,
"grad_norm": 12.229057312011719,
"learning_rate": 4.691862068965517e-05,
"loss": 0.8073,
"step": 164000
},
{
"epoch": 105.2463211772233,
"grad_norm": 11.960144996643066,
"learning_rate": 4.6746206896551726e-05,
"loss": 0.7956,
"step": 164500
},
{
"epoch": 105.56621880998081,
"grad_norm": 13.662198066711426,
"learning_rate": 4.6573793103448275e-05,
"loss": 0.7875,
"step": 165000
},
{
"epoch": 105.88611644273833,
"grad_norm": 13.428373336791992,
"learning_rate": 4.6401724137931034e-05,
"loss": 0.7967,
"step": 165500
},
{
"epoch": 106.20601407549584,
"grad_norm": 13.212657928466797,
"learning_rate": 4.622931034482759e-05,
"loss": 0.7734,
"step": 166000
},
{
"epoch": 106.52591170825336,
"grad_norm": 13.421162605285645,
"learning_rate": 4.605689655172414e-05,
"loss": 0.7691,
"step": 166500
},
{
"epoch": 106.84580934101088,
"grad_norm": 10.209512710571289,
"learning_rate": 4.588448275862069e-05,
"loss": 0.7952,
"step": 167000
},
{
"epoch": 107.1657069737684,
"grad_norm": 10.482810020446777,
"learning_rate": 4.571206896551725e-05,
"loss": 0.7837,
"step": 167500
},
{
"epoch": 107.48560460652591,
"grad_norm": 13.598471641540527,
"learning_rate": 4.55396551724138e-05,
"loss": 0.778,
"step": 168000
},
{
"epoch": 107.80550223928343,
"grad_norm": 12.402639389038086,
"learning_rate": 4.5367241379310346e-05,
"loss": 0.7711,
"step": 168500
},
{
"epoch": 108.12539987204094,
"grad_norm": 12.243593215942383,
"learning_rate": 4.51948275862069e-05,
"loss": 0.773,
"step": 169000
},
{
"epoch": 108.44529750479846,
"grad_norm": 10.736000061035156,
"learning_rate": 4.502275862068966e-05,
"loss": 0.7477,
"step": 169500
},
{
"epoch": 108.76519513755598,
"grad_norm": 11.13589096069336,
"learning_rate": 4.485034482758621e-05,
"loss": 0.7669,
"step": 170000
},
{
"epoch": 109.0850927703135,
"grad_norm": 10.14847183227539,
"learning_rate": 4.4677931034482765e-05,
"loss": 0.7618,
"step": 170500
},
{
"epoch": 109.40499040307101,
"grad_norm": 10.636765480041504,
"learning_rate": 4.4505517241379314e-05,
"loss": 0.7362,
"step": 171000
},
{
"epoch": 109.72488803582854,
"grad_norm": 12.350906372070312,
"learning_rate": 4.433344827586207e-05,
"loss": 0.7578,
"step": 171500
},
{
"epoch": 110.04478566858606,
"grad_norm": 13.237043380737305,
"learning_rate": 4.416103448275862e-05,
"loss": 0.7662,
"step": 172000
},
{
"epoch": 110.36468330134358,
"grad_norm": 8.747899055480957,
"learning_rate": 4.398862068965518e-05,
"loss": 0.7306,
"step": 172500
},
{
"epoch": 110.68458093410109,
"grad_norm": 11.915460586547852,
"learning_rate": 4.3816206896551725e-05,
"loss": 0.7604,
"step": 173000
},
{
"epoch": 111.00447856685861,
"grad_norm": 10.675039291381836,
"learning_rate": 4.3643793103448274e-05,
"loss": 0.7348,
"step": 173500
},
{
"epoch": 111.32437619961613,
"grad_norm": 11.528447151184082,
"learning_rate": 4.347172413793103e-05,
"loss": 0.7297,
"step": 174000
},
{
"epoch": 111.64427383237364,
"grad_norm": 11.974442481994629,
"learning_rate": 4.329931034482759e-05,
"loss": 0.7398,
"step": 174500
},
{
"epoch": 111.96417146513116,
"grad_norm": 10.059257507324219,
"learning_rate": 4.312689655172414e-05,
"loss": 0.7337,
"step": 175000
},
{
"epoch": 112.28406909788868,
"grad_norm": 11.215494155883789,
"learning_rate": 4.295448275862069e-05,
"loss": 0.7154,
"step": 175500
},
{
"epoch": 112.60396673064619,
"grad_norm": 11.685689926147461,
"learning_rate": 4.278241379310345e-05,
"loss": 0.7268,
"step": 176000
},
{
"epoch": 112.92386436340371,
"grad_norm": 12.056086540222168,
"learning_rate": 4.261e-05,
"loss": 0.7214,
"step": 176500
},
{
"epoch": 113.24376199616123,
"grad_norm": 10.17962646484375,
"learning_rate": 4.243758620689655e-05,
"loss": 0.7309,
"step": 177000
},
{
"epoch": 113.56365962891874,
"grad_norm": 13.576325416564941,
"learning_rate": 4.226517241379311e-05,
"loss": 0.7047,
"step": 177500
},
{
"epoch": 113.88355726167626,
"grad_norm": 10.385096549987793,
"learning_rate": 4.209275862068966e-05,
"loss": 0.7298,
"step": 178000
},
{
"epoch": 114.20345489443378,
"grad_norm": 10.976679801940918,
"learning_rate": 4.192068965517242e-05,
"loss": 0.7079,
"step": 178500
},
{
"epoch": 114.52335252719129,
"grad_norm": 14.960927963256836,
"learning_rate": 4.174827586206897e-05,
"loss": 0.7004,
"step": 179000
},
{
"epoch": 114.84325015994882,
"grad_norm": 11.473701477050781,
"learning_rate": 4.157586206896552e-05,
"loss": 0.7137,
"step": 179500
},
{
"epoch": 115.16314779270634,
"grad_norm": 11.255741119384766,
"learning_rate": 4.140344827586207e-05,
"loss": 0.6994,
"step": 180000
},
{
"epoch": 115.48304542546386,
"grad_norm": 11.247090339660645,
"learning_rate": 4.123103448275862e-05,
"loss": 0.6822,
"step": 180500
},
{
"epoch": 115.80294305822137,
"grad_norm": 10.883934020996094,
"learning_rate": 4.1058620689655176e-05,
"loss": 0.6902,
"step": 181000
},
{
"epoch": 116.12284069097889,
"grad_norm": 12.544395446777344,
"learning_rate": 4.0886551724137935e-05,
"loss": 0.6853,
"step": 181500
},
{
"epoch": 116.4427383237364,
"grad_norm": 10.791812896728516,
"learning_rate": 4.0714137931034484e-05,
"loss": 0.6785,
"step": 182000
},
{
"epoch": 116.76263595649392,
"grad_norm": 13.567925453186035,
"learning_rate": 4.054172413793104e-05,
"loss": 0.6904,
"step": 182500
},
{
"epoch": 117.08253358925144,
"grad_norm": 10.842440605163574,
"learning_rate": 4.036931034482759e-05,
"loss": 0.6851,
"step": 183000
},
{
"epoch": 117.40243122200896,
"grad_norm": 13.358929634094238,
"learning_rate": 4.0196896551724136e-05,
"loss": 0.6629,
"step": 183500
},
{
"epoch": 117.72232885476647,
"grad_norm": 12.40263557434082,
"learning_rate": 4.002448275862069e-05,
"loss": 0.666,
"step": 184000
},
{
"epoch": 118.04222648752399,
"grad_norm": 12.171306610107422,
"learning_rate": 3.985241379310345e-05,
"loss": 0.6875,
"step": 184500
},
{
"epoch": 118.3621241202815,
"grad_norm": 11.05837631225586,
"learning_rate": 3.968e-05,
"loss": 0.6598,
"step": 185000
},
{
"epoch": 118.68202175303902,
"grad_norm": 13.27622127532959,
"learning_rate": 3.9507586206896555e-05,
"loss": 0.6735,
"step": 185500
},
{
"epoch": 119.00191938579654,
"grad_norm": 10.379920959472656,
"learning_rate": 3.9335172413793104e-05,
"loss": 0.6849,
"step": 186000
},
{
"epoch": 119.32181701855406,
"grad_norm": 13.972020149230957,
"learning_rate": 3.916275862068965e-05,
"loss": 0.6641,
"step": 186500
},
{
"epoch": 119.64171465131157,
"grad_norm": 11.595196723937988,
"learning_rate": 3.899068965517241e-05,
"loss": 0.6577,
"step": 187000
},
{
"epoch": 119.9616122840691,
"grad_norm": 11.496007919311523,
"learning_rate": 3.881827586206897e-05,
"loss": 0.6601,
"step": 187500
},
{
"epoch": 120.28150991682662,
"grad_norm": 10.272443771362305,
"learning_rate": 3.864586206896552e-05,
"loss": 0.6365,
"step": 188000
},
{
"epoch": 120.60140754958414,
"grad_norm": 11.38764762878418,
"learning_rate": 3.847344827586207e-05,
"loss": 0.6484,
"step": 188500
},
{
"epoch": 120.92130518234165,
"grad_norm": 14.054584503173828,
"learning_rate": 3.8301034482758627e-05,
"loss": 0.6662,
"step": 189000
},
{
"epoch": 121.24120281509917,
"grad_norm": 10.214823722839355,
"learning_rate": 3.8128620689655175e-05,
"loss": 0.6378,
"step": 189500
},
{
"epoch": 121.56110044785669,
"grad_norm": 10.101186752319336,
"learning_rate": 3.7956551724137934e-05,
"loss": 0.6458,
"step": 190000
},
{
"epoch": 121.8809980806142,
"grad_norm": 9.963375091552734,
"learning_rate": 3.778413793103448e-05,
"loss": 0.6518,
"step": 190500
},
{
"epoch": 122.20089571337172,
"grad_norm": 12.217530250549316,
"learning_rate": 3.761172413793104e-05,
"loss": 0.6382,
"step": 191000
},
{
"epoch": 122.52079334612924,
"grad_norm": 13.154520034790039,
"learning_rate": 3.743931034482759e-05,
"loss": 0.6272,
"step": 191500
},
{
"epoch": 122.84069097888676,
"grad_norm": 12.079745292663574,
"learning_rate": 3.7267241379310346e-05,
"loss": 0.6496,
"step": 192000
},
{
"epoch": 123.16058861164427,
"grad_norm": 9.851350784301758,
"learning_rate": 3.70948275862069e-05,
"loss": 0.6253,
"step": 192500
},
{
"epoch": 123.48048624440179,
"grad_norm": 10.869081497192383,
"learning_rate": 3.692241379310345e-05,
"loss": 0.6307,
"step": 193000
},
{
"epoch": 123.8003838771593,
"grad_norm": 16.297130584716797,
"learning_rate": 3.675034482758621e-05,
"loss": 0.6336,
"step": 193500
},
{
"epoch": 124.12028150991682,
"grad_norm": 11.86780834197998,
"learning_rate": 3.657793103448276e-05,
"loss": 0.626,
"step": 194000
},
{
"epoch": 124.44017914267434,
"grad_norm": 12.359445571899414,
"learning_rate": 3.640551724137931e-05,
"loss": 0.6068,
"step": 194500
},
{
"epoch": 124.76007677543186,
"grad_norm": 10.589780807495117,
"learning_rate": 3.623310344827586e-05,
"loss": 0.6388,
"step": 195000
},
{
"epoch": 125.07997440818939,
"grad_norm": 11.763408660888672,
"learning_rate": 3.606068965517241e-05,
"loss": 0.6132,
"step": 195500
},
{
"epoch": 125.3998720409469,
"grad_norm": 11.668269157409668,
"learning_rate": 3.5888275862068966e-05,
"loss": 0.602,
"step": 196000
},
{
"epoch": 125.71976967370442,
"grad_norm": 11.628352165222168,
"learning_rate": 3.5715862068965515e-05,
"loss": 0.6196,
"step": 196500
},
{
"epoch": 126.03966730646194,
"grad_norm": 11.364988327026367,
"learning_rate": 3.554344827586207e-05,
"loss": 0.6197,
"step": 197000
},
{
"epoch": 126.35956493921945,
"grad_norm": 10.588140487670898,
"learning_rate": 3.5371034482758626e-05,
"loss": 0.6017,
"step": 197500
},
{
"epoch": 126.67946257197697,
"grad_norm": 13.472407341003418,
"learning_rate": 3.5198965517241385e-05,
"loss": 0.6145,
"step": 198000
},
{
"epoch": 126.99936020473449,
"grad_norm": 10.752822875976562,
"learning_rate": 3.502655172413793e-05,
"loss": 0.6211,
"step": 198500
},
{
"epoch": 127.319257837492,
"grad_norm": 11.411170959472656,
"learning_rate": 3.485413793103449e-05,
"loss": 0.5919,
"step": 199000
},
{
"epoch": 127.63915547024952,
"grad_norm": 12.288487434387207,
"learning_rate": 3.468172413793104e-05,
"loss": 0.6032,
"step": 199500
},
{
"epoch": 127.95905310300704,
"grad_norm": 8.647520065307617,
"learning_rate": 3.4509310344827586e-05,
"loss": 0.6228,
"step": 200000
}
],
"logging_steps": 500,
"max_steps": 300000,
"num_input_tokens_seen": 0,
"num_train_epochs": 192,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.2457759656933786e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}