cc_kaz / checkpoint-140000 /trainer_state.json
DaniilOr's picture
Initial upload of multiple checkpoints
769e510 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 89.57133717210493,
"eval_steps": 500,
"global_step": 140000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.3198976327575176,
"grad_norm": 4.1601386070251465,
"learning_rate": 5e-06,
"loss": 10.3279,
"step": 500
},
{
"epoch": 0.6397952655150352,
"grad_norm": 4.366061687469482,
"learning_rate": 1e-05,
"loss": 9.3834,
"step": 1000
},
{
"epoch": 0.9596928982725528,
"grad_norm": 4.784337043762207,
"learning_rate": 1.5e-05,
"loss": 8.8888,
"step": 1500
},
{
"epoch": 1.2795905310300704,
"grad_norm": 3.9968652725219727,
"learning_rate": 2e-05,
"loss": 8.6568,
"step": 2000
},
{
"epoch": 1.599488163787588,
"grad_norm": 4.402552127838135,
"learning_rate": 2.5e-05,
"loss": 8.5473,
"step": 2500
},
{
"epoch": 1.9193857965451055,
"grad_norm": 4.639041423797607,
"learning_rate": 3e-05,
"loss": 8.4044,
"step": 3000
},
{
"epoch": 2.239283429302623,
"grad_norm": 5.651747226715088,
"learning_rate": 3.5e-05,
"loss": 8.2868,
"step": 3500
},
{
"epoch": 2.5591810620601407,
"grad_norm": 4.6999359130859375,
"learning_rate": 4e-05,
"loss": 8.1766,
"step": 4000
},
{
"epoch": 2.8790786948176583,
"grad_norm": 4.838181495666504,
"learning_rate": 4.499e-05,
"loss": 8.1118,
"step": 4500
},
{
"epoch": 3.198976327575176,
"grad_norm": 4.238831996917725,
"learning_rate": 4.999e-05,
"loss": 8.0038,
"step": 5000
},
{
"epoch": 3.5188739603326935,
"grad_norm": 4.455530643463135,
"learning_rate": 5.499000000000001e-05,
"loss": 7.9014,
"step": 5500
},
{
"epoch": 3.838771593090211,
"grad_norm": 5.811736583709717,
"learning_rate": 5.999e-05,
"loss": 7.8352,
"step": 6000
},
{
"epoch": 4.158669225847729,
"grad_norm": 4.998301982879639,
"learning_rate": 6.498e-05,
"loss": 7.7613,
"step": 6500
},
{
"epoch": 4.478566858605246,
"grad_norm": 5.011510848999023,
"learning_rate": 6.998e-05,
"loss": 7.6554,
"step": 7000
},
{
"epoch": 4.798464491362764,
"grad_norm": 4.750300884246826,
"learning_rate": 7.498e-05,
"loss": 7.6109,
"step": 7500
},
{
"epoch": 5.1183621241202815,
"grad_norm": 6.24017858505249,
"learning_rate": 7.998e-05,
"loss": 7.5186,
"step": 8000
},
{
"epoch": 5.438259756877799,
"grad_norm": 6.061458587646484,
"learning_rate": 8.497000000000001e-05,
"loss": 7.3966,
"step": 8500
},
{
"epoch": 5.758157389635317,
"grad_norm": 7.151447772979736,
"learning_rate": 8.997000000000001e-05,
"loss": 7.2877,
"step": 9000
},
{
"epoch": 6.078055022392834,
"grad_norm": 7.578985214233398,
"learning_rate": 9.497000000000001e-05,
"loss": 7.1542,
"step": 9500
},
{
"epoch": 6.397952655150352,
"grad_norm": 5.948920726776123,
"learning_rate": 9.997e-05,
"loss": 7.0008,
"step": 10000
},
{
"epoch": 6.717850287907869,
"grad_norm": 8.036959648132324,
"learning_rate": 9.982896551724137e-05,
"loss": 6.8966,
"step": 10500
},
{
"epoch": 7.037747920665387,
"grad_norm": 7.160433292388916,
"learning_rate": 9.965655172413794e-05,
"loss": 6.7509,
"step": 11000
},
{
"epoch": 7.357645553422905,
"grad_norm": 5.934999465942383,
"learning_rate": 9.948413793103449e-05,
"loss": 6.5833,
"step": 11500
},
{
"epoch": 7.677543186180422,
"grad_norm": 7.745622634887695,
"learning_rate": 9.931172413793104e-05,
"loss": 6.4975,
"step": 12000
},
{
"epoch": 7.99744081893794,
"grad_norm": 7.0418477058410645,
"learning_rate": 9.91393103448276e-05,
"loss": 6.4261,
"step": 12500
},
{
"epoch": 8.317338451695457,
"grad_norm": 6.101259708404541,
"learning_rate": 9.896689655172414e-05,
"loss": 6.2092,
"step": 13000
},
{
"epoch": 8.637236084452976,
"grad_norm": 7.289799213409424,
"learning_rate": 9.87944827586207e-05,
"loss": 6.1436,
"step": 13500
},
{
"epoch": 8.957133717210493,
"grad_norm": 8.126811027526855,
"learning_rate": 9.862206896551725e-05,
"loss": 6.0456,
"step": 14000
},
{
"epoch": 9.277031349968011,
"grad_norm": 8.221816062927246,
"learning_rate": 9.845000000000001e-05,
"loss": 5.9141,
"step": 14500
},
{
"epoch": 9.596928982725528,
"grad_norm": 7.361550331115723,
"learning_rate": 9.827793103448277e-05,
"loss": 5.8326,
"step": 15000
},
{
"epoch": 9.916826615483046,
"grad_norm": 7.1737775802612305,
"learning_rate": 9.810551724137932e-05,
"loss": 5.7974,
"step": 15500
},
{
"epoch": 10.236724248240563,
"grad_norm": 9.80185604095459,
"learning_rate": 9.793310344827586e-05,
"loss": 5.6282,
"step": 16000
},
{
"epoch": 10.556621880998081,
"grad_norm": 7.2062153816223145,
"learning_rate": 9.776068965517242e-05,
"loss": 5.5619,
"step": 16500
},
{
"epoch": 10.876519513755598,
"grad_norm": 10.801878929138184,
"learning_rate": 9.758827586206896e-05,
"loss": 5.5155,
"step": 17000
},
{
"epoch": 11.196417146513117,
"grad_norm": 8.48509693145752,
"learning_rate": 9.741586206896553e-05,
"loss": 5.4259,
"step": 17500
},
{
"epoch": 11.516314779270633,
"grad_norm": 8.47572135925293,
"learning_rate": 9.724344827586207e-05,
"loss": 5.3205,
"step": 18000
},
{
"epoch": 11.836212412028152,
"grad_norm": 6.122796535491943,
"learning_rate": 9.707103448275863e-05,
"loss": 5.3025,
"step": 18500
},
{
"epoch": 12.156110044785668,
"grad_norm": 8.210710525512695,
"learning_rate": 9.689896551724139e-05,
"loss": 5.2264,
"step": 19000
},
{
"epoch": 12.476007677543187,
"grad_norm": 7.857537746429443,
"learning_rate": 9.672655172413794e-05,
"loss": 5.1395,
"step": 19500
},
{
"epoch": 12.795905310300704,
"grad_norm": 7.743075370788574,
"learning_rate": 9.655413793103448e-05,
"loss": 5.1109,
"step": 20000
},
{
"epoch": 13.115802943058222,
"grad_norm": 10.574569702148438,
"learning_rate": 9.638172413793104e-05,
"loss": 5.0794,
"step": 20500
},
{
"epoch": 13.435700575815739,
"grad_norm": 8.313858985900879,
"learning_rate": 9.620931034482758e-05,
"loss": 4.921,
"step": 21000
},
{
"epoch": 13.755598208573257,
"grad_norm": 9.096057891845703,
"learning_rate": 9.603689655172414e-05,
"loss": 4.96,
"step": 21500
},
{
"epoch": 14.075495841330774,
"grad_norm": 8.402993202209473,
"learning_rate": 9.58644827586207e-05,
"loss": 4.9062,
"step": 22000
},
{
"epoch": 14.395393474088293,
"grad_norm": 8.110074996948242,
"learning_rate": 9.569206896551725e-05,
"loss": 4.8026,
"step": 22500
},
{
"epoch": 14.71529110684581,
"grad_norm": 7.908292293548584,
"learning_rate": 9.552000000000001e-05,
"loss": 4.82,
"step": 23000
},
{
"epoch": 15.035188739603328,
"grad_norm": 7.991878986358643,
"learning_rate": 9.534758620689655e-05,
"loss": 4.7397,
"step": 23500
},
{
"epoch": 15.355086372360844,
"grad_norm": 8.696029663085938,
"learning_rate": 9.517551724137932e-05,
"loss": 4.6656,
"step": 24000
},
{
"epoch": 15.674984005118363,
"grad_norm": 9.421612739562988,
"learning_rate": 9.500310344827586e-05,
"loss": 4.6412,
"step": 24500
},
{
"epoch": 15.99488163787588,
"grad_norm": 9.747482299804688,
"learning_rate": 9.483068965517242e-05,
"loss": 4.6048,
"step": 25000
},
{
"epoch": 16.314779270633398,
"grad_norm": 10.389492988586426,
"learning_rate": 9.465827586206897e-05,
"loss": 4.481,
"step": 25500
},
{
"epoch": 16.634676903390915,
"grad_norm": 8.661949157714844,
"learning_rate": 9.448586206896553e-05,
"loss": 4.4923,
"step": 26000
},
{
"epoch": 16.95457453614843,
"grad_norm": 12.681297302246094,
"learning_rate": 9.431344827586207e-05,
"loss": 4.4816,
"step": 26500
},
{
"epoch": 17.27447216890595,
"grad_norm": 8.993134498596191,
"learning_rate": 9.414103448275863e-05,
"loss": 4.3512,
"step": 27000
},
{
"epoch": 17.59436980166347,
"grad_norm": 10.020146369934082,
"learning_rate": 9.396862068965517e-05,
"loss": 4.3447,
"step": 27500
},
{
"epoch": 17.914267434420985,
"grad_norm": 9.514701843261719,
"learning_rate": 9.379655172413794e-05,
"loss": 4.3376,
"step": 28000
},
{
"epoch": 18.234165067178502,
"grad_norm": 10.324498176574707,
"learning_rate": 9.362413793103448e-05,
"loss": 4.2612,
"step": 28500
},
{
"epoch": 18.554062699936022,
"grad_norm": 10.682856559753418,
"learning_rate": 9.345172413793104e-05,
"loss": 4.226,
"step": 29000
},
{
"epoch": 18.87396033269354,
"grad_norm": 7.883260726928711,
"learning_rate": 9.327931034482758e-05,
"loss": 4.19,
"step": 29500
},
{
"epoch": 19.193857965451055,
"grad_norm": 12.470623016357422,
"learning_rate": 9.310724137931035e-05,
"loss": 4.1881,
"step": 30000
},
{
"epoch": 19.513755598208572,
"grad_norm": 9.932331085205078,
"learning_rate": 9.29348275862069e-05,
"loss": 4.0853,
"step": 30500
},
{
"epoch": 19.833653230966092,
"grad_norm": 8.153782844543457,
"learning_rate": 9.276241379310345e-05,
"loss": 4.1087,
"step": 31000
},
{
"epoch": 20.15355086372361,
"grad_norm": 8.214093208312988,
"learning_rate": 9.258999999999999e-05,
"loss": 4.0751,
"step": 31500
},
{
"epoch": 20.473448496481126,
"grad_norm": 11.927350044250488,
"learning_rate": 9.241758620689656e-05,
"loss": 3.9686,
"step": 32000
},
{
"epoch": 20.793346129238643,
"grad_norm": 9.67835807800293,
"learning_rate": 9.224551724137932e-05,
"loss": 3.9745,
"step": 32500
},
{
"epoch": 21.113243761996163,
"grad_norm": 9.911735534667969,
"learning_rate": 9.207310344827586e-05,
"loss": 3.9308,
"step": 33000
},
{
"epoch": 21.43314139475368,
"grad_norm": 9.05053424835205,
"learning_rate": 9.190068965517242e-05,
"loss": 3.8718,
"step": 33500
},
{
"epoch": 21.753039027511196,
"grad_norm": 9.588044166564941,
"learning_rate": 9.172827586206897e-05,
"loss": 3.8425,
"step": 34000
},
{
"epoch": 22.072936660268713,
"grad_norm": 8.788230895996094,
"learning_rate": 9.155620689655173e-05,
"loss": 3.8617,
"step": 34500
},
{
"epoch": 22.392834293026233,
"grad_norm": 9.435895919799805,
"learning_rate": 9.138379310344827e-05,
"loss": 3.7524,
"step": 35000
},
{
"epoch": 22.71273192578375,
"grad_norm": 9.870182037353516,
"learning_rate": 9.121137931034483e-05,
"loss": 3.7916,
"step": 35500
},
{
"epoch": 23.032629558541267,
"grad_norm": 9.612881660461426,
"learning_rate": 9.103896551724139e-05,
"loss": 3.8011,
"step": 36000
},
{
"epoch": 23.352527191298783,
"grad_norm": 9.643827438354492,
"learning_rate": 9.086689655172414e-05,
"loss": 3.6478,
"step": 36500
},
{
"epoch": 23.672424824056304,
"grad_norm": 14.105424880981445,
"learning_rate": 9.069448275862069e-05,
"loss": 3.6671,
"step": 37000
},
{
"epoch": 23.99232245681382,
"grad_norm": 10.427962303161621,
"learning_rate": 9.052206896551724e-05,
"loss": 3.6809,
"step": 37500
},
{
"epoch": 24.312220089571337,
"grad_norm": 11.505946159362793,
"learning_rate": 9.03496551724138e-05,
"loss": 3.553,
"step": 38000
},
{
"epoch": 24.632117722328854,
"grad_norm": 10.393635749816895,
"learning_rate": 9.017724137931035e-05,
"loss": 3.5408,
"step": 38500
},
{
"epoch": 24.952015355086374,
"grad_norm": 9.023842811584473,
"learning_rate": 9.00051724137931e-05,
"loss": 3.5915,
"step": 39000
},
{
"epoch": 25.27191298784389,
"grad_norm": 10.69048023223877,
"learning_rate": 8.983275862068967e-05,
"loss": 3.4896,
"step": 39500
},
{
"epoch": 25.591810620601407,
"grad_norm": 10.803936958312988,
"learning_rate": 8.966034482758621e-05,
"loss": 3.4854,
"step": 40000
},
{
"epoch": 25.911708253358924,
"grad_norm": 10.489801406860352,
"learning_rate": 8.948793103448276e-05,
"loss": 3.4871,
"step": 40500
},
{
"epoch": 26.231605886116444,
"grad_norm": 10.558309555053711,
"learning_rate": 8.931586206896552e-05,
"loss": 3.4186,
"step": 41000
},
{
"epoch": 26.55150351887396,
"grad_norm": 12.186748504638672,
"learning_rate": 8.914344827586208e-05,
"loss": 3.4027,
"step": 41500
},
{
"epoch": 26.871401151631478,
"grad_norm": 9.8623046875,
"learning_rate": 8.897103448275862e-05,
"loss": 3.4191,
"step": 42000
},
{
"epoch": 27.191298784388994,
"grad_norm": 11.407792091369629,
"learning_rate": 8.879862068965518e-05,
"loss": 3.341,
"step": 42500
},
{
"epoch": 27.511196417146515,
"grad_norm": 13.37617301940918,
"learning_rate": 8.862655172413794e-05,
"loss": 3.3137,
"step": 43000
},
{
"epoch": 27.83109404990403,
"grad_norm": 10.30826187133789,
"learning_rate": 8.845413793103449e-05,
"loss": 3.3036,
"step": 43500
},
{
"epoch": 28.150991682661548,
"grad_norm": 12.024778366088867,
"learning_rate": 8.828172413793105e-05,
"loss": 3.2678,
"step": 44000
},
{
"epoch": 28.470889315419065,
"grad_norm": 9.730340957641602,
"learning_rate": 8.810931034482759e-05,
"loss": 3.1949,
"step": 44500
},
{
"epoch": 28.790786948176585,
"grad_norm": 9.700602531433105,
"learning_rate": 8.793689655172414e-05,
"loss": 3.2541,
"step": 45000
},
{
"epoch": 29.1106845809341,
"grad_norm": 12.359143257141113,
"learning_rate": 8.77648275862069e-05,
"loss": 3.2456,
"step": 45500
},
{
"epoch": 29.43058221369162,
"grad_norm": 11.989018440246582,
"learning_rate": 8.759241379310346e-05,
"loss": 3.1154,
"step": 46000
},
{
"epoch": 29.750479846449135,
"grad_norm": 10.904190063476562,
"learning_rate": 8.742e-05,
"loss": 3.175,
"step": 46500
},
{
"epoch": 30.070377479206655,
"grad_norm": 11.253949165344238,
"learning_rate": 8.724758620689656e-05,
"loss": 3.1478,
"step": 47000
},
{
"epoch": 30.390275111964172,
"grad_norm": 12.229791641235352,
"learning_rate": 8.707517241379311e-05,
"loss": 3.0632,
"step": 47500
},
{
"epoch": 30.71017274472169,
"grad_norm": 9.516524314880371,
"learning_rate": 8.690275862068967e-05,
"loss": 3.0843,
"step": 48000
},
{
"epoch": 31.030070377479205,
"grad_norm": 13.730731010437012,
"learning_rate": 8.673034482758621e-05,
"loss": 3.098,
"step": 48500
},
{
"epoch": 31.349968010236726,
"grad_norm": 9.73539924621582,
"learning_rate": 8.655827586206897e-05,
"loss": 2.9611,
"step": 49000
},
{
"epoch": 31.669865642994242,
"grad_norm": 12.066815376281738,
"learning_rate": 8.638586206896552e-05,
"loss": 2.9943,
"step": 49500
},
{
"epoch": 31.98976327575176,
"grad_norm": 11.028585433959961,
"learning_rate": 8.621344827586208e-05,
"loss": 3.0424,
"step": 50000
},
{
"epoch": 32.30966090850928,
"grad_norm": 11.2380952835083,
"learning_rate": 8.604103448275862e-05,
"loss": 2.9023,
"step": 50500
},
{
"epoch": 32.629558541266796,
"grad_norm": 9.345772743225098,
"learning_rate": 8.586862068965518e-05,
"loss": 2.9586,
"step": 51000
},
{
"epoch": 32.94945617402431,
"grad_norm": 10.239849090576172,
"learning_rate": 8.569655172413793e-05,
"loss": 2.9461,
"step": 51500
},
{
"epoch": 33.26935380678183,
"grad_norm": 11.058523178100586,
"learning_rate": 8.552413793103449e-05,
"loss": 2.8453,
"step": 52000
},
{
"epoch": 33.589251439539346,
"grad_norm": 12.131317138671875,
"learning_rate": 8.535172413793105e-05,
"loss": 2.8603,
"step": 52500
},
{
"epoch": 33.90914907229686,
"grad_norm": 10.392476081848145,
"learning_rate": 8.517931034482759e-05,
"loss": 2.8817,
"step": 53000
},
{
"epoch": 34.22904670505438,
"grad_norm": 10.749021530151367,
"learning_rate": 8.500724137931036e-05,
"loss": 2.8073,
"step": 53500
},
{
"epoch": 34.5489443378119,
"grad_norm": 12.33171558380127,
"learning_rate": 8.48348275862069e-05,
"loss": 2.7793,
"step": 54000
},
{
"epoch": 34.86884197056942,
"grad_norm": 12.961758613586426,
"learning_rate": 8.466241379310346e-05,
"loss": 2.8066,
"step": 54500
},
{
"epoch": 35.18873960332694,
"grad_norm": 13.320075035095215,
"learning_rate": 8.449e-05,
"loss": 2.7459,
"step": 55000
},
{
"epoch": 35.50863723608445,
"grad_norm": 14.416489601135254,
"learning_rate": 8.431758620689655e-05,
"loss": 2.7321,
"step": 55500
},
{
"epoch": 35.82853486884197,
"grad_norm": 11.203073501586914,
"learning_rate": 8.414551724137931e-05,
"loss": 2.7486,
"step": 56000
},
{
"epoch": 36.14843250159949,
"grad_norm": 10.463476181030273,
"learning_rate": 8.397310344827587e-05,
"loss": 2.7086,
"step": 56500
},
{
"epoch": 36.468330134357004,
"grad_norm": 11.375761985778809,
"learning_rate": 8.380068965517241e-05,
"loss": 2.6387,
"step": 57000
},
{
"epoch": 36.78822776711452,
"grad_norm": 11.649105072021484,
"learning_rate": 8.362827586206897e-05,
"loss": 2.6746,
"step": 57500
},
{
"epoch": 37.108125399872044,
"grad_norm": 12.708244323730469,
"learning_rate": 8.345586206896552e-05,
"loss": 2.6454,
"step": 58000
},
{
"epoch": 37.42802303262956,
"grad_norm": 12.876201629638672,
"learning_rate": 8.328344827586208e-05,
"loss": 2.5798,
"step": 58500
},
{
"epoch": 37.74792066538708,
"grad_norm": 11.92346477508545,
"learning_rate": 8.311103448275862e-05,
"loss": 2.6589,
"step": 59000
},
{
"epoch": 38.067818298144594,
"grad_norm": 10.742238998413086,
"learning_rate": 8.293896551724138e-05,
"loss": 2.5868,
"step": 59500
},
{
"epoch": 38.38771593090211,
"grad_norm": 11.399048805236816,
"learning_rate": 8.276655172413793e-05,
"loss": 2.5124,
"step": 60000
},
{
"epoch": 38.70761356365963,
"grad_norm": 13.563875198364258,
"learning_rate": 8.259413793103449e-05,
"loss": 2.576,
"step": 60500
},
{
"epoch": 39.027511196417144,
"grad_norm": 11.297135353088379,
"learning_rate": 8.242172413793103e-05,
"loss": 2.5671,
"step": 61000
},
{
"epoch": 39.34740882917466,
"grad_norm": 11.336121559143066,
"learning_rate": 8.22496551724138e-05,
"loss": 2.4445,
"step": 61500
},
{
"epoch": 39.667306461932185,
"grad_norm": 9.477692604064941,
"learning_rate": 8.207724137931035e-05,
"loss": 2.4981,
"step": 62000
},
{
"epoch": 39.9872040946897,
"grad_norm": 11.597848892211914,
"learning_rate": 8.19048275862069e-05,
"loss": 2.5382,
"step": 62500
},
{
"epoch": 40.30710172744722,
"grad_norm": 14.910037994384766,
"learning_rate": 8.173241379310346e-05,
"loss": 2.4158,
"step": 63000
},
{
"epoch": 40.626999360204735,
"grad_norm": 11.870673179626465,
"learning_rate": 8.156e-05,
"loss": 2.4395,
"step": 63500
},
{
"epoch": 40.94689699296225,
"grad_norm": 15.279576301574707,
"learning_rate": 8.138758620689655e-05,
"loss": 2.4653,
"step": 64000
},
{
"epoch": 41.26679462571977,
"grad_norm": 11.710406303405762,
"learning_rate": 8.121551724137931e-05,
"loss": 2.3567,
"step": 64500
},
{
"epoch": 41.586692258477285,
"grad_norm": 10.663411140441895,
"learning_rate": 8.104310344827587e-05,
"loss": 2.3655,
"step": 65000
},
{
"epoch": 41.9065898912348,
"grad_norm": 13.946629524230957,
"learning_rate": 8.087068965517241e-05,
"loss": 2.4336,
"step": 65500
},
{
"epoch": 42.226487523992326,
"grad_norm": 13.782262802124023,
"learning_rate": 8.069827586206898e-05,
"loss": 2.3351,
"step": 66000
},
{
"epoch": 42.54638515674984,
"grad_norm": 11.177961349487305,
"learning_rate": 8.052586206896552e-05,
"loss": 2.359,
"step": 66500
},
{
"epoch": 42.86628278950736,
"grad_norm": 15.120301246643066,
"learning_rate": 8.035344827586208e-05,
"loss": 2.355,
"step": 67000
},
{
"epoch": 43.186180422264876,
"grad_norm": 10.805267333984375,
"learning_rate": 8.018103448275862e-05,
"loss": 2.2905,
"step": 67500
},
{
"epoch": 43.50607805502239,
"grad_norm": 11.777176856994629,
"learning_rate": 8.000862068965517e-05,
"loss": 2.2906,
"step": 68000
},
{
"epoch": 43.82597568777991,
"grad_norm": 15.457807540893555,
"learning_rate": 7.983689655172414e-05,
"loss": 2.3269,
"step": 68500
},
{
"epoch": 44.145873320537426,
"grad_norm": 11.639357566833496,
"learning_rate": 7.966448275862069e-05,
"loss": 2.2371,
"step": 69000
},
{
"epoch": 44.46577095329494,
"grad_norm": 11.710591316223145,
"learning_rate": 7.949206896551725e-05,
"loss": 2.2248,
"step": 69500
},
{
"epoch": 44.785668586052466,
"grad_norm": 12.675103187561035,
"learning_rate": 7.93196551724138e-05,
"loss": 2.2586,
"step": 70000
},
{
"epoch": 45.10556621880998,
"grad_norm": 12.752120971679688,
"learning_rate": 7.914724137931034e-05,
"loss": 2.2489,
"step": 70500
},
{
"epoch": 45.4254638515675,
"grad_norm": 11.379339218139648,
"learning_rate": 7.89751724137931e-05,
"loss": 2.1774,
"step": 71000
},
{
"epoch": 45.74536148432502,
"grad_norm": 12.76633358001709,
"learning_rate": 7.880275862068966e-05,
"loss": 2.2007,
"step": 71500
},
{
"epoch": 46.06525911708253,
"grad_norm": 11.421367645263672,
"learning_rate": 7.863034482758621e-05,
"loss": 2.2262,
"step": 72000
},
{
"epoch": 46.38515674984005,
"grad_norm": 14.81748104095459,
"learning_rate": 7.845793103448277e-05,
"loss": 2.101,
"step": 72500
},
{
"epoch": 46.70505438259757,
"grad_norm": 12.902971267700195,
"learning_rate": 7.828551724137931e-05,
"loss": 2.1568,
"step": 73000
},
{
"epoch": 47.02495201535508,
"grad_norm": 10.685113906860352,
"learning_rate": 7.811310344827587e-05,
"loss": 2.1655,
"step": 73500
},
{
"epoch": 47.34484964811261,
"grad_norm": 15.892518043518066,
"learning_rate": 7.794068965517242e-05,
"loss": 2.0551,
"step": 74000
},
{
"epoch": 47.664747280870124,
"grad_norm": 13.730358123779297,
"learning_rate": 7.776862068965518e-05,
"loss": 2.1053,
"step": 74500
},
{
"epoch": 47.98464491362764,
"grad_norm": 13.635787963867188,
"learning_rate": 7.759620689655172e-05,
"loss": 2.1408,
"step": 75000
},
{
"epoch": 48.30454254638516,
"grad_norm": 12.861611366271973,
"learning_rate": 7.742379310344828e-05,
"loss": 2.0104,
"step": 75500
},
{
"epoch": 48.624440179142674,
"grad_norm": 11.84931468963623,
"learning_rate": 7.725137931034483e-05,
"loss": 2.0555,
"step": 76000
},
{
"epoch": 48.94433781190019,
"grad_norm": 15.812765121459961,
"learning_rate": 7.70793103448276e-05,
"loss": 2.1087,
"step": 76500
},
{
"epoch": 49.26423544465771,
"grad_norm": 14.233431816101074,
"learning_rate": 7.690689655172414e-05,
"loss": 1.9998,
"step": 77000
},
{
"epoch": 49.584133077415224,
"grad_norm": 14.329803466796875,
"learning_rate": 7.673448275862069e-05,
"loss": 2.0189,
"step": 77500
},
{
"epoch": 49.90403071017275,
"grad_norm": 11.00400161743164,
"learning_rate": 7.656206896551725e-05,
"loss": 2.059,
"step": 78000
},
{
"epoch": 50.223928342930265,
"grad_norm": 13.582133293151855,
"learning_rate": 7.63896551724138e-05,
"loss": 1.9753,
"step": 78500
},
{
"epoch": 50.54382597568778,
"grad_norm": 12.560907363891602,
"learning_rate": 7.621724137931034e-05,
"loss": 1.9759,
"step": 79000
},
{
"epoch": 50.8637236084453,
"grad_norm": 12.169915199279785,
"learning_rate": 7.60451724137931e-05,
"loss": 1.9944,
"step": 79500
},
{
"epoch": 51.183621241202815,
"grad_norm": 13.5604248046875,
"learning_rate": 7.587275862068966e-05,
"loss": 1.9323,
"step": 80000
},
{
"epoch": 51.50351887396033,
"grad_norm": 15.892741203308105,
"learning_rate": 7.570034482758621e-05,
"loss": 1.9095,
"step": 80500
},
{
"epoch": 51.82341650671785,
"grad_norm": 13.435209274291992,
"learning_rate": 7.552793103448276e-05,
"loss": 1.9508,
"step": 81000
},
{
"epoch": 52.143314139475365,
"grad_norm": 11.180010795593262,
"learning_rate": 7.535586206896551e-05,
"loss": 1.9216,
"step": 81500
},
{
"epoch": 52.46321177223289,
"grad_norm": 12.792661666870117,
"learning_rate": 7.518344827586207e-05,
"loss": 1.8817,
"step": 82000
},
{
"epoch": 52.783109404990405,
"grad_norm": 11.785886764526367,
"learning_rate": 7.501103448275863e-05,
"loss": 1.9121,
"step": 82500
},
{
"epoch": 53.10300703774792,
"grad_norm": 10.568120002746582,
"learning_rate": 7.483862068965518e-05,
"loss": 1.8885,
"step": 83000
},
{
"epoch": 53.42290467050544,
"grad_norm": 14.641459465026855,
"learning_rate": 7.466620689655172e-05,
"loss": 1.8357,
"step": 83500
},
{
"epoch": 53.742802303262955,
"grad_norm": 13.5363187789917,
"learning_rate": 7.449379310344828e-05,
"loss": 1.8515,
"step": 84000
},
{
"epoch": 54.06269993602047,
"grad_norm": 12.997908592224121,
"learning_rate": 7.432172413793104e-05,
"loss": 1.8681,
"step": 84500
},
{
"epoch": 54.38259756877799,
"grad_norm": 12.53503131866455,
"learning_rate": 7.414931034482759e-05,
"loss": 1.7785,
"step": 85000
},
{
"epoch": 54.702495201535505,
"grad_norm": 11.986194610595703,
"learning_rate": 7.397689655172413e-05,
"loss": 1.8507,
"step": 85500
},
{
"epoch": 55.02239283429303,
"grad_norm": 12.089723587036133,
"learning_rate": 7.380448275862069e-05,
"loss": 1.8537,
"step": 86000
},
{
"epoch": 55.342290467050546,
"grad_norm": 13.552453994750977,
"learning_rate": 7.363206896551725e-05,
"loss": 1.7622,
"step": 86500
},
{
"epoch": 55.66218809980806,
"grad_norm": 12.03878116607666,
"learning_rate": 7.346e-05,
"loss": 1.8076,
"step": 87000
},
{
"epoch": 55.98208573256558,
"grad_norm": 11.187782287597656,
"learning_rate": 7.328758620689655e-05,
"loss": 1.8241,
"step": 87500
},
{
"epoch": 56.301983365323096,
"grad_norm": 14.924737930297852,
"learning_rate": 7.311517241379312e-05,
"loss": 1.7077,
"step": 88000
},
{
"epoch": 56.62188099808061,
"grad_norm": 12.302467346191406,
"learning_rate": 7.294275862068966e-05,
"loss": 1.74,
"step": 88500
},
{
"epoch": 56.94177863083813,
"grad_norm": 10.834394454956055,
"learning_rate": 7.277034482758621e-05,
"loss": 1.7827,
"step": 89000
},
{
"epoch": 57.261676263595646,
"grad_norm": 14.356012344360352,
"learning_rate": 7.259793103448276e-05,
"loss": 1.7082,
"step": 89500
},
{
"epoch": 57.58157389635317,
"grad_norm": 14.632678031921387,
"learning_rate": 7.242551724137931e-05,
"loss": 1.7045,
"step": 90000
},
{
"epoch": 57.90147152911069,
"grad_norm": 13.501043319702148,
"learning_rate": 7.225344827586207e-05,
"loss": 1.7522,
"step": 90500
},
{
"epoch": 58.2213691618682,
"grad_norm": 18.839614868164062,
"learning_rate": 7.208103448275862e-05,
"loss": 1.6801,
"step": 91000
},
{
"epoch": 58.54126679462572,
"grad_norm": 13.41618824005127,
"learning_rate": 7.190862068965517e-05,
"loss": 1.7005,
"step": 91500
},
{
"epoch": 58.86116442738324,
"grad_norm": 12.56169605255127,
"learning_rate": 7.173620689655172e-05,
"loss": 1.7018,
"step": 92000
},
{
"epoch": 59.18106206014075,
"grad_norm": 13.447467803955078,
"learning_rate": 7.15641379310345e-05,
"loss": 1.6691,
"step": 92500
},
{
"epoch": 59.50095969289827,
"grad_norm": 12.452493667602539,
"learning_rate": 7.139172413793104e-05,
"loss": 1.651,
"step": 93000
},
{
"epoch": 59.82085732565579,
"grad_norm": 10.552214622497559,
"learning_rate": 7.121931034482759e-05,
"loss": 1.6916,
"step": 93500
},
{
"epoch": 60.14075495841331,
"grad_norm": 11.099422454833984,
"learning_rate": 7.104689655172413e-05,
"loss": 1.6716,
"step": 94000
},
{
"epoch": 60.46065259117083,
"grad_norm": 13.663276672363281,
"learning_rate": 7.08744827586207e-05,
"loss": 1.6105,
"step": 94500
},
{
"epoch": 60.780550223928344,
"grad_norm": 11.783934593200684,
"learning_rate": 7.070206896551725e-05,
"loss": 1.6399,
"step": 95000
},
{
"epoch": 61.10044785668586,
"grad_norm": 12.881340026855469,
"learning_rate": 7.053e-05,
"loss": 1.6058,
"step": 95500
},
{
"epoch": 61.42034548944338,
"grad_norm": 12.405476570129395,
"learning_rate": 7.035758620689656e-05,
"loss": 1.5703,
"step": 96000
},
{
"epoch": 61.740243122200894,
"grad_norm": 11.660452842712402,
"learning_rate": 7.018517241379311e-05,
"loss": 1.6002,
"step": 96500
},
{
"epoch": 62.06014075495841,
"grad_norm": 11.69723892211914,
"learning_rate": 7.001275862068966e-05,
"loss": 1.6186,
"step": 97000
},
{
"epoch": 62.38003838771593,
"grad_norm": 16.210947036743164,
"learning_rate": 6.984034482758621e-05,
"loss": 1.5663,
"step": 97500
},
{
"epoch": 62.69993602047345,
"grad_norm": 11.853803634643555,
"learning_rate": 6.966827586206897e-05,
"loss": 1.5744,
"step": 98000
},
{
"epoch": 63.01983365323097,
"grad_norm": 10.565818786621094,
"learning_rate": 6.949586206896553e-05,
"loss": 1.5829,
"step": 98500
},
{
"epoch": 63.339731285988485,
"grad_norm": 11.621013641357422,
"learning_rate": 6.932344827586207e-05,
"loss": 1.5213,
"step": 99000
},
{
"epoch": 63.659628918746,
"grad_norm": 10.182308197021484,
"learning_rate": 6.915103448275862e-05,
"loss": 1.5291,
"step": 99500
},
{
"epoch": 63.97952655150352,
"grad_norm": 14.434243202209473,
"learning_rate": 6.897862068965517e-05,
"loss": 1.5612,
"step": 100000
},
{
"epoch": 64.29942418426104,
"grad_norm": 12.513864517211914,
"learning_rate": 6.880655172413794e-05,
"loss": 1.5041,
"step": 100500
},
{
"epoch": 64.61932181701856,
"grad_norm": 13.189037322998047,
"learning_rate": 6.863413793103448e-05,
"loss": 1.5097,
"step": 101000
},
{
"epoch": 64.93921944977608,
"grad_norm": 11.867232322692871,
"learning_rate": 6.846172413793104e-05,
"loss": 1.5401,
"step": 101500
},
{
"epoch": 65.25911708253359,
"grad_norm": 13.00894832611084,
"learning_rate": 6.828931034482758e-05,
"loss": 1.4581,
"step": 102000
},
{
"epoch": 65.57901471529111,
"grad_norm": 11.719345092773438,
"learning_rate": 6.811689655172415e-05,
"loss": 1.4807,
"step": 102500
},
{
"epoch": 65.89891234804863,
"grad_norm": 11.355063438415527,
"learning_rate": 6.79444827586207e-05,
"loss": 1.4892,
"step": 103000
},
{
"epoch": 66.21880998080614,
"grad_norm": 13.351948738098145,
"learning_rate": 6.777241379310345e-05,
"loss": 1.4636,
"step": 103500
},
{
"epoch": 66.53870761356366,
"grad_norm": 15.406342506408691,
"learning_rate": 6.76e-05,
"loss": 1.4678,
"step": 104000
},
{
"epoch": 66.85860524632118,
"grad_norm": 14.357329368591309,
"learning_rate": 6.742758620689656e-05,
"loss": 1.4936,
"step": 104500
},
{
"epoch": 67.17850287907869,
"grad_norm": 12.275686264038086,
"learning_rate": 6.725517241379311e-05,
"loss": 1.4566,
"step": 105000
},
{
"epoch": 67.49840051183621,
"grad_norm": 13.198380470275879,
"learning_rate": 6.708310344827586e-05,
"loss": 1.4326,
"step": 105500
},
{
"epoch": 67.81829814459373,
"grad_norm": 13.365631103515625,
"learning_rate": 6.691068965517242e-05,
"loss": 1.426,
"step": 106000
},
{
"epoch": 68.13819577735124,
"grad_norm": 14.106985092163086,
"learning_rate": 6.673827586206897e-05,
"loss": 1.4289,
"step": 106500
},
{
"epoch": 68.45809341010876,
"grad_norm": 10.076281547546387,
"learning_rate": 6.656586206896553e-05,
"loss": 1.396,
"step": 107000
},
{
"epoch": 68.77799104286628,
"grad_norm": 14.63807201385498,
"learning_rate": 6.639344827586207e-05,
"loss": 1.3979,
"step": 107500
},
{
"epoch": 69.0978886756238,
"grad_norm": 13.643959045410156,
"learning_rate": 6.622137931034483e-05,
"loss": 1.4274,
"step": 108000
},
{
"epoch": 69.41778630838132,
"grad_norm": 11.819470405578613,
"learning_rate": 6.604896551724138e-05,
"loss": 1.3769,
"step": 108500
},
{
"epoch": 69.73768394113884,
"grad_norm": 14.33261775970459,
"learning_rate": 6.587655172413794e-05,
"loss": 1.3825,
"step": 109000
},
{
"epoch": 70.05758157389636,
"grad_norm": 10.918536186218262,
"learning_rate": 6.570413793103448e-05,
"loss": 1.3913,
"step": 109500
},
{
"epoch": 70.37747920665387,
"grad_norm": 13.519926071166992,
"learning_rate": 6.553172413793104e-05,
"loss": 1.3341,
"step": 110000
},
{
"epoch": 70.69737683941139,
"grad_norm": 12.5425386428833,
"learning_rate": 6.535931034482759e-05,
"loss": 1.3828,
"step": 110500
},
{
"epoch": 71.0172744721689,
"grad_norm": 11.435805320739746,
"learning_rate": 6.518724137931035e-05,
"loss": 1.3821,
"step": 111000
},
{
"epoch": 71.33717210492642,
"grad_norm": 12.65505313873291,
"learning_rate": 6.501482758620689e-05,
"loss": 1.3083,
"step": 111500
},
{
"epoch": 71.65706973768394,
"grad_norm": 15.489115715026855,
"learning_rate": 6.484241379310345e-05,
"loss": 1.341,
"step": 112000
},
{
"epoch": 71.97696737044146,
"grad_norm": 14.14395809173584,
"learning_rate": 6.467e-05,
"loss": 1.3579,
"step": 112500
},
{
"epoch": 72.29686500319897,
"grad_norm": 13.708014488220215,
"learning_rate": 6.449758620689656e-05,
"loss": 1.3032,
"step": 113000
},
{
"epoch": 72.61676263595649,
"grad_norm": 10.75635814666748,
"learning_rate": 6.432551724137932e-05,
"loss": 1.3045,
"step": 113500
},
{
"epoch": 72.93666026871401,
"grad_norm": 12.12192440032959,
"learning_rate": 6.415310344827586e-05,
"loss": 1.3248,
"step": 114000
},
{
"epoch": 73.25655790147152,
"grad_norm": 13.368456840515137,
"learning_rate": 6.398068965517241e-05,
"loss": 1.287,
"step": 114500
},
{
"epoch": 73.57645553422904,
"grad_norm": 12.584633827209473,
"learning_rate": 6.380827586206897e-05,
"loss": 1.3015,
"step": 115000
},
{
"epoch": 73.89635316698656,
"grad_norm": 13.863194465637207,
"learning_rate": 6.363620689655173e-05,
"loss": 1.2898,
"step": 115500
},
{
"epoch": 74.21625079974409,
"grad_norm": 12.937112808227539,
"learning_rate": 6.346379310344827e-05,
"loss": 1.2456,
"step": 116000
},
{
"epoch": 74.5361484325016,
"grad_norm": 11.274981498718262,
"learning_rate": 6.329137931034484e-05,
"loss": 1.2689,
"step": 116500
},
{
"epoch": 74.85604606525912,
"grad_norm": 14.425061225891113,
"learning_rate": 6.311896551724138e-05,
"loss": 1.2752,
"step": 117000
},
{
"epoch": 75.17594369801664,
"grad_norm": 11.654635429382324,
"learning_rate": 6.294655172413794e-05,
"loss": 1.2442,
"step": 117500
},
{
"epoch": 75.49584133077416,
"grad_norm": 10.00129222869873,
"learning_rate": 6.277413793103448e-05,
"loss": 1.2506,
"step": 118000
},
{
"epoch": 75.81573896353167,
"grad_norm": 11.665295600891113,
"learning_rate": 6.260206896551725e-05,
"loss": 1.2541,
"step": 118500
},
{
"epoch": 76.13563659628919,
"grad_norm": 10.555766105651855,
"learning_rate": 6.24296551724138e-05,
"loss": 1.2486,
"step": 119000
},
{
"epoch": 76.4555342290467,
"grad_norm": 14.879280090332031,
"learning_rate": 6.225724137931035e-05,
"loss": 1.2124,
"step": 119500
},
{
"epoch": 76.77543186180422,
"grad_norm": 15.131136894226074,
"learning_rate": 6.208482758620689e-05,
"loss": 1.2549,
"step": 120000
},
{
"epoch": 77.09532949456174,
"grad_norm": 9.889472961425781,
"learning_rate": 6.191241379310345e-05,
"loss": 1.2376,
"step": 120500
},
{
"epoch": 77.41522712731926,
"grad_norm": 11.307145118713379,
"learning_rate": 6.174e-05,
"loss": 1.1958,
"step": 121000
},
{
"epoch": 77.73512476007677,
"grad_norm": 14.303799629211426,
"learning_rate": 6.156758620689656e-05,
"loss": 1.2009,
"step": 121500
},
{
"epoch": 78.05502239283429,
"grad_norm": 11.318217277526855,
"learning_rate": 6.139517241379311e-05,
"loss": 1.2215,
"step": 122000
},
{
"epoch": 78.3749200255918,
"grad_norm": 13.979291915893555,
"learning_rate": 6.122310344827586e-05,
"loss": 1.1713,
"step": 122500
},
{
"epoch": 78.69481765834932,
"grad_norm": 12.78084945678711,
"learning_rate": 6.105103448275863e-05,
"loss": 1.1901,
"step": 123000
},
{
"epoch": 79.01471529110684,
"grad_norm": 10.332459449768066,
"learning_rate": 6.087862068965517e-05,
"loss": 1.2141,
"step": 123500
},
{
"epoch": 79.33461292386437,
"grad_norm": 11.179670333862305,
"learning_rate": 6.0706206896551735e-05,
"loss": 1.1641,
"step": 124000
},
{
"epoch": 79.65451055662189,
"grad_norm": 12.706995964050293,
"learning_rate": 6.053379310344828e-05,
"loss": 1.1887,
"step": 124500
},
{
"epoch": 79.9744081893794,
"grad_norm": 12.575511932373047,
"learning_rate": 6.036137931034483e-05,
"loss": 1.1994,
"step": 125000
},
{
"epoch": 80.29430582213692,
"grad_norm": 11.299592971801758,
"learning_rate": 6.0189310344827584e-05,
"loss": 1.1535,
"step": 125500
},
{
"epoch": 80.61420345489444,
"grad_norm": 9.741961479187012,
"learning_rate": 6.0016896551724147e-05,
"loss": 1.1508,
"step": 126000
},
{
"epoch": 80.93410108765195,
"grad_norm": 13.86517333984375,
"learning_rate": 5.984448275862069e-05,
"loss": 1.1453,
"step": 126500
},
{
"epoch": 81.25399872040947,
"grad_norm": 12.810471534729004,
"learning_rate": 5.9672068965517244e-05,
"loss": 1.131,
"step": 127000
},
{
"epoch": 81.57389635316699,
"grad_norm": 11.828211784362793,
"learning_rate": 5.949965517241379e-05,
"loss": 1.1462,
"step": 127500
},
{
"epoch": 81.8937939859245,
"grad_norm": 13.588178634643555,
"learning_rate": 5.932724137931035e-05,
"loss": 1.1519,
"step": 128000
},
{
"epoch": 82.21369161868202,
"grad_norm": 13.903426170349121,
"learning_rate": 5.91548275862069e-05,
"loss": 1.1222,
"step": 128500
},
{
"epoch": 82.53358925143954,
"grad_norm": 13.447443962097168,
"learning_rate": 5.898275862068966e-05,
"loss": 1.1173,
"step": 129000
},
{
"epoch": 82.85348688419705,
"grad_norm": 12.132195472717285,
"learning_rate": 5.8810344827586205e-05,
"loss": 1.1262,
"step": 129500
},
{
"epoch": 83.17338451695457,
"grad_norm": 11.170686721801758,
"learning_rate": 5.863793103448276e-05,
"loss": 1.0957,
"step": 130000
},
{
"epoch": 83.49328214971209,
"grad_norm": 12.57539176940918,
"learning_rate": 5.846551724137931e-05,
"loss": 1.0862,
"step": 130500
},
{
"epoch": 83.8131797824696,
"grad_norm": 14.212547302246094,
"learning_rate": 5.8293448275862074e-05,
"loss": 1.0929,
"step": 131000
},
{
"epoch": 84.13307741522712,
"grad_norm": 14.803600311279297,
"learning_rate": 5.8121034482758616e-05,
"loss": 1.0948,
"step": 131500
},
{
"epoch": 84.45297504798465,
"grad_norm": 19.55899429321289,
"learning_rate": 5.794862068965518e-05,
"loss": 1.0788,
"step": 132000
},
{
"epoch": 84.77287268074217,
"grad_norm": 11.086203575134277,
"learning_rate": 5.7776206896551734e-05,
"loss": 1.098,
"step": 132500
},
{
"epoch": 85.09277031349968,
"grad_norm": 10.74999713897705,
"learning_rate": 5.7603793103448276e-05,
"loss": 1.0649,
"step": 133000
},
{
"epoch": 85.4126679462572,
"grad_norm": 14.409449577331543,
"learning_rate": 5.743137931034484e-05,
"loss": 1.0592,
"step": 133500
},
{
"epoch": 85.73256557901472,
"grad_norm": 10.215742111206055,
"learning_rate": 5.725931034482759e-05,
"loss": 1.0765,
"step": 134000
},
{
"epoch": 86.05246321177223,
"grad_norm": 12.911944389343262,
"learning_rate": 5.7086896551724146e-05,
"loss": 1.0504,
"step": 134500
},
{
"epoch": 86.37236084452975,
"grad_norm": 14.987035751342773,
"learning_rate": 5.691448275862069e-05,
"loss": 1.0141,
"step": 135000
},
{
"epoch": 86.69225847728727,
"grad_norm": 11.989995002746582,
"learning_rate": 5.674206896551725e-05,
"loss": 1.0431,
"step": 135500
},
{
"epoch": 87.01215611004478,
"grad_norm": 12.771849632263184,
"learning_rate": 5.657e-05,
"loss": 1.054,
"step": 136000
},
{
"epoch": 87.3320537428023,
"grad_norm": 13.398333549499512,
"learning_rate": 5.639758620689656e-05,
"loss": 0.9984,
"step": 136500
},
{
"epoch": 87.65195137555982,
"grad_norm": 10.814030647277832,
"learning_rate": 5.6225172413793106e-05,
"loss": 1.0283,
"step": 137000
},
{
"epoch": 87.97184900831734,
"grad_norm": 12.13095760345459,
"learning_rate": 5.605275862068966e-05,
"loss": 1.0414,
"step": 137500
},
{
"epoch": 88.29174664107485,
"grad_norm": 12.733049392700195,
"learning_rate": 5.5880344827586204e-05,
"loss": 1.0087,
"step": 138000
},
{
"epoch": 88.61164427383237,
"grad_norm": 16.555213928222656,
"learning_rate": 5.570827586206897e-05,
"loss": 1.0062,
"step": 138500
},
{
"epoch": 88.93154190658989,
"grad_norm": 11.025595664978027,
"learning_rate": 5.553586206896552e-05,
"loss": 1.0212,
"step": 139000
},
{
"epoch": 89.2514395393474,
"grad_norm": 9.93308162689209,
"learning_rate": 5.5363448275862074e-05,
"loss": 0.998,
"step": 139500
},
{
"epoch": 89.57133717210493,
"grad_norm": 14.131500244140625,
"learning_rate": 5.519103448275862e-05,
"loss": 0.9797,
"step": 140000
}
],
"logging_steps": 500,
"max_steps": 300000,
"num_input_tokens_seen": 0,
"num_train_epochs": 192,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5720679109463245e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}