cc_kaz / checkpoint-80000 /trainer_state.json
DaniilOr's picture
Initial upload of multiple checkpoints
769e510 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 51.183621241202815,
"eval_steps": 500,
"global_step": 80000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.3198976327575176,
"grad_norm": 4.1601386070251465,
"learning_rate": 5e-06,
"loss": 10.3279,
"step": 500
},
{
"epoch": 0.6397952655150352,
"grad_norm": 4.366061687469482,
"learning_rate": 1e-05,
"loss": 9.3834,
"step": 1000
},
{
"epoch": 0.9596928982725528,
"grad_norm": 4.784337043762207,
"learning_rate": 1.5e-05,
"loss": 8.8888,
"step": 1500
},
{
"epoch": 1.2795905310300704,
"grad_norm": 3.9968652725219727,
"learning_rate": 2e-05,
"loss": 8.6568,
"step": 2000
},
{
"epoch": 1.599488163787588,
"grad_norm": 4.402552127838135,
"learning_rate": 2.5e-05,
"loss": 8.5473,
"step": 2500
},
{
"epoch": 1.9193857965451055,
"grad_norm": 4.639041423797607,
"learning_rate": 3e-05,
"loss": 8.4044,
"step": 3000
},
{
"epoch": 2.239283429302623,
"grad_norm": 5.651747226715088,
"learning_rate": 3.5e-05,
"loss": 8.2868,
"step": 3500
},
{
"epoch": 2.5591810620601407,
"grad_norm": 4.6999359130859375,
"learning_rate": 4e-05,
"loss": 8.1766,
"step": 4000
},
{
"epoch": 2.8790786948176583,
"grad_norm": 4.838181495666504,
"learning_rate": 4.499e-05,
"loss": 8.1118,
"step": 4500
},
{
"epoch": 3.198976327575176,
"grad_norm": 4.238831996917725,
"learning_rate": 4.999e-05,
"loss": 8.0038,
"step": 5000
},
{
"epoch": 3.5188739603326935,
"grad_norm": 4.455530643463135,
"learning_rate": 5.499000000000001e-05,
"loss": 7.9014,
"step": 5500
},
{
"epoch": 3.838771593090211,
"grad_norm": 5.811736583709717,
"learning_rate": 5.999e-05,
"loss": 7.8352,
"step": 6000
},
{
"epoch": 4.158669225847729,
"grad_norm": 4.998301982879639,
"learning_rate": 6.498e-05,
"loss": 7.7613,
"step": 6500
},
{
"epoch": 4.478566858605246,
"grad_norm": 5.011510848999023,
"learning_rate": 6.998e-05,
"loss": 7.6554,
"step": 7000
},
{
"epoch": 4.798464491362764,
"grad_norm": 4.750300884246826,
"learning_rate": 7.498e-05,
"loss": 7.6109,
"step": 7500
},
{
"epoch": 5.1183621241202815,
"grad_norm": 6.24017858505249,
"learning_rate": 7.998e-05,
"loss": 7.5186,
"step": 8000
},
{
"epoch": 5.438259756877799,
"grad_norm": 6.061458587646484,
"learning_rate": 8.497000000000001e-05,
"loss": 7.3966,
"step": 8500
},
{
"epoch": 5.758157389635317,
"grad_norm": 7.151447772979736,
"learning_rate": 8.997000000000001e-05,
"loss": 7.2877,
"step": 9000
},
{
"epoch": 6.078055022392834,
"grad_norm": 7.578985214233398,
"learning_rate": 9.497000000000001e-05,
"loss": 7.1542,
"step": 9500
},
{
"epoch": 6.397952655150352,
"grad_norm": 5.948920726776123,
"learning_rate": 9.997e-05,
"loss": 7.0008,
"step": 10000
},
{
"epoch": 6.717850287907869,
"grad_norm": 8.036959648132324,
"learning_rate": 9.982896551724137e-05,
"loss": 6.8966,
"step": 10500
},
{
"epoch": 7.037747920665387,
"grad_norm": 7.160433292388916,
"learning_rate": 9.965655172413794e-05,
"loss": 6.7509,
"step": 11000
},
{
"epoch": 7.357645553422905,
"grad_norm": 5.934999465942383,
"learning_rate": 9.948413793103449e-05,
"loss": 6.5833,
"step": 11500
},
{
"epoch": 7.677543186180422,
"grad_norm": 7.745622634887695,
"learning_rate": 9.931172413793104e-05,
"loss": 6.4975,
"step": 12000
},
{
"epoch": 7.99744081893794,
"grad_norm": 7.0418477058410645,
"learning_rate": 9.91393103448276e-05,
"loss": 6.4261,
"step": 12500
},
{
"epoch": 8.317338451695457,
"grad_norm": 6.101259708404541,
"learning_rate": 9.896689655172414e-05,
"loss": 6.2092,
"step": 13000
},
{
"epoch": 8.637236084452976,
"grad_norm": 7.289799213409424,
"learning_rate": 9.87944827586207e-05,
"loss": 6.1436,
"step": 13500
},
{
"epoch": 8.957133717210493,
"grad_norm": 8.126811027526855,
"learning_rate": 9.862206896551725e-05,
"loss": 6.0456,
"step": 14000
},
{
"epoch": 9.277031349968011,
"grad_norm": 8.221816062927246,
"learning_rate": 9.845000000000001e-05,
"loss": 5.9141,
"step": 14500
},
{
"epoch": 9.596928982725528,
"grad_norm": 7.361550331115723,
"learning_rate": 9.827793103448277e-05,
"loss": 5.8326,
"step": 15000
},
{
"epoch": 9.916826615483046,
"grad_norm": 7.1737775802612305,
"learning_rate": 9.810551724137932e-05,
"loss": 5.7974,
"step": 15500
},
{
"epoch": 10.236724248240563,
"grad_norm": 9.80185604095459,
"learning_rate": 9.793310344827586e-05,
"loss": 5.6282,
"step": 16000
},
{
"epoch": 10.556621880998081,
"grad_norm": 7.2062153816223145,
"learning_rate": 9.776068965517242e-05,
"loss": 5.5619,
"step": 16500
},
{
"epoch": 10.876519513755598,
"grad_norm": 10.801878929138184,
"learning_rate": 9.758827586206896e-05,
"loss": 5.5155,
"step": 17000
},
{
"epoch": 11.196417146513117,
"grad_norm": 8.48509693145752,
"learning_rate": 9.741586206896553e-05,
"loss": 5.4259,
"step": 17500
},
{
"epoch": 11.516314779270633,
"grad_norm": 8.47572135925293,
"learning_rate": 9.724344827586207e-05,
"loss": 5.3205,
"step": 18000
},
{
"epoch": 11.836212412028152,
"grad_norm": 6.122796535491943,
"learning_rate": 9.707103448275863e-05,
"loss": 5.3025,
"step": 18500
},
{
"epoch": 12.156110044785668,
"grad_norm": 8.210710525512695,
"learning_rate": 9.689896551724139e-05,
"loss": 5.2264,
"step": 19000
},
{
"epoch": 12.476007677543187,
"grad_norm": 7.857537746429443,
"learning_rate": 9.672655172413794e-05,
"loss": 5.1395,
"step": 19500
},
{
"epoch": 12.795905310300704,
"grad_norm": 7.743075370788574,
"learning_rate": 9.655413793103448e-05,
"loss": 5.1109,
"step": 20000
},
{
"epoch": 13.115802943058222,
"grad_norm": 10.574569702148438,
"learning_rate": 9.638172413793104e-05,
"loss": 5.0794,
"step": 20500
},
{
"epoch": 13.435700575815739,
"grad_norm": 8.313858985900879,
"learning_rate": 9.620931034482758e-05,
"loss": 4.921,
"step": 21000
},
{
"epoch": 13.755598208573257,
"grad_norm": 9.096057891845703,
"learning_rate": 9.603689655172414e-05,
"loss": 4.96,
"step": 21500
},
{
"epoch": 14.075495841330774,
"grad_norm": 8.402993202209473,
"learning_rate": 9.58644827586207e-05,
"loss": 4.9062,
"step": 22000
},
{
"epoch": 14.395393474088293,
"grad_norm": 8.110074996948242,
"learning_rate": 9.569206896551725e-05,
"loss": 4.8026,
"step": 22500
},
{
"epoch": 14.71529110684581,
"grad_norm": 7.908292293548584,
"learning_rate": 9.552000000000001e-05,
"loss": 4.82,
"step": 23000
},
{
"epoch": 15.035188739603328,
"grad_norm": 7.991878986358643,
"learning_rate": 9.534758620689655e-05,
"loss": 4.7397,
"step": 23500
},
{
"epoch": 15.355086372360844,
"grad_norm": 8.696029663085938,
"learning_rate": 9.517551724137932e-05,
"loss": 4.6656,
"step": 24000
},
{
"epoch": 15.674984005118363,
"grad_norm": 9.421612739562988,
"learning_rate": 9.500310344827586e-05,
"loss": 4.6412,
"step": 24500
},
{
"epoch": 15.99488163787588,
"grad_norm": 9.747482299804688,
"learning_rate": 9.483068965517242e-05,
"loss": 4.6048,
"step": 25000
},
{
"epoch": 16.314779270633398,
"grad_norm": 10.389492988586426,
"learning_rate": 9.465827586206897e-05,
"loss": 4.481,
"step": 25500
},
{
"epoch": 16.634676903390915,
"grad_norm": 8.661949157714844,
"learning_rate": 9.448586206896553e-05,
"loss": 4.4923,
"step": 26000
},
{
"epoch": 16.95457453614843,
"grad_norm": 12.681297302246094,
"learning_rate": 9.431344827586207e-05,
"loss": 4.4816,
"step": 26500
},
{
"epoch": 17.27447216890595,
"grad_norm": 8.993134498596191,
"learning_rate": 9.414103448275863e-05,
"loss": 4.3512,
"step": 27000
},
{
"epoch": 17.59436980166347,
"grad_norm": 10.020146369934082,
"learning_rate": 9.396862068965517e-05,
"loss": 4.3447,
"step": 27500
},
{
"epoch": 17.914267434420985,
"grad_norm": 9.514701843261719,
"learning_rate": 9.379655172413794e-05,
"loss": 4.3376,
"step": 28000
},
{
"epoch": 18.234165067178502,
"grad_norm": 10.324498176574707,
"learning_rate": 9.362413793103448e-05,
"loss": 4.2612,
"step": 28500
},
{
"epoch": 18.554062699936022,
"grad_norm": 10.682856559753418,
"learning_rate": 9.345172413793104e-05,
"loss": 4.226,
"step": 29000
},
{
"epoch": 18.87396033269354,
"grad_norm": 7.883260726928711,
"learning_rate": 9.327931034482758e-05,
"loss": 4.19,
"step": 29500
},
{
"epoch": 19.193857965451055,
"grad_norm": 12.470623016357422,
"learning_rate": 9.310724137931035e-05,
"loss": 4.1881,
"step": 30000
},
{
"epoch": 19.513755598208572,
"grad_norm": 9.932331085205078,
"learning_rate": 9.29348275862069e-05,
"loss": 4.0853,
"step": 30500
},
{
"epoch": 19.833653230966092,
"grad_norm": 8.153782844543457,
"learning_rate": 9.276241379310345e-05,
"loss": 4.1087,
"step": 31000
},
{
"epoch": 20.15355086372361,
"grad_norm": 8.214093208312988,
"learning_rate": 9.258999999999999e-05,
"loss": 4.0751,
"step": 31500
},
{
"epoch": 20.473448496481126,
"grad_norm": 11.927350044250488,
"learning_rate": 9.241758620689656e-05,
"loss": 3.9686,
"step": 32000
},
{
"epoch": 20.793346129238643,
"grad_norm": 9.67835807800293,
"learning_rate": 9.224551724137932e-05,
"loss": 3.9745,
"step": 32500
},
{
"epoch": 21.113243761996163,
"grad_norm": 9.911735534667969,
"learning_rate": 9.207310344827586e-05,
"loss": 3.9308,
"step": 33000
},
{
"epoch": 21.43314139475368,
"grad_norm": 9.05053424835205,
"learning_rate": 9.190068965517242e-05,
"loss": 3.8718,
"step": 33500
},
{
"epoch": 21.753039027511196,
"grad_norm": 9.588044166564941,
"learning_rate": 9.172827586206897e-05,
"loss": 3.8425,
"step": 34000
},
{
"epoch": 22.072936660268713,
"grad_norm": 8.788230895996094,
"learning_rate": 9.155620689655173e-05,
"loss": 3.8617,
"step": 34500
},
{
"epoch": 22.392834293026233,
"grad_norm": 9.435895919799805,
"learning_rate": 9.138379310344827e-05,
"loss": 3.7524,
"step": 35000
},
{
"epoch": 22.71273192578375,
"grad_norm": 9.870182037353516,
"learning_rate": 9.121137931034483e-05,
"loss": 3.7916,
"step": 35500
},
{
"epoch": 23.032629558541267,
"grad_norm": 9.612881660461426,
"learning_rate": 9.103896551724139e-05,
"loss": 3.8011,
"step": 36000
},
{
"epoch": 23.352527191298783,
"grad_norm": 9.643827438354492,
"learning_rate": 9.086689655172414e-05,
"loss": 3.6478,
"step": 36500
},
{
"epoch": 23.672424824056304,
"grad_norm": 14.105424880981445,
"learning_rate": 9.069448275862069e-05,
"loss": 3.6671,
"step": 37000
},
{
"epoch": 23.99232245681382,
"grad_norm": 10.427962303161621,
"learning_rate": 9.052206896551724e-05,
"loss": 3.6809,
"step": 37500
},
{
"epoch": 24.312220089571337,
"grad_norm": 11.505946159362793,
"learning_rate": 9.03496551724138e-05,
"loss": 3.553,
"step": 38000
},
{
"epoch": 24.632117722328854,
"grad_norm": 10.393635749816895,
"learning_rate": 9.017724137931035e-05,
"loss": 3.5408,
"step": 38500
},
{
"epoch": 24.952015355086374,
"grad_norm": 9.023842811584473,
"learning_rate": 9.00051724137931e-05,
"loss": 3.5915,
"step": 39000
},
{
"epoch": 25.27191298784389,
"grad_norm": 10.69048023223877,
"learning_rate": 8.983275862068967e-05,
"loss": 3.4896,
"step": 39500
},
{
"epoch": 25.591810620601407,
"grad_norm": 10.803936958312988,
"learning_rate": 8.966034482758621e-05,
"loss": 3.4854,
"step": 40000
},
{
"epoch": 25.911708253358924,
"grad_norm": 10.489801406860352,
"learning_rate": 8.948793103448276e-05,
"loss": 3.4871,
"step": 40500
},
{
"epoch": 26.231605886116444,
"grad_norm": 10.558309555053711,
"learning_rate": 8.931586206896552e-05,
"loss": 3.4186,
"step": 41000
},
{
"epoch": 26.55150351887396,
"grad_norm": 12.186748504638672,
"learning_rate": 8.914344827586208e-05,
"loss": 3.4027,
"step": 41500
},
{
"epoch": 26.871401151631478,
"grad_norm": 9.8623046875,
"learning_rate": 8.897103448275862e-05,
"loss": 3.4191,
"step": 42000
},
{
"epoch": 27.191298784388994,
"grad_norm": 11.407792091369629,
"learning_rate": 8.879862068965518e-05,
"loss": 3.341,
"step": 42500
},
{
"epoch": 27.511196417146515,
"grad_norm": 13.37617301940918,
"learning_rate": 8.862655172413794e-05,
"loss": 3.3137,
"step": 43000
},
{
"epoch": 27.83109404990403,
"grad_norm": 10.30826187133789,
"learning_rate": 8.845413793103449e-05,
"loss": 3.3036,
"step": 43500
},
{
"epoch": 28.150991682661548,
"grad_norm": 12.024778366088867,
"learning_rate": 8.828172413793105e-05,
"loss": 3.2678,
"step": 44000
},
{
"epoch": 28.470889315419065,
"grad_norm": 9.730340957641602,
"learning_rate": 8.810931034482759e-05,
"loss": 3.1949,
"step": 44500
},
{
"epoch": 28.790786948176585,
"grad_norm": 9.700602531433105,
"learning_rate": 8.793689655172414e-05,
"loss": 3.2541,
"step": 45000
},
{
"epoch": 29.1106845809341,
"grad_norm": 12.359143257141113,
"learning_rate": 8.77648275862069e-05,
"loss": 3.2456,
"step": 45500
},
{
"epoch": 29.43058221369162,
"grad_norm": 11.989018440246582,
"learning_rate": 8.759241379310346e-05,
"loss": 3.1154,
"step": 46000
},
{
"epoch": 29.750479846449135,
"grad_norm": 10.904190063476562,
"learning_rate": 8.742e-05,
"loss": 3.175,
"step": 46500
},
{
"epoch": 30.070377479206655,
"grad_norm": 11.253949165344238,
"learning_rate": 8.724758620689656e-05,
"loss": 3.1478,
"step": 47000
},
{
"epoch": 30.390275111964172,
"grad_norm": 12.229791641235352,
"learning_rate": 8.707517241379311e-05,
"loss": 3.0632,
"step": 47500
},
{
"epoch": 30.71017274472169,
"grad_norm": 9.516524314880371,
"learning_rate": 8.690275862068967e-05,
"loss": 3.0843,
"step": 48000
},
{
"epoch": 31.030070377479205,
"grad_norm": 13.730731010437012,
"learning_rate": 8.673034482758621e-05,
"loss": 3.098,
"step": 48500
},
{
"epoch": 31.349968010236726,
"grad_norm": 9.73539924621582,
"learning_rate": 8.655827586206897e-05,
"loss": 2.9611,
"step": 49000
},
{
"epoch": 31.669865642994242,
"grad_norm": 12.066815376281738,
"learning_rate": 8.638586206896552e-05,
"loss": 2.9943,
"step": 49500
},
{
"epoch": 31.98976327575176,
"grad_norm": 11.028585433959961,
"learning_rate": 8.621344827586208e-05,
"loss": 3.0424,
"step": 50000
},
{
"epoch": 32.30966090850928,
"grad_norm": 11.2380952835083,
"learning_rate": 8.604103448275862e-05,
"loss": 2.9023,
"step": 50500
},
{
"epoch": 32.629558541266796,
"grad_norm": 9.345772743225098,
"learning_rate": 8.586862068965518e-05,
"loss": 2.9586,
"step": 51000
},
{
"epoch": 32.94945617402431,
"grad_norm": 10.239849090576172,
"learning_rate": 8.569655172413793e-05,
"loss": 2.9461,
"step": 51500
},
{
"epoch": 33.26935380678183,
"grad_norm": 11.058523178100586,
"learning_rate": 8.552413793103449e-05,
"loss": 2.8453,
"step": 52000
},
{
"epoch": 33.589251439539346,
"grad_norm": 12.131317138671875,
"learning_rate": 8.535172413793105e-05,
"loss": 2.8603,
"step": 52500
},
{
"epoch": 33.90914907229686,
"grad_norm": 10.392476081848145,
"learning_rate": 8.517931034482759e-05,
"loss": 2.8817,
"step": 53000
},
{
"epoch": 34.22904670505438,
"grad_norm": 10.749021530151367,
"learning_rate": 8.500724137931036e-05,
"loss": 2.8073,
"step": 53500
},
{
"epoch": 34.5489443378119,
"grad_norm": 12.33171558380127,
"learning_rate": 8.48348275862069e-05,
"loss": 2.7793,
"step": 54000
},
{
"epoch": 34.86884197056942,
"grad_norm": 12.961758613586426,
"learning_rate": 8.466241379310346e-05,
"loss": 2.8066,
"step": 54500
},
{
"epoch": 35.18873960332694,
"grad_norm": 13.320075035095215,
"learning_rate": 8.449e-05,
"loss": 2.7459,
"step": 55000
},
{
"epoch": 35.50863723608445,
"grad_norm": 14.416489601135254,
"learning_rate": 8.431758620689655e-05,
"loss": 2.7321,
"step": 55500
},
{
"epoch": 35.82853486884197,
"grad_norm": 11.203073501586914,
"learning_rate": 8.414551724137931e-05,
"loss": 2.7486,
"step": 56000
},
{
"epoch": 36.14843250159949,
"grad_norm": 10.463476181030273,
"learning_rate": 8.397310344827587e-05,
"loss": 2.7086,
"step": 56500
},
{
"epoch": 36.468330134357004,
"grad_norm": 11.375761985778809,
"learning_rate": 8.380068965517241e-05,
"loss": 2.6387,
"step": 57000
},
{
"epoch": 36.78822776711452,
"grad_norm": 11.649105072021484,
"learning_rate": 8.362827586206897e-05,
"loss": 2.6746,
"step": 57500
},
{
"epoch": 37.108125399872044,
"grad_norm": 12.708244323730469,
"learning_rate": 8.345586206896552e-05,
"loss": 2.6454,
"step": 58000
},
{
"epoch": 37.42802303262956,
"grad_norm": 12.876201629638672,
"learning_rate": 8.328344827586208e-05,
"loss": 2.5798,
"step": 58500
},
{
"epoch": 37.74792066538708,
"grad_norm": 11.92346477508545,
"learning_rate": 8.311103448275862e-05,
"loss": 2.6589,
"step": 59000
},
{
"epoch": 38.067818298144594,
"grad_norm": 10.742238998413086,
"learning_rate": 8.293896551724138e-05,
"loss": 2.5868,
"step": 59500
},
{
"epoch": 38.38771593090211,
"grad_norm": 11.399048805236816,
"learning_rate": 8.276655172413793e-05,
"loss": 2.5124,
"step": 60000
},
{
"epoch": 38.70761356365963,
"grad_norm": 13.563875198364258,
"learning_rate": 8.259413793103449e-05,
"loss": 2.576,
"step": 60500
},
{
"epoch": 39.027511196417144,
"grad_norm": 11.297135353088379,
"learning_rate": 8.242172413793103e-05,
"loss": 2.5671,
"step": 61000
},
{
"epoch": 39.34740882917466,
"grad_norm": 11.336121559143066,
"learning_rate": 8.22496551724138e-05,
"loss": 2.4445,
"step": 61500
},
{
"epoch": 39.667306461932185,
"grad_norm": 9.477692604064941,
"learning_rate": 8.207724137931035e-05,
"loss": 2.4981,
"step": 62000
},
{
"epoch": 39.9872040946897,
"grad_norm": 11.597848892211914,
"learning_rate": 8.19048275862069e-05,
"loss": 2.5382,
"step": 62500
},
{
"epoch": 40.30710172744722,
"grad_norm": 14.910037994384766,
"learning_rate": 8.173241379310346e-05,
"loss": 2.4158,
"step": 63000
},
{
"epoch": 40.626999360204735,
"grad_norm": 11.870673179626465,
"learning_rate": 8.156e-05,
"loss": 2.4395,
"step": 63500
},
{
"epoch": 40.94689699296225,
"grad_norm": 15.279576301574707,
"learning_rate": 8.138758620689655e-05,
"loss": 2.4653,
"step": 64000
},
{
"epoch": 41.26679462571977,
"grad_norm": 11.710406303405762,
"learning_rate": 8.121551724137931e-05,
"loss": 2.3567,
"step": 64500
},
{
"epoch": 41.586692258477285,
"grad_norm": 10.663411140441895,
"learning_rate": 8.104310344827587e-05,
"loss": 2.3655,
"step": 65000
},
{
"epoch": 41.9065898912348,
"grad_norm": 13.946629524230957,
"learning_rate": 8.087068965517241e-05,
"loss": 2.4336,
"step": 65500
},
{
"epoch": 42.226487523992326,
"grad_norm": 13.782262802124023,
"learning_rate": 8.069827586206898e-05,
"loss": 2.3351,
"step": 66000
},
{
"epoch": 42.54638515674984,
"grad_norm": 11.177961349487305,
"learning_rate": 8.052586206896552e-05,
"loss": 2.359,
"step": 66500
},
{
"epoch": 42.86628278950736,
"grad_norm": 15.120301246643066,
"learning_rate": 8.035344827586208e-05,
"loss": 2.355,
"step": 67000
},
{
"epoch": 43.186180422264876,
"grad_norm": 10.805267333984375,
"learning_rate": 8.018103448275862e-05,
"loss": 2.2905,
"step": 67500
},
{
"epoch": 43.50607805502239,
"grad_norm": 11.777176856994629,
"learning_rate": 8.000862068965517e-05,
"loss": 2.2906,
"step": 68000
},
{
"epoch": 43.82597568777991,
"grad_norm": 15.457807540893555,
"learning_rate": 7.983689655172414e-05,
"loss": 2.3269,
"step": 68500
},
{
"epoch": 44.145873320537426,
"grad_norm": 11.639357566833496,
"learning_rate": 7.966448275862069e-05,
"loss": 2.2371,
"step": 69000
},
{
"epoch": 44.46577095329494,
"grad_norm": 11.710591316223145,
"learning_rate": 7.949206896551725e-05,
"loss": 2.2248,
"step": 69500
},
{
"epoch": 44.785668586052466,
"grad_norm": 12.675103187561035,
"learning_rate": 7.93196551724138e-05,
"loss": 2.2586,
"step": 70000
},
{
"epoch": 45.10556621880998,
"grad_norm": 12.752120971679688,
"learning_rate": 7.914724137931034e-05,
"loss": 2.2489,
"step": 70500
},
{
"epoch": 45.4254638515675,
"grad_norm": 11.379339218139648,
"learning_rate": 7.89751724137931e-05,
"loss": 2.1774,
"step": 71000
},
{
"epoch": 45.74536148432502,
"grad_norm": 12.76633358001709,
"learning_rate": 7.880275862068966e-05,
"loss": 2.2007,
"step": 71500
},
{
"epoch": 46.06525911708253,
"grad_norm": 11.421367645263672,
"learning_rate": 7.863034482758621e-05,
"loss": 2.2262,
"step": 72000
},
{
"epoch": 46.38515674984005,
"grad_norm": 14.81748104095459,
"learning_rate": 7.845793103448277e-05,
"loss": 2.101,
"step": 72500
},
{
"epoch": 46.70505438259757,
"grad_norm": 12.902971267700195,
"learning_rate": 7.828551724137931e-05,
"loss": 2.1568,
"step": 73000
},
{
"epoch": 47.02495201535508,
"grad_norm": 10.685113906860352,
"learning_rate": 7.811310344827587e-05,
"loss": 2.1655,
"step": 73500
},
{
"epoch": 47.34484964811261,
"grad_norm": 15.892518043518066,
"learning_rate": 7.794068965517242e-05,
"loss": 2.0551,
"step": 74000
},
{
"epoch": 47.664747280870124,
"grad_norm": 13.730358123779297,
"learning_rate": 7.776862068965518e-05,
"loss": 2.1053,
"step": 74500
},
{
"epoch": 47.98464491362764,
"grad_norm": 13.635787963867188,
"learning_rate": 7.759620689655172e-05,
"loss": 2.1408,
"step": 75000
},
{
"epoch": 48.30454254638516,
"grad_norm": 12.861611366271973,
"learning_rate": 7.742379310344828e-05,
"loss": 2.0104,
"step": 75500
},
{
"epoch": 48.624440179142674,
"grad_norm": 11.84931468963623,
"learning_rate": 7.725137931034483e-05,
"loss": 2.0555,
"step": 76000
},
{
"epoch": 48.94433781190019,
"grad_norm": 15.812765121459961,
"learning_rate": 7.70793103448276e-05,
"loss": 2.1087,
"step": 76500
},
{
"epoch": 49.26423544465771,
"grad_norm": 14.233431816101074,
"learning_rate": 7.690689655172414e-05,
"loss": 1.9998,
"step": 77000
},
{
"epoch": 49.584133077415224,
"grad_norm": 14.329803466796875,
"learning_rate": 7.673448275862069e-05,
"loss": 2.0189,
"step": 77500
},
{
"epoch": 49.90403071017275,
"grad_norm": 11.00400161743164,
"learning_rate": 7.656206896551725e-05,
"loss": 2.059,
"step": 78000
},
{
"epoch": 50.223928342930265,
"grad_norm": 13.582133293151855,
"learning_rate": 7.63896551724138e-05,
"loss": 1.9753,
"step": 78500
},
{
"epoch": 50.54382597568778,
"grad_norm": 12.560907363891602,
"learning_rate": 7.621724137931034e-05,
"loss": 1.9759,
"step": 79000
},
{
"epoch": 50.8637236084453,
"grad_norm": 12.169915199279785,
"learning_rate": 7.60451724137931e-05,
"loss": 1.9944,
"step": 79500
},
{
"epoch": 51.183621241202815,
"grad_norm": 13.5604248046875,
"learning_rate": 7.587275862068966e-05,
"loss": 1.9323,
"step": 80000
}
],
"logging_steps": 500,
"max_steps": 300000,
"num_input_tokens_seen": 0,
"num_train_epochs": 192,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.984666210441626e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}