Attila1011's picture
Upload folder using huggingface_hub
8a04c02 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.596543209876543,
"eval_steps": 256,
"global_step": 17408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.12641975308641976,
"grad_norm": 0.49028417468070984,
"learning_rate": 0.000498046875,
"loss": 1.2244819402694702,
"step": 256
},
{
"epoch": 0.12641975308641976,
"eval_bleu": 0.023714342494794382,
"eval_loss": 1.1823350772625063,
"eval_mse_loss": 1.1823350772625063,
"step": 256
},
{
"epoch": 0.12641975308641976,
"eval_bleu": 0.023714342494794382,
"eval_loss": 1.1823350772625063,
"eval_mse_loss": 1.1823350772625063,
"eval_runtime": 7.3875,
"eval_samples_per_second": 354.383,
"eval_steps_per_second": 5.55,
"step": 256
},
{
"epoch": 0.2528395061728395,
"grad_norm": 0.578209638595581,
"learning_rate": 0.000998046875,
"loss": 1.1352403163909912,
"step": 512
},
{
"epoch": 0.2528395061728395,
"eval_bleu": 0.022467213421973074,
"eval_loss": 1.0758944168323423,
"eval_mse_loss": 1.0758944168323423,
"step": 512
},
{
"epoch": 0.2528395061728395,
"eval_bleu": 0.022467213421973074,
"eval_loss": 1.0758944168323423,
"eval_mse_loss": 1.0758944168323423,
"eval_runtime": 7.512,
"eval_samples_per_second": 348.51,
"eval_steps_per_second": 5.458,
"step": 512
},
{
"epoch": 0.37925925925925924,
"grad_norm": 1.1504969596862793,
"learning_rate": 0.0009995882304697813,
"loss": 1.0155020952224731,
"step": 768
},
{
"epoch": 0.37925925925925924,
"eval_bleu": 0.03303487719486685,
"eval_loss": 0.9521724465416699,
"eval_mse_loss": 0.9521724465416699,
"step": 768
},
{
"epoch": 0.37925925925925924,
"eval_bleu": 0.03303487719486685,
"eval_loss": 0.9521724465416699,
"eval_mse_loss": 0.9521724465416699,
"eval_runtime": 7.3269,
"eval_samples_per_second": 357.313,
"eval_steps_per_second": 5.596,
"step": 768
},
{
"epoch": 0.505679012345679,
"grad_norm": 0.9992234706878662,
"learning_rate": 0.0009983471408586747,
"loss": 0.9070967435836792,
"step": 1024
},
{
"epoch": 0.505679012345679,
"eval_bleu": 0.05283670615976002,
"eval_loss": 0.8704100134896069,
"eval_mse_loss": 0.8704100134896069,
"step": 1024
},
{
"epoch": 0.505679012345679,
"eval_bleu": 0.05283670615976002,
"eval_loss": 0.8704100134896069,
"eval_mse_loss": 0.8704100134896069,
"eval_runtime": 7.9933,
"eval_samples_per_second": 327.523,
"eval_steps_per_second": 5.129,
"step": 1024
},
{
"epoch": 0.6320987654320988,
"grad_norm": 0.931308925151825,
"learning_rate": 0.000996278785066807,
"loss": 0.8445582389831543,
"step": 1280
},
{
"epoch": 0.6320987654320988,
"eval_bleu": 0.062080457666779255,
"eval_loss": 0.8229870461836094,
"eval_mse_loss": 0.8229870461836094,
"step": 1280
},
{
"epoch": 0.6320987654320988,
"eval_bleu": 0.062080457666779255,
"eval_loss": 0.8229870461836094,
"eval_mse_loss": 0.8229870461836094,
"eval_runtime": 8.2397,
"eval_samples_per_second": 317.731,
"eval_steps_per_second": 4.976,
"step": 1280
},
{
"epoch": 0.7585185185185185,
"grad_norm": 1.1758294105529785,
"learning_rate": 0.0009933865966059944,
"loss": 0.7974789142608643,
"step": 1536
},
{
"epoch": 0.7585185185185185,
"eval_bleu": 0.08761458974204005,
"eval_loss": 0.7755879759788513,
"eval_mse_loss": 0.7755879759788513,
"step": 1536
},
{
"epoch": 0.7585185185185185,
"eval_bleu": 0.08761458974204005,
"eval_loss": 0.7755879759788513,
"eval_mse_loss": 0.7755879759788513,
"eval_runtime": 7.2349,
"eval_samples_per_second": 361.856,
"eval_steps_per_second": 5.667,
"step": 1536
},
{
"epoch": 0.8849382716049383,
"grad_norm": 1.6121838092803955,
"learning_rate": 0.0009896753765666925,
"loss": 0.7649396657943726,
"step": 1792
},
{
"epoch": 0.8849382716049383,
"eval_bleu": 0.09849079764244906,
"eval_loss": 0.7508636625801645,
"eval_mse_loss": 0.7508636625801645,
"step": 1792
},
{
"epoch": 0.8849382716049383,
"eval_bleu": 0.09849079764244906,
"eval_loss": 0.7508636625801645,
"eval_mse_loss": 0.7508636625801645,
"eval_runtime": 7.7694,
"eval_samples_per_second": 336.963,
"eval_steps_per_second": 5.277,
"step": 1792
},
{
"epoch": 1.011358024691358,
"grad_norm": 1.966202974319458,
"learning_rate": 0.0009851512856480936,
"loss": 0.7395287156105042,
"step": 2048
},
{
"epoch": 1.011358024691358,
"eval_bleu": 0.11072005270411464,
"eval_loss": 0.7292228166649981,
"eval_mse_loss": 0.7292228166649981,
"step": 2048
},
{
"epoch": 1.011358024691358,
"eval_bleu": 0.11072005270411464,
"eval_loss": 0.7292228166649981,
"eval_mse_loss": 0.7292228166649981,
"eval_runtime": 7.9736,
"eval_samples_per_second": 328.334,
"eval_steps_per_second": 5.142,
"step": 2048
},
{
"epoch": 1.1377777777777778,
"grad_norm": 1.720017433166504,
"learning_rate": 0.0009798218339312412,
"loss": 0.7159179449081421,
"step": 2304
},
{
"epoch": 1.1377777777777778,
"eval_bleu": 0.11064736251702126,
"eval_loss": 0.7017144793417396,
"eval_mse_loss": 0.7017144793417396,
"step": 2304
},
{
"epoch": 1.1377777777777778,
"eval_bleu": 0.11064736251702126,
"eval_loss": 0.7017144793417396,
"eval_mse_loss": 0.7017144793417396,
"eval_runtime": 7.2741,
"eval_samples_per_second": 359.908,
"eval_steps_per_second": 5.636,
"step": 2304
},
{
"epoch": 1.2641975308641975,
"grad_norm": 1.0777727365493774,
"learning_rate": 0.0009736958684121462,
"loss": 0.6964433789253235,
"step": 2560
},
{
"epoch": 1.2641975308641975,
"eval_bleu": 0.1276517391343738,
"eval_loss": 0.6854300062830855,
"eval_mse_loss": 0.6854300062830855,
"step": 2560
},
{
"epoch": 1.2641975308641975,
"eval_bleu": 0.1276517391343738,
"eval_loss": 0.6854300062830855,
"eval_mse_loss": 0.6854300062830855,
"eval_runtime": 7.5038,
"eval_samples_per_second": 348.888,
"eval_steps_per_second": 5.464,
"step": 2560
},
{
"epoch": 1.3906172839506172,
"grad_norm": 2.31036639213562,
"learning_rate": 0.0009667835583155982,
"loss": 0.6787664890289307,
"step": 2816
},
{
"epoch": 1.3906172839506172,
"eval_bleu": 0.14606448546092324,
"eval_loss": 0.6666799565640892,
"eval_mse_loss": 0.6666799565640892,
"step": 2816
},
{
"epoch": 1.3906172839506172,
"eval_bleu": 0.14606448546092324,
"eval_loss": 0.6666799565640892,
"eval_mse_loss": 0.6666799565640892,
"eval_runtime": 8.1414,
"eval_samples_per_second": 321.565,
"eval_steps_per_second": 5.036,
"step": 2816
},
{
"epoch": 1.5170370370370372,
"grad_norm": 1.8440918922424316,
"learning_rate": 0.0009590963782140483,
"loss": 0.6634169220924377,
"step": 3072
},
{
"epoch": 1.5170370370370372,
"eval_bleu": 0.14900832500200872,
"eval_loss": 0.6537455029603911,
"eval_mse_loss": 0.6537455029603911,
"step": 3072
},
{
"epoch": 1.5170370370370372,
"eval_bleu": 0.14900832500200872,
"eval_loss": 0.6537455029603911,
"eval_mse_loss": 0.6537455029603911,
"eval_runtime": 8.5409,
"eval_samples_per_second": 306.525,
"eval_steps_per_second": 4.8,
"step": 3072
},
{
"epoch": 1.643456790123457,
"grad_norm": 2.082489013671875,
"learning_rate": 0.0009506470889795917,
"loss": 0.6479848027229309,
"step": 3328
},
{
"epoch": 1.643456790123457,
"eval_bleu": 0.15870707580244742,
"eval_loss": 0.6412253728727015,
"eval_mse_loss": 0.6412253728727015,
"step": 3328
},
{
"epoch": 1.643456790123457,
"eval_bleu": 0.15870707580244742,
"eval_loss": 0.6412253728727015,
"eval_mse_loss": 0.6412253728727015,
"eval_runtime": 7.43,
"eval_samples_per_second": 352.355,
"eval_steps_per_second": 5.518,
"step": 3328
},
{
"epoch": 1.7698765432098766,
"grad_norm": 2.5578348636627197,
"learning_rate": 0.0009414497166006652,
"loss": 0.6374112963676453,
"step": 3584
},
{
"epoch": 1.7698765432098766,
"eval_bleu": 0.16517790474378305,
"eval_loss": 0.6340285481476202,
"eval_mse_loss": 0.6340285481476202,
"step": 3584
},
{
"epoch": 1.7698765432098766,
"eval_bleu": 0.16517790474378305,
"eval_loss": 0.6340285481476202,
"eval_mse_loss": 0.6340285481476202,
"eval_runtime": 7.8092,
"eval_samples_per_second": 335.246,
"eval_steps_per_second": 5.25,
"step": 3584
},
{
"epoch": 1.8962962962962964,
"grad_norm": 1.9383974075317383,
"learning_rate": 0.0009315195288986269,
"loss": 0.623887836933136,
"step": 3840
},
{
"epoch": 1.8962962962962964,
"eval_bleu": 0.17847564897785617,
"eval_loss": 0.6204298661976326,
"eval_mse_loss": 0.6204298661976326,
"step": 3840
},
{
"epoch": 1.8962962962962964,
"eval_bleu": 0.17847564897785617,
"eval_loss": 0.6204298661976326,
"eval_mse_loss": 0.6204298661976326,
"eval_runtime": 7.8808,
"eval_samples_per_second": 332.2,
"eval_steps_per_second": 5.203,
"step": 3840
},
{
"epoch": 2.022716049382716,
"grad_norm": 2.074370861053467,
"learning_rate": 0.0009208730101828687,
"loss": 0.612588107585907,
"step": 4096
},
{
"epoch": 2.022716049382716,
"eval_bleu": 0.20019494813992078,
"eval_loss": 0.6014744639396667,
"eval_mse_loss": 0.6014744639396667,
"step": 4096
},
{
"epoch": 2.022716049382716,
"eval_bleu": 0.20019494813992078,
"eval_loss": 0.6014744639396667,
"eval_mse_loss": 0.6014744639396667,
"eval_runtime": 8.3023,
"eval_samples_per_second": 315.333,
"eval_steps_per_second": 4.938,
"step": 4096
},
{
"epoch": 2.149135802469136,
"grad_norm": 2.1671674251556396,
"learning_rate": 0.0009095278338865343,
"loss": 0.5997830629348755,
"step": 4352
},
{
"epoch": 2.149135802469136,
"eval_bleu": 0.1987893921535938,
"eval_loss": 0.5986893453249117,
"eval_mse_loss": 0.5986893453249117,
"step": 4352
},
{
"epoch": 2.149135802469136,
"eval_bleu": 0.1987893921535938,
"eval_loss": 0.5986893453249117,
"eval_mse_loss": 0.5986893453249117,
"eval_runtime": 7.4027,
"eval_samples_per_second": 353.653,
"eval_steps_per_second": 5.538,
"step": 4352
},
{
"epoch": 2.2755555555555556,
"grad_norm": 3.125169038772583,
"learning_rate": 0.0008975028332282684,
"loss": 0.5991740822792053,
"step": 4608
},
{
"epoch": 2.2755555555555556,
"eval_bleu": 0.20658330468877353,
"eval_loss": 0.6155260525098661,
"eval_mse_loss": 0.6155260525098661,
"step": 4608
},
{
"epoch": 2.2755555555555556,
"eval_bleu": 0.20658330468877353,
"eval_loss": 0.6155260525098661,
"eval_mse_loss": 0.6155260525098661,
"eval_runtime": 7.4818,
"eval_samples_per_second": 349.915,
"eval_steps_per_second": 5.48,
"step": 4608
},
{
"epoch": 2.4019753086419753,
"grad_norm": 4.6324992179870605,
"learning_rate": 0.0008848179699486985,
"loss": 0.6436169147491455,
"step": 4864
},
{
"epoch": 2.4019753086419753,
"eval_bleu": 0.19786110402686427,
"eval_loss": 0.6434223041301821,
"eval_mse_loss": 0.6434223041301821,
"step": 4864
},
{
"epoch": 2.4019753086419753,
"eval_bleu": 0.19786110402686427,
"eval_loss": 0.6434223041301821,
"eval_mse_loss": 0.6434223041301821,
"eval_runtime": 8.06,
"eval_samples_per_second": 324.813,
"eval_steps_per_second": 5.087,
"step": 4864
},
{
"epoch": 2.528395061728395,
"grad_norm": 4.647606372833252,
"learning_rate": 0.0008714943011735476,
"loss": 0.6646981239318848,
"step": 5120
},
{
"epoch": 2.528395061728395,
"eval_bleu": 0.17652369495305426,
"eval_loss": 0.6581660363732315,
"eval_mse_loss": 0.6581660363732315,
"step": 5120
},
{
"epoch": 2.528395061728395,
"eval_bleu": 0.17652369495305426,
"eval_loss": 0.6581660363732315,
"eval_mse_loss": 0.6581660363732315,
"eval_runtime": 8.4642,
"eval_samples_per_second": 309.304,
"eval_steps_per_second": 4.844,
"step": 5120
},
{
"epoch": 2.6548148148148147,
"grad_norm": 4.655036926269531,
"learning_rate": 0.000857553944458386,
"loss": 0.670133650302887,
"step": 5376
},
{
"epoch": 2.6548148148148147,
"eval_bleu": 0.16665801459747773,
"eval_loss": 0.6649525950594646,
"eval_mse_loss": 0.6649525950594646,
"step": 5376
},
{
"epoch": 2.6548148148148147,
"eval_bleu": 0.16665801459747773,
"eval_loss": 0.6649525950594646,
"eval_mse_loss": 0.6649525950594646,
"eval_runtime": 7.3082,
"eval_samples_per_second": 358.23,
"eval_steps_per_second": 5.61,
"step": 5376
},
{
"epoch": 2.7812345679012345,
"grad_norm": 6.748809814453125,
"learning_rate": 0.000843020041073049,
"loss": 0.6625139117240906,
"step": 5632
},
{
"epoch": 2.7812345679012345,
"eval_bleu": 0.173997037607852,
"eval_loss": 0.6807852968937014,
"eval_mse_loss": 0.6807852968937014,
"step": 5632
},
{
"epoch": 2.7812345679012345,
"eval_bleu": 0.173997037607852,
"eval_loss": 0.6807852968937014,
"eval_mse_loss": 0.6807852968937014,
"eval_runtime": 7.5532,
"eval_samples_per_second": 346.607,
"eval_steps_per_second": 5.428,
"step": 5632
},
{
"epoch": 2.907654320987654,
"grad_norm": 4.406154632568359,
"learning_rate": 0.0008279167175866678,
"loss": 0.6519719958305359,
"step": 5888
},
{
"epoch": 2.907654320987654,
"eval_bleu": 0.19570266564584513,
"eval_loss": 0.6394063074414323,
"eval_mse_loss": 0.6394063074414323,
"step": 5888
},
{
"epoch": 2.907654320987654,
"eval_bleu": 0.19570266564584513,
"eval_loss": 0.6394063074414323,
"eval_mse_loss": 0.6394063074414323,
"eval_runtime": 7.9229,
"eval_samples_per_second": 330.434,
"eval_steps_per_second": 5.175,
"step": 5888
},
{
"epoch": 3.034074074074074,
"grad_norm": 4.702131271362305,
"learning_rate": 0.0008122690458170833,
"loss": 0.6394258737564087,
"step": 6144
},
{
"epoch": 3.034074074074074,
"eval_bleu": 0.18025210346558498,
"eval_loss": 0.6386287488588472,
"eval_mse_loss": 0.6386287488588472,
"step": 6144
},
{
"epoch": 3.034074074074074,
"eval_bleu": 0.18025210346558498,
"eval_loss": 0.6386287488588472,
"eval_mse_loss": 0.6386287488588472,
"eval_runtime": 8.212,
"eval_samples_per_second": 318.804,
"eval_steps_per_second": 4.993,
"step": 6144
},
{
"epoch": 3.1604938271604937,
"grad_norm": 3.3306374549865723,
"learning_rate": 0.0007961030012111305,
"loss": 0.6313198208808899,
"step": 6400
},
{
"epoch": 3.1604938271604937,
"eval_bleu": 0.19600803362588348,
"eval_loss": 0.6227354814366597,
"eval_mse_loss": 0.6227354814366597,
"step": 6400
},
{
"epoch": 3.1604938271604937,
"eval_bleu": 0.19600803362588348,
"eval_loss": 0.6227354814366597,
"eval_mse_loss": 0.6227354814366597,
"eval_runtime": 7.9462,
"eval_samples_per_second": 329.467,
"eval_steps_per_second": 5.16,
"step": 6400
},
{
"epoch": 3.286913580246914,
"grad_norm": 4.519600868225098,
"learning_rate": 0.0007794454197248784,
"loss": 0.6270281672477722,
"step": 6656
},
{
"epoch": 3.286913580246914,
"eval_bleu": 0.21406015637359221,
"eval_loss": 0.6298785456796971,
"eval_mse_loss": 0.6298785456796971,
"step": 6656
},
{
"epoch": 3.286913580246914,
"eval_bleu": 0.21406015637359221,
"eval_loss": 0.6298785456796971,
"eval_mse_loss": 0.6298785456796971,
"eval_runtime": 7.1729,
"eval_samples_per_second": 364.985,
"eval_steps_per_second": 5.716,
"step": 6656
},
{
"epoch": 3.413333333333333,
"grad_norm": 4.557300090789795,
"learning_rate": 0.0007623239532754083,
"loss": 0.6133501529693604,
"step": 6912
},
{
"epoch": 3.413333333333333,
"eval_bleu": 0.21008851882230098,
"eval_loss": 0.6231076019566234,
"eval_mse_loss": 0.6231076019566234,
"step": 6912
},
{
"epoch": 3.413333333333333,
"eval_bleu": 0.21008851882230098,
"eval_loss": 0.6231076019566234,
"eval_mse_loss": 0.6231076019566234,
"eval_runtime": 7.9356,
"eval_samples_per_second": 329.904,
"eval_steps_per_second": 5.167,
"step": 6912
},
{
"epoch": 3.5397530864197533,
"grad_norm": 4.322099208831787,
"learning_rate": 0.0007447670238380815,
"loss": 0.6112697720527649,
"step": 7168
},
{
"epoch": 3.5397530864197533,
"eval_bleu": 0.2060799623946906,
"eval_loss": 0.6111486045325675,
"eval_mse_loss": 0.6111486045325675,
"step": 7168
},
{
"epoch": 3.5397530864197533,
"eval_bleu": 0.2060799623946906,
"eval_loss": 0.6111486045325675,
"eval_mse_loss": 0.6111486045325675,
"eval_runtime": 8.1866,
"eval_samples_per_second": 319.793,
"eval_steps_per_second": 5.008,
"step": 7168
},
{
"epoch": 3.6661728395061726,
"grad_norm": 4.115531921386719,
"learning_rate": 0.0007268037762654929,
"loss": 0.6039376854896545,
"step": 7424
},
{
"epoch": 3.6661728395061726,
"eval_bleu": 0.21503632161601938,
"eval_loss": 0.6128283637325939,
"eval_mse_loss": 0.6128283637325939,
"step": 7424
},
{
"epoch": 3.6661728395061726,
"eval_bleu": 0.21503632161601938,
"eval_loss": 0.6128283637325939,
"eval_mse_loss": 0.6128283637325939,
"eval_runtime": 8.2856,
"eval_samples_per_second": 315.97,
"eval_steps_per_second": 4.948,
"step": 7424
},
{
"epoch": 3.7925925925925927,
"grad_norm": 4.654777526855469,
"learning_rate": 0.0007084640299064357,
"loss": 0.5996431708335876,
"step": 7680
},
{
"epoch": 3.7925925925925927,
"eval_bleu": 0.22869408936536856,
"eval_loss": 0.5915597095722105,
"eval_mse_loss": 0.5915597095722105,
"step": 7680
},
{
"epoch": 3.7925925925925927,
"eval_bleu": 0.22869408936536856,
"eval_loss": 0.5915597095722105,
"eval_mse_loss": 0.5915597095722105,
"eval_runtime": 8.2255,
"eval_samples_per_second": 318.278,
"eval_steps_per_second": 4.984,
"step": 7680
},
{
"epoch": 3.9190123456790125,
"grad_norm": 3.9882354736328125,
"learning_rate": 0.0006897782291051889,
"loss": 0.5931000709533691,
"step": 7936
},
{
"epoch": 3.9190123456790125,
"eval_bleu": 0.2349496066220062,
"eval_loss": 0.5926558040991062,
"eval_mse_loss": 0.5926558040991062,
"step": 7936
},
{
"epoch": 3.9190123456790125,
"eval_bleu": 0.2349496066220062,
"eval_loss": 0.5926558040991062,
"eval_mse_loss": 0.5926558040991062,
"eval_runtime": 8.1758,
"eval_samples_per_second": 320.212,
"eval_steps_per_second": 5.015,
"step": 7936
},
{
"epoch": 4.045432098765432,
"grad_norm": 4.266123294830322,
"learning_rate": 0.000670777392663298,
"loss": 0.5860158205032349,
"step": 8192
},
{
"epoch": 4.045432098765432,
"eval_bleu": 0.25634671575676854,
"eval_loss": 0.5731269004868298,
"eval_mse_loss": 0.5731269004868298,
"step": 8192
},
{
"epoch": 4.045432098765432,
"eval_bleu": 0.25634671575676854,
"eval_loss": 0.5731269004868298,
"eval_mse_loss": 0.5731269004868298,
"eval_runtime": 7.4662,
"eval_samples_per_second": 350.648,
"eval_steps_per_second": 5.491,
"step": 8192
},
{
"epoch": 4.1718518518518515,
"grad_norm": 5.411952018737793,
"learning_rate": 0.0006514930623477486,
"loss": 0.5818273425102234,
"step": 8448
},
{
"epoch": 4.1718518518518515,
"eval_bleu": 0.2520441554433453,
"eval_loss": 0.5782059794519006,
"eval_mse_loss": 0.5782059794519006,
"step": 8448
},
{
"epoch": 4.1718518518518515,
"eval_bleu": 0.2520441554433453,
"eval_loss": 0.5782059794519006,
"eval_mse_loss": 0.5782059794519006,
"eval_runtime": 7.4057,
"eval_samples_per_second": 353.512,
"eval_steps_per_second": 5.536,
"step": 8448
},
{
"epoch": 4.298271604938272,
"grad_norm": 3.5435891151428223,
"learning_rate": 0.0006319572505310022,
"loss": 0.5773241519927979,
"step": 8704
},
{
"epoch": 4.298271604938272,
"eval_bleu": 0.2534190472706376,
"eval_loss": 0.5589999033183586,
"eval_mse_loss": 0.5589999033183586,
"step": 8704
},
{
"epoch": 4.298271604938272,
"eval_bleu": 0.2534190472706376,
"eval_loss": 0.5589999033183586,
"eval_mse_loss": 0.5589999033183586,
"eval_runtime": 7.7623,
"eval_samples_per_second": 337.273,
"eval_steps_per_second": 5.282,
"step": 8704
},
{
"epoch": 4.424691358024692,
"grad_norm": 3.660297155380249,
"learning_rate": 0.000612202387049823,
"loss": 0.5752817988395691,
"step": 8960
},
{
"epoch": 4.424691358024692,
"eval_bleu": 0.25405581690850026,
"eval_loss": 0.5717670982930718,
"eval_mse_loss": 0.5717670982930718,
"step": 8960
},
{
"epoch": 4.424691358024692,
"eval_bleu": 0.25405581690850026,
"eval_loss": 0.5717670982930718,
"eval_mse_loss": 0.5717670982930718,
"eval_runtime": 8.0671,
"eval_samples_per_second": 324.527,
"eval_steps_per_second": 5.082,
"step": 8960
},
{
"epoch": 4.551111111111111,
"grad_norm": 4.61627197265625,
"learning_rate": 0.0005922612653711009,
"loss": 0.5677421689033508,
"step": 9216
},
{
"epoch": 4.551111111111111,
"eval_bleu": 0.2616680879763428,
"eval_loss": 0.5714971786592065,
"eval_mse_loss": 0.5714971786592065,
"step": 9216
},
{
"epoch": 4.551111111111111,
"eval_bleu": 0.2616680879763428,
"eval_loss": 0.5714971786592065,
"eval_mse_loss": 0.5714971786592065,
"eval_runtime": 8.2747,
"eval_samples_per_second": 316.384,
"eval_steps_per_second": 4.955,
"step": 9216
},
{
"epoch": 4.67753086419753,
"grad_norm": 4.289632797241211,
"learning_rate": 0.0005721669881540442,
"loss": 0.5655048489570618,
"step": 9472
},
{
"epoch": 4.67753086419753,
"eval_bleu": 0.27326453853008387,
"eval_loss": 0.5655288245619797,
"eval_mse_loss": 0.5655288245619797,
"step": 9472
},
{
"epoch": 4.67753086419753,
"eval_bleu": 0.27326453853008387,
"eval_loss": 0.5655288245619797,
"eval_mse_loss": 0.5655288245619797,
"eval_runtime": 8.1079,
"eval_samples_per_second": 322.895,
"eval_steps_per_second": 5.057,
"step": 9472
},
{
"epoch": 4.803950617283951,
"grad_norm": 4.95919132232666,
"learning_rate": 0.0005519529122991056,
"loss": 0.5713540315628052,
"step": 9728
},
{
"epoch": 4.803950617283951,
"eval_bleu": 0.2627683773991363,
"eval_loss": 0.5628622283295888,
"eval_mse_loss": 0.5628622283295888,
"step": 9728
},
{
"epoch": 4.803950617283951,
"eval_bleu": 0.2627683773991363,
"eval_loss": 0.5628622283295888,
"eval_mse_loss": 0.5628622283295888,
"eval_runtime": 7.9584,
"eval_samples_per_second": 328.962,
"eval_steps_per_second": 5.152,
"step": 9728
},
{
"epoch": 4.930370370370371,
"grad_norm": 6.370813369750977,
"learning_rate": 0.0005316525935748631,
"loss": 0.5834425687789917,
"step": 9984
},
{
"epoch": 4.930370370370371,
"eval_bleu": 0.26641828019601,
"eval_loss": 0.5816229349229394,
"eval_mse_loss": 0.5816229349229394,
"step": 9984
},
{
"epoch": 4.930370370370371,
"eval_bleu": 0.26641828019601,
"eval_loss": 0.5816229349229394,
"eval_mse_loss": 0.5816229349229394,
"eval_runtime": 7.1012,
"eval_samples_per_second": 368.672,
"eval_steps_per_second": 5.774,
"step": 9984
},
{
"epoch": 5.05679012345679,
"grad_norm": 6.872233867645264,
"learning_rate": 0.0005112997309147753,
"loss": 0.5829513669013977,
"step": 10240
},
{
"epoch": 5.05679012345679,
"eval_bleu": 0.2556546592309415,
"eval_loss": 0.5965675159198481,
"eval_mse_loss": 0.5965675159198481,
"step": 10240
},
{
"epoch": 5.05679012345679,
"eval_bleu": 0.2556546592309415,
"eval_loss": 0.5965675159198481,
"eval_mse_loss": 0.5965675159198481,
"eval_runtime": 7.6184,
"eval_samples_per_second": 343.643,
"eval_steps_per_second": 5.382,
"step": 10240
},
{
"epoch": 5.18320987654321,
"grad_norm": 6.554806232452393,
"learning_rate": 0.0004909281104762817,
"loss": 0.5979596376419067,
"step": 10496
},
{
"epoch": 5.18320987654321,
"eval_bleu": 0.24496216459175849,
"eval_loss": 0.5925210903330547,
"eval_mse_loss": 0.5925210903330547,
"step": 10496
},
{
"epoch": 5.18320987654321,
"eval_bleu": 0.24496216459175849,
"eval_loss": 0.5925210903330547,
"eval_mse_loss": 0.5925210903330547,
"eval_runtime": 8.1186,
"eval_samples_per_second": 322.468,
"eval_steps_per_second": 5.05,
"step": 10496
},
{
"epoch": 5.3096296296296295,
"grad_norm": 8.099956512451172,
"learning_rate": 0.0004705715495551068,
"loss": 0.5981637835502625,
"step": 10752
},
{
"epoch": 5.3096296296296295,
"eval_bleu": 0.24892557597714382,
"eval_loss": 0.6025734194895116,
"eval_mse_loss": 0.6025734194895116,
"step": 10752
},
{
"epoch": 5.3096296296296295,
"eval_bleu": 0.24892557597714382,
"eval_loss": 0.6025734194895116,
"eval_mse_loss": 0.6025734194895116,
"eval_runtime": 8.3839,
"eval_samples_per_second": 312.265,
"eval_steps_per_second": 4.89,
"step": 10752
},
{
"epoch": 5.43604938271605,
"grad_norm": 6.063875675201416,
"learning_rate": 0.00045026384044787715,
"loss": 0.600553572177887,
"step": 11008
},
{
"epoch": 5.43604938271605,
"eval_bleu": 0.2584749312819124,
"eval_loss": 0.5978762725504433,
"eval_mse_loss": 0.5978762725504433,
"step": 11008
},
{
"epoch": 5.43604938271605,
"eval_bleu": 0.2584749312819124,
"eval_loss": 0.5978762725504433,
"eval_mse_loss": 0.5978762725504433,
"eval_runtime": 8.4473,
"eval_samples_per_second": 309.921,
"eval_steps_per_second": 4.854,
"step": 11008
},
{
"epoch": 5.562469135802469,
"grad_norm": 6.819189071655273,
"learning_rate": 0.0004300386943562342,
"loss": 0.6050165891647339,
"step": 11264
},
{
"epoch": 5.562469135802469,
"eval_bleu": 0.23936172268263006,
"eval_loss": 0.6197842156014791,
"eval_mse_loss": 0.6197842156014791,
"step": 11264
},
{
"epoch": 5.562469135802469,
"eval_bleu": 0.23936172268263006,
"eval_loss": 0.6197842156014791,
"eval_mse_loss": 0.6197842156014791,
"eval_runtime": 7.275,
"eval_samples_per_second": 359.864,
"eval_steps_per_second": 5.636,
"step": 11264
},
{
"epoch": 5.688888888888889,
"grad_norm": 8.074504852294922,
"learning_rate": 0.0004099296854255696,
"loss": 0.6032764315605164,
"step": 11520
},
{
"epoch": 5.688888888888889,
"eval_bleu": 0.2563038856939558,
"eval_loss": 0.5922774854229718,
"eval_mse_loss": 0.5922774854229718,
"step": 11520
},
{
"epoch": 5.688888888888889,
"eval_bleu": 0.2563038856939558,
"eval_loss": 0.5922774854229718,
"eval_mse_loss": 0.5922774854229718,
"eval_runtime": 7.7562,
"eval_samples_per_second": 337.535,
"eval_steps_per_second": 5.286,
"step": 11520
},
{
"epoch": 5.815308641975308,
"grad_norm": 7.949519157409668,
"learning_rate": 0.00038997019501127406,
"loss": 0.5983560681343079,
"step": 11776
},
{
"epoch": 5.815308641975308,
"eval_bleu": 0.25476493381302434,
"eval_loss": 0.6024370585999838,
"eval_mse_loss": 0.6024370585999838,
"step": 11776
},
{
"epoch": 5.815308641975308,
"eval_bleu": 0.25476493381302434,
"eval_loss": 0.6024370585999838,
"eval_mse_loss": 0.6024370585999838,
"eval_runtime": 7.9866,
"eval_samples_per_second": 327.799,
"eval_steps_per_second": 5.134,
"step": 11776
},
{
"epoch": 5.941728395061729,
"grad_norm": 7.21051025390625,
"learning_rate": 0.00037019335626502263,
"loss": 0.5926205515861511,
"step": 12032
},
{
"epoch": 5.941728395061729,
"eval_bleu": 0.2579463063130564,
"eval_loss": 0.5958762241572868,
"eval_mse_loss": 0.5958762241572868,
"step": 12032
},
{
"epoch": 5.941728395061729,
"eval_bleu": 0.2579463063130564,
"eval_loss": 0.5958762241572868,
"eval_mse_loss": 0.5958762241572868,
"eval_runtime": 8.1465,
"eval_samples_per_second": 321.364,
"eval_steps_per_second": 5.033,
"step": 12032
},
{
"epoch": 6.068148148148148,
"grad_norm": 6.125677108764648,
"learning_rate": 0.0003506319991330833,
"loss": 0.5896713137626648,
"step": 12288
},
{
"epoch": 6.068148148148148,
"eval_bleu": 0.2633499094850468,
"eval_loss": 0.5901360758920995,
"eval_mse_loss": 0.5901360758920995,
"step": 12288
},
{
"epoch": 6.068148148148148,
"eval_bleu": 0.2633499094850468,
"eval_loss": 0.5901360758920995,
"eval_mse_loss": 0.5901360758920995,
"eval_runtime": 7.4076,
"eval_samples_per_second": 353.421,
"eval_steps_per_second": 5.535,
"step": 12288
},
{
"epoch": 6.194567901234568,
"grad_norm": 8.257464408874512,
"learning_rate": 0.00033131859585795183,
"loss": 0.5833750367164612,
"step": 12544
},
{
"epoch": 6.194567901234568,
"eval_bleu": 0.2632131293368136,
"eval_loss": 0.5901335754045626,
"eval_mse_loss": 0.5901335754045626,
"step": 12544
},
{
"epoch": 6.194567901234568,
"eval_bleu": 0.2632131293368136,
"eval_loss": 0.5901335754045626,
"eval_mse_loss": 0.5901335754045626,
"eval_runtime": 7.4243,
"eval_samples_per_second": 352.625,
"eval_steps_per_second": 5.522,
"step": 12544
},
{
"epoch": 6.320987654320987,
"grad_norm": 4.863134384155273,
"learning_rate": 0.0003122852070737825,
"loss": 0.581442654132843,
"step": 12800
},
{
"epoch": 6.320987654320987,
"eval_bleu": 0.26764936587915455,
"eval_loss": 0.5768222677998427,
"eval_mse_loss": 0.5768222677998427,
"step": 12800
},
{
"epoch": 6.320987654320987,
"eval_bleu": 0.26764936587915455,
"eval_loss": 0.5768222677998427,
"eval_mse_loss": 0.5768222677998427,
"eval_runtime": 7.9069,
"eval_samples_per_second": 331.105,
"eval_steps_per_second": 5.185,
"step": 12800
},
{
"epoch": 6.4474074074074075,
"grad_norm": 8.697264671325684,
"learning_rate": 0.00029356342858509677,
"loss": 0.5772220492362976,
"step": 13056
},
{
"epoch": 6.4474074074074075,
"eval_bleu": 0.27568777700132663,
"eval_loss": 0.584543146738192,
"eval_mse_loss": 0.584543146738192,
"step": 13056
},
{
"epoch": 6.4474074074074075,
"eval_bleu": 0.27568777700132663,
"eval_loss": 0.584543146738192,
"eval_mse_loss": 0.584543146738192,
"eval_runtime": 8.0062,
"eval_samples_per_second": 326.995,
"eval_steps_per_second": 5.121,
"step": 13056
},
{
"epoch": 6.573827160493828,
"grad_norm": 6.309488773345947,
"learning_rate": 0.0002751843389171185,
"loss": 0.5722501277923584,
"step": 13312
},
{
"epoch": 6.573827160493828,
"eval_bleu": 0.27306378384191476,
"eval_loss": 0.5755636982801484,
"eval_mse_loss": 0.5755636982801484,
"step": 13312
},
{
"epoch": 6.573827160493828,
"eval_bleu": 0.27306378384191476,
"eval_loss": 0.5755636982801484,
"eval_mse_loss": 0.5755636982801484,
"eval_runtime": 8.591,
"eval_samples_per_second": 304.738,
"eval_steps_per_second": 4.772,
"step": 13312
},
{
"epoch": 6.700246913580247,
"grad_norm": 5.390190601348877,
"learning_rate": 0.0002571784477248029,
"loss": 0.5714833736419678,
"step": 13568
},
{
"epoch": 6.700246913580247,
"eval_bleu": 0.2848300030499236,
"eval_loss": 0.570485861563101,
"eval_mse_loss": 0.570485861563101,
"step": 13568
},
{
"epoch": 6.700246913580247,
"eval_bleu": 0.2848300030499236,
"eval_loss": 0.570485861563101,
"eval_mse_loss": 0.570485861563101,
"eval_runtime": 7.2761,
"eval_samples_per_second": 359.808,
"eval_steps_per_second": 5.635,
"step": 13568
},
{
"epoch": 6.826666666666666,
"grad_norm": 7.215004920959473,
"learning_rate": 0.0002395756451462014,
"loss": 0.5674658417701721,
"step": 13824
},
{
"epoch": 6.826666666666666,
"eval_bleu": 0.2756130503152812,
"eval_loss": 0.5792907963438731,
"eval_mse_loss": 0.5792907963438731,
"step": 13824
},
{
"epoch": 6.826666666666666,
"eval_bleu": 0.2756130503152812,
"eval_loss": 0.5792907963438731,
"eval_mse_loss": 0.5792907963438731,
"eval_runtime": 7.3879,
"eval_samples_per_second": 354.362,
"eval_steps_per_second": 5.55,
"step": 13824
},
{
"epoch": 6.953086419753086,
"grad_norm": 6.6079816818237305,
"learning_rate": 0.00022240515218423758,
"loss": 0.5638896226882935,
"step": 14080
},
{
"epoch": 6.953086419753086,
"eval_bleu": 0.2958995495334816,
"eval_loss": 0.5594088427904176,
"eval_mse_loss": 0.5594088427904176,
"step": 14080
},
{
"epoch": 6.953086419753086,
"eval_bleu": 0.2958995495334816,
"eval_loss": 0.5594088427904176,
"eval_mse_loss": 0.5594088427904176,
"eval_runtime": 7.9094,
"eval_samples_per_second": 330.998,
"eval_steps_per_second": 5.184,
"step": 14080
},
{
"epoch": 7.079506172839507,
"grad_norm": 6.562555313110352,
"learning_rate": 0.00020569547219925934,
"loss": 0.5631716251373291,
"step": 14336
},
{
"epoch": 7.079506172839507,
"eval_bleu": 0.2905136606672008,
"eval_loss": 0.5631622737500726,
"eval_mse_loss": 0.5631622737500726,
"step": 14336
},
{
"epoch": 7.079506172839507,
"eval_bleu": 0.2905136606672008,
"eval_loss": 0.5631622737500726,
"eval_mse_loss": 0.5631622737500726,
"eval_runtime": 8.3002,
"eval_samples_per_second": 315.415,
"eval_steps_per_second": 4.94,
"step": 14336
},
{
"epoch": 7.205925925925926,
"grad_norm": 5.195821285247803,
"learning_rate": 0.00018947434359289434,
"loss": 0.5603777170181274,
"step": 14592
},
{
"epoch": 7.205925925925926,
"eval_bleu": 0.29437709495493225,
"eval_loss": 0.5465006559360318,
"eval_mse_loss": 0.5465006559360318,
"step": 14592
},
{
"epoch": 7.205925925925926,
"eval_bleu": 0.29437709495493225,
"eval_loss": 0.5465006559360318,
"eval_mse_loss": 0.5465006559360318,
"eval_runtime": 7.1426,
"eval_samples_per_second": 366.535,
"eval_steps_per_second": 5.74,
"step": 14592
},
{
"epoch": 7.332345679012346,
"grad_norm": 8.156927108764648,
"learning_rate": 0.0001737686937617491,
"loss": 0.5557982325553894,
"step": 14848
},
{
"epoch": 7.332345679012346,
"eval_bleu": 0.29449963629268144,
"eval_loss": 0.557792792959911,
"eval_mse_loss": 0.557792792959911,
"step": 14848
},
{
"epoch": 7.332345679012346,
"eval_bleu": 0.29449963629268144,
"eval_loss": 0.557792792959911,
"eval_mse_loss": 0.557792792959911,
"eval_runtime": 7.7275,
"eval_samples_per_second": 338.789,
"eval_steps_per_second": 5.306,
"step": 14848
},
{
"epoch": 7.458765432098765,
"grad_norm": 5.172349452972412,
"learning_rate": 0.00015860459439739582,
"loss": 0.5558417439460754,
"step": 15104
},
{
"epoch": 7.458765432098765,
"eval_bleu": 0.31102631980865114,
"eval_loss": 0.5469118814642836,
"eval_mse_loss": 0.5469118814642836,
"step": 15104
},
{
"epoch": 7.458765432098765,
"eval_bleu": 0.31102631980865114,
"eval_loss": 0.5469118814642836,
"eval_mse_loss": 0.5469118814642836,
"eval_runtime": 7.9676,
"eval_samples_per_second": 328.58,
"eval_steps_per_second": 5.146,
"step": 15104
},
{
"epoch": 7.5851851851851855,
"grad_norm": 5.335220813751221,
"learning_rate": 0.0001440072182068436,
"loss": 0.5527888536453247,
"step": 15360
},
{
"epoch": 7.5851851851851855,
"eval_bleu": 0.3040853321881768,
"eval_loss": 0.5505978399660529,
"eval_mse_loss": 0.5505978399660529,
"step": 15360
},
{
"epoch": 7.5851851851851855,
"eval_bleu": 0.3040853321881768,
"eval_loss": 0.5505978399660529,
"eval_mse_loss": 0.5505978399660529,
"eval_runtime": 8.0279,
"eval_samples_per_second": 326.113,
"eval_steps_per_second": 5.107,
"step": 15360
},
{
"epoch": 7.711604938271605,
"grad_norm": 3.120297908782959,
"learning_rate": 0.00013000079712534475,
"loss": 0.5498585104942322,
"step": 15616
},
{
"epoch": 7.711604938271605,
"eval_bleu": 0.30098409224195205,
"eval_loss": 0.54582195819878,
"eval_mse_loss": 0.54582195819878,
"step": 15616
},
{
"epoch": 7.711604938271605,
"eval_bleu": 0.30098409224195205,
"eval_loss": 0.54582195819878,
"eval_mse_loss": 0.54582195819878,
"eval_runtime": 8.0703,
"eval_samples_per_second": 324.401,
"eval_steps_per_second": 5.08,
"step": 15616
},
{
"epoch": 7.838024691358025,
"grad_norm": 4.802690029144287,
"learning_rate": 0.00011660858209089819,
"loss": 0.5511511564254761,
"step": 15872
},
{
"epoch": 7.838024691358025,
"eval_bleu": 0.3068134406523234,
"eval_loss": 0.5499871998298459,
"eval_mse_loss": 0.5499871998298459,
"step": 15872
},
{
"epoch": 7.838024691358025,
"eval_bleu": 0.3068134406523234,
"eval_loss": 0.5499871998298459,
"eval_mse_loss": 0.5499871998298459,
"eval_runtime": 7.2713,
"eval_samples_per_second": 360.046,
"eval_steps_per_second": 5.639,
"step": 15872
},
{
"epoch": 7.964444444444444,
"grad_norm": 3.471189260482788,
"learning_rate": 0.00010385280444723056,
"loss": 0.5520000457763672,
"step": 16128
},
{
"epoch": 7.964444444444444,
"eval_bleu": 0.3052295052303889,
"eval_loss": 0.5441080797009352,
"eval_mse_loss": 0.5441080797009352,
"step": 16128
},
{
"epoch": 7.964444444444444,
"eval_bleu": 0.3052295052303889,
"eval_loss": 0.5441080797009352,
"eval_mse_loss": 0.5441080797009352,
"eval_runtime": 7.7809,
"eval_samples_per_second": 336.463,
"eval_steps_per_second": 5.269,
"step": 16128
},
{
"epoch": 8.090864197530864,
"grad_norm": 5.1973466873168945,
"learning_rate": 9.175463903932168e-05,
"loss": 0.5478553771972656,
"step": 16384
},
{
"epoch": 8.090864197530864,
"eval_bleu": 0.2907079883758542,
"eval_loss": 0.5581492403658425,
"eval_mse_loss": 0.5581492403658425,
"step": 16384
},
{
"epoch": 8.090864197530864,
"eval_bleu": 0.2907079883758542,
"eval_loss": 0.5581492403658425,
"eval_mse_loss": 0.5581492403658425,
"eval_runtime": 8.0444,
"eval_samples_per_second": 325.444,
"eval_steps_per_second": 5.097,
"step": 16384
},
{
"epoch": 8.217283950617285,
"grad_norm": 3.1891634464263916,
"learning_rate": 8.033416906274093e-05,
"loss": 0.5465660095214844,
"step": 16640
},
{
"epoch": 8.217283950617285,
"eval_bleu": 0.3151093389207819,
"eval_loss": 0.5382588927338763,
"eval_mse_loss": 0.5382588927338763,
"step": 16640
},
{
"epoch": 8.217283950617285,
"eval_bleu": 0.3151093389207819,
"eval_loss": 0.5382588927338763,
"eval_mse_loss": 0.5382588927338763,
"eval_runtime": 8.1703,
"eval_samples_per_second": 320.43,
"eval_steps_per_second": 5.018,
"step": 16640
},
{
"epoch": 8.343703703703703,
"grad_norm": 3.5024101734161377,
"learning_rate": 6.961035272514177e-05,
"loss": 0.5455322265625,
"step": 16896
},
{
"epoch": 8.343703703703703,
"eval_bleu": 0.31440992086262937,
"eval_loss": 0.5380844590140552,
"eval_mse_loss": 0.5380844590140552,
"step": 16896
},
{
"epoch": 8.343703703703703,
"eval_bleu": 0.31440992086262937,
"eval_loss": 0.5380844590140552,
"eval_mse_loss": 0.5380844590140552,
"eval_runtime": 7.1273,
"eval_samples_per_second": 367.32,
"eval_steps_per_second": 5.753,
"step": 16896
},
{
"epoch": 8.470123456790123,
"grad_norm": 3.242410659790039,
"learning_rate": 5.960099177526024e-05,
"loss": 0.5413248538970947,
"step": 17152
},
{
"epoch": 8.470123456790123,
"eval_bleu": 0.3030533280572146,
"eval_loss": 0.5433335965726434,
"eval_mse_loss": 0.5433335965726434,
"step": 17152
},
{
"epoch": 8.470123456790123,
"eval_bleu": 0.3030533280572146,
"eval_loss": 0.5433335965726434,
"eval_mse_loss": 0.5433335965726434,
"eval_runtime": 8.049,
"eval_samples_per_second": 325.26,
"eval_steps_per_second": 5.094,
"step": 17152
},
{
"epoch": 8.596543209876543,
"grad_norm": 3.3194503784179688,
"learning_rate": 5.032270195165667e-05,
"loss": 0.5402602553367615,
"step": 17408
},
{
"epoch": 8.596543209876543,
"eval_bleu": 0.3284021854773732,
"eval_loss": 0.5290337989969951,
"eval_mse_loss": 0.5290337989969951,
"step": 17408
},
{
"epoch": 8.596543209876543,
"eval_bleu": 0.3284021854773732,
"eval_loss": 0.5290337989969951,
"eval_mse_loss": 0.5290337989969951,
"eval_runtime": 7.8967,
"eval_samples_per_second": 331.532,
"eval_steps_per_second": 5.192,
"step": 17408
}
],
"logging_steps": 256,
"max_steps": 20250,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 256,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}