gpol13's picture
Upload folder using huggingface_hub
d5d745c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 9213,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0162813415825464,
"grad_norm": 1.3799059391021729,
"learning_rate": 4.974492564854011e-05,
"loss": 4.366279296875,
"step": 50
},
{
"epoch": 0.0325626831650928,
"grad_norm": NaN,
"learning_rate": 4.9473569955497666e-05,
"loss": 3.2682159423828123,
"step": 100
},
{
"epoch": 0.04884402474763921,
"grad_norm": 1.1759626865386963,
"learning_rate": 4.9207641376316076e-05,
"loss": 2.121284637451172,
"step": 150
},
{
"epoch": 0.0651253663301856,
"grad_norm": 1.299229383468628,
"learning_rate": 4.8936285683273635e-05,
"loss": 1.8733770751953125,
"step": 200
},
{
"epoch": 0.08140670791273201,
"grad_norm": 1.3079231977462769,
"learning_rate": 4.8664929990231194e-05,
"loss": 1.8073320007324218,
"step": 250
},
{
"epoch": 0.09768804949527841,
"grad_norm": 1.5056711435317993,
"learning_rate": 4.839357429718876e-05,
"loss": 1.7036862182617187,
"step": 300
},
{
"epoch": 0.11396939107782482,
"grad_norm": 1.5221471786499023,
"learning_rate": 4.812221860414632e-05,
"loss": 1.6605093383789062,
"step": 350
},
{
"epoch": 0.1302507326603712,
"grad_norm": 1.4612085819244385,
"learning_rate": 4.785086291110388e-05,
"loss": 1.582379608154297,
"step": 400
},
{
"epoch": 0.14653207424291761,
"grad_norm": 1.3166944980621338,
"learning_rate": 4.7579507218061436e-05,
"loss": 1.5711520385742188,
"step": 450
},
{
"epoch": 0.16281341582546402,
"grad_norm": 1.8040547370910645,
"learning_rate": 4.7308151525018995e-05,
"loss": 1.436278076171875,
"step": 500
},
{
"epoch": 0.17909475740801042,
"grad_norm": 1.7718613147735596,
"learning_rate": 4.7036795831976553e-05,
"loss": 1.4956285095214843,
"step": 550
},
{
"epoch": 0.19537609899055683,
"grad_norm": 2.499027967453003,
"learning_rate": 4.676544013893412e-05,
"loss": 1.3423948669433594,
"step": 600
},
{
"epoch": 0.21165744057310323,
"grad_norm": 1.7684857845306396,
"learning_rate": 4.649408444589168e-05,
"loss": 1.358212432861328,
"step": 650
},
{
"epoch": 0.22793878215564964,
"grad_norm": 1.8858190774917603,
"learning_rate": 4.622815586671009e-05,
"loss": 1.3155609130859376,
"step": 700
},
{
"epoch": 0.24422012373819602,
"grad_norm": 1.708154559135437,
"learning_rate": 4.5956800173667646e-05,
"loss": 1.204995346069336,
"step": 750
},
{
"epoch": 0.2605014653207424,
"grad_norm": 2.377797842025757,
"learning_rate": 4.5685444480625205e-05,
"loss": 1.2155376434326173,
"step": 800
},
{
"epoch": 0.2767828069032888,
"grad_norm": 2.3532145023345947,
"learning_rate": 4.5414088787582764e-05,
"loss": 1.2571015930175782,
"step": 850
},
{
"epoch": 0.29306414848583523,
"grad_norm": 2.745908498764038,
"learning_rate": 4.514273309454032e-05,
"loss": 1.1259475708007813,
"step": 900
},
{
"epoch": 0.30934549006838163,
"grad_norm": 4.180660247802734,
"learning_rate": 4.487137740149789e-05,
"loss": 1.1778811645507812,
"step": 950
},
{
"epoch": 0.32562683165092804,
"grad_norm": 2.554922103881836,
"learning_rate": 4.460002170845545e-05,
"loss": 1.144913787841797,
"step": 1000
},
{
"epoch": 0.34190817323347444,
"grad_norm": 2.6831798553466797,
"learning_rate": 4.4328666015413006e-05,
"loss": 1.1192340850830078,
"step": 1050
},
{
"epoch": 0.35818951481602085,
"grad_norm": 2.5000758171081543,
"learning_rate": 4.4057310322370565e-05,
"loss": 1.0886085510253907,
"step": 1100
},
{
"epoch": 0.37447085639856725,
"grad_norm": 2.5406346321105957,
"learning_rate": 4.3785954629328124e-05,
"loss": 1.0647865295410157,
"step": 1150
},
{
"epoch": 0.39075219798111366,
"grad_norm": 2.5966973304748535,
"learning_rate": 4.351459893628568e-05,
"loss": 1.0138130187988281,
"step": 1200
},
{
"epoch": 0.40703353956366006,
"grad_norm": 2.9423012733459473,
"learning_rate": 4.324324324324325e-05,
"loss": 0.971071548461914,
"step": 1250
},
{
"epoch": 0.42331488114620647,
"grad_norm": 2.9763288497924805,
"learning_rate": 4.297188755020081e-05,
"loss": 0.9740264129638672,
"step": 1300
},
{
"epoch": 0.4395962227287529,
"grad_norm": 2.0831127166748047,
"learning_rate": 4.270595897101922e-05,
"loss": 1.0225084686279298,
"step": 1350
},
{
"epoch": 0.4558775643112993,
"grad_norm": 3.093662977218628,
"learning_rate": 4.2434603277976776e-05,
"loss": 0.9085057830810547,
"step": 1400
},
{
"epoch": 0.4721589058938456,
"grad_norm": 3.1048061847686768,
"learning_rate": 4.2163247584934335e-05,
"loss": 0.9657279968261718,
"step": 1450
},
{
"epoch": 0.48844024747639203,
"grad_norm": 89.80404663085938,
"learning_rate": 4.189189189189189e-05,
"loss": 0.9195979309082031,
"step": 1500
},
{
"epoch": 0.5047215890589385,
"grad_norm": 2.7518820762634277,
"learning_rate": 4.162053619884945e-05,
"loss": 0.8844217681884765,
"step": 1550
},
{
"epoch": 0.5210029306414848,
"grad_norm": 2.216895818710327,
"learning_rate": 4.134918050580701e-05,
"loss": 0.9437327575683594,
"step": 1600
},
{
"epoch": 0.5372842722240313,
"grad_norm": 2.756894826889038,
"learning_rate": 4.1077824812764577e-05,
"loss": 0.9072888946533203,
"step": 1650
},
{
"epoch": 0.5535656138065776,
"grad_norm": 2.630861759185791,
"learning_rate": 4.0806469119722135e-05,
"loss": 0.9133613586425782,
"step": 1700
},
{
"epoch": 0.5698469553891241,
"grad_norm": 2.3018958568573,
"learning_rate": 4.0535113426679694e-05,
"loss": 0.9179753875732422,
"step": 1750
},
{
"epoch": 0.5861282969716705,
"grad_norm": 2.2267212867736816,
"learning_rate": 4.026375773363725e-05,
"loss": 0.8736819458007813,
"step": 1800
},
{
"epoch": 0.6024096385542169,
"grad_norm": 3.817021369934082,
"learning_rate": 3.999240204059481e-05,
"loss": 0.8818047332763672,
"step": 1850
},
{
"epoch": 0.6186909801367633,
"grad_norm": 2.8244123458862305,
"learning_rate": 3.972104634755237e-05,
"loss": 0.8710990142822266,
"step": 1900
},
{
"epoch": 0.6349723217193096,
"grad_norm": 2.787409782409668,
"learning_rate": 3.9449690654509936e-05,
"loss": 0.791876220703125,
"step": 1950
},
{
"epoch": 0.6512536633018561,
"grad_norm": 2.5339832305908203,
"learning_rate": 3.9178334961467495e-05,
"loss": 0.8330724334716797,
"step": 2000
},
{
"epoch": 0.6675350048844024,
"grad_norm": 2.2571518421173096,
"learning_rate": 3.8906979268425054e-05,
"loss": 0.8065113067626953,
"step": 2050
},
{
"epoch": 0.6838163464669489,
"grad_norm": 3.3255224227905273,
"learning_rate": 3.863562357538261e-05,
"loss": 0.7839543151855469,
"step": 2100
},
{
"epoch": 0.7000976880494952,
"grad_norm": 2.493654727935791,
"learning_rate": 3.836426788234017e-05,
"loss": 0.7902137756347656,
"step": 2150
},
{
"epoch": 0.7163790296320417,
"grad_norm": 2.943366527557373,
"learning_rate": 3.809291218929774e-05,
"loss": 0.9532376861572266,
"step": 2200
},
{
"epoch": 0.732660371214588,
"grad_norm": 2.404705762863159,
"learning_rate": 3.7821556496255296e-05,
"loss": 0.8227187347412109,
"step": 2250
},
{
"epoch": 0.7489417127971345,
"grad_norm": 8.06905460357666,
"learning_rate": 3.7550200803212855e-05,
"loss": 0.7640556335449219,
"step": 2300
},
{
"epoch": 0.7652230543796809,
"grad_norm": 3.540977954864502,
"learning_rate": 3.7278845110170414e-05,
"loss": 0.8362091064453125,
"step": 2350
},
{
"epoch": 0.7815043959622273,
"grad_norm": 2.233323574066162,
"learning_rate": 3.700748941712797e-05,
"loss": 0.6893608093261718,
"step": 2400
},
{
"epoch": 0.7977857375447737,
"grad_norm": 2.947315216064453,
"learning_rate": 3.673613372408553e-05,
"loss": 0.7564961242675782,
"step": 2450
},
{
"epoch": 0.8140670791273201,
"grad_norm": 2.9839603900909424,
"learning_rate": 3.64647780310431e-05,
"loss": 0.7726463317871094,
"step": 2500
},
{
"epoch": 0.8303484207098665,
"grad_norm": 2.638998508453369,
"learning_rate": 3.6193422338000656e-05,
"loss": 0.7850227355957031,
"step": 2550
},
{
"epoch": 0.8466297622924129,
"grad_norm": 2.203768730163574,
"learning_rate": 3.5922066644958215e-05,
"loss": 0.7540821838378906,
"step": 2600
},
{
"epoch": 0.8629111038749593,
"grad_norm": 2.7057082653045654,
"learning_rate": 3.565071095191577e-05,
"loss": 0.6677760314941407,
"step": 2650
},
{
"epoch": 0.8791924454575057,
"grad_norm": 3.2892088890075684,
"learning_rate": 3.537935525887333e-05,
"loss": 0.74295654296875,
"step": 2700
},
{
"epoch": 0.8954737870400521,
"grad_norm": 2.8778061866760254,
"learning_rate": 3.510799956583089e-05,
"loss": 0.7150550842285156,
"step": 2750
},
{
"epoch": 0.9117551286225986,
"grad_norm": 1.9023234844207764,
"learning_rate": 3.483664387278846e-05,
"loss": 0.7367278289794922,
"step": 2800
},
{
"epoch": 0.9280364702051449,
"grad_norm": 3.3899879455566406,
"learning_rate": 3.4565288179746015e-05,
"loss": 0.7095525360107422,
"step": 2850
},
{
"epoch": 0.9443178117876913,
"grad_norm": 3.202036142349243,
"learning_rate": 3.4293932486703574e-05,
"loss": 0.7237194061279297,
"step": 2900
},
{
"epoch": 0.9605991533702377,
"grad_norm": 2.44712233543396,
"learning_rate": 3.402257679366113e-05,
"loss": 0.710773696899414,
"step": 2950
},
{
"epoch": 0.9768804949527841,
"grad_norm": 3.5873775482177734,
"learning_rate": 3.375122110061869e-05,
"loss": 0.6593586730957032,
"step": 3000
},
{
"epoch": 0.9931618365353305,
"grad_norm": 2.8714234828948975,
"learning_rate": 3.347986540757626e-05,
"loss": 0.7627605438232422,
"step": 3050
},
{
"epoch": 1.0,
"eval_bertscore_f1": 0.9653369394064688,
"eval_bleu": 0.6270834635129311,
"eval_loss": 0.48991522192955017,
"eval_meteor": 0.7251021230424122,
"eval_rouge1": 0.8465042416762141,
"eval_rouge2": 0.738163460778114,
"eval_runtime": 68.0737,
"eval_samples_per_second": 18.979,
"eval_steps_per_second": 2.38,
"step": 3071
},
{
"epoch": 1.009443178117877,
"grad_norm": 3.2640202045440674,
"learning_rate": 3.3208509714533816e-05,
"loss": 0.5927775573730468,
"step": 3100
},
{
"epoch": 1.0257245197004232,
"grad_norm": 3.130765914916992,
"learning_rate": 3.2937154021491375e-05,
"loss": 0.5853068161010743,
"step": 3150
},
{
"epoch": 1.0420058612829697,
"grad_norm": 3.2238473892211914,
"learning_rate": 3.2665798328448934e-05,
"loss": 0.6931375885009765,
"step": 3200
},
{
"epoch": 1.0582872028655161,
"grad_norm": 4.1798176765441895,
"learning_rate": 3.239444263540649e-05,
"loss": 0.6535150146484375,
"step": 3250
},
{
"epoch": 1.0745685444480626,
"grad_norm": 3.4835116863250732,
"learning_rate": 3.212308694236405e-05,
"loss": 0.6570293426513671,
"step": 3300
},
{
"epoch": 1.0908498860306088,
"grad_norm": 3.2468245029449463,
"learning_rate": 3.185173124932162e-05,
"loss": 0.6235344696044922,
"step": 3350
},
{
"epoch": 1.1071312276131553,
"grad_norm": 2.503023862838745,
"learning_rate": 3.1580375556279176e-05,
"loss": 0.6021703720092774,
"step": 3400
},
{
"epoch": 1.1234125691957018,
"grad_norm": 3.5487520694732666,
"learning_rate": 3.1309019863236735e-05,
"loss": 0.6459141540527343,
"step": 3450
},
{
"epoch": 1.1396939107782482,
"grad_norm": 2.8496859073638916,
"learning_rate": 3.1037664170194294e-05,
"loss": 0.5954633712768554,
"step": 3500
},
{
"epoch": 1.1559752523607945,
"grad_norm": 2.746445894241333,
"learning_rate": 3.076630847715185e-05,
"loss": 0.5743134689331054,
"step": 3550
},
{
"epoch": 1.172256593943341,
"grad_norm": 3.843780517578125,
"learning_rate": 3.0494952784109408e-05,
"loss": 0.7025726318359375,
"step": 3600
},
{
"epoch": 1.1885379355258874,
"grad_norm": 2.3990111351013184,
"learning_rate": 3.0223597091066974e-05,
"loss": 0.6482646942138672,
"step": 3650
},
{
"epoch": 1.2048192771084336,
"grad_norm": 3.495655059814453,
"learning_rate": 2.9952241398024532e-05,
"loss": 0.6225572967529297,
"step": 3700
},
{
"epoch": 1.22110061869098,
"grad_norm": 3.0918631553649902,
"learning_rate": 2.968088570498209e-05,
"loss": 0.6018490982055664,
"step": 3750
},
{
"epoch": 1.2373819602735265,
"grad_norm": 3.54016375541687,
"learning_rate": 2.940953001193965e-05,
"loss": 0.6204871749877929,
"step": 3800
},
{
"epoch": 1.253663301856073,
"grad_norm": 3.330631971359253,
"learning_rate": 2.913817431889721e-05,
"loss": 0.5625830459594726,
"step": 3850
},
{
"epoch": 1.2699446434386195,
"grad_norm": 3.4091968536376953,
"learning_rate": 2.8866818625854774e-05,
"loss": 0.6275486755371094,
"step": 3900
},
{
"epoch": 1.2862259850211657,
"grad_norm": 3.535207748413086,
"learning_rate": 2.8595462932812333e-05,
"loss": 0.6113796997070312,
"step": 3950
},
{
"epoch": 1.3025073266037122,
"grad_norm": 2.739208459854126,
"learning_rate": 2.8324107239769892e-05,
"loss": 0.6166405487060547,
"step": 4000
},
{
"epoch": 1.3187886681862586,
"grad_norm": 2.3887178897857666,
"learning_rate": 2.805275154672745e-05,
"loss": 0.6348526000976562,
"step": 4050
},
{
"epoch": 1.3350700097688049,
"grad_norm": 3.2300209999084473,
"learning_rate": 2.778139585368501e-05,
"loss": 0.6592056274414062,
"step": 4100
},
{
"epoch": 1.3513513513513513,
"grad_norm": 2.4417901039123535,
"learning_rate": 2.751004016064257e-05,
"loss": 0.5736191177368164,
"step": 4150
},
{
"epoch": 1.3676326929338978,
"grad_norm": 4.1886467933654785,
"learning_rate": 2.7238684467600134e-05,
"loss": 0.5781734466552735,
"step": 4200
},
{
"epoch": 1.3839140345164442,
"grad_norm": 2.7025551795959473,
"learning_rate": 2.6967328774557693e-05,
"loss": 0.5421427917480469,
"step": 4250
},
{
"epoch": 1.4001953760989905,
"grad_norm": 3.4467735290527344,
"learning_rate": 2.6695973081515252e-05,
"loss": 0.6328504180908203,
"step": 4300
},
{
"epoch": 1.416476717681537,
"grad_norm": 2.252255916595459,
"learning_rate": 2.642461738847281e-05,
"loss": 0.565279884338379,
"step": 4350
},
{
"epoch": 1.4327580592640834,
"grad_norm": 2.3594324588775635,
"learning_rate": 2.615326169543037e-05,
"loss": 0.585950927734375,
"step": 4400
},
{
"epoch": 1.4490394008466296,
"grad_norm": 3.1787843704223633,
"learning_rate": 2.5881906002387928e-05,
"loss": 0.6461568450927735,
"step": 4450
},
{
"epoch": 1.465320742429176,
"grad_norm": 9.052631378173828,
"learning_rate": 2.5610550309345494e-05,
"loss": 0.5787173461914062,
"step": 4500
},
{
"epoch": 1.4816020840117226,
"grad_norm": 3.1000287532806396,
"learning_rate": 2.5339194616303053e-05,
"loss": 0.5753350830078126,
"step": 4550
},
{
"epoch": 1.497883425594269,
"grad_norm": 2.160932779312134,
"learning_rate": 2.506783892326061e-05,
"loss": 0.6055181503295899,
"step": 4600
},
{
"epoch": 1.5141647671768155,
"grad_norm": 5.498105525970459,
"learning_rate": 2.479648323021817e-05,
"loss": 0.5424030303955079,
"step": 4650
},
{
"epoch": 1.530446108759362,
"grad_norm": 2.4782474040985107,
"learning_rate": 2.4525127537175733e-05,
"loss": 0.6082788848876953,
"step": 4700
},
{
"epoch": 1.5467274503419082,
"grad_norm": 2.7400150299072266,
"learning_rate": 2.425377184413329e-05,
"loss": 0.5984983444213867,
"step": 4750
},
{
"epoch": 1.5630087919244544,
"grad_norm": 3.0426690578460693,
"learning_rate": 2.398241615109085e-05,
"loss": 0.6066116333007813,
"step": 4800
},
{
"epoch": 1.5792901335070009,
"grad_norm": 3.5095133781433105,
"learning_rate": 2.3711060458048412e-05,
"loss": 0.605382080078125,
"step": 4850
},
{
"epoch": 1.5955714750895473,
"grad_norm": 3.64323091506958,
"learning_rate": 2.343970476500597e-05,
"loss": 0.5372691726684571,
"step": 4900
},
{
"epoch": 1.6118528166720938,
"grad_norm": 6.410864353179932,
"learning_rate": 2.316834907196353e-05,
"loss": 0.4930916976928711,
"step": 4950
},
{
"epoch": 1.6281341582546403,
"grad_norm": 2.9752631187438965,
"learning_rate": 2.2896993378921092e-05,
"loss": 0.49088024139404296,
"step": 5000
},
{
"epoch": 1.6444154998371867,
"grad_norm": 2.8982131481170654,
"learning_rate": 2.262563768587865e-05,
"loss": 0.5840103912353516,
"step": 5050
},
{
"epoch": 1.660696841419733,
"grad_norm": 3.7222821712493896,
"learning_rate": 2.235428199283621e-05,
"loss": 0.5301944732666015,
"step": 5100
},
{
"epoch": 1.6769781830022794,
"grad_norm": 3.526601791381836,
"learning_rate": 2.2082926299793772e-05,
"loss": 0.4781329345703125,
"step": 5150
},
{
"epoch": 1.6932595245848256,
"grad_norm": 3.4005913734436035,
"learning_rate": 2.181157060675133e-05,
"loss": 0.5219943237304687,
"step": 5200
},
{
"epoch": 1.709540866167372,
"grad_norm": 3.9888486862182617,
"learning_rate": 2.154021491370889e-05,
"loss": 0.5756942367553711,
"step": 5250
},
{
"epoch": 1.7258222077499186,
"grad_norm": 3.6952855587005615,
"learning_rate": 2.1268859220666452e-05,
"loss": 0.5279730606079102,
"step": 5300
},
{
"epoch": 1.742103549332465,
"grad_norm": 3.1715617179870605,
"learning_rate": 2.099750352762401e-05,
"loss": 0.5441674423217774,
"step": 5350
},
{
"epoch": 1.7583848909150115,
"grad_norm": 3.5982584953308105,
"learning_rate": 2.0726147834581573e-05,
"loss": 0.46869205474853515,
"step": 5400
},
{
"epoch": 1.774666232497558,
"grad_norm": 3.594470977783203,
"learning_rate": 2.0454792141539132e-05,
"loss": 0.5004570388793945,
"step": 5450
},
{
"epoch": 1.7909475740801042,
"grad_norm": 3.198012351989746,
"learning_rate": 2.018343644849669e-05,
"loss": 0.49389095306396485,
"step": 5500
},
{
"epoch": 1.8072289156626506,
"grad_norm": 2.3895151615142822,
"learning_rate": 1.9912080755454253e-05,
"loss": 0.5188541793823243,
"step": 5550
},
{
"epoch": 1.8235102572451969,
"grad_norm": 2.874993085861206,
"learning_rate": 1.964072506241181e-05,
"loss": 0.4755914306640625,
"step": 5600
},
{
"epoch": 1.8397915988277433,
"grad_norm": 4.330140590667725,
"learning_rate": 1.936936936936937e-05,
"loss": 0.49986125946044924,
"step": 5650
},
{
"epoch": 1.8560729404102898,
"grad_norm": 3.2301809787750244,
"learning_rate": 1.9098013676326933e-05,
"loss": 0.5472452163696289,
"step": 5700
},
{
"epoch": 1.8723542819928363,
"grad_norm": 2.056736946105957,
"learning_rate": 1.883208509714534e-05,
"loss": 0.5061603164672852,
"step": 5750
},
{
"epoch": 1.8886356235753827,
"grad_norm": 4.6902031898498535,
"learning_rate": 1.85607294041029e-05,
"loss": 0.4669316101074219,
"step": 5800
},
{
"epoch": 1.904916965157929,
"grad_norm": 3.790092945098877,
"learning_rate": 1.828937371106046e-05,
"loss": 0.561137809753418,
"step": 5850
},
{
"epoch": 1.9211983067404754,
"grad_norm": 4.152039527893066,
"learning_rate": 1.801801801801802e-05,
"loss": 0.4813918304443359,
"step": 5900
},
{
"epoch": 1.9374796483230217,
"grad_norm": 3.3476598262786865,
"learning_rate": 1.774666232497558e-05,
"loss": 0.5630344390869141,
"step": 5950
},
{
"epoch": 1.9537609899055681,
"grad_norm": 4.2672810554504395,
"learning_rate": 1.747530663193314e-05,
"loss": 0.48508411407470703,
"step": 6000
},
{
"epoch": 1.9700423314881146,
"grad_norm": 4.236985206604004,
"learning_rate": 1.72039509388907e-05,
"loss": 0.5445558929443359,
"step": 6050
},
{
"epoch": 1.986323673070661,
"grad_norm": 2.686180591583252,
"learning_rate": 1.693259524584826e-05,
"loss": 0.5194969558715821,
"step": 6100
},
{
"epoch": 2.0,
"eval_bertscore_f1": 0.9755530517905858,
"eval_bleu": 0.7363057302997511,
"eval_loss": 0.3618590235710144,
"eval_meteor": 0.813260581053782,
"eval_rouge1": 0.8844645577727277,
"eval_rouge2": 0.8050353100012327,
"eval_runtime": 70.0732,
"eval_samples_per_second": 18.438,
"eval_steps_per_second": 2.312,
"step": 6142
},
{
"epoch": 2.0026050146532075,
"grad_norm": 2.022204637527466,
"learning_rate": 1.666123955280582e-05,
"loss": 0.48952743530273435,
"step": 6150
},
{
"epoch": 2.018886356235754,
"grad_norm": 4.96242094039917,
"learning_rate": 1.638988385976338e-05,
"loss": 0.5839331436157227,
"step": 6200
},
{
"epoch": 2.0351676978183004,
"grad_norm": 3.4074771404266357,
"learning_rate": 1.611852816672094e-05,
"loss": 0.5070013427734374,
"step": 6250
},
{
"epoch": 2.0514490394008464,
"grad_norm": 3.10239577293396,
"learning_rate": 1.58471724736785e-05,
"loss": 0.4913197708129883,
"step": 6300
},
{
"epoch": 2.067730380983393,
"grad_norm": 3.764558792114258,
"learning_rate": 1.557581678063606e-05,
"loss": 0.4683738327026367,
"step": 6350
},
{
"epoch": 2.0840117225659394,
"grad_norm": 4.150667667388916,
"learning_rate": 1.5304461087593617e-05,
"loss": 0.4650471878051758,
"step": 6400
},
{
"epoch": 2.100293064148486,
"grad_norm": 3.9944324493408203,
"learning_rate": 1.5033105394551178e-05,
"loss": 0.5024824905395507,
"step": 6450
},
{
"epoch": 2.1165744057310323,
"grad_norm": 2.410952568054199,
"learning_rate": 1.476174970150874e-05,
"loss": 0.5205254745483399,
"step": 6500
},
{
"epoch": 2.1328557473135787,
"grad_norm": 4.4830098152160645,
"learning_rate": 1.4490394008466299e-05,
"loss": 0.5458049011230469,
"step": 6550
},
{
"epoch": 2.149137088896125,
"grad_norm": 3.420327663421631,
"learning_rate": 1.4219038315423858e-05,
"loss": 0.544830436706543,
"step": 6600
},
{
"epoch": 2.165418430478671,
"grad_norm": 4.262825012207031,
"learning_rate": 1.394768262238142e-05,
"loss": 0.4901109313964844,
"step": 6650
},
{
"epoch": 2.1816997720612177,
"grad_norm": 2.969730854034424,
"learning_rate": 1.3676326929338979e-05,
"loss": 0.48183216094970704,
"step": 6700
},
{
"epoch": 2.197981113643764,
"grad_norm": 2.7617075443267822,
"learning_rate": 1.3404971236296538e-05,
"loss": 0.5208282470703125,
"step": 6750
},
{
"epoch": 2.2142624552263106,
"grad_norm": 2.8121178150177,
"learning_rate": 1.31336155432541e-05,
"loss": 0.47464847564697266,
"step": 6800
},
{
"epoch": 2.230543796808857,
"grad_norm": 2.1643424034118652,
"learning_rate": 1.2862259850211659e-05,
"loss": 0.5135415267944335,
"step": 6850
},
{
"epoch": 2.2468251383914035,
"grad_norm": 3.0597665309906006,
"learning_rate": 1.2590904157169217e-05,
"loss": 0.48383502960205077,
"step": 6900
},
{
"epoch": 2.26310647997395,
"grad_norm": 3.4192488193511963,
"learning_rate": 1.2319548464126778e-05,
"loss": 0.5295528411865235,
"step": 6950
},
{
"epoch": 2.2793878215564964,
"grad_norm": 3.485333204269409,
"learning_rate": 1.2048192771084338e-05,
"loss": 0.5490006637573243,
"step": 7000
},
{
"epoch": 2.295669163139043,
"grad_norm": 3.5061099529266357,
"learning_rate": 1.1776837078041899e-05,
"loss": 0.444782600402832,
"step": 7050
},
{
"epoch": 2.311950504721589,
"grad_norm": 4.059643745422363,
"learning_rate": 1.1505481384999458e-05,
"loss": 0.4735762786865234,
"step": 7100
},
{
"epoch": 2.3282318463041354,
"grad_norm": 3.1162891387939453,
"learning_rate": 1.1234125691957018e-05,
"loss": 0.5211288452148437,
"step": 7150
},
{
"epoch": 2.344513187886682,
"grad_norm": 1.198476791381836,
"learning_rate": 1.0962769998914577e-05,
"loss": 0.5121672439575196,
"step": 7200
},
{
"epoch": 2.3607945294692283,
"grad_norm": 3.9411354064941406,
"learning_rate": 1.0691414305872138e-05,
"loss": 0.5504902267456054,
"step": 7250
},
{
"epoch": 2.3770758710517748,
"grad_norm": 3.590696334838867,
"learning_rate": 1.0420058612829696e-05,
"loss": 0.4592051315307617,
"step": 7300
},
{
"epoch": 2.393357212634321,
"grad_norm": 2.1098175048828125,
"learning_rate": 1.0148702919787257e-05,
"loss": 0.4932923126220703,
"step": 7350
},
{
"epoch": 2.4096385542168672,
"grad_norm": 4.837367057800293,
"learning_rate": 9.877347226744818e-06,
"loss": 0.45726318359375,
"step": 7400
},
{
"epoch": 2.4259198957994137,
"grad_norm": 2.808544874191284,
"learning_rate": 9.605991533702376e-06,
"loss": 0.4931900787353516,
"step": 7450
},
{
"epoch": 2.44220123738196,
"grad_norm": 2.6487984657287598,
"learning_rate": 9.334635840659937e-06,
"loss": 0.4715615844726562,
"step": 7500
},
{
"epoch": 2.4584825789645066,
"grad_norm": 4.251109600067139,
"learning_rate": 9.063280147617497e-06,
"loss": 0.5373792266845703,
"step": 7550
},
{
"epoch": 2.474763920547053,
"grad_norm": 3.84010648727417,
"learning_rate": 8.791924454575056e-06,
"loss": 0.44632495880126954,
"step": 7600
},
{
"epoch": 2.4910452621295995,
"grad_norm": 1.9418392181396484,
"learning_rate": 8.520568761532617e-06,
"loss": 0.48151702880859376,
"step": 7650
},
{
"epoch": 2.507326603712146,
"grad_norm": 4.140622138977051,
"learning_rate": 8.249213068490177e-06,
"loss": 0.4063055419921875,
"step": 7700
},
{
"epoch": 2.5236079452946925,
"grad_norm": 3.0216522216796875,
"learning_rate": 7.977857375447738e-06,
"loss": 0.4796050262451172,
"step": 7750
},
{
"epoch": 2.539889286877239,
"grad_norm": 4.727103233337402,
"learning_rate": 7.706501682405297e-06,
"loss": 0.46068046569824217,
"step": 7800
},
{
"epoch": 2.556170628459785,
"grad_norm": 4.281773567199707,
"learning_rate": 7.435145989362857e-06,
"loss": 0.44071575164794924,
"step": 7850
},
{
"epoch": 2.5724519700423314,
"grad_norm": 3.134763479232788,
"learning_rate": 7.163790296320418e-06,
"loss": 0.4763399887084961,
"step": 7900
},
{
"epoch": 2.588733311624878,
"grad_norm": 3.584044933319092,
"learning_rate": 6.8924346032779764e-06,
"loss": 0.4629644012451172,
"step": 7950
},
{
"epoch": 2.6050146532074243,
"grad_norm": 2.601400852203369,
"learning_rate": 6.621078910235537e-06,
"loss": 0.4727302551269531,
"step": 8000
},
{
"epoch": 2.6212959947899708,
"grad_norm": 3.5354995727539062,
"learning_rate": 6.3497232171930975e-06,
"loss": 0.42160026550292967,
"step": 8050
},
{
"epoch": 2.6375773363725172,
"grad_norm": 2.9206888675689697,
"learning_rate": 6.078367524150657e-06,
"loss": 0.4754294204711914,
"step": 8100
},
{
"epoch": 2.6538586779550632,
"grad_norm": 2.4927732944488525,
"learning_rate": 5.807011831108217e-06,
"loss": 0.5114262390136719,
"step": 8150
},
{
"epoch": 2.6701400195376097,
"grad_norm": 4.378971099853516,
"learning_rate": 5.535656138065777e-06,
"loss": 0.5084254837036133,
"step": 8200
},
{
"epoch": 2.686421361120156,
"grad_norm": 2.4034016132354736,
"learning_rate": 5.264300445023337e-06,
"loss": 0.527303466796875,
"step": 8250
},
{
"epoch": 2.7027027027027026,
"grad_norm": 3.7141177654266357,
"learning_rate": 4.9929447519808975e-06,
"loss": 0.4662747573852539,
"step": 8300
},
{
"epoch": 2.718984044285249,
"grad_norm": 3.871277332305908,
"learning_rate": 4.721589058938457e-06,
"loss": 0.5126468276977539,
"step": 8350
},
{
"epoch": 2.7352653858677956,
"grad_norm": 2.500791072845459,
"learning_rate": 4.450233365896017e-06,
"loss": 0.47957534790039064,
"step": 8400
},
{
"epoch": 2.751546727450342,
"grad_norm": 5.441941738128662,
"learning_rate": 4.1788776728535765e-06,
"loss": 0.38029510498046876,
"step": 8450
},
{
"epoch": 2.7678280690328885,
"grad_norm": 3.3940446376800537,
"learning_rate": 3.907521979811136e-06,
"loss": 0.4626531219482422,
"step": 8500
},
{
"epoch": 2.784109410615435,
"grad_norm": 4.125059127807617,
"learning_rate": 3.6361662867686967e-06,
"loss": 0.4890303039550781,
"step": 8550
},
{
"epoch": 2.800390752197981,
"grad_norm": 2.758863687515259,
"learning_rate": 3.3648105937262564e-06,
"loss": 0.4689041519165039,
"step": 8600
},
{
"epoch": 2.8166720937805274,
"grad_norm": 4.864498138427734,
"learning_rate": 3.0934549006838165e-06,
"loss": 0.46032047271728516,
"step": 8650
},
{
"epoch": 2.832953435363074,
"grad_norm": 3.3108010292053223,
"learning_rate": 2.8220992076413766e-06,
"loss": 0.43362377166748045,
"step": 8700
},
{
"epoch": 2.8492347769456203,
"grad_norm": 2.3421084880828857,
"learning_rate": 2.5507435145989362e-06,
"loss": 0.44478134155273436,
"step": 8750
},
{
"epoch": 2.865516118528167,
"grad_norm": 3.283203601837158,
"learning_rate": 2.2793878215564963e-06,
"loss": 0.5047480392456055,
"step": 8800
},
{
"epoch": 2.8817974601107132,
"grad_norm": 2.0124731063842773,
"learning_rate": 2.0080321285140564e-06,
"loss": 0.4658950424194336,
"step": 8850
},
{
"epoch": 2.8980788016932593,
"grad_norm": 3.839552879333496,
"learning_rate": 1.7366764354716163e-06,
"loss": 0.45034191131591794,
"step": 8900
},
{
"epoch": 2.9143601432758057,
"grad_norm": 4.701524257659912,
"learning_rate": 1.4653207424291762e-06,
"loss": 0.47517498016357423,
"step": 8950
},
{
"epoch": 2.930641484858352,
"grad_norm": 6.58011531829834,
"learning_rate": 1.1939650493867363e-06,
"loss": 0.44451316833496096,
"step": 9000
},
{
"epoch": 2.9469228264408986,
"grad_norm": 2.9627132415771484,
"learning_rate": 9.226093563442963e-07,
"loss": 0.41320926666259766,
"step": 9050
},
{
"epoch": 2.963204168023445,
"grad_norm": 3.003753185272217,
"learning_rate": 6.51253663301856e-07,
"loss": 0.3974274444580078,
"step": 9100
},
{
"epoch": 2.9794855096059916,
"grad_norm": 2.0012876987457275,
"learning_rate": 3.7989797025941607e-07,
"loss": 0.42885406494140627,
"step": 9150
},
{
"epoch": 2.995766851188538,
"grad_norm": 3.7651121616363525,
"learning_rate": 1.0854227721697602e-07,
"loss": 0.4800850296020508,
"step": 9200
},
{
"epoch": 3.0,
"eval_bertscore_f1": 0.9785511039727982,
"eval_bleu": 0.7645620244248046,
"eval_loss": 0.3346184194087982,
"eval_meteor": 0.8355226256477348,
"eval_rouge1": 0.8968326891869934,
"eval_rouge2": 0.8250429516845066,
"eval_runtime": 68.0941,
"eval_samples_per_second": 18.974,
"eval_steps_per_second": 2.379,
"step": 9213
}
],
"logging_steps": 50,
"max_steps": 9213,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.263887217557504e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}