{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 9213, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0162813415825464, "grad_norm": 1.3799059391021729, "learning_rate": 4.974492564854011e-05, "loss": 4.366279296875, "step": 50 }, { "epoch": 0.0325626831650928, "grad_norm": NaN, "learning_rate": 4.9473569955497666e-05, "loss": 3.2682159423828123, "step": 100 }, { "epoch": 0.04884402474763921, "grad_norm": 1.1759626865386963, "learning_rate": 4.9207641376316076e-05, "loss": 2.121284637451172, "step": 150 }, { "epoch": 0.0651253663301856, "grad_norm": 1.299229383468628, "learning_rate": 4.8936285683273635e-05, "loss": 1.8733770751953125, "step": 200 }, { "epoch": 0.08140670791273201, "grad_norm": 1.3079231977462769, "learning_rate": 4.8664929990231194e-05, "loss": 1.8073320007324218, "step": 250 }, { "epoch": 0.09768804949527841, "grad_norm": 1.5056711435317993, "learning_rate": 4.839357429718876e-05, "loss": 1.7036862182617187, "step": 300 }, { "epoch": 0.11396939107782482, "grad_norm": 1.5221471786499023, "learning_rate": 4.812221860414632e-05, "loss": 1.6605093383789062, "step": 350 }, { "epoch": 0.1302507326603712, "grad_norm": 1.4612085819244385, "learning_rate": 4.785086291110388e-05, "loss": 1.582379608154297, "step": 400 }, { "epoch": 0.14653207424291761, "grad_norm": 1.3166944980621338, "learning_rate": 4.7579507218061436e-05, "loss": 1.5711520385742188, "step": 450 }, { "epoch": 0.16281341582546402, "grad_norm": 1.8040547370910645, "learning_rate": 4.7308151525018995e-05, "loss": 1.436278076171875, "step": 500 }, { "epoch": 0.17909475740801042, "grad_norm": 1.7718613147735596, "learning_rate": 4.7036795831976553e-05, "loss": 1.4956285095214843, "step": 550 }, { "epoch": 0.19537609899055683, "grad_norm": 2.499027967453003, "learning_rate": 4.676544013893412e-05, "loss": 1.3423948669433594, "step": 600 }, { "epoch": 0.21165744057310323, "grad_norm": 1.7684857845306396, "learning_rate": 4.649408444589168e-05, "loss": 1.358212432861328, "step": 650 }, { "epoch": 0.22793878215564964, "grad_norm": 1.8858190774917603, "learning_rate": 4.622815586671009e-05, "loss": 1.3155609130859376, "step": 700 }, { "epoch": 0.24422012373819602, "grad_norm": 1.708154559135437, "learning_rate": 4.5956800173667646e-05, "loss": 1.204995346069336, "step": 750 }, { "epoch": 0.2605014653207424, "grad_norm": 2.377797842025757, "learning_rate": 4.5685444480625205e-05, "loss": 1.2155376434326173, "step": 800 }, { "epoch": 0.2767828069032888, "grad_norm": 2.3532145023345947, "learning_rate": 4.5414088787582764e-05, "loss": 1.2571015930175782, "step": 850 }, { "epoch": 0.29306414848583523, "grad_norm": 2.745908498764038, "learning_rate": 4.514273309454032e-05, "loss": 1.1259475708007813, "step": 900 }, { "epoch": 0.30934549006838163, "grad_norm": 4.180660247802734, "learning_rate": 4.487137740149789e-05, "loss": 1.1778811645507812, "step": 950 }, { "epoch": 0.32562683165092804, "grad_norm": 2.554922103881836, "learning_rate": 4.460002170845545e-05, "loss": 1.144913787841797, "step": 1000 }, { "epoch": 0.34190817323347444, "grad_norm": 2.6831798553466797, "learning_rate": 4.4328666015413006e-05, "loss": 1.1192340850830078, "step": 1050 }, { "epoch": 0.35818951481602085, "grad_norm": 2.5000758171081543, "learning_rate": 4.4057310322370565e-05, "loss": 1.0886085510253907, "step": 1100 }, { "epoch": 0.37447085639856725, "grad_norm": 2.5406346321105957, "learning_rate": 4.3785954629328124e-05, "loss": 1.0647865295410157, "step": 1150 }, { "epoch": 0.39075219798111366, "grad_norm": 2.5966973304748535, "learning_rate": 4.351459893628568e-05, "loss": 1.0138130187988281, "step": 1200 }, { "epoch": 0.40703353956366006, "grad_norm": 2.9423012733459473, "learning_rate": 4.324324324324325e-05, "loss": 0.971071548461914, "step": 1250 }, { "epoch": 0.42331488114620647, "grad_norm": 2.9763288497924805, "learning_rate": 4.297188755020081e-05, "loss": 0.9740264129638672, "step": 1300 }, { "epoch": 0.4395962227287529, "grad_norm": 2.0831127166748047, "learning_rate": 4.270595897101922e-05, "loss": 1.0225084686279298, "step": 1350 }, { "epoch": 0.4558775643112993, "grad_norm": 3.093662977218628, "learning_rate": 4.2434603277976776e-05, "loss": 0.9085057830810547, "step": 1400 }, { "epoch": 0.4721589058938456, "grad_norm": 3.1048061847686768, "learning_rate": 4.2163247584934335e-05, "loss": 0.9657279968261718, "step": 1450 }, { "epoch": 0.48844024747639203, "grad_norm": 89.80404663085938, "learning_rate": 4.189189189189189e-05, "loss": 0.9195979309082031, "step": 1500 }, { "epoch": 0.5047215890589385, "grad_norm": 2.7518820762634277, "learning_rate": 4.162053619884945e-05, "loss": 0.8844217681884765, "step": 1550 }, { "epoch": 0.5210029306414848, "grad_norm": 2.216895818710327, "learning_rate": 4.134918050580701e-05, "loss": 0.9437327575683594, "step": 1600 }, { "epoch": 0.5372842722240313, "grad_norm": 2.756894826889038, "learning_rate": 4.1077824812764577e-05, "loss": 0.9072888946533203, "step": 1650 }, { "epoch": 0.5535656138065776, "grad_norm": 2.630861759185791, "learning_rate": 4.0806469119722135e-05, "loss": 0.9133613586425782, "step": 1700 }, { "epoch": 0.5698469553891241, "grad_norm": 2.3018958568573, "learning_rate": 4.0535113426679694e-05, "loss": 0.9179753875732422, "step": 1750 }, { "epoch": 0.5861282969716705, "grad_norm": 2.2267212867736816, "learning_rate": 4.026375773363725e-05, "loss": 0.8736819458007813, "step": 1800 }, { "epoch": 0.6024096385542169, "grad_norm": 3.817021369934082, "learning_rate": 3.999240204059481e-05, "loss": 0.8818047332763672, "step": 1850 }, { "epoch": 0.6186909801367633, "grad_norm": 2.8244123458862305, "learning_rate": 3.972104634755237e-05, "loss": 0.8710990142822266, "step": 1900 }, { "epoch": 0.6349723217193096, "grad_norm": 2.787409782409668, "learning_rate": 3.9449690654509936e-05, "loss": 0.791876220703125, "step": 1950 }, { "epoch": 0.6512536633018561, "grad_norm": 2.5339832305908203, "learning_rate": 3.9178334961467495e-05, "loss": 0.8330724334716797, "step": 2000 }, { "epoch": 0.6675350048844024, "grad_norm": 2.2571518421173096, "learning_rate": 3.8906979268425054e-05, "loss": 0.8065113067626953, "step": 2050 }, { "epoch": 0.6838163464669489, "grad_norm": 3.3255224227905273, "learning_rate": 3.863562357538261e-05, "loss": 0.7839543151855469, "step": 2100 }, { "epoch": 0.7000976880494952, "grad_norm": 2.493654727935791, "learning_rate": 3.836426788234017e-05, "loss": 0.7902137756347656, "step": 2150 }, { "epoch": 0.7163790296320417, "grad_norm": 2.943366527557373, "learning_rate": 3.809291218929774e-05, "loss": 0.9532376861572266, "step": 2200 }, { "epoch": 0.732660371214588, "grad_norm": 2.404705762863159, "learning_rate": 3.7821556496255296e-05, "loss": 0.8227187347412109, "step": 2250 }, { "epoch": 0.7489417127971345, "grad_norm": 8.06905460357666, "learning_rate": 3.7550200803212855e-05, "loss": 0.7640556335449219, "step": 2300 }, { "epoch": 0.7652230543796809, "grad_norm": 3.540977954864502, "learning_rate": 3.7278845110170414e-05, "loss": 0.8362091064453125, "step": 2350 }, { "epoch": 0.7815043959622273, "grad_norm": 2.233323574066162, "learning_rate": 3.700748941712797e-05, "loss": 0.6893608093261718, "step": 2400 }, { "epoch": 0.7977857375447737, "grad_norm": 2.947315216064453, "learning_rate": 3.673613372408553e-05, "loss": 0.7564961242675782, "step": 2450 }, { "epoch": 0.8140670791273201, "grad_norm": 2.9839603900909424, "learning_rate": 3.64647780310431e-05, "loss": 0.7726463317871094, "step": 2500 }, { "epoch": 0.8303484207098665, "grad_norm": 2.638998508453369, "learning_rate": 3.6193422338000656e-05, "loss": 0.7850227355957031, "step": 2550 }, { "epoch": 0.8466297622924129, "grad_norm": 2.203768730163574, "learning_rate": 3.5922066644958215e-05, "loss": 0.7540821838378906, "step": 2600 }, { "epoch": 0.8629111038749593, "grad_norm": 2.7057082653045654, "learning_rate": 3.565071095191577e-05, "loss": 0.6677760314941407, "step": 2650 }, { "epoch": 0.8791924454575057, "grad_norm": 3.2892088890075684, "learning_rate": 3.537935525887333e-05, "loss": 0.74295654296875, "step": 2700 }, { "epoch": 0.8954737870400521, "grad_norm": 2.8778061866760254, "learning_rate": 3.510799956583089e-05, "loss": 0.7150550842285156, "step": 2750 }, { "epoch": 0.9117551286225986, "grad_norm": 1.9023234844207764, "learning_rate": 3.483664387278846e-05, "loss": 0.7367278289794922, "step": 2800 }, { "epoch": 0.9280364702051449, "grad_norm": 3.3899879455566406, "learning_rate": 3.4565288179746015e-05, "loss": 0.7095525360107422, "step": 2850 }, { "epoch": 0.9443178117876913, "grad_norm": 3.202036142349243, "learning_rate": 3.4293932486703574e-05, "loss": 0.7237194061279297, "step": 2900 }, { "epoch": 0.9605991533702377, "grad_norm": 2.44712233543396, "learning_rate": 3.402257679366113e-05, "loss": 0.710773696899414, "step": 2950 }, { "epoch": 0.9768804949527841, "grad_norm": 3.5873775482177734, "learning_rate": 3.375122110061869e-05, "loss": 0.6593586730957032, "step": 3000 }, { "epoch": 0.9931618365353305, "grad_norm": 2.8714234828948975, "learning_rate": 3.347986540757626e-05, "loss": 0.7627605438232422, "step": 3050 }, { "epoch": 1.0, "eval_bertscore_f1": 0.9653369394064688, "eval_bleu": 0.6270834635129311, "eval_loss": 0.48991522192955017, "eval_meteor": 0.7251021230424122, "eval_rouge1": 0.8465042416762141, "eval_rouge2": 0.738163460778114, "eval_runtime": 68.0737, "eval_samples_per_second": 18.979, "eval_steps_per_second": 2.38, "step": 3071 }, { "epoch": 1.009443178117877, "grad_norm": 3.2640202045440674, "learning_rate": 3.3208509714533816e-05, "loss": 0.5927775573730468, "step": 3100 }, { "epoch": 1.0257245197004232, "grad_norm": 3.130765914916992, "learning_rate": 3.2937154021491375e-05, "loss": 0.5853068161010743, "step": 3150 }, { "epoch": 1.0420058612829697, "grad_norm": 3.2238473892211914, "learning_rate": 3.2665798328448934e-05, "loss": 0.6931375885009765, "step": 3200 }, { "epoch": 1.0582872028655161, "grad_norm": 4.1798176765441895, "learning_rate": 3.239444263540649e-05, "loss": 0.6535150146484375, "step": 3250 }, { "epoch": 1.0745685444480626, "grad_norm": 3.4835116863250732, "learning_rate": 3.212308694236405e-05, "loss": 0.6570293426513671, "step": 3300 }, { "epoch": 1.0908498860306088, "grad_norm": 3.2468245029449463, "learning_rate": 3.185173124932162e-05, "loss": 0.6235344696044922, "step": 3350 }, { "epoch": 1.1071312276131553, "grad_norm": 2.503023862838745, "learning_rate": 3.1580375556279176e-05, "loss": 0.6021703720092774, "step": 3400 }, { "epoch": 1.1234125691957018, "grad_norm": 3.5487520694732666, "learning_rate": 3.1309019863236735e-05, "loss": 0.6459141540527343, "step": 3450 }, { "epoch": 1.1396939107782482, "grad_norm": 2.8496859073638916, "learning_rate": 3.1037664170194294e-05, "loss": 0.5954633712768554, "step": 3500 }, { "epoch": 1.1559752523607945, "grad_norm": 2.746445894241333, "learning_rate": 3.076630847715185e-05, "loss": 0.5743134689331054, "step": 3550 }, { "epoch": 1.172256593943341, "grad_norm": 3.843780517578125, "learning_rate": 3.0494952784109408e-05, "loss": 0.7025726318359375, "step": 3600 }, { "epoch": 1.1885379355258874, "grad_norm": 2.3990111351013184, "learning_rate": 3.0223597091066974e-05, "loss": 0.6482646942138672, "step": 3650 }, { "epoch": 1.2048192771084336, "grad_norm": 3.495655059814453, "learning_rate": 2.9952241398024532e-05, "loss": 0.6225572967529297, "step": 3700 }, { "epoch": 1.22110061869098, "grad_norm": 3.0918631553649902, "learning_rate": 2.968088570498209e-05, "loss": 0.6018490982055664, "step": 3750 }, { "epoch": 1.2373819602735265, "grad_norm": 3.54016375541687, "learning_rate": 2.940953001193965e-05, "loss": 0.6204871749877929, "step": 3800 }, { "epoch": 1.253663301856073, "grad_norm": 3.330631971359253, "learning_rate": 2.913817431889721e-05, "loss": 0.5625830459594726, "step": 3850 }, { "epoch": 1.2699446434386195, "grad_norm": 3.4091968536376953, "learning_rate": 2.8866818625854774e-05, "loss": 0.6275486755371094, "step": 3900 }, { "epoch": 1.2862259850211657, "grad_norm": 3.535207748413086, "learning_rate": 2.8595462932812333e-05, "loss": 0.6113796997070312, "step": 3950 }, { "epoch": 1.3025073266037122, "grad_norm": 2.739208459854126, "learning_rate": 2.8324107239769892e-05, "loss": 0.6166405487060547, "step": 4000 }, { "epoch": 1.3187886681862586, "grad_norm": 2.3887178897857666, "learning_rate": 2.805275154672745e-05, "loss": 0.6348526000976562, "step": 4050 }, { "epoch": 1.3350700097688049, "grad_norm": 3.2300209999084473, "learning_rate": 2.778139585368501e-05, "loss": 0.6592056274414062, "step": 4100 }, { "epoch": 1.3513513513513513, "grad_norm": 2.4417901039123535, "learning_rate": 2.751004016064257e-05, "loss": 0.5736191177368164, "step": 4150 }, { "epoch": 1.3676326929338978, "grad_norm": 4.1886467933654785, "learning_rate": 2.7238684467600134e-05, "loss": 0.5781734466552735, "step": 4200 }, { "epoch": 1.3839140345164442, "grad_norm": 2.7025551795959473, "learning_rate": 2.6967328774557693e-05, "loss": 0.5421427917480469, "step": 4250 }, { "epoch": 1.4001953760989905, "grad_norm": 3.4467735290527344, "learning_rate": 2.6695973081515252e-05, "loss": 0.6328504180908203, "step": 4300 }, { "epoch": 1.416476717681537, "grad_norm": 2.252255916595459, "learning_rate": 2.642461738847281e-05, "loss": 0.565279884338379, "step": 4350 }, { "epoch": 1.4327580592640834, "grad_norm": 2.3594324588775635, "learning_rate": 2.615326169543037e-05, "loss": 0.585950927734375, "step": 4400 }, { "epoch": 1.4490394008466296, "grad_norm": 3.1787843704223633, "learning_rate": 2.5881906002387928e-05, "loss": 0.6461568450927735, "step": 4450 }, { "epoch": 1.465320742429176, "grad_norm": 9.052631378173828, "learning_rate": 2.5610550309345494e-05, "loss": 0.5787173461914062, "step": 4500 }, { "epoch": 1.4816020840117226, "grad_norm": 3.1000287532806396, "learning_rate": 2.5339194616303053e-05, "loss": 0.5753350830078126, "step": 4550 }, { "epoch": 1.497883425594269, "grad_norm": 2.160932779312134, "learning_rate": 2.506783892326061e-05, "loss": 0.6055181503295899, "step": 4600 }, { "epoch": 1.5141647671768155, "grad_norm": 5.498105525970459, "learning_rate": 2.479648323021817e-05, "loss": 0.5424030303955079, "step": 4650 }, { "epoch": 1.530446108759362, "grad_norm": 2.4782474040985107, "learning_rate": 2.4525127537175733e-05, "loss": 0.6082788848876953, "step": 4700 }, { "epoch": 1.5467274503419082, "grad_norm": 2.7400150299072266, "learning_rate": 2.425377184413329e-05, "loss": 0.5984983444213867, "step": 4750 }, { "epoch": 1.5630087919244544, "grad_norm": 3.0426690578460693, "learning_rate": 2.398241615109085e-05, "loss": 0.6066116333007813, "step": 4800 }, { "epoch": 1.5792901335070009, "grad_norm": 3.5095133781433105, "learning_rate": 2.3711060458048412e-05, "loss": 0.605382080078125, "step": 4850 }, { "epoch": 1.5955714750895473, "grad_norm": 3.64323091506958, "learning_rate": 2.343970476500597e-05, "loss": 0.5372691726684571, "step": 4900 }, { "epoch": 1.6118528166720938, "grad_norm": 6.410864353179932, "learning_rate": 2.316834907196353e-05, "loss": 0.4930916976928711, "step": 4950 }, { "epoch": 1.6281341582546403, "grad_norm": 2.9752631187438965, "learning_rate": 2.2896993378921092e-05, "loss": 0.49088024139404296, "step": 5000 }, { "epoch": 1.6444154998371867, "grad_norm": 2.8982131481170654, "learning_rate": 2.262563768587865e-05, "loss": 0.5840103912353516, "step": 5050 }, { "epoch": 1.660696841419733, "grad_norm": 3.7222821712493896, "learning_rate": 2.235428199283621e-05, "loss": 0.5301944732666015, "step": 5100 }, { "epoch": 1.6769781830022794, "grad_norm": 3.526601791381836, "learning_rate": 2.2082926299793772e-05, "loss": 0.4781329345703125, "step": 5150 }, { "epoch": 1.6932595245848256, "grad_norm": 3.4005913734436035, "learning_rate": 2.181157060675133e-05, "loss": 0.5219943237304687, "step": 5200 }, { "epoch": 1.709540866167372, "grad_norm": 3.9888486862182617, "learning_rate": 2.154021491370889e-05, "loss": 0.5756942367553711, "step": 5250 }, { "epoch": 1.7258222077499186, "grad_norm": 3.6952855587005615, "learning_rate": 2.1268859220666452e-05, "loss": 0.5279730606079102, "step": 5300 }, { "epoch": 1.742103549332465, "grad_norm": 3.1715617179870605, "learning_rate": 2.099750352762401e-05, "loss": 0.5441674423217774, "step": 5350 }, { "epoch": 1.7583848909150115, "grad_norm": 3.5982584953308105, "learning_rate": 2.0726147834581573e-05, "loss": 0.46869205474853515, "step": 5400 }, { "epoch": 1.774666232497558, "grad_norm": 3.594470977783203, "learning_rate": 2.0454792141539132e-05, "loss": 0.5004570388793945, "step": 5450 }, { "epoch": 1.7909475740801042, "grad_norm": 3.198012351989746, "learning_rate": 2.018343644849669e-05, "loss": 0.49389095306396485, "step": 5500 }, { "epoch": 1.8072289156626506, "grad_norm": 2.3895151615142822, "learning_rate": 1.9912080755454253e-05, "loss": 0.5188541793823243, "step": 5550 }, { "epoch": 1.8235102572451969, "grad_norm": 2.874993085861206, "learning_rate": 1.964072506241181e-05, "loss": 0.4755914306640625, "step": 5600 }, { "epoch": 1.8397915988277433, "grad_norm": 4.330140590667725, "learning_rate": 1.936936936936937e-05, "loss": 0.49986125946044924, "step": 5650 }, { "epoch": 1.8560729404102898, "grad_norm": 3.2301809787750244, "learning_rate": 1.9098013676326933e-05, "loss": 0.5472452163696289, "step": 5700 }, { "epoch": 1.8723542819928363, "grad_norm": 2.056736946105957, "learning_rate": 1.883208509714534e-05, "loss": 0.5061603164672852, "step": 5750 }, { "epoch": 1.8886356235753827, "grad_norm": 4.6902031898498535, "learning_rate": 1.85607294041029e-05, "loss": 0.4669316101074219, "step": 5800 }, { "epoch": 1.904916965157929, "grad_norm": 3.790092945098877, "learning_rate": 1.828937371106046e-05, "loss": 0.561137809753418, "step": 5850 }, { "epoch": 1.9211983067404754, "grad_norm": 4.152039527893066, "learning_rate": 1.801801801801802e-05, "loss": 0.4813918304443359, "step": 5900 }, { "epoch": 1.9374796483230217, "grad_norm": 3.3476598262786865, "learning_rate": 1.774666232497558e-05, "loss": 0.5630344390869141, "step": 5950 }, { "epoch": 1.9537609899055681, "grad_norm": 4.2672810554504395, "learning_rate": 1.747530663193314e-05, "loss": 0.48508411407470703, "step": 6000 }, { "epoch": 1.9700423314881146, "grad_norm": 4.236985206604004, "learning_rate": 1.72039509388907e-05, "loss": 0.5445558929443359, "step": 6050 }, { "epoch": 1.986323673070661, "grad_norm": 2.686180591583252, "learning_rate": 1.693259524584826e-05, "loss": 0.5194969558715821, "step": 6100 }, { "epoch": 2.0, "eval_bertscore_f1": 0.9755530517905858, "eval_bleu": 0.7363057302997511, "eval_loss": 0.3618590235710144, "eval_meteor": 0.813260581053782, "eval_rouge1": 0.8844645577727277, "eval_rouge2": 0.8050353100012327, "eval_runtime": 70.0732, "eval_samples_per_second": 18.438, "eval_steps_per_second": 2.312, "step": 6142 }, { "epoch": 2.0026050146532075, "grad_norm": 2.022204637527466, "learning_rate": 1.666123955280582e-05, "loss": 0.48952743530273435, "step": 6150 }, { "epoch": 2.018886356235754, "grad_norm": 4.96242094039917, "learning_rate": 1.638988385976338e-05, "loss": 0.5839331436157227, "step": 6200 }, { "epoch": 2.0351676978183004, "grad_norm": 3.4074771404266357, "learning_rate": 1.611852816672094e-05, "loss": 0.5070013427734374, "step": 6250 }, { "epoch": 2.0514490394008464, "grad_norm": 3.10239577293396, "learning_rate": 1.58471724736785e-05, "loss": 0.4913197708129883, "step": 6300 }, { "epoch": 2.067730380983393, "grad_norm": 3.764558792114258, "learning_rate": 1.557581678063606e-05, "loss": 0.4683738327026367, "step": 6350 }, { "epoch": 2.0840117225659394, "grad_norm": 4.150667667388916, "learning_rate": 1.5304461087593617e-05, "loss": 0.4650471878051758, "step": 6400 }, { "epoch": 2.100293064148486, "grad_norm": 3.9944324493408203, "learning_rate": 1.5033105394551178e-05, "loss": 0.5024824905395507, "step": 6450 }, { "epoch": 2.1165744057310323, "grad_norm": 2.410952568054199, "learning_rate": 1.476174970150874e-05, "loss": 0.5205254745483399, "step": 6500 }, { "epoch": 2.1328557473135787, "grad_norm": 4.4830098152160645, "learning_rate": 1.4490394008466299e-05, "loss": 0.5458049011230469, "step": 6550 }, { "epoch": 2.149137088896125, "grad_norm": 3.420327663421631, "learning_rate": 1.4219038315423858e-05, "loss": 0.544830436706543, "step": 6600 }, { "epoch": 2.165418430478671, "grad_norm": 4.262825012207031, "learning_rate": 1.394768262238142e-05, "loss": 0.4901109313964844, "step": 6650 }, { "epoch": 2.1816997720612177, "grad_norm": 2.969730854034424, "learning_rate": 1.3676326929338979e-05, "loss": 0.48183216094970704, "step": 6700 }, { "epoch": 2.197981113643764, "grad_norm": 2.7617075443267822, "learning_rate": 1.3404971236296538e-05, "loss": 0.5208282470703125, "step": 6750 }, { "epoch": 2.2142624552263106, "grad_norm": 2.8121178150177, "learning_rate": 1.31336155432541e-05, "loss": 0.47464847564697266, "step": 6800 }, { "epoch": 2.230543796808857, "grad_norm": 2.1643424034118652, "learning_rate": 1.2862259850211659e-05, "loss": 0.5135415267944335, "step": 6850 }, { "epoch": 2.2468251383914035, "grad_norm": 3.0597665309906006, "learning_rate": 1.2590904157169217e-05, "loss": 0.48383502960205077, "step": 6900 }, { "epoch": 2.26310647997395, "grad_norm": 3.4192488193511963, "learning_rate": 1.2319548464126778e-05, "loss": 0.5295528411865235, "step": 6950 }, { "epoch": 2.2793878215564964, "grad_norm": 3.485333204269409, "learning_rate": 1.2048192771084338e-05, "loss": 0.5490006637573243, "step": 7000 }, { "epoch": 2.295669163139043, "grad_norm": 3.5061099529266357, "learning_rate": 1.1776837078041899e-05, "loss": 0.444782600402832, "step": 7050 }, { "epoch": 2.311950504721589, "grad_norm": 4.059643745422363, "learning_rate": 1.1505481384999458e-05, "loss": 0.4735762786865234, "step": 7100 }, { "epoch": 2.3282318463041354, "grad_norm": 3.1162891387939453, "learning_rate": 1.1234125691957018e-05, "loss": 0.5211288452148437, "step": 7150 }, { "epoch": 2.344513187886682, "grad_norm": 1.198476791381836, "learning_rate": 1.0962769998914577e-05, "loss": 0.5121672439575196, "step": 7200 }, { "epoch": 2.3607945294692283, "grad_norm": 3.9411354064941406, "learning_rate": 1.0691414305872138e-05, "loss": 0.5504902267456054, "step": 7250 }, { "epoch": 2.3770758710517748, "grad_norm": 3.590696334838867, "learning_rate": 1.0420058612829696e-05, "loss": 0.4592051315307617, "step": 7300 }, { "epoch": 2.393357212634321, "grad_norm": 2.1098175048828125, "learning_rate": 1.0148702919787257e-05, "loss": 0.4932923126220703, "step": 7350 }, { "epoch": 2.4096385542168672, "grad_norm": 4.837367057800293, "learning_rate": 9.877347226744818e-06, "loss": 0.45726318359375, "step": 7400 }, { "epoch": 2.4259198957994137, "grad_norm": 2.808544874191284, "learning_rate": 9.605991533702376e-06, "loss": 0.4931900787353516, "step": 7450 }, { "epoch": 2.44220123738196, "grad_norm": 2.6487984657287598, "learning_rate": 9.334635840659937e-06, "loss": 0.4715615844726562, "step": 7500 }, { "epoch": 2.4584825789645066, "grad_norm": 4.251109600067139, "learning_rate": 9.063280147617497e-06, "loss": 0.5373792266845703, "step": 7550 }, { "epoch": 2.474763920547053, "grad_norm": 3.84010648727417, "learning_rate": 8.791924454575056e-06, "loss": 0.44632495880126954, "step": 7600 }, { "epoch": 2.4910452621295995, "grad_norm": 1.9418392181396484, "learning_rate": 8.520568761532617e-06, "loss": 0.48151702880859376, "step": 7650 }, { "epoch": 2.507326603712146, "grad_norm": 4.140622138977051, "learning_rate": 8.249213068490177e-06, "loss": 0.4063055419921875, "step": 7700 }, { "epoch": 2.5236079452946925, "grad_norm": 3.0216522216796875, "learning_rate": 7.977857375447738e-06, "loss": 0.4796050262451172, "step": 7750 }, { "epoch": 2.539889286877239, "grad_norm": 4.727103233337402, "learning_rate": 7.706501682405297e-06, "loss": 0.46068046569824217, "step": 7800 }, { "epoch": 2.556170628459785, "grad_norm": 4.281773567199707, "learning_rate": 7.435145989362857e-06, "loss": 0.44071575164794924, "step": 7850 }, { "epoch": 2.5724519700423314, "grad_norm": 3.134763479232788, "learning_rate": 7.163790296320418e-06, "loss": 0.4763399887084961, "step": 7900 }, { "epoch": 2.588733311624878, "grad_norm": 3.584044933319092, "learning_rate": 6.8924346032779764e-06, "loss": 0.4629644012451172, "step": 7950 }, { "epoch": 2.6050146532074243, "grad_norm": 2.601400852203369, "learning_rate": 6.621078910235537e-06, "loss": 0.4727302551269531, "step": 8000 }, { "epoch": 2.6212959947899708, "grad_norm": 3.5354995727539062, "learning_rate": 6.3497232171930975e-06, "loss": 0.42160026550292967, "step": 8050 }, { "epoch": 2.6375773363725172, "grad_norm": 2.9206888675689697, "learning_rate": 6.078367524150657e-06, "loss": 0.4754294204711914, "step": 8100 }, { "epoch": 2.6538586779550632, "grad_norm": 2.4927732944488525, "learning_rate": 5.807011831108217e-06, "loss": 0.5114262390136719, "step": 8150 }, { "epoch": 2.6701400195376097, "grad_norm": 4.378971099853516, "learning_rate": 5.535656138065777e-06, "loss": 0.5084254837036133, "step": 8200 }, { "epoch": 2.686421361120156, "grad_norm": 2.4034016132354736, "learning_rate": 5.264300445023337e-06, "loss": 0.527303466796875, "step": 8250 }, { "epoch": 2.7027027027027026, "grad_norm": 3.7141177654266357, "learning_rate": 4.9929447519808975e-06, "loss": 0.4662747573852539, "step": 8300 }, { "epoch": 2.718984044285249, "grad_norm": 3.871277332305908, "learning_rate": 4.721589058938457e-06, "loss": 0.5126468276977539, "step": 8350 }, { "epoch": 2.7352653858677956, "grad_norm": 2.500791072845459, "learning_rate": 4.450233365896017e-06, "loss": 0.47957534790039064, "step": 8400 }, { "epoch": 2.751546727450342, "grad_norm": 5.441941738128662, "learning_rate": 4.1788776728535765e-06, "loss": 0.38029510498046876, "step": 8450 }, { "epoch": 2.7678280690328885, "grad_norm": 3.3940446376800537, "learning_rate": 3.907521979811136e-06, "loss": 0.4626531219482422, "step": 8500 }, { "epoch": 2.784109410615435, "grad_norm": 4.125059127807617, "learning_rate": 3.6361662867686967e-06, "loss": 0.4890303039550781, "step": 8550 }, { "epoch": 2.800390752197981, "grad_norm": 2.758863687515259, "learning_rate": 3.3648105937262564e-06, "loss": 0.4689041519165039, "step": 8600 }, { "epoch": 2.8166720937805274, "grad_norm": 4.864498138427734, "learning_rate": 3.0934549006838165e-06, "loss": 0.46032047271728516, "step": 8650 }, { "epoch": 2.832953435363074, "grad_norm": 3.3108010292053223, "learning_rate": 2.8220992076413766e-06, "loss": 0.43362377166748045, "step": 8700 }, { "epoch": 2.8492347769456203, "grad_norm": 2.3421084880828857, "learning_rate": 2.5507435145989362e-06, "loss": 0.44478134155273436, "step": 8750 }, { "epoch": 2.865516118528167, "grad_norm": 3.283203601837158, "learning_rate": 2.2793878215564963e-06, "loss": 0.5047480392456055, "step": 8800 }, { "epoch": 2.8817974601107132, "grad_norm": 2.0124731063842773, "learning_rate": 2.0080321285140564e-06, "loss": 0.4658950424194336, "step": 8850 }, { "epoch": 2.8980788016932593, "grad_norm": 3.839552879333496, "learning_rate": 1.7366764354716163e-06, "loss": 0.45034191131591794, "step": 8900 }, { "epoch": 2.9143601432758057, "grad_norm": 4.701524257659912, "learning_rate": 1.4653207424291762e-06, "loss": 0.47517498016357423, "step": 8950 }, { "epoch": 2.930641484858352, "grad_norm": 6.58011531829834, "learning_rate": 1.1939650493867363e-06, "loss": 0.44451316833496096, "step": 9000 }, { "epoch": 2.9469228264408986, "grad_norm": 2.9627132415771484, "learning_rate": 9.226093563442963e-07, "loss": 0.41320926666259766, "step": 9050 }, { "epoch": 2.963204168023445, "grad_norm": 3.003753185272217, "learning_rate": 6.51253663301856e-07, "loss": 0.3974274444580078, "step": 9100 }, { "epoch": 2.9794855096059916, "grad_norm": 2.0012876987457275, "learning_rate": 3.7989797025941607e-07, "loss": 0.42885406494140627, "step": 9150 }, { "epoch": 2.995766851188538, "grad_norm": 3.7651121616363525, "learning_rate": 1.0854227721697602e-07, "loss": 0.4800850296020508, "step": 9200 }, { "epoch": 3.0, "eval_bertscore_f1": 0.9785511039727982, "eval_bleu": 0.7645620244248046, "eval_loss": 0.3346184194087982, "eval_meteor": 0.8355226256477348, "eval_rouge1": 0.8968326891869934, "eval_rouge2": 0.8250429516845066, "eval_runtime": 68.0941, "eval_samples_per_second": 18.974, "eval_steps_per_second": 2.379, "step": 9213 } ], "logging_steps": 50, "max_steps": 9213, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.263887217557504e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }