diff --git "a/checkpoint-7665/trainer_state.json" "b/checkpoint-7665/trainer_state.json" --- "a/checkpoint-7665/trainer_state.json" +++ "b/checkpoint-7665/trainer_state.json" @@ -11,5749 +11,5749 @@ "log_history": [ { "epoch": 0.045662100456621, - "grad_norm": 3.6685774326324463, + "grad_norm": 7.013392925262451, "learning_rate": 4.994129158512721e-05, - "loss": 3.1406, + "loss": 2.6272, "step": 10 }, { "epoch": 0.091324200913242, - "grad_norm": 1.5006240606307983, + "grad_norm": 2.9674174785614014, "learning_rate": 4.987606001304632e-05, - "loss": 1.6914, + "loss": 0.6549, "step": 20 }, { "epoch": 0.136986301369863, - "grad_norm": 1.5749850273132324, + "grad_norm": 2.2170982360839844, "learning_rate": 4.981082844096543e-05, - "loss": 1.0225, + "loss": 0.274, "step": 30 }, { "epoch": 0.182648401826484, - "grad_norm": 1.263222575187683, + "grad_norm": 1.2071510553359985, "learning_rate": 4.974559686888454e-05, - "loss": 0.5903, + "loss": 0.1543, "step": 40 }, { "epoch": 0.228310502283105, - "grad_norm": 1.1013864278793335, + "grad_norm": 0.8103004693984985, "learning_rate": 4.9680365296803655e-05, - "loss": 0.3666, + "loss": 0.146, "step": 50 }, { "epoch": 0.273972602739726, - "grad_norm": 0.7766101956367493, + "grad_norm": 0.8910240530967712, "learning_rate": 4.961513372472277e-05, - "loss": 0.2453, + "loss": 0.1213, "step": 60 }, { "epoch": 0.319634703196347, - "grad_norm": 0.699148416519165, + "grad_norm": 1.551806926727295, "learning_rate": 4.954990215264188e-05, - "loss": 0.166, + "loss": 0.1473, "step": 70 }, { "epoch": 0.365296803652968, - "grad_norm": 0.8860347270965576, + "grad_norm": 0.5918143391609192, "learning_rate": 4.948467058056099e-05, - "loss": 0.1778, + "loss": 0.0856, "step": 80 }, { "epoch": 0.410958904109589, - "grad_norm": 1.2396528720855713, + "grad_norm": 1.8692866563796997, "learning_rate": 4.9419439008480104e-05, - "loss": 0.169, + "loss": 0.0968, "step": 90 }, { "epoch": 0.45662100456621, - "grad_norm": 0.5089202523231506, + "grad_norm": 1.902163028717041, "learning_rate": 4.9354207436399216e-05, - "loss": 0.1386, + "loss": 0.0993, "step": 100 }, { "epoch": 0.502283105022831, - "grad_norm": 4.915615081787109, + "grad_norm": 0.6189514398574829, "learning_rate": 4.9288975864318335e-05, - "loss": 0.1191, + "loss": 0.0896, "step": 110 }, { "epoch": 0.547945205479452, - "grad_norm": 0.44728991389274597, + "grad_norm": 0.9506679773330688, "learning_rate": 4.922374429223745e-05, - "loss": 0.1137, + "loss": 0.0816, "step": 120 }, { "epoch": 0.593607305936073, - "grad_norm": 0.48462724685668945, + "grad_norm": 1.173135757446289, "learning_rate": 4.915851272015656e-05, - "loss": 0.1289, + "loss": 0.0794, "step": 130 }, { "epoch": 0.639269406392694, - "grad_norm": 0.428950697183609, + "grad_norm": 0.889118492603302, "learning_rate": 4.909328114807567e-05, - "loss": 0.1206, + "loss": 0.0592, "step": 140 }, { "epoch": 0.684931506849315, - "grad_norm": 0.41929879784584045, + "grad_norm": 0.6385554671287537, "learning_rate": 4.9028049575994784e-05, - "loss": 0.0686, + "loss": 0.052, "step": 150 }, { "epoch": 0.730593607305936, - "grad_norm": 0.4039075970649719, + "grad_norm": 0.889135479927063, "learning_rate": 4.8962818003913896e-05, - "loss": 0.0717, + "loss": 0.0607, "step": 160 }, { "epoch": 0.776255707762557, - "grad_norm": 0.9115980863571167, + "grad_norm": 5.079071998596191, "learning_rate": 4.889758643183301e-05, - "loss": 0.1013, + "loss": 0.0888, "step": 170 }, { "epoch": 0.821917808219178, - "grad_norm": 0.37252315878868103, + "grad_norm": 4.792678356170654, "learning_rate": 4.883235485975212e-05, - "loss": 0.0831, + "loss": 0.0767, "step": 180 }, { "epoch": 0.867579908675799, - "grad_norm": 0.594161331653595, + "grad_norm": 0.6212195754051208, "learning_rate": 4.876712328767123e-05, - "loss": 0.0694, + "loss": 0.121, "step": 190 }, { "epoch": 0.91324200913242, - "grad_norm": 0.3747114837169647, + "grad_norm": 0.9692168235778809, "learning_rate": 4.8701891715590345e-05, - "loss": 0.09, + "loss": 0.0716, "step": 200 }, { "epoch": 0.958904109589041, - "grad_norm": 0.3669376075267792, + "grad_norm": 0.5348126888275146, "learning_rate": 4.8636660143509464e-05, - "loss": 0.0591, + "loss": 0.0828, "step": 210 }, { "epoch": 1.0, - "eval_bertscore_f1": 0.8792882986807934, - "eval_bleu": 0.7712648078243911, - "eval_loss": 0.057190317660570145, - "eval_rougeL": 0.29946539136986206, - "eval_runtime": 99.9649, - "eval_samples_per_second": 15.035, - "eval_steps_per_second": 0.94, + "eval_bertscore_f1": 0.8785514626912252, + "eval_bleu": 0.6709298648976769, + "eval_loss": 0.059928230941295624, + "eval_rougeL": 0.2992309920195003, + "eval_runtime": 78.4856, + "eval_samples_per_second": 19.15, + "eval_steps_per_second": 1.198, "step": 219 }, { "epoch": 1.004566210045662, - "grad_norm": 1.0320205688476562, + "grad_norm": 0.592220664024353, "learning_rate": 4.8571428571428576e-05, - "loss": 0.0721, + "loss": 0.1077, "step": 220 }, { "epoch": 1.0502283105022832, - "grad_norm": 0.32598379254341125, + "grad_norm": 0.6266194581985474, "learning_rate": 4.850619699934769e-05, - "loss": 0.0588, + "loss": 0.0584, "step": 230 }, { "epoch": 1.095890410958904, - "grad_norm": 0.3275991678237915, + "grad_norm": 0.7085559964179993, "learning_rate": 4.84409654272668e-05, - "loss": 0.0578, + "loss": 0.0663, "step": 240 }, { "epoch": 1.1415525114155252, - "grad_norm": 0.32213452458381653, + "grad_norm": 0.4316001832485199, "learning_rate": 4.837573385518591e-05, - "loss": 0.0804, + "loss": 0.0528, "step": 250 }, { "epoch": 1.187214611872146, - "grad_norm": 0.22395305335521698, + "grad_norm": 0.4922642707824707, "learning_rate": 4.8310502283105025e-05, - "loss": 0.0618, + "loss": 0.0502, "step": 260 }, { "epoch": 1.2328767123287672, - "grad_norm": 0.2619684338569641, + "grad_norm": 0.4204493761062622, "learning_rate": 4.824527071102414e-05, - "loss": 0.1243, + "loss": 0.0567, "step": 270 }, { "epoch": 1.278538812785388, - "grad_norm": 0.4555642604827881, + "grad_norm": 0.3499491810798645, "learning_rate": 4.818003913894325e-05, - "loss": 0.0678, + "loss": 0.055, "step": 280 }, { "epoch": 1.3242009132420092, - "grad_norm": 0.39453741908073425, + "grad_norm": 0.4341481328010559, "learning_rate": 4.811480756686236e-05, - "loss": 0.0625, + "loss": 0.0462, "step": 290 }, { "epoch": 1.36986301369863, - "grad_norm": 0.5136148929595947, + "grad_norm": 0.7632847428321838, "learning_rate": 4.804957599478147e-05, - "loss": 0.0607, + "loss": 0.0578, "step": 300 }, { "epoch": 1.4155251141552512, - "grad_norm": 0.2983837425708771, + "grad_norm": 0.9663671255111694, "learning_rate": 4.798434442270059e-05, - "loss": 0.0582, + "loss": 0.0421, "step": 310 }, { "epoch": 1.461187214611872, - "grad_norm": 0.21343404054641724, + "grad_norm": 0.42978063225746155, "learning_rate": 4.7919112850619704e-05, - "loss": 0.0532, + "loss": 0.0489, "step": 320 }, { "epoch": 1.5068493150684932, - "grad_norm": 0.25173795223236084, + "grad_norm": 0.4420313239097595, "learning_rate": 4.7853881278538817e-05, - "loss": 0.0437, + "loss": 0.0572, "step": 330 }, { "epoch": 1.5525114155251143, - "grad_norm": 0.631598949432373, + "grad_norm": 0.5001460909843445, "learning_rate": 4.778864970645793e-05, - "loss": 0.1192, + "loss": 0.0526, "step": 340 }, { "epoch": 1.5981735159817352, - "grad_norm": 0.2316834181547165, + "grad_norm": 0.3782072961330414, "learning_rate": 4.772341813437704e-05, - "loss": 0.0566, + "loss": 0.0656, "step": 350 }, { "epoch": 1.643835616438356, - "grad_norm": 0.15093661844730377, + "grad_norm": 0.7828798294067383, "learning_rate": 4.765818656229615e-05, - "loss": 0.0772, + "loss": 0.0545, "step": 360 }, { "epoch": 1.6894977168949772, - "grad_norm": 0.21192322671413422, + "grad_norm": 0.477966845035553, "learning_rate": 4.7592954990215265e-05, - "loss": 0.0812, + "loss": 0.0595, "step": 370 }, { "epoch": 1.7351598173515983, - "grad_norm": 0.21022455394268036, + "grad_norm": 0.3790488541126251, "learning_rate": 4.752772341813438e-05, - "loss": 0.0624, + "loss": 0.0784, "step": 380 }, { "epoch": 1.7808219178082192, - "grad_norm": 0.2214512974023819, + "grad_norm": 0.551754355430603, "learning_rate": 4.746249184605349e-05, - "loss": 0.0714, + "loss": 0.071, "step": 390 }, { "epoch": 1.82648401826484, - "grad_norm": 0.5700469017028809, + "grad_norm": 0.34281566739082336, "learning_rate": 4.73972602739726e-05, - "loss": 0.0503, + "loss": 0.0405, "step": 400 }, { "epoch": 1.8721461187214612, - "grad_norm": 0.15681976079940796, + "grad_norm": 1.1913338899612427, "learning_rate": 4.733202870189172e-05, - "loss": 0.0601, + "loss": 0.0731, "step": 410 }, { "epoch": 1.9178082191780823, - "grad_norm": 0.5676538348197937, + "grad_norm": 0.34058138728141785, "learning_rate": 4.726679712981083e-05, - "loss": 0.0539, + "loss": 0.0564, "step": 420 }, { "epoch": 1.9634703196347032, - "grad_norm": 0.26402613520622253, + "grad_norm": 0.311128169298172, "learning_rate": 4.7201565557729945e-05, - "loss": 0.0459, + "loss": 0.0534, "step": 430 }, { "epoch": 2.0, - "eval_bertscore_f1": 0.8805641725471, - "eval_bleu": 0.5660743945408303, - "eval_loss": 0.043494194746017456, - "eval_rougeL": 0.31573197270677456, - "eval_runtime": 86.9461, - "eval_samples_per_second": 17.287, - "eval_steps_per_second": 1.081, + "eval_bertscore_f1": 0.8737317200351062, + "eval_bleu": 0.4862796407603182, + "eval_loss": 0.04997913911938667, + "eval_rougeL": 0.3070874128312525, + "eval_runtime": 65.3099, + "eval_samples_per_second": 23.013, + "eval_steps_per_second": 1.439, "step": 438 }, { "epoch": 2.009132420091324, - "grad_norm": 0.8411938548088074, + "grad_norm": 0.5538708567619324, "learning_rate": 4.713633398564906e-05, - "loss": 0.097, + "loss": 0.0767, "step": 440 }, { "epoch": 2.0547945205479454, - "grad_norm": 0.20442543923854828, + "grad_norm": 0.40191444754600525, "learning_rate": 4.707110241356817e-05, - "loss": 0.0657, + "loss": 0.0413, "step": 450 }, { "epoch": 2.1004566210045663, - "grad_norm": 0.20288924872875214, + "grad_norm": 0.5775645971298218, "learning_rate": 4.700587084148728e-05, - "loss": 0.0548, + "loss": 0.0527, "step": 460 }, { "epoch": 2.146118721461187, - "grad_norm": 0.3183929920196533, + "grad_norm": 0.6470644474029541, "learning_rate": 4.6940639269406394e-05, - "loss": 0.0465, + "loss": 0.0467, "step": 470 }, { "epoch": 2.191780821917808, - "grad_norm": 0.28004202246665955, + "grad_norm": 1.177239179611206, "learning_rate": 4.6875407697325506e-05, - "loss": 0.0622, + "loss": 0.0526, "step": 480 }, { "epoch": 2.237442922374429, - "grad_norm": 0.33859482407569885, + "grad_norm": 0.42161795496940613, "learning_rate": 4.681017612524462e-05, - "loss": 0.0521, + "loss": 0.0549, "step": 490 }, { "epoch": 2.2831050228310503, - "grad_norm": 0.17048068344593048, + "grad_norm": 0.24745285511016846, "learning_rate": 4.674494455316373e-05, - "loss": 0.0662, + "loss": 0.04, "step": 500 }, { "epoch": 2.328767123287671, - "grad_norm": 0.18371596932411194, + "grad_norm": 0.301503449678421, "learning_rate": 4.667971298108285e-05, - "loss": 0.0718, + "loss": 0.0599, "step": 510 }, { "epoch": 2.374429223744292, - "grad_norm": 0.15471260249614716, + "grad_norm": 0.8956980109214783, "learning_rate": 4.661448140900196e-05, - "loss": 0.0527, + "loss": 0.0622, "step": 520 }, { "epoch": 2.4200913242009134, - "grad_norm": 0.25129422545433044, + "grad_norm": 0.32675889134407043, "learning_rate": 4.6549249836921074e-05, - "loss": 0.0644, + "loss": 0.0398, "step": 530 }, { "epoch": 2.4657534246575343, - "grad_norm": 0.1418524533510208, + "grad_norm": 0.32268226146698, "learning_rate": 4.6484018264840186e-05, - "loss": 0.0415, + "loss": 0.0423, "step": 540 }, { "epoch": 2.5114155251141552, - "grad_norm": 0.14907291531562805, + "grad_norm": 0.3572223484516144, "learning_rate": 4.64187866927593e-05, - "loss": 0.0369, + "loss": 0.0433, "step": 550 }, { "epoch": 2.557077625570776, - "grad_norm": 0.2596917748451233, + "grad_norm": 0.32862910628318787, "learning_rate": 4.635355512067841e-05, - "loss": 0.0492, + "loss": 0.0471, "step": 560 }, { "epoch": 2.602739726027397, - "grad_norm": 0.4573097229003906, + "grad_norm": 0.413512259721756, "learning_rate": 4.628832354859752e-05, - "loss": 0.0542, + "loss": 0.0575, "step": 570 }, { "epoch": 2.6484018264840183, - "grad_norm": 0.17177747189998627, + "grad_norm": 0.6657458543777466, "learning_rate": 4.6223091976516634e-05, - "loss": 0.0492, + "loss": 0.0463, "step": 580 }, { "epoch": 2.6940639269406392, - "grad_norm": 0.160361185669899, + "grad_norm": 0.3851383626461029, "learning_rate": 4.6157860404435747e-05, - "loss": 0.0574, + "loss": 0.0543, "step": 590 }, { "epoch": 2.73972602739726, - "grad_norm": 0.14502376317977905, + "grad_norm": 0.5988168716430664, "learning_rate": 4.609262883235486e-05, - "loss": 0.0487, + "loss": 0.0546, "step": 600 }, { "epoch": 2.7853881278538815, - "grad_norm": 0.15075726807117462, + "grad_norm": 0.45219168066978455, "learning_rate": 4.602739726027398e-05, - "loss": 0.0584, + "loss": 0.0472, "step": 610 }, { "epoch": 2.8310502283105023, - "grad_norm": 0.29456523060798645, + "grad_norm": 0.3951447010040283, "learning_rate": 4.596216568819309e-05, - "loss": 0.0661, + "loss": 0.0478, "step": 620 }, { "epoch": 2.8767123287671232, - "grad_norm": 0.20278586447238922, + "grad_norm": 0.2301015555858612, "learning_rate": 4.58969341161122e-05, - "loss": 0.0463, + "loss": 0.041, "step": 630 }, { "epoch": 2.922374429223744, - "grad_norm": 0.12769503891468048, + "grad_norm": 0.556537926197052, "learning_rate": 4.5831702544031314e-05, - "loss": 0.0496, + "loss": 0.0386, "step": 640 }, { "epoch": 2.968036529680365, - "grad_norm": 0.2201366275548935, + "grad_norm": 1.080448865890503, "learning_rate": 4.5766470971950426e-05, - "loss": 0.0376, + "loss": 0.0499, "step": 650 }, { "epoch": 3.0, - "eval_bertscore_f1": 0.8785978137693322, - "eval_bleu": 0.7393077025329909, - "eval_loss": 0.040518876165151596, - "eval_rougeL": 0.298014790853214, - "eval_runtime": 85.5219, - "eval_samples_per_second": 17.574, - "eval_steps_per_second": 1.099, + "eval_bertscore_f1": 0.8748933539260488, + "eval_bleu": 0.9425275721886326, + "eval_loss": 0.04728687182068825, + "eval_rougeL": 0.28689011243059326, + "eval_runtime": 65.6139, + "eval_samples_per_second": 22.907, + "eval_steps_per_second": 1.433, "step": 657 }, { "epoch": 3.0136986301369864, - "grad_norm": 0.4501720666885376, + "grad_norm": 0.2687000334262848, "learning_rate": 4.570123939986954e-05, - "loss": 0.0526, + "loss": 0.0616, "step": 660 }, { "epoch": 3.0593607305936072, - "grad_norm": 0.18561075627803802, + "grad_norm": 0.45356887578964233, "learning_rate": 4.563600782778865e-05, - "loss": 0.048, + "loss": 0.0462, "step": 670 }, { "epoch": 3.105022831050228, - "grad_norm": 0.13908711075782776, + "grad_norm": 0.24934875965118408, "learning_rate": 4.557077625570776e-05, - "loss": 0.0445, + "loss": 0.0412, "step": 680 }, { "epoch": 3.1506849315068495, - "grad_norm": 0.1476353406906128, + "grad_norm": 0.3431055545806885, "learning_rate": 4.5505544683626875e-05, - "loss": 0.0587, + "loss": 0.0391, "step": 690 }, { "epoch": 3.1963470319634704, - "grad_norm": 0.19854065775871277, + "grad_norm": 0.46532484889030457, "learning_rate": 4.544031311154599e-05, - "loss": 0.041, + "loss": 0.0571, "step": 700 }, { "epoch": 3.2420091324200913, - "grad_norm": 0.17018119990825653, + "grad_norm": 0.24521034955978394, "learning_rate": 4.5375081539465106e-05, - "loss": 0.0355, + "loss": 0.0479, "step": 710 }, { "epoch": 3.287671232876712, - "grad_norm": 0.45481783151626587, + "grad_norm": 0.22706200182437897, "learning_rate": 4.530984996738422e-05, - "loss": 0.0647, + "loss": 0.04, "step": 720 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.13204658031463623, + "grad_norm": 0.4912814199924469, "learning_rate": 4.524461839530333e-05, - "loss": 0.0533, + "loss": 0.0399, "step": 730 }, { "epoch": 3.3789954337899544, - "grad_norm": 0.1771598607301712, + "grad_norm": 0.3373461365699768, "learning_rate": 4.517938682322244e-05, - "loss": 0.0407, + "loss": 0.0402, "step": 740 }, { "epoch": 3.4246575342465753, - "grad_norm": 0.32262006402015686, + "grad_norm": 0.6649588942527771, "learning_rate": 4.5114155251141555e-05, - "loss": 0.0361, + "loss": 0.0391, "step": 750 }, { "epoch": 3.470319634703196, - "grad_norm": 0.16268010437488556, + "grad_norm": 0.34905117750167847, "learning_rate": 4.504892367906067e-05, - "loss": 0.0401, + "loss": 0.0482, "step": 760 }, { "epoch": 3.5159817351598175, - "grad_norm": 0.15328466892242432, + "grad_norm": 0.24569763243198395, "learning_rate": 4.498369210697978e-05, - "loss": 0.068, + "loss": 0.0359, "step": 770 }, { "epoch": 3.5616438356164384, - "grad_norm": 0.11483100056648254, + "grad_norm": 0.294628769159317, "learning_rate": 4.491846053489889e-05, - "loss": 0.0405, + "loss": 0.0429, "step": 780 }, { "epoch": 3.6073059360730593, - "grad_norm": 0.12601321935653687, + "grad_norm": 0.23211832344532013, "learning_rate": 4.4853228962818004e-05, - "loss": 0.0552, + "loss": 0.0328, "step": 790 }, { "epoch": 3.65296803652968, - "grad_norm": 0.141522616147995, + "grad_norm": 0.19963033497333527, "learning_rate": 4.4787997390737116e-05, - "loss": 0.0542, + "loss": 0.0397, "step": 800 }, { "epoch": 3.6986301369863015, - "grad_norm": 0.3403891324996948, + "grad_norm": 0.2943974733352661, "learning_rate": 4.4722765818656235e-05, - "loss": 0.0495, + "loss": 0.0423, "step": 810 }, { "epoch": 3.7442922374429224, - "grad_norm": 0.1596749722957611, + "grad_norm": 0.20433223247528076, "learning_rate": 4.465753424657535e-05, - "loss": 0.0441, + "loss": 0.0375, "step": 820 }, { "epoch": 3.7899543378995433, - "grad_norm": 0.3847573697566986, + "grad_norm": 0.22242455184459686, "learning_rate": 4.459230267449446e-05, - "loss": 0.0542, + "loss": 0.0384, "step": 830 }, { "epoch": 3.8356164383561646, - "grad_norm": 0.2511195242404938, + "grad_norm": 0.3566916882991791, "learning_rate": 4.452707110241357e-05, - "loss": 0.0434, + "loss": 0.0383, "step": 840 }, { "epoch": 3.8812785388127855, - "grad_norm": 0.13712382316589355, + "grad_norm": 0.9326187372207642, "learning_rate": 4.446183953033268e-05, - "loss": 0.0347, + "loss": 0.0486, "step": 850 }, { "epoch": 3.9269406392694064, - "grad_norm": 0.1607000082731247, + "grad_norm": 0.547247588634491, "learning_rate": 4.4396607958251795e-05, - "loss": 0.0461, + "loss": 0.0478, "step": 860 }, { "epoch": 3.9726027397260273, - "grad_norm": 0.26414012908935547, + "grad_norm": 0.5731884241104126, "learning_rate": 4.433137638617091e-05, - "loss": 0.0663, + "loss": 0.038, "step": 870 }, { "epoch": 4.0, - "eval_bertscore_f1": 0.8776752933294711, - "eval_bleu": 0.7660390563095643, - "eval_loss": 0.03776135668158531, - "eval_rougeL": 0.2900147272161592, - "eval_runtime": 85.7788, - "eval_samples_per_second": 17.522, - "eval_steps_per_second": 1.096, + "eval_bertscore_f1": 0.8795713798966475, + "eval_bleu": 0.6914713325727705, + "eval_loss": 0.04532833769917488, + "eval_rougeL": 0.30243117527372954, + "eval_runtime": 65.1517, + "eval_samples_per_second": 23.069, + "eval_steps_per_second": 1.443, "step": 876 }, { "epoch": 4.018264840182648, - "grad_norm": 0.39637818932533264, + "grad_norm": 0.7525059580802917, "learning_rate": 4.426614481409002e-05, - "loss": 0.0413, + "loss": 0.0482, "step": 880 }, { "epoch": 4.063926940639269, - "grad_norm": 0.1926361322402954, + "grad_norm": 0.4412573277950287, "learning_rate": 4.420091324200913e-05, - "loss": 0.0653, + "loss": 0.0383, "step": 890 }, { "epoch": 4.109589041095891, - "grad_norm": 0.18935950100421906, + "grad_norm": 0.4645381569862366, "learning_rate": 4.4135681669928244e-05, - "loss": 0.0435, + "loss": 0.041, "step": 900 }, { "epoch": 4.155251141552512, - "grad_norm": 0.17452946305274963, + "grad_norm": 0.2361067682504654, "learning_rate": 4.407045009784736e-05, - "loss": 0.0441, + "loss": 0.0412, "step": 910 }, { "epoch": 4.200913242009133, - "grad_norm": 0.13483966886997223, + "grad_norm": 0.2725714445114136, "learning_rate": 4.4005218525766475e-05, - "loss": 0.047, + "loss": 0.0409, "step": 920 }, { "epoch": 4.2465753424657535, - "grad_norm": 0.12774349749088287, + "grad_norm": 0.25887736678123474, "learning_rate": 4.393998695368559e-05, - "loss": 0.0438, + "loss": 0.0391, "step": 930 }, { "epoch": 4.292237442922374, - "grad_norm": 0.3918684720993042, + "grad_norm": 0.22646628320217133, "learning_rate": 4.38747553816047e-05, - "loss": 0.0592, + "loss": 0.0416, "step": 940 }, { "epoch": 4.337899543378995, - "grad_norm": 0.12250568717718124, + "grad_norm": 0.30533650517463684, "learning_rate": 4.380952380952381e-05, - "loss": 0.0375, + "loss": 0.0412, "step": 950 }, { "epoch": 4.383561643835616, - "grad_norm": 0.10988382250070572, + "grad_norm": 0.2850722670555115, "learning_rate": 4.3744292237442924e-05, - "loss": 0.0393, + "loss": 0.0341, "step": 960 }, { "epoch": 4.429223744292237, - "grad_norm": 0.2036619633436203, + "grad_norm": 0.6640023589134216, "learning_rate": 4.3679060665362036e-05, - "loss": 0.0426, + "loss": 0.0448, "step": 970 }, { "epoch": 4.474885844748858, - "grad_norm": 0.1382388472557068, + "grad_norm": 0.4189640283584595, "learning_rate": 4.361382909328115e-05, - "loss": 0.0405, + "loss": 0.0364, "step": 980 }, { "epoch": 4.52054794520548, - "grad_norm": 0.18227025866508484, + "grad_norm": 0.27891141176223755, "learning_rate": 4.354859752120026e-05, - "loss": 0.057, + "loss": 0.0377, "step": 990 }, { "epoch": 4.566210045662101, - "grad_norm": 0.11121729016304016, + "grad_norm": 0.24627180397510529, "learning_rate": 4.348336594911937e-05, - "loss": 0.0462, + "loss": 0.0401, "step": 1000 }, { "epoch": 4.6118721461187215, - "grad_norm": 0.10684996098279953, + "grad_norm": 0.23066160082817078, "learning_rate": 4.341813437703849e-05, - "loss": 0.0491, + "loss": 0.0366, "step": 1010 }, { "epoch": 4.657534246575342, - "grad_norm": 0.20612068474292755, + "grad_norm": 0.2346690148115158, "learning_rate": 4.3352902804957604e-05, - "loss": 0.0444, + "loss": 0.0441, "step": 1020 }, { "epoch": 4.703196347031963, - "grad_norm": 0.18251581490039825, + "grad_norm": 0.2368006706237793, "learning_rate": 4.3287671232876716e-05, "loss": 0.0406, "step": 1030 }, { "epoch": 4.748858447488584, - "grad_norm": 0.12324585765600204, + "grad_norm": 0.19953212141990662, "learning_rate": 4.322243966079583e-05, - "loss": 0.0622, + "loss": 0.0398, "step": 1040 }, { "epoch": 4.794520547945205, - "grad_norm": 0.24980604648590088, + "grad_norm": 0.8159717321395874, "learning_rate": 4.315720808871494e-05, - "loss": 0.0369, + "loss": 0.0426, "step": 1050 }, { "epoch": 4.840182648401827, - "grad_norm": 0.11414594203233719, + "grad_norm": 0.29258477687835693, "learning_rate": 4.309197651663405e-05, - "loss": 0.0337, + "loss": 0.0376, "step": 1060 }, { "epoch": 4.885844748858448, - "grad_norm": 0.09755056351423264, + "grad_norm": 0.22379961609840393, "learning_rate": 4.3026744944553165e-05, - "loss": 0.0375, + "loss": 0.0461, "step": 1070 }, { "epoch": 4.931506849315069, - "grad_norm": 0.17053422331809998, + "grad_norm": 0.36883947253227234, "learning_rate": 4.296151337247228e-05, - "loss": 0.037, + "loss": 0.0464, "step": 1080 }, { "epoch": 4.9771689497716896, - "grad_norm": 0.1424807757139206, + "grad_norm": 0.26361939311027527, "learning_rate": 4.289628180039139e-05, - "loss": 0.0335, + "loss": 0.0425, "step": 1090 }, { "epoch": 5.0, - "eval_bertscore_f1": 0.8830310614285117, - "eval_bleu": 0.6535866956857873, - "eval_loss": 0.036899685859680176, - "eval_rougeL": 0.315426501376588, - "eval_runtime": 86.4385, - "eval_samples_per_second": 17.388, - "eval_steps_per_second": 1.087, + "eval_bertscore_f1": 0.874491546682255, + "eval_bleu": 0.5863769810718094, + "eval_loss": 0.044686976820230484, + "eval_rougeL": 0.30493134131703603, + "eval_runtime": 65.1008, + "eval_samples_per_second": 23.087, + "eval_steps_per_second": 1.444, "step": 1095 }, { "epoch": 5.0228310502283104, - "grad_norm": 0.16293296217918396, + "grad_norm": 0.29032638669013977, "learning_rate": 4.28310502283105e-05, - "loss": 0.0436, + "loss": 0.0407, "step": 1100 }, { "epoch": 5.068493150684931, - "grad_norm": 0.12882739305496216, + "grad_norm": 0.1798933893442154, "learning_rate": 4.276581865622962e-05, - "loss": 0.0422, + "loss": 0.0369, "step": 1110 }, { "epoch": 5.114155251141552, - "grad_norm": 0.1830970197916031, + "grad_norm": 0.20421646535396576, "learning_rate": 4.270058708414873e-05, - "loss": 0.0429, + "loss": 0.0366, "step": 1120 }, { "epoch": 5.159817351598173, - "grad_norm": 0.18178118765354156, + "grad_norm": 0.4460963308811188, "learning_rate": 4.2635355512067844e-05, - "loss": 0.0377, + "loss": 0.037, "step": 1130 }, { "epoch": 5.205479452054795, - "grad_norm": 0.11018156260251999, + "grad_norm": 0.19764983654022217, "learning_rate": 4.257012393998696e-05, - "loss": 0.0524, + "loss": 0.0369, "step": 1140 }, { "epoch": 5.251141552511416, - "grad_norm": 0.14554737508296967, + "grad_norm": 0.15257489681243896, "learning_rate": 4.250489236790607e-05, - "loss": 0.0518, + "loss": 0.0357, "step": 1150 }, { "epoch": 5.296803652968037, - "grad_norm": 0.13553589582443237, + "grad_norm": 0.9983281493186951, "learning_rate": 4.243966079582518e-05, - "loss": 0.0385, + "loss": 0.0406, "step": 1160 }, { "epoch": 5.342465753424658, - "grad_norm": 0.4197877049446106, + "grad_norm": 0.253713995218277, "learning_rate": 4.237442922374429e-05, - "loss": 0.0462, + "loss": 0.0353, "step": 1170 }, { "epoch": 5.3881278538812785, - "grad_norm": 0.08472498506307602, + "grad_norm": 0.24854066967964172, "learning_rate": 4.2309197651663405e-05, - "loss": 0.0439, + "loss": 0.0352, "step": 1180 }, { "epoch": 5.433789954337899, - "grad_norm": 0.13307222723960876, + "grad_norm": 0.29193824529647827, "learning_rate": 4.224396607958252e-05, - "loss": 0.0511, + "loss": 0.0452, "step": 1190 }, { "epoch": 5.47945205479452, - "grad_norm": 0.11114447563886642, + "grad_norm": 0.40590381622314453, "learning_rate": 4.217873450750163e-05, - "loss": 0.0359, + "loss": 0.0378, "step": 1200 }, { "epoch": 5.525114155251142, - "grad_norm": 0.09421250224113464, + "grad_norm": 0.18774063885211945, "learning_rate": 4.211350293542075e-05, - "loss": 0.0343, + "loss": 0.0425, "step": 1210 }, { "epoch": 5.570776255707763, - "grad_norm": 0.12763556838035583, + "grad_norm": 0.26123547554016113, "learning_rate": 4.204827136333986e-05, - "loss": 0.0478, + "loss": 0.0421, "step": 1220 }, { "epoch": 5.616438356164384, - "grad_norm": 0.3674803078174591, + "grad_norm": 0.6077787280082703, "learning_rate": 4.198303979125897e-05, - "loss": 0.0424, + "loss": 0.0342, "step": 1230 }, { "epoch": 5.662100456621005, - "grad_norm": 0.19763565063476562, + "grad_norm": 0.3458491563796997, "learning_rate": 4.1917808219178085e-05, - "loss": 0.0375, + "loss": 0.0357, "step": 1240 }, { "epoch": 5.707762557077626, - "grad_norm": 0.15177254378795624, + "grad_norm": 0.4896857440471649, "learning_rate": 4.18525766470972e-05, - "loss": 0.0397, + "loss": 0.0396, "step": 1250 }, { "epoch": 5.7534246575342465, - "grad_norm": 0.1431536078453064, + "grad_norm": 0.16601338982582092, "learning_rate": 4.178734507501631e-05, - "loss": 0.0393, + "loss": 0.0389, "step": 1260 }, { "epoch": 5.799086757990867, - "grad_norm": 0.1322961449623108, + "grad_norm": 0.2273034006357193, "learning_rate": 4.172211350293542e-05, - "loss": 0.0342, + "loss": 0.0434, "step": 1270 }, { "epoch": 5.844748858447488, - "grad_norm": 0.15804563462734222, + "grad_norm": 0.3890644907951355, "learning_rate": 4.1656881930854534e-05, - "loss": 0.0499, + "loss": 0.0437, "step": 1280 }, { "epoch": 5.890410958904109, - "grad_norm": 0.09370295703411102, + "grad_norm": 0.20492355525493622, "learning_rate": 4.1591650358773646e-05, - "loss": 0.0339, + "loss": 0.0382, "step": 1290 }, { "epoch": 5.936073059360731, - "grad_norm": 0.2806883454322815, + "grad_norm": 0.34492093324661255, "learning_rate": 4.152641878669276e-05, - "loss": 0.035, + "loss": 0.0334, "step": 1300 }, { "epoch": 5.981735159817352, - "grad_norm": 0.09985365718603134, + "grad_norm": 0.20600809156894684, "learning_rate": 4.146118721461188e-05, - "loss": 0.0461, + "loss": 0.0345, "step": 1310 }, { "epoch": 6.0, - "eval_bertscore_f1": 0.8816708240680352, - "eval_bleu": 0.6032876773464483, - "eval_loss": 0.036029841750860214, - "eval_rougeL": 0.31834801433173276, - "eval_runtime": 86.97, - "eval_samples_per_second": 17.282, - "eval_steps_per_second": 1.081, + "eval_bertscore_f1": 0.8773880335226586, + "eval_bleu": 0.7957797365038125, + "eval_loss": 0.04304474964737892, + "eval_rougeL": 0.298324896779863, + "eval_runtime": 65.1124, + "eval_samples_per_second": 23.083, + "eval_steps_per_second": 1.444, "step": 1314 }, { "epoch": 6.027397260273973, - "grad_norm": 0.10236047208309174, + "grad_norm": 0.28056013584136963, "learning_rate": 4.139595564253099e-05, - "loss": 0.0336, + "loss": 0.0351, "step": 1320 }, { "epoch": 6.073059360730594, - "grad_norm": 0.24231013655662537, + "grad_norm": 12.815491676330566, "learning_rate": 4.13307240704501e-05, - "loss": 0.0496, + "loss": 0.0398, "step": 1330 }, { "epoch": 6.1187214611872145, - "grad_norm": 0.11029026657342911, + "grad_norm": 0.3111923635005951, "learning_rate": 4.1265492498369214e-05, - "loss": 0.0403, + "loss": 0.0428, "step": 1340 }, { "epoch": 6.164383561643835, - "grad_norm": 0.14063166081905365, + "grad_norm": 0.23117075860500336, "learning_rate": 4.1200260926288326e-05, - "loss": 0.0473, + "loss": 0.0351, "step": 1350 }, { "epoch": 6.210045662100456, - "grad_norm": 0.10346683859825134, + "grad_norm": 0.22561274468898773, "learning_rate": 4.113502935420744e-05, - "loss": 0.0362, + "loss": 0.0385, "step": 1360 }, { "epoch": 6.255707762557078, - "grad_norm": 0.1653313785791397, + "grad_norm": 0.16258157789707184, "learning_rate": 4.106979778212655e-05, - "loss": 0.0327, + "loss": 0.0339, "step": 1370 }, { "epoch": 6.301369863013699, - "grad_norm": 0.1193709447979927, + "grad_norm": 0.6510432362556458, "learning_rate": 4.100456621004566e-05, - "loss": 0.0341, + "loss": 0.036, "step": 1380 }, { "epoch": 6.34703196347032, - "grad_norm": 0.34356066584587097, + "grad_norm": 0.24988140165805817, "learning_rate": 4.0939334637964774e-05, - "loss": 0.0431, + "loss": 0.0355, "step": 1390 }, { "epoch": 6.392694063926941, - "grad_norm": 0.2641497552394867, + "grad_norm": 0.3678286671638489, "learning_rate": 4.087410306588389e-05, - "loss": 0.0423, + "loss": 0.0338, "step": 1400 }, { "epoch": 6.438356164383562, - "grad_norm": 0.14067210257053375, + "grad_norm": 0.2946147918701172, "learning_rate": 4.0808871493803006e-05, - "loss": 0.0354, + "loss": 0.0419, "step": 1410 }, { "epoch": 6.4840182648401825, - "grad_norm": 0.11391960829496384, + "grad_norm": 0.21736349165439606, "learning_rate": 4.074363992172212e-05, - "loss": 0.0321, + "loss": 0.0344, "step": 1420 }, { "epoch": 6.529680365296803, - "grad_norm": 0.15210473537445068, + "grad_norm": 0.3145897388458252, "learning_rate": 4.067840834964123e-05, - "loss": 0.038, + "loss": 0.0385, "step": 1430 }, { "epoch": 6.575342465753424, - "grad_norm": 0.22131510078907013, + "grad_norm": 0.24647307395935059, "learning_rate": 4.061317677756034e-05, - "loss": 0.0385, + "loss": 0.0384, "step": 1440 }, { "epoch": 6.621004566210045, - "grad_norm": 0.10140376538038254, + "grad_norm": 0.31906986236572266, "learning_rate": 4.0547945205479454e-05, - "loss": 0.0499, + "loss": 0.0361, "step": 1450 }, { "epoch": 6.666666666666667, - "grad_norm": 0.15117721259593964, + "grad_norm": 0.23453353345394135, "learning_rate": 4.0482713633398566e-05, - "loss": 0.0375, + "loss": 0.0353, "step": 1460 }, { "epoch": 6.712328767123288, - "grad_norm": 0.15028707683086395, + "grad_norm": 3.637096405029297, "learning_rate": 4.041748206131768e-05, - "loss": 0.0528, + "loss": 0.0412, "step": 1470 }, { "epoch": 6.757990867579909, - "grad_norm": 0.16894899308681488, + "grad_norm": 0.21208681166172028, "learning_rate": 4.035225048923679e-05, - "loss": 0.0381, + "loss": 0.0354, "step": 1480 }, { "epoch": 6.80365296803653, - "grad_norm": 0.10997270792722702, + "grad_norm": 0.29757997393608093, "learning_rate": 4.02870189171559e-05, - "loss": 0.0345, + "loss": 0.0372, "step": 1490 }, { "epoch": 6.8493150684931505, - "grad_norm": 0.2839312255382538, + "grad_norm": 0.21638333797454834, "learning_rate": 4.0221787345075015e-05, - "loss": 0.0507, + "loss": 0.0343, "step": 1500 }, { "epoch": 6.894977168949771, - "grad_norm": 0.1316707879304886, + "grad_norm": 0.24771864712238312, "learning_rate": 4.0156555772994134e-05, - "loss": 0.0375, + "loss": 0.0384, "step": 1510 }, { "epoch": 6.940639269406392, - "grad_norm": 0.4014754295349121, + "grad_norm": 0.6341008543968201, "learning_rate": 4.0091324200913246e-05, - "loss": 0.0444, + "loss": 0.0457, "step": 1520 }, { "epoch": 6.986301369863014, - "grad_norm": 0.10761623084545135, + "grad_norm": 0.2649524509906769, "learning_rate": 4.002609262883236e-05, - "loss": 0.0334, + "loss": 0.0362, "step": 1530 }, { "epoch": 7.0, - "eval_bertscore_f1": 0.8739348690825467, - "eval_bleu": 0.5975825525895913, - "eval_loss": 0.034862346947193146, - "eval_rougeL": 0.2734166668626543, - "eval_runtime": 86.9967, - "eval_samples_per_second": 17.277, - "eval_steps_per_second": 1.081, + "eval_bertscore_f1": 0.8794697234809835, + "eval_bleu": 0.7000552060314711, + "eval_loss": 0.0426379069685936, + "eval_rougeL": 0.30292468350769075, + "eval_runtime": 65.6416, + "eval_samples_per_second": 22.897, + "eval_steps_per_second": 1.432, "step": 1533 }, { "epoch": 7.031963470319635, - "grad_norm": 0.2581014931201935, + "grad_norm": 0.2106972336769104, "learning_rate": 3.996086105675147e-05, - "loss": 0.0377, + "loss": 0.0376, "step": 1540 }, { "epoch": 7.077625570776256, - "grad_norm": 0.3633580803871155, + "grad_norm": 0.19316372275352478, "learning_rate": 3.989562948467058e-05, - "loss": 0.0409, + "loss": 0.0357, "step": 1550 }, { "epoch": 7.123287671232877, - "grad_norm": 0.2476946860551834, + "grad_norm": 0.2257992923259735, "learning_rate": 3.9830397912589695e-05, - "loss": 0.0432, + "loss": 0.037, "step": 1560 }, { "epoch": 7.168949771689498, - "grad_norm": 0.12708741426467896, + "grad_norm": 0.2736322283744812, "learning_rate": 3.976516634050881e-05, - "loss": 0.0344, + "loss": 0.0424, "step": 1570 }, { "epoch": 7.2146118721461185, - "grad_norm": 0.0819052904844284, + "grad_norm": 0.32095804810523987, "learning_rate": 3.969993476842792e-05, - "loss": 0.0411, + "loss": 0.0364, "step": 1580 }, { "epoch": 7.260273972602739, - "grad_norm": 0.09047359228134155, + "grad_norm": 0.17005418241024017, "learning_rate": 3.963470319634703e-05, - "loss": 0.031, + "loss": 0.0365, "step": 1590 }, { "epoch": 7.30593607305936, - "grad_norm": 0.15134736895561218, + "grad_norm": 0.5590267777442932, "learning_rate": 3.9569471624266144e-05, - "loss": 0.038, + "loss": 0.0345, "step": 1600 }, { "epoch": 7.351598173515982, - "grad_norm": 0.09916142374277115, + "grad_norm": 0.3150683641433716, "learning_rate": 3.950424005218526e-05, - "loss": 0.0318, + "loss": 0.0382, "step": 1610 }, { "epoch": 7.397260273972603, - "grad_norm": 0.11288363486528397, + "grad_norm": 0.22695161402225494, "learning_rate": 3.9439008480104375e-05, - "loss": 0.0411, + "loss": 0.036, "step": 1620 }, { "epoch": 7.442922374429224, - "grad_norm": 0.1381136178970337, + "grad_norm": 0.428966760635376, "learning_rate": 3.937377690802349e-05, - "loss": 0.0396, + "loss": 0.0362, "step": 1630 }, { "epoch": 7.488584474885845, - "grad_norm": 0.08881582319736481, + "grad_norm": 0.45022791624069214, "learning_rate": 3.93085453359426e-05, - "loss": 0.0298, + "loss": 0.0359, "step": 1640 }, { "epoch": 7.534246575342466, - "grad_norm": 0.10169164836406708, + "grad_norm": 1.2701294422149658, "learning_rate": 3.924331376386171e-05, - "loss": 0.0391, + "loss": 0.0326, "step": 1650 }, { "epoch": 7.579908675799087, - "grad_norm": 0.24670983850955963, + "grad_norm": 0.2580886483192444, "learning_rate": 3.9178082191780823e-05, - "loss": 0.041, + "loss": 0.0333, "step": 1660 }, { "epoch": 7.6255707762557075, - "grad_norm": 0.10547634214162827, + "grad_norm": 0.3360602855682373, "learning_rate": 3.9112850619699936e-05, - "loss": 0.042, + "loss": 0.0339, "step": 1670 }, { "epoch": 7.671232876712329, - "grad_norm": 0.30908140540122986, + "grad_norm": 0.39842474460601807, "learning_rate": 3.904761904761905e-05, - "loss": 0.0382, + "loss": 0.0378, "step": 1680 }, { "epoch": 7.71689497716895, - "grad_norm": 0.19618338346481323, + "grad_norm": 0.8376930356025696, "learning_rate": 3.898238747553816e-05, - "loss": 0.0414, + "loss": 0.0389, "step": 1690 }, { "epoch": 7.762557077625571, - "grad_norm": 0.0793214961886406, + "grad_norm": 0.23643815517425537, "learning_rate": 3.891715590345727e-05, - "loss": 0.032, + "loss": 0.038, "step": 1700 }, { "epoch": 7.808219178082192, - "grad_norm": 0.12131261080503464, + "grad_norm": 0.26686662435531616, "learning_rate": 3.885192433137639e-05, - "loss": 0.0417, + "loss": 0.0339, "step": 1710 }, { "epoch": 7.853881278538813, - "grad_norm": 0.41418784856796265, + "grad_norm": 0.16813237965106964, "learning_rate": 3.87866927592955e-05, - "loss": 0.0395, + "loss": 0.0328, "step": 1720 }, { "epoch": 7.899543378995434, - "grad_norm": 0.1027241051197052, + "grad_norm": 0.28077274560928345, "learning_rate": 3.8721461187214615e-05, - "loss": 0.0299, + "loss": 0.0326, "step": 1730 }, { "epoch": 7.945205479452055, - "grad_norm": 0.1369609236717224, + "grad_norm": 0.3392268717288971, "learning_rate": 3.865622961513373e-05, - "loss": 0.0375, + "loss": 0.0416, "step": 1740 }, { "epoch": 7.9908675799086755, - "grad_norm": 0.0653514638543129, + "grad_norm": 0.24137546122074127, "learning_rate": 3.859099804305284e-05, - "loss": 0.0466, + "loss": 0.036, "step": 1750 }, { "epoch": 8.0, - "eval_bertscore_f1": 0.882495258898237, - "eval_bleu": 0.6683821586112519, - "eval_loss": 0.033862482756376266, - "eval_rougeL": 0.3127432355714671, - "eval_runtime": 86.7914, - "eval_samples_per_second": 17.317, - "eval_steps_per_second": 1.083, + "eval_bertscore_f1": 0.8751992449630361, + "eval_bleu": 0.7231744684160546, + "eval_loss": 0.042749155312776566, + "eval_rougeL": 0.2994421875548862, + "eval_runtime": 65.5673, + "eval_samples_per_second": 22.923, + "eval_steps_per_second": 1.434, "step": 1752 }, { "epoch": 8.036529680365296, - "grad_norm": 0.07512130588293076, + "grad_norm": 0.3088216185569763, "learning_rate": 3.852576647097195e-05, - "loss": 0.0428, + "loss": 0.0347, "step": 1760 }, { "epoch": 8.082191780821917, - "grad_norm": 0.10082229226827621, + "grad_norm": 0.3175930678844452, "learning_rate": 3.8460534898891064e-05, - "loss": 0.0394, + "loss": 0.0348, "step": 1770 }, { "epoch": 8.127853881278538, - "grad_norm": 0.08132240921258926, + "grad_norm": 0.4923621416091919, "learning_rate": 3.8395303326810176e-05, - "loss": 0.0315, + "loss": 0.0354, "step": 1780 }, { "epoch": 8.173515981735159, - "grad_norm": 0.09185563027858734, + "grad_norm": 0.23712977766990662, "learning_rate": 3.833007175472929e-05, - "loss": 0.0403, + "loss": 0.039, "step": 1790 }, { "epoch": 8.219178082191782, - "grad_norm": 0.08592450618743896, + "grad_norm": 0.24011415243148804, "learning_rate": 3.82648401826484e-05, - "loss": 0.0356, + "loss": 0.036, "step": 1800 }, { "epoch": 8.264840182648403, - "grad_norm": 0.11714768409729004, + "grad_norm": 0.23044519126415253, "learning_rate": 3.819960861056752e-05, - "loss": 0.0374, + "loss": 0.0317, "step": 1810 }, { "epoch": 8.310502283105023, - "grad_norm": 0.0861973762512207, + "grad_norm": 0.25046950578689575, "learning_rate": 3.813437703848663e-05, - "loss": 0.043, + "loss": 0.0361, "step": 1820 }, { "epoch": 8.356164383561644, - "grad_norm": 0.36329740285873413, + "grad_norm": 0.2792360782623291, "learning_rate": 3.8069145466405744e-05, - "loss": 0.0379, + "loss": 0.0372, "step": 1830 }, { "epoch": 8.401826484018265, - "grad_norm": 0.0876651480793953, + "grad_norm": 0.7518793940544128, "learning_rate": 3.8003913894324856e-05, - "loss": 0.0332, + "loss": 0.0372, "step": 1840 }, { "epoch": 8.447488584474886, - "grad_norm": 0.08967719227075577, + "grad_norm": 0.2685680389404297, "learning_rate": 3.793868232224397e-05, - "loss": 0.0337, + "loss": 0.0338, "step": 1850 }, { "epoch": 8.493150684931507, - "grad_norm": 0.07374356687068939, + "grad_norm": 0.19953973591327667, "learning_rate": 3.787345075016308e-05, - "loss": 0.0285, + "loss": 0.0342, "step": 1860 }, { "epoch": 8.538812785388128, - "grad_norm": 0.345527321100235, + "grad_norm": 0.2889842987060547, "learning_rate": 3.780821917808219e-05, - "loss": 0.0459, + "loss": 0.0347, "step": 1870 }, { "epoch": 8.584474885844749, - "grad_norm": 0.07992502301931381, + "grad_norm": 0.2976756691932678, "learning_rate": 3.7742987606001305e-05, - "loss": 0.0384, + "loss": 0.0357, "step": 1880 }, { "epoch": 8.63013698630137, - "grad_norm": 0.1169479638338089, + "grad_norm": 0.2278045117855072, "learning_rate": 3.767775603392042e-05, - "loss": 0.0329, + "loss": 0.0367, "step": 1890 }, { "epoch": 8.67579908675799, - "grad_norm": 0.2200576215982437, + "grad_norm": 0.14058364927768707, "learning_rate": 3.761252446183953e-05, - "loss": 0.0392, + "loss": 0.0323, "step": 1900 }, { "epoch": 8.721461187214611, - "grad_norm": 0.07725539803504944, + "grad_norm": 0.28625041246414185, "learning_rate": 3.754729288975865e-05, - "loss": 0.0365, + "loss": 0.035, "step": 1910 }, { "epoch": 8.767123287671232, - "grad_norm": 0.17150481045246124, + "grad_norm": 0.2872960567474365, "learning_rate": 3.748206131767776e-05, "loss": 0.0402, "step": 1920 }, { "epoch": 8.812785388127853, - "grad_norm": 0.09744448214769363, + "grad_norm": 0.34051889181137085, "learning_rate": 3.741682974559687e-05, - "loss": 0.0345, + "loss": 0.0342, "step": 1930 }, { "epoch": 8.858447488584474, - "grad_norm": 0.13161571323871613, + "grad_norm": 0.23217986524105072, "learning_rate": 3.7351598173515985e-05, - "loss": 0.0294, + "loss": 0.0375, "step": 1940 }, { "epoch": 8.904109589041095, - "grad_norm": 0.12368807941675186, + "grad_norm": 0.38106223940849304, "learning_rate": 3.72863666014351e-05, - "loss": 0.0402, + "loss": 0.0363, "step": 1950 }, { "epoch": 8.949771689497716, - "grad_norm": 0.2823167145252228, + "grad_norm": 0.17972268164157867, "learning_rate": 3.722113502935421e-05, - "loss": 0.0396, + "loss": 0.0367, "step": 1960 }, { "epoch": 8.995433789954339, - "grad_norm": 0.0739816278219223, + "grad_norm": 0.27785488963127136, "learning_rate": 3.715590345727332e-05, - "loss": 0.0367, + "loss": 0.0341, "step": 1970 }, { "epoch": 9.0, - "eval_bertscore_f1": 0.8818957697766191, - "eval_bleu": 0.7007771398124651, - "eval_loss": 0.03325749561190605, - "eval_rougeL": 0.30904254601890724, - "eval_runtime": 85.8594, - "eval_samples_per_second": 17.505, - "eval_steps_per_second": 1.095, + "eval_bertscore_f1": 0.8794770974836902, + "eval_bleu": 0.701948780010586, + "eval_loss": 0.043190885335206985, + "eval_rougeL": 0.30334992897033064, + "eval_runtime": 64.8862, + "eval_samples_per_second": 23.164, + "eval_steps_per_second": 1.449, "step": 1971 }, { "epoch": 9.04109589041096, - "grad_norm": 0.1445605307817459, + "grad_norm": 0.2479076385498047, "learning_rate": 3.709067188519243e-05, - "loss": 0.0325, + "loss": 0.0314, "step": 1980 }, { "epoch": 9.08675799086758, - "grad_norm": 0.09838061034679413, + "grad_norm": 0.3738096058368683, "learning_rate": 3.7025440313111545e-05, - "loss": 0.0313, + "loss": 0.0381, "step": 1990 }, { "epoch": 9.132420091324201, - "grad_norm": 0.1458427757024765, + "grad_norm": 0.2466166913509369, "learning_rate": 3.696020874103066e-05, - "loss": 0.0369, + "loss": 0.0322, "step": 2000 }, { "epoch": 9.178082191780822, - "grad_norm": 0.08086104691028595, + "grad_norm": 0.5537144541740417, "learning_rate": 3.6894977168949777e-05, - "loss": 0.0376, + "loss": 0.0362, "step": 2010 }, { "epoch": 9.223744292237443, - "grad_norm": 0.08939240872859955, + "grad_norm": 0.711529552936554, "learning_rate": 3.682974559686889e-05, - "loss": 0.0366, + "loss": 0.0373, "step": 2020 }, { "epoch": 9.269406392694064, - "grad_norm": 0.3728352189064026, + "grad_norm": 0.15315942466259003, "learning_rate": 3.6764514024788e-05, - "loss": 0.0371, + "loss": 0.0375, "step": 2030 }, { "epoch": 9.315068493150685, - "grad_norm": 0.17691729962825775, + "grad_norm": 0.2318936139345169, "learning_rate": 3.669928245270711e-05, - "loss": 0.0393, + "loss": 0.0337, "step": 2040 }, { "epoch": 9.360730593607306, - "grad_norm": 0.1187521442770958, + "grad_norm": 0.15388214588165283, "learning_rate": 3.6634050880626225e-05, - "loss": 0.0392, + "loss": 0.0363, "step": 2050 }, { "epoch": 9.406392694063927, - "grad_norm": 0.05309230834245682, + "grad_norm": 0.2665681540966034, "learning_rate": 3.656881930854534e-05, - "loss": 0.0351, + "loss": 0.0321, "step": 2060 }, { "epoch": 9.452054794520548, - "grad_norm": 0.11482471972703934, + "grad_norm": 0.3937646150588989, "learning_rate": 3.650358773646445e-05, - "loss": 0.03, + "loss": 0.0372, "step": 2070 }, { "epoch": 9.497716894977168, - "grad_norm": 0.1264190524816513, + "grad_norm": 0.6450703144073486, "learning_rate": 3.643835616438356e-05, - "loss": 0.0391, + "loss": 0.0327, "step": 2080 }, { "epoch": 9.54337899543379, - "grad_norm": 0.3746449947357178, + "grad_norm": 0.8227304816246033, "learning_rate": 3.6373124592302674e-05, - "loss": 0.0303, + "loss": 0.0328, "step": 2090 }, { "epoch": 9.58904109589041, - "grad_norm": 0.0804123654961586, + "grad_norm": 0.5075876712799072, "learning_rate": 3.6307893020221786e-05, - "loss": 0.0307, + "loss": 0.0344, "step": 2100 }, { "epoch": 9.634703196347033, - "grad_norm": 0.12022325396537781, + "grad_norm": 0.25154736638069153, "learning_rate": 3.6242661448140905e-05, - "loss": 0.0419, + "loss": 0.0345, "step": 2110 }, { "epoch": 9.680365296803654, - "grad_norm": 0.07230564951896667, + "grad_norm": 0.4204483926296234, "learning_rate": 3.617742987606002e-05, - "loss": 0.0347, + "loss": 0.0342, "step": 2120 }, { "epoch": 9.726027397260275, - "grad_norm": 0.0902918353676796, + "grad_norm": 0.3545476198196411, "learning_rate": 3.611219830397913e-05, - "loss": 0.0311, + "loss": 0.0358, "step": 2130 }, { "epoch": 9.771689497716896, - "grad_norm": 0.13435712456703186, + "grad_norm": 0.26954710483551025, "learning_rate": 3.604696673189824e-05, - "loss": 0.0381, + "loss": 0.0375, "step": 2140 }, { "epoch": 9.817351598173516, - "grad_norm": 0.24455106258392334, + "grad_norm": 0.1699344962835312, "learning_rate": 3.5981735159817354e-05, - "loss": 0.0343, + "loss": 0.0338, "step": 2150 }, { "epoch": 9.863013698630137, - "grad_norm": 0.4970339238643646, + "grad_norm": 0.24426551163196564, "learning_rate": 3.5916503587736466e-05, - "loss": 0.0498, + "loss": 0.0366, "step": 2160 }, { "epoch": 9.908675799086758, - "grad_norm": 0.08798874914646149, + "grad_norm": 0.17183125019073486, "learning_rate": 3.585127201565558e-05, - "loss": 0.035, + "loss": 0.0316, "step": 2170 }, { "epoch": 9.954337899543379, - "grad_norm": 0.10993966460227966, + "grad_norm": 0.21993212401866913, "learning_rate": 3.578604044357469e-05, - "loss": 0.0378, + "loss": 0.0314, "step": 2180 }, { "epoch": 10.0, - "grad_norm": 0.09713996201753616, + "grad_norm": 0.24478712677955627, "learning_rate": 3.57208088714938e-05, - "loss": 0.0311, + "loss": 0.0359, "step": 2190 }, { "epoch": 10.0, - "eval_bertscore_f1": 0.8794062694786234, - "eval_bleu": 0.533594752651969, - "eval_loss": 0.03252607583999634, - "eval_rougeL": 0.3176795038867278, - "eval_runtime": 86.4689, - "eval_samples_per_second": 17.382, - "eval_steps_per_second": 1.087, + "eval_bertscore_f1": 0.8741392707792981, + "eval_bleu": 0.5581555403302182, + "eval_loss": 0.041961919516325, + "eval_rougeL": 0.3069551250109942, + "eval_runtime": 65.0323, + "eval_samples_per_second": 23.112, + "eval_steps_per_second": 1.445, "step": 2190 }, { "epoch": 10.045662100456621, - "grad_norm": 0.15792271494865417, + "grad_norm": 0.3946993947029114, "learning_rate": 3.5655577299412915e-05, - "loss": 0.0351, + "loss": 0.0349, "step": 2200 }, { "epoch": 10.091324200913242, - "grad_norm": 0.13184691965579987, + "grad_norm": 0.15859711170196533, "learning_rate": 3.5590345727332033e-05, - "loss": 0.0374, + "loss": 0.0322, "step": 2210 }, { "epoch": 10.136986301369863, - "grad_norm": 0.22351279854774475, + "grad_norm": 0.766708493232727, "learning_rate": 3.5525114155251146e-05, - "loss": 0.0315, + "loss": 0.0343, "step": 2220 }, { "epoch": 10.182648401826484, - "grad_norm": 0.07609914988279343, + "grad_norm": 0.17286263406276703, "learning_rate": 3.545988258317026e-05, - "loss": 0.0347, + "loss": 0.0341, "step": 2230 }, { "epoch": 10.228310502283104, - "grad_norm": 0.10054602473974228, + "grad_norm": 0.2328823357820511, "learning_rate": 3.539465101108937e-05, - "loss": 0.0328, + "loss": 0.0343, "step": 2240 }, { "epoch": 10.273972602739725, - "grad_norm": 0.06633490324020386, + "grad_norm": 0.23741354048252106, "learning_rate": 3.532941943900848e-05, - "loss": 0.0312, + "loss": 0.0334, "step": 2250 }, { "epoch": 10.319634703196346, - "grad_norm": 0.09284522384405136, + "grad_norm": 0.23443607985973358, "learning_rate": 3.5264187866927594e-05, - "loss": 0.0345, + "loss": 0.0317, "step": 2260 }, { "epoch": 10.365296803652967, - "grad_norm": 0.09927036613225937, + "grad_norm": 0.29927483201026917, "learning_rate": 3.5198956294846707e-05, - "loss": 0.0353, + "loss": 0.0334, "step": 2270 }, { "epoch": 10.41095890410959, - "grad_norm": 0.32926803827285767, + "grad_norm": 0.5162585377693176, "learning_rate": 3.513372472276582e-05, - "loss": 0.0362, + "loss": 0.0368, "step": 2280 }, { "epoch": 10.45662100456621, - "grad_norm": 0.1537335067987442, + "grad_norm": 0.20458880066871643, "learning_rate": 3.506849315068493e-05, - "loss": 0.038, + "loss": 0.0333, "step": 2290 }, { "epoch": 10.502283105022832, - "grad_norm": 0.27000853419303894, + "grad_norm": 0.6160792708396912, "learning_rate": 3.500326157860404e-05, - "loss": 0.0372, + "loss": 0.0358, "step": 2300 }, { "epoch": 10.547945205479452, - "grad_norm": 0.19232715666294098, + "grad_norm": 0.34151145815849304, "learning_rate": 3.493803000652316e-05, - "loss": 0.0325, + "loss": 0.0332, "step": 2310 }, { "epoch": 10.593607305936073, - "grad_norm": 0.09830646961927414, + "grad_norm": 0.2050519436597824, "learning_rate": 3.4872798434442274e-05, - "loss": 0.0399, + "loss": 0.0333, "step": 2320 }, { "epoch": 10.639269406392694, - "grad_norm": 0.16243436932563782, + "grad_norm": 0.33147433400154114, "learning_rate": 3.4807566862361386e-05, - "loss": 0.0358, + "loss": 0.034, "step": 2330 }, { "epoch": 10.684931506849315, - "grad_norm": 0.11637797206640244, + "grad_norm": 0.20570585131645203, "learning_rate": 3.47423352902805e-05, - "loss": 0.0397, + "loss": 0.0315, "step": 2340 }, { "epoch": 10.730593607305936, - "grad_norm": 0.06575558334589005, + "grad_norm": 0.21926531195640564, "learning_rate": 3.467710371819961e-05, - "loss": 0.0313, + "loss": 0.0335, "step": 2350 }, { "epoch": 10.776255707762557, - "grad_norm": 0.18646180629730225, + "grad_norm": 0.20111322402954102, "learning_rate": 3.461187214611872e-05, - "loss": 0.0334, + "loss": 0.038, "step": 2360 }, { "epoch": 10.821917808219178, - "grad_norm": 0.14705593883991241, + "grad_norm": 0.37514954805374146, "learning_rate": 3.4546640574037835e-05, - "loss": 0.0389, + "loss": 0.0346, "step": 2370 }, { "epoch": 10.867579908675799, - "grad_norm": 0.14052802324295044, + "grad_norm": 0.19650527834892273, "learning_rate": 3.448140900195695e-05, - "loss": 0.0331, + "loss": 0.0323, "step": 2380 }, { "epoch": 10.91324200913242, - "grad_norm": 0.0970313772559166, + "grad_norm": 0.3676673471927643, "learning_rate": 3.441617742987606e-05, - "loss": 0.0331, + "loss": 0.0322, "step": 2390 }, { "epoch": 10.95890410958904, - "grad_norm": 0.0760510265827179, + "grad_norm": 0.2045547366142273, "learning_rate": 3.435094585779517e-05, - "loss": 0.0339, + "loss": 0.036, "step": 2400 }, { "epoch": 11.0, - "eval_bertscore_f1": 0.8777465161211239, - "eval_bleu": 0.8421703823372678, - "eval_loss": 0.032254405319690704, - "eval_rougeL": 0.28874573142254134, - "eval_runtime": 85.9081, - "eval_samples_per_second": 17.495, - "eval_steps_per_second": 1.094, + "eval_bertscore_f1": 0.8752922487909288, + "eval_bleu": 0.9581851182952255, + "eval_loss": 0.0423484668135643, + "eval_rougeL": 0.28949663545817383, + "eval_runtime": 64.9513, + "eval_samples_per_second": 23.14, + "eval_steps_per_second": 1.447, "step": 2409 }, { "epoch": 11.004566210045661, - "grad_norm": 0.07779296487569809, + "grad_norm": 0.21690687537193298, "learning_rate": 3.428571428571429e-05, - "loss": 0.0346, + "loss": 0.0348, "step": 2410 }, { "epoch": 11.050228310502282, - "grad_norm": 0.25582781434059143, + "grad_norm": 0.5675901174545288, "learning_rate": 3.42204827136334e-05, - "loss": 0.0362, + "loss": 0.0339, "step": 2420 }, { "epoch": 11.095890410958905, - "grad_norm": 0.09182050079107285, + "grad_norm": 0.18212980031967163, "learning_rate": 3.4155251141552515e-05, - "loss": 0.0326, + "loss": 0.0361, "step": 2430 }, { "epoch": 11.141552511415526, - "grad_norm": 0.35790833830833435, + "grad_norm": 0.23558968305587769, "learning_rate": 3.409001956947163e-05, - "loss": 0.0337, + "loss": 0.0376, "step": 2440 }, { "epoch": 11.187214611872147, - "grad_norm": 0.3586607873439789, + "grad_norm": 0.1530950367450714, "learning_rate": 3.402478799739074e-05, - "loss": 0.0427, + "loss": 0.0382, "step": 2450 }, { "epoch": 11.232876712328768, - "grad_norm": 0.17306306958198547, + "grad_norm": 0.2215454876422882, "learning_rate": 3.395955642530985e-05, - "loss": 0.0311, + "loss": 0.0335, "step": 2460 }, { "epoch": 11.278538812785389, - "grad_norm": 0.08027535676956177, + "grad_norm": 0.31875351071357727, "learning_rate": 3.3894324853228963e-05, - "loss": 0.0298, + "loss": 0.0336, "step": 2470 }, { "epoch": 11.32420091324201, - "grad_norm": 0.05666491016745567, + "grad_norm": 0.18925060331821442, "learning_rate": 3.3829093281148076e-05, - "loss": 0.0286, + "loss": 0.033, "step": 2480 }, { "epoch": 11.36986301369863, - "grad_norm": 0.0904633179306984, + "grad_norm": 0.1896040290594101, "learning_rate": 3.376386170906719e-05, - "loss": 0.0339, + "loss": 0.0366, "step": 2490 }, { "epoch": 11.415525114155251, - "grad_norm": 0.12512832880020142, + "grad_norm": 0.18136650323867798, "learning_rate": 3.36986301369863e-05, - "loss": 0.0451, + "loss": 0.0325, "step": 2500 }, { "epoch": 11.461187214611872, - "grad_norm": 0.061826564371585846, + "grad_norm": 0.1443028599023819, "learning_rate": 3.363339856490542e-05, - "loss": 0.0355, + "loss": 0.0342, "step": 2510 }, { "epoch": 11.506849315068493, - "grad_norm": 0.19998124241828918, + "grad_norm": 0.3814453184604645, "learning_rate": 3.356816699282453e-05, - "loss": 0.0308, + "loss": 0.0342, "step": 2520 }, { "epoch": 11.552511415525114, - "grad_norm": 0.08238628506660461, + "grad_norm": 0.19911924004554749, "learning_rate": 3.350293542074364e-05, - "loss": 0.0336, + "loss": 0.034, "step": 2530 }, { "epoch": 11.598173515981735, - "grad_norm": 0.18144136667251587, + "grad_norm": 0.16410912573337555, "learning_rate": 3.3437703848662755e-05, - "loss": 0.0303, + "loss": 0.034, "step": 2540 }, { "epoch": 11.643835616438356, - "grad_norm": 0.13037236034870148, + "grad_norm": 0.18103031814098358, "learning_rate": 3.337247227658187e-05, - "loss": 0.0308, + "loss": 0.0309, "step": 2550 }, { "epoch": 11.689497716894977, - "grad_norm": 0.16914191842079163, + "grad_norm": 0.25762510299682617, "learning_rate": 3.330724070450098e-05, - "loss": 0.0302, + "loss": 0.0325, "step": 2560 }, { "epoch": 11.735159817351597, - "grad_norm": 0.24281352758407593, + "grad_norm": 0.20280788838863373, "learning_rate": 3.324200913242009e-05, - "loss": 0.0306, + "loss": 0.0326, "step": 2570 }, { "epoch": 11.780821917808218, - "grad_norm": 0.22488336265087128, + "grad_norm": 0.2774021029472351, "learning_rate": 3.3176777560339204e-05, - "loss": 0.0362, + "loss": 0.0331, "step": 2580 }, { "epoch": 11.826484018264841, - "grad_norm": 0.06625436991453171, + "grad_norm": 0.187673881649971, "learning_rate": 3.3111545988258316e-05, - "loss": 0.0321, + "loss": 0.0342, "step": 2590 }, { "epoch": 11.872146118721462, - "grad_norm": 0.07758279889822006, + "grad_norm": 0.1484459787607193, "learning_rate": 3.304631441617743e-05, - "loss": 0.0372, + "loss": 0.0312, "step": 2600 }, { "epoch": 11.917808219178083, - "grad_norm": 0.08518276363611221, + "grad_norm": 0.25383421778678894, "learning_rate": 3.298108284409655e-05, - "loss": 0.0378, + "loss": 0.0331, "step": 2610 }, { "epoch": 11.963470319634704, - "grad_norm": 0.0794219970703125, + "grad_norm": 0.1816657930612564, "learning_rate": 3.291585127201566e-05, - "loss": 0.0294, + "loss": 0.0324, "step": 2620 }, { "epoch": 12.0, - "eval_bertscore_f1": 0.877530678739884, - "eval_bleu": 0.8141384615121525, - "eval_loss": 0.032114505767822266, - "eval_rougeL": 0.2890521718339253, - "eval_runtime": 85.8584, - "eval_samples_per_second": 17.506, - "eval_steps_per_second": 1.095, + "eval_bertscore_f1": 0.8740238581430254, + "eval_bleu": 0.9457621299589393, + "eval_loss": 0.04242411255836487, + "eval_rougeL": 0.2864058958363045, + "eval_runtime": 65.3663, + "eval_samples_per_second": 22.994, + "eval_steps_per_second": 1.438, "step": 2628 }, { "epoch": 12.009132420091325, - "grad_norm": 0.1260496973991394, + "grad_norm": 0.22657038271427155, "learning_rate": 3.285061969993477e-05, - "loss": 0.0365, + "loss": 0.0328, "step": 2630 }, { "epoch": 12.054794520547945, - "grad_norm": 0.09780646860599518, + "grad_norm": 0.36884865164756775, "learning_rate": 3.2785388127853884e-05, - "loss": 0.0331, + "loss": 0.0344, "step": 2640 }, { "epoch": 12.100456621004566, - "grad_norm": 0.14175942540168762, + "grad_norm": 0.24654433131217957, "learning_rate": 3.2720156555772996e-05, - "loss": 0.0364, + "loss": 0.0335, "step": 2650 }, { "epoch": 12.146118721461187, - "grad_norm": 0.12439887225627899, + "grad_norm": 0.21425046026706696, "learning_rate": 3.265492498369211e-05, - "loss": 0.0291, + "loss": 0.0353, "step": 2660 }, { "epoch": 12.191780821917808, - "grad_norm": 0.09981077909469604, + "grad_norm": 0.1591794192790985, "learning_rate": 3.258969341161122e-05, - "loss": 0.0363, + "loss": 0.03, "step": 2670 }, { "epoch": 12.237442922374429, - "grad_norm": 0.1319703310728073, + "grad_norm": 0.16401417553424835, "learning_rate": 3.252446183953033e-05, - "loss": 0.0307, + "loss": 0.0319, "step": 2680 }, { "epoch": 12.28310502283105, - "grad_norm": 0.10203209519386292, + "grad_norm": 0.37971609830856323, "learning_rate": 3.2459230267449445e-05, - "loss": 0.0308, + "loss": 0.0318, "step": 2690 }, { "epoch": 12.32876712328767, - "grad_norm": 0.3156202435493469, + "grad_norm": 0.17105679214000702, "learning_rate": 3.239399869536856e-05, - "loss": 0.0333, + "loss": 0.0329, "step": 2700 }, { "epoch": 12.374429223744292, - "grad_norm": 0.08748015016317368, + "grad_norm": 0.1671990305185318, "learning_rate": 3.2328767123287676e-05, - "loss": 0.0389, + "loss": 0.0321, "step": 2710 }, { "epoch": 12.420091324200913, - "grad_norm": 0.1384272575378418, + "grad_norm": 0.18382422626018524, "learning_rate": 3.226353555120679e-05, - "loss": 0.0282, + "loss": 0.0344, "step": 2720 }, { "epoch": 12.465753424657533, - "grad_norm": 0.052130818367004395, + "grad_norm": 0.4917747378349304, "learning_rate": 3.21983039791259e-05, - "loss": 0.0283, + "loss": 0.0368, "step": 2730 }, { "epoch": 12.511415525114156, - "grad_norm": 0.14489006996154785, + "grad_norm": 0.274912565946579, "learning_rate": 3.213307240704501e-05, - "loss": 0.0398, + "loss": 0.0341, "step": 2740 }, { "epoch": 12.557077625570777, - "grad_norm": 0.09531650692224503, + "grad_norm": 0.34153324365615845, "learning_rate": 3.2067840834964125e-05, - "loss": 0.0289, + "loss": 0.0343, "step": 2750 }, { "epoch": 12.602739726027398, - "grad_norm": 0.0696859359741211, + "grad_norm": 0.5951138138771057, "learning_rate": 3.200260926288324e-05, - "loss": 0.034, + "loss": 0.0314, "step": 2760 }, { "epoch": 12.648401826484019, - "grad_norm": 0.14592404663562775, + "grad_norm": 0.1907246708869934, "learning_rate": 3.193737769080235e-05, - "loss": 0.0334, + "loss": 0.0298, "step": 2770 }, { "epoch": 12.69406392694064, - "grad_norm": 0.08082027733325958, + "grad_norm": 0.2845422625541687, "learning_rate": 3.187214611872146e-05, - "loss": 0.0295, + "loss": 0.0315, "step": 2780 }, { "epoch": 12.73972602739726, - "grad_norm": 0.08294638246297836, + "grad_norm": 0.5828273892402649, "learning_rate": 3.180691454664057e-05, - "loss": 0.0276, + "loss": 0.0339, "step": 2790 }, { "epoch": 12.785388127853881, - "grad_norm": 0.05551337078213692, + "grad_norm": 0.12505371868610382, "learning_rate": 3.174168297455969e-05, - "loss": 0.0297, + "loss": 0.0329, "step": 2800 }, { "epoch": 12.831050228310502, - "grad_norm": 0.09836950153112411, + "grad_norm": 0.18723076581954956, "learning_rate": 3.1676451402478804e-05, - "loss": 0.0374, + "loss": 0.0296, "step": 2810 }, { "epoch": 12.876712328767123, - "grad_norm": 0.13783608376979828, + "grad_norm": 0.20346902310848236, "learning_rate": 3.1611219830397917e-05, - "loss": 0.0366, + "loss": 0.034, "step": 2820 }, { "epoch": 12.922374429223744, - "grad_norm": 0.11985262483358383, + "grad_norm": 0.21218132972717285, "learning_rate": 3.154598825831703e-05, - "loss": 0.0351, + "loss": 0.0308, "step": 2830 }, { "epoch": 12.968036529680365, - "grad_norm": 0.18932116031646729, + "grad_norm": 0.2717015743255615, "learning_rate": 3.148075668623614e-05, - "loss": 0.0406, + "loss": 0.0331, "step": 2840 }, { "epoch": 13.0, - "eval_bertscore_f1": 0.8794350181907633, - "eval_bleu": 0.7714270941582966, - "eval_loss": 0.03142493963241577, - "eval_rougeL": 0.2958524493369111, - "eval_runtime": 85.3679, - "eval_samples_per_second": 17.606, - "eval_steps_per_second": 1.101, + "eval_bertscore_f1": 0.8743115136088805, + "eval_bleu": 0.5546674313556451, + "eval_loss": 0.041579220443964005, + "eval_rougeL": 0.30863203533209793, + "eval_runtime": 65.4384, + "eval_samples_per_second": 22.968, + "eval_steps_per_second": 1.436, "step": 2847 }, { "epoch": 13.013698630136986, - "grad_norm": 0.09223882853984833, + "grad_norm": 0.45697784423828125, "learning_rate": 3.141552511415525e-05, - "loss": 0.0323, + "loss": 0.034, "step": 2850 }, { "epoch": 13.059360730593607, - "grad_norm": 0.10966315865516663, + "grad_norm": 0.2796197235584259, "learning_rate": 3.1350293542074365e-05, - "loss": 0.0308, + "loss": 0.0349, "step": 2860 }, { "epoch": 13.105022831050228, - "grad_norm": 0.09420251101255417, + "grad_norm": 0.2153574377298355, "learning_rate": 3.128506196999348e-05, "loss": 0.0323, "step": 2870 }, { "epoch": 13.150684931506849, - "grad_norm": 0.14217647910118103, + "grad_norm": 0.19059832394123077, "learning_rate": 3.121983039791259e-05, - "loss": 0.0341, + "loss": 0.0342, "step": 2880 }, { "epoch": 13.19634703196347, - "grad_norm": 0.13941463828086853, + "grad_norm": 0.2336965799331665, "learning_rate": 3.11545988258317e-05, - "loss": 0.0322, + "loss": 0.0335, "step": 2890 }, { "epoch": 13.242009132420092, - "grad_norm": 0.05103166028857231, + "grad_norm": 0.2201681286096573, "learning_rate": 3.108936725375082e-05, - "loss": 0.0323, + "loss": 0.0324, "step": 2900 }, { "epoch": 13.287671232876713, - "grad_norm": 0.13112640380859375, + "grad_norm": 0.4124416708946228, "learning_rate": 3.102413568166993e-05, - "loss": 0.0291, + "loss": 0.0332, "step": 2910 }, { "epoch": 13.333333333333334, - "grad_norm": 0.10072794556617737, + "grad_norm": 0.24998348951339722, "learning_rate": 3.0958904109589045e-05, - "loss": 0.0336, + "loss": 0.0318, "step": 2920 }, { "epoch": 13.378995433789955, - "grad_norm": 0.05783500149846077, + "grad_norm": 0.22465960681438446, "learning_rate": 3.089367253750816e-05, - "loss": 0.0308, + "loss": 0.0335, "step": 2930 }, { "epoch": 13.424657534246576, - "grad_norm": 0.06233891472220421, + "grad_norm": 0.21678076684474945, "learning_rate": 3.082844096542727e-05, - "loss": 0.0316, + "loss": 0.0339, "step": 2940 }, { "epoch": 13.470319634703197, - "grad_norm": 0.08696655184030533, + "grad_norm": 0.1709524393081665, "learning_rate": 3.076320939334638e-05, - "loss": 0.0303, + "loss": 0.0331, "step": 2950 }, { "epoch": 13.515981735159817, - "grad_norm": 0.12923157215118408, + "grad_norm": 0.1303175985813141, "learning_rate": 3.0697977821265494e-05, - "loss": 0.0317, + "loss": 0.0288, "step": 2960 }, { "epoch": 13.561643835616438, - "grad_norm": 0.06819671392440796, + "grad_norm": 0.2762468159198761, "learning_rate": 3.0632746249184606e-05, - "loss": 0.0338, + "loss": 0.0303, "step": 2970 }, { "epoch": 13.60730593607306, - "grad_norm": 0.05921826884150505, + "grad_norm": 0.12408158928155899, "learning_rate": 3.056751467710372e-05, - "loss": 0.0306, + "loss": 0.0311, "step": 2980 }, { "epoch": 13.65296803652968, - "grad_norm": 0.12270516902208328, + "grad_norm": 0.1823064684867859, "learning_rate": 3.0502283105022834e-05, - "loss": 0.0354, + "loss": 0.0294, "step": 2990 }, { "epoch": 13.698630136986301, - "grad_norm": 0.08732906728982925, + "grad_norm": 0.23568011820316315, "learning_rate": 3.0437051532941946e-05, - "loss": 0.0357, + "loss": 0.0329, "step": 3000 }, { "epoch": 13.744292237442922, - "grad_norm": 0.23829276859760284, + "grad_norm": 0.19115784764289856, "learning_rate": 3.0371819960861058e-05, - "loss": 0.0343, + "loss": 0.0317, "step": 3010 }, { "epoch": 13.789954337899543, - "grad_norm": 0.05741345137357712, + "grad_norm": 0.149480938911438, "learning_rate": 3.030658838878017e-05, - "loss": 0.0296, + "loss": 0.0332, "step": 3020 }, { "epoch": 13.835616438356164, - "grad_norm": 0.08565385639667511, + "grad_norm": 0.21910887956619263, "learning_rate": 3.0241356816699286e-05, - "loss": 0.0355, + "loss": 0.0352, "step": 3030 }, { "epoch": 13.881278538812785, - "grad_norm": 0.07784215360879898, + "grad_norm": 0.20704659819602966, "learning_rate": 3.0176125244618398e-05, - "loss": 0.0339, + "loss": 0.0312, "step": 3040 }, { "epoch": 13.926940639269407, - "grad_norm": 0.059760138392448425, + "grad_norm": 0.15948213636875153, "learning_rate": 3.011089367253751e-05, - "loss": 0.0294, + "loss": 0.0317, "step": 3050 }, { "epoch": 13.972602739726028, - "grad_norm": 0.11744826287031174, + "grad_norm": 0.2612301707267761, "learning_rate": 3.0045662100456622e-05, - "loss": 0.0359, + "loss": 0.0337, "step": 3060 }, { "epoch": 14.0, - "eval_bertscore_f1": 0.8785086652872488, - "eval_bleu": 0.44968903656050474, - "eval_loss": 0.03112851269543171, - "eval_rougeL": 0.3259152581485867, - "eval_runtime": 86.2048, - "eval_samples_per_second": 17.435, - "eval_steps_per_second": 1.09, + "eval_bertscore_f1": 0.8789519665246001, + "eval_bleu": 0.7617727244561832, + "eval_loss": 0.042337022721767426, + "eval_rougeL": 0.30510897093069805, + "eval_runtime": 65.1106, + "eval_samples_per_second": 23.084, + "eval_steps_per_second": 1.444, "step": 3066 }, { "epoch": 14.018264840182649, - "grad_norm": 0.08186227828264236, + "grad_norm": 0.1484575718641281, "learning_rate": 2.9980430528375734e-05, - "loss": 0.0382, + "loss": 0.0331, "step": 3070 }, { "epoch": 14.06392694063927, - "grad_norm": 0.3178916871547699, + "grad_norm": 0.2205040156841278, "learning_rate": 2.991519895629485e-05, - "loss": 0.0377, + "loss": 0.0318, "step": 3080 }, { "epoch": 14.10958904109589, - "grad_norm": 0.07393322885036469, + "grad_norm": 0.18296614289283752, "learning_rate": 2.9849967384213962e-05, - "loss": 0.0284, + "loss": 0.0315, "step": 3090 }, { "epoch": 14.155251141552512, - "grad_norm": 0.07416381686925888, + "grad_norm": 0.38665133714675903, "learning_rate": 2.9784735812133074e-05, - "loss": 0.0301, + "loss": 0.0329, "step": 3100 }, { "epoch": 14.200913242009133, - "grad_norm": 0.1198846623301506, + "grad_norm": 0.21823015809059143, "learning_rate": 2.9719504240052186e-05, - "loss": 0.0316, + "loss": 0.0319, "step": 3110 }, { "epoch": 14.246575342465754, - "grad_norm": 0.07470612972974777, + "grad_norm": 0.16656526923179626, "learning_rate": 2.96542726679713e-05, - "loss": 0.0335, + "loss": 0.0349, "step": 3120 }, { "epoch": 14.292237442922374, - "grad_norm": 0.18774984776973724, + "grad_norm": 0.15548262000083923, "learning_rate": 2.9589041095890414e-05, - "loss": 0.0366, + "loss": 0.0322, "step": 3130 }, { "epoch": 14.337899543378995, - "grad_norm": 0.11751188337802887, + "grad_norm": 0.15261350572109222, "learning_rate": 2.9523809523809526e-05, - "loss": 0.0382, + "loss": 0.0327, "step": 3140 }, { "epoch": 14.383561643835616, - "grad_norm": 0.08535225689411163, + "grad_norm": 0.37383410334587097, "learning_rate": 2.945857795172864e-05, - "loss": 0.0331, + "loss": 0.0305, "step": 3150 }, { "epoch": 14.429223744292237, - "grad_norm": 0.07403095066547394, + "grad_norm": 0.25296688079833984, "learning_rate": 2.939334637964775e-05, - "loss": 0.0261, + "loss": 0.0331, "step": 3160 }, { "epoch": 14.474885844748858, - "grad_norm": 0.19732074439525604, + "grad_norm": 0.1821436882019043, "learning_rate": 2.9328114807566863e-05, - "loss": 0.0332, + "loss": 0.0323, "step": 3170 }, { "epoch": 14.520547945205479, - "grad_norm": 0.2637515366077423, + "grad_norm": 0.24707984924316406, "learning_rate": 2.926288323548598e-05, - "loss": 0.0327, + "loss": 0.0379, "step": 3180 }, { "epoch": 14.5662100456621, - "grad_norm": 0.057027578353881836, + "grad_norm": 0.47426894307136536, "learning_rate": 2.919765166340509e-05, - "loss": 0.0304, + "loss": 0.0297, "step": 3190 }, { "epoch": 14.61187214611872, - "grad_norm": 0.0682603120803833, + "grad_norm": 1.039271593093872, "learning_rate": 2.9132420091324203e-05, - "loss": 0.0286, + "loss": 0.0333, "step": 3200 }, { "epoch": 14.657534246575342, - "grad_norm": 0.07055100798606873, + "grad_norm": 0.3582177460193634, "learning_rate": 2.9067188519243315e-05, - "loss": 0.0332, + "loss": 0.0319, "step": 3210 }, { "epoch": 14.703196347031964, - "grad_norm": 0.3047225773334503, + "grad_norm": 0.23270468413829803, "learning_rate": 2.9001956947162427e-05, - "loss": 0.0364, + "loss": 0.0313, "step": 3220 }, { "epoch": 14.748858447488585, - "grad_norm": 0.052217088639736176, + "grad_norm": 0.18566164374351501, "learning_rate": 2.8936725375081543e-05, - "loss": 0.0305, + "loss": 0.0313, "step": 3230 }, { "epoch": 14.794520547945206, - "grad_norm": 0.13912373781204224, + "grad_norm": 0.1770157665014267, "learning_rate": 2.8871493803000655e-05, - "loss": 0.0356, + "loss": 0.0328, "step": 3240 }, { "epoch": 14.840182648401827, - "grad_norm": 0.06369337439537048, + "grad_norm": 0.18902353942394257, "learning_rate": 2.8806262230919767e-05, - "loss": 0.0304, + "loss": 0.0305, "step": 3250 }, { "epoch": 14.885844748858448, - "grad_norm": 0.09859542548656464, + "grad_norm": 0.21729163825511932, "learning_rate": 2.874103065883888e-05, - "loss": 0.0305, + "loss": 0.0348, "step": 3260 }, { "epoch": 14.931506849315069, - "grad_norm": 0.08918995410203934, + "grad_norm": 0.1655968874692917, "learning_rate": 2.867579908675799e-05, - "loss": 0.0282, + "loss": 0.0359, "step": 3270 }, { "epoch": 14.97716894977169, - "grad_norm": 0.09965560585260391, + "grad_norm": 0.21756260097026825, "learning_rate": 2.8610567514677107e-05, - "loss": 0.0298, + "loss": 0.0342, "step": 3280 }, { "epoch": 15.0, - "eval_bertscore_f1": 0.8813119770603345, - "eval_bleu": 0.6140646535986611, - "eval_loss": 0.030848076567053795, - "eval_rougeL": 0.3100828866535566, - "eval_runtime": 85.4538, - "eval_samples_per_second": 17.588, - "eval_steps_per_second": 1.1, + "eval_bertscore_f1": 0.874911447445393, + "eval_bleu": 0.7648419907762714, + "eval_loss": 0.042606160044670105, + "eval_rougeL": 0.29841511614623334, + "eval_runtime": 65.1464, + "eval_samples_per_second": 23.071, + "eval_steps_per_second": 1.443, "step": 3285 }, { "epoch": 15.02283105022831, - "grad_norm": 0.07896128296852112, + "grad_norm": 0.23614604771137238, "learning_rate": 2.854533594259622e-05, - "loss": 0.0273, + "loss": 0.0325, "step": 3290 }, { "epoch": 15.068493150684931, - "grad_norm": 0.25050729513168335, + "grad_norm": 0.25791773200035095, "learning_rate": 2.848010437051533e-05, - "loss": 0.0339, + "loss": 0.0328, "step": 3300 }, { "epoch": 15.114155251141552, - "grad_norm": 0.19186432659626007, + "grad_norm": 0.23111367225646973, "learning_rate": 2.8414872798434443e-05, - "loss": 0.0273, + "loss": 0.033, "step": 3310 }, { "epoch": 15.159817351598173, - "grad_norm": 0.09528006613254547, + "grad_norm": 0.1846420168876648, "learning_rate": 2.8349641226353556e-05, - "loss": 0.0354, + "loss": 0.0307, "step": 3320 }, { "epoch": 15.205479452054794, - "grad_norm": 0.09040423482656479, + "grad_norm": 0.18300461769104004, "learning_rate": 2.828440965427267e-05, - "loss": 0.0311, + "loss": 0.0338, "step": 3330 }, { "epoch": 15.251141552511415, - "grad_norm": 0.0519646555185318, + "grad_norm": 0.18972980976104736, "learning_rate": 2.8219178082191783e-05, - "loss": 0.0275, + "loss": 0.0328, "step": 3340 }, { "epoch": 15.296803652968036, - "grad_norm": 0.17647899687290192, + "grad_norm": 0.19306117296218872, "learning_rate": 2.8153946510110896e-05, - "loss": 0.0358, + "loss": 0.0303, "step": 3350 }, { "epoch": 15.342465753424657, - "grad_norm": 0.099759042263031, + "grad_norm": 0.1415315568447113, "learning_rate": 2.8088714938030008e-05, - "loss": 0.0277, + "loss": 0.0314, "step": 3360 }, { "epoch": 15.38812785388128, - "grad_norm": 0.060709647834300995, + "grad_norm": 0.17149841785430908, "learning_rate": 2.802348336594912e-05, - "loss": 0.029, + "loss": 0.0318, "step": 3370 }, { "epoch": 15.4337899543379, - "grad_norm": 0.08363103866577148, + "grad_norm": 0.19151033461093903, "learning_rate": 2.7958251793868235e-05, - "loss": 0.0289, + "loss": 0.0319, "step": 3380 }, { "epoch": 15.479452054794521, - "grad_norm": 0.0995960682630539, + "grad_norm": 0.24431711435317993, "learning_rate": 2.7893020221787348e-05, - "loss": 0.0342, + "loss": 0.0318, "step": 3390 }, { "epoch": 15.525114155251142, - "grad_norm": 0.0788845494389534, + "grad_norm": 0.21643275022506714, "learning_rate": 2.782778864970646e-05, - "loss": 0.0286, + "loss": 0.0305, "step": 3400 }, { "epoch": 15.570776255707763, - "grad_norm": 0.07989612221717834, + "grad_norm": 0.4153854250907898, "learning_rate": 2.7762557077625572e-05, - "loss": 0.0308, + "loss": 0.0319, "step": 3410 }, { "epoch": 15.616438356164384, - "grad_norm": 0.05240185931324959, + "grad_norm": 0.3891514539718628, "learning_rate": 2.7697325505544684e-05, - "loss": 0.0282, + "loss": 0.0306, "step": 3420 }, { "epoch": 15.662100456621005, - "grad_norm": 0.12877096235752106, + "grad_norm": 0.23639145493507385, "learning_rate": 2.76320939334638e-05, - "loss": 0.0288, + "loss": 0.0321, "step": 3430 }, { "epoch": 15.707762557077626, - "grad_norm": 0.25699493288993835, + "grad_norm": 0.19188259541988373, "learning_rate": 2.7566862361382912e-05, - "loss": 0.0346, + "loss": 0.0348, "step": 3440 }, { "epoch": 15.753424657534246, - "grad_norm": 0.37610942125320435, + "grad_norm": 0.1795605719089508, "learning_rate": 2.7501630789302024e-05, - "loss": 0.0325, + "loss": 0.0332, "step": 3450 }, { "epoch": 15.799086757990867, - "grad_norm": 0.19347558915615082, + "grad_norm": 0.1636660099029541, "learning_rate": 2.7436399217221136e-05, - "loss": 0.0332, + "loss": 0.0315, "step": 3460 }, { "epoch": 15.844748858447488, - "grad_norm": 0.148736372590065, + "grad_norm": 0.19387169182300568, "learning_rate": 2.737116764514025e-05, - "loss": 0.0269, + "loss": 0.0321, "step": 3470 }, { "epoch": 15.89041095890411, - "grad_norm": 0.3108772039413452, + "grad_norm": 0.19701571762561798, "learning_rate": 2.7305936073059364e-05, - "loss": 0.0376, + "loss": 0.0306, "step": 3480 }, { "epoch": 15.93607305936073, - "grad_norm": 0.1941206008195877, + "grad_norm": 0.11701110750436783, "learning_rate": 2.7240704500978476e-05, - "loss": 0.034, + "loss": 0.0318, "step": 3490 }, { "epoch": 15.981735159817351, - "grad_norm": 0.1268136203289032, + "grad_norm": 0.18387459218502045, "learning_rate": 2.7175472928897588e-05, - "loss": 0.0308, + "loss": 0.0318, "step": 3500 }, { "epoch": 16.0, - "eval_bertscore_f1": 0.8762329206098657, - "eval_bleu": 0.5864006312262637, - "eval_loss": 0.030562568455934525, - "eval_rougeL": 0.2886638006885884, - "eval_runtime": 85.2823, - "eval_samples_per_second": 17.624, - "eval_steps_per_second": 1.102, + "eval_bertscore_f1": 0.8746434982744599, + "eval_bleu": 0.5087846068780025, + "eval_loss": 0.04214566573500633, + "eval_rougeL": 0.3110731653536748, + "eval_runtime": 65.1075, + "eval_samples_per_second": 23.085, + "eval_steps_per_second": 1.444, "step": 3504 }, { "epoch": 16.027397260273972, - "grad_norm": 0.07895983755588531, + "grad_norm": 0.23977895081043243, "learning_rate": 2.71102413568167e-05, - "loss": 0.0374, + "loss": 0.0313, "step": 3510 }, { "epoch": 16.073059360730593, - "grad_norm": 0.09038267284631729, + "grad_norm": 0.1618836671113968, "learning_rate": 2.7045009784735813e-05, - "loss": 0.0304, + "loss": 0.0332, "step": 3520 }, { "epoch": 16.118721461187214, - "grad_norm": 0.06773549318313599, + "grad_norm": 0.16630738973617554, "learning_rate": 2.6979778212654928e-05, - "loss": 0.0301, + "loss": 0.0314, "step": 3530 }, { "epoch": 16.164383561643834, - "grad_norm": 0.08219348639249802, + "grad_norm": 0.12681370973587036, "learning_rate": 2.691454664057404e-05, - "loss": 0.03, + "loss": 0.0358, "step": 3540 }, { "epoch": 16.210045662100455, - "grad_norm": 0.08681906014680862, + "grad_norm": 0.2200222760438919, "learning_rate": 2.6849315068493153e-05, - "loss": 0.0337, + "loss": 0.0313, "step": 3550 }, { "epoch": 16.255707762557076, - "grad_norm": 0.07358887046575546, + "grad_norm": 0.13934072852134705, "learning_rate": 2.6784083496412265e-05, - "loss": 0.0281, + "loss": 0.0317, "step": 3560 }, { "epoch": 16.301369863013697, - "grad_norm": 0.07765143364667892, + "grad_norm": 0.2129465490579605, "learning_rate": 2.6718851924331377e-05, - "loss": 0.0321, + "loss": 0.0317, "step": 3570 }, { "epoch": 16.347031963470318, - "grad_norm": 0.05562649667263031, + "grad_norm": 0.19109800457954407, "learning_rate": 2.6653620352250492e-05, - "loss": 0.0308, + "loss": 0.0326, "step": 3580 }, { "epoch": 16.39269406392694, - "grad_norm": 0.055072009563446045, + "grad_norm": 0.20696400105953217, "learning_rate": 2.6588388780169605e-05, - "loss": 0.0306, + "loss": 0.0319, "step": 3590 }, { "epoch": 16.438356164383563, - "grad_norm": 0.07029841840267181, + "grad_norm": 0.21380408108234406, "learning_rate": 2.6523157208088717e-05, - "loss": 0.0304, + "loss": 0.0329, "step": 3600 }, { "epoch": 16.484018264840184, - "grad_norm": 0.06442932784557343, + "grad_norm": 0.2107531875371933, "learning_rate": 2.645792563600783e-05, - "loss": 0.0285, + "loss": 0.0328, "step": 3610 }, { "epoch": 16.529680365296805, - "grad_norm": 0.10228294134140015, + "grad_norm": 0.1593928039073944, "learning_rate": 2.639269406392694e-05, - "loss": 0.0299, + "loss": 0.0334, "step": 3620 }, { "epoch": 16.575342465753426, - "grad_norm": 0.060761261731386185, + "grad_norm": 0.2697247862815857, "learning_rate": 2.6327462491846057e-05, - "loss": 0.0288, + "loss": 0.0315, "step": 3630 }, { "epoch": 16.621004566210047, - "grad_norm": 0.1510060727596283, + "grad_norm": 0.18833734095096588, "learning_rate": 2.626223091976517e-05, - "loss": 0.0286, + "loss": 0.0317, "step": 3640 }, { "epoch": 16.666666666666668, - "grad_norm": 0.09816893935203552, + "grad_norm": 0.13584983348846436, "learning_rate": 2.619699934768428e-05, - "loss": 0.0293, + "loss": 0.0316, "step": 3650 }, { "epoch": 16.71232876712329, - "grad_norm": 0.10027530789375305, + "grad_norm": 0.15701667964458466, "learning_rate": 2.6131767775603393e-05, - "loss": 0.0287, + "loss": 0.033, "step": 3660 }, { "epoch": 16.75799086757991, - "grad_norm": 0.08933715522289276, + "grad_norm": 0.233077272772789, "learning_rate": 2.6066536203522505e-05, - "loss": 0.03, + "loss": 0.0323, "step": 3670 }, { "epoch": 16.80365296803653, - "grad_norm": 0.06297653168439865, + "grad_norm": 0.3123913109302521, "learning_rate": 2.600130463144162e-05, - "loss": 0.0291, + "loss": 0.0316, "step": 3680 }, { "epoch": 16.84931506849315, - "grad_norm": 0.1338263303041458, + "grad_norm": 0.13571007549762726, "learning_rate": 2.5936073059360733e-05, - "loss": 0.0369, + "loss": 0.0327, "step": 3690 }, { "epoch": 16.894977168949772, - "grad_norm": 0.25006791949272156, + "grad_norm": 0.2144760936498642, "learning_rate": 2.5870841487279845e-05, - "loss": 0.0301, + "loss": 0.0419, "step": 3700 }, { "epoch": 16.940639269406393, - "grad_norm": 0.26022225618362427, + "grad_norm": 0.17451731860637665, "learning_rate": 2.5805609915198957e-05, - "loss": 0.0335, + "loss": 0.0315, "step": 3710 }, { "epoch": 16.986301369863014, - "grad_norm": 0.10207168757915497, + "grad_norm": 0.18419808149337769, "learning_rate": 2.574037834311807e-05, - "loss": 0.03, + "loss": 0.036, "step": 3720 }, { "epoch": 17.0, - "eval_bertscore_f1": 0.879923531196947, - "eval_bleu": 0.6651508967347922, - "eval_loss": 0.030300738289952278, - "eval_rougeL": 0.3055340180010153, - "eval_runtime": 85.0894, - "eval_samples_per_second": 17.664, - "eval_steps_per_second": 1.105, + "eval_bertscore_f1": 0.8797636342223136, + "eval_bleu": 0.706349516211386, + "eval_loss": 0.041200146079063416, + "eval_rougeL": 0.3065169488015689, + "eval_runtime": 65.4904, + "eval_samples_per_second": 22.95, + "eval_steps_per_second": 1.435, "step": 3723 }, { "epoch": 17.031963470319635, - "grad_norm": 0.05873997136950493, + "grad_norm": 0.19375832378864288, "learning_rate": 2.5675146771037185e-05, - "loss": 0.0333, + "loss": 0.0318, "step": 3730 }, { "epoch": 17.077625570776256, - "grad_norm": 0.1252429187297821, + "grad_norm": 0.18487004935741425, "learning_rate": 2.5609915198956297e-05, - "loss": 0.0299, + "loss": 0.0295, "step": 3740 }, { "epoch": 17.123287671232877, - "grad_norm": 0.07967006415128708, + "grad_norm": 0.25660863518714905, "learning_rate": 2.554468362687541e-05, - "loss": 0.0295, + "loss": 0.0308, "step": 3750 }, { "epoch": 17.168949771689498, - "grad_norm": 0.09141400456428528, + "grad_norm": 0.16052749752998352, "learning_rate": 2.547945205479452e-05, - "loss": 0.0282, + "loss": 0.0325, "step": 3760 }, { "epoch": 17.21461187214612, - "grad_norm": 0.18087051808834076, + "grad_norm": 0.22260412573814392, "learning_rate": 2.5414220482713634e-05, - "loss": 0.0342, + "loss": 0.0326, "step": 3770 }, { "epoch": 17.26027397260274, - "grad_norm": 0.06933823972940445, + "grad_norm": 0.1811288297176361, "learning_rate": 2.534898891063275e-05, - "loss": 0.0334, + "loss": 0.032, "step": 3780 }, { "epoch": 17.30593607305936, - "grad_norm": 0.06062929704785347, + "grad_norm": 0.24194012582302094, "learning_rate": 2.528375733855186e-05, - "loss": 0.0304, + "loss": 0.0318, "step": 3790 }, { "epoch": 17.35159817351598, - "grad_norm": 0.05274573713541031, + "grad_norm": 0.14382942020893097, "learning_rate": 2.5218525766470974e-05, - "loss": 0.0299, + "loss": 0.0307, "step": 3800 }, { "epoch": 17.397260273972602, - "grad_norm": 0.11018598824739456, + "grad_norm": 0.1479591429233551, "learning_rate": 2.5153294194390086e-05, "loss": 0.0297, "step": 3810 }, { "epoch": 17.442922374429223, - "grad_norm": 0.2097170352935791, + "grad_norm": 0.18002314865589142, "learning_rate": 2.5088062622309198e-05, - "loss": 0.0311, + "loss": 0.0299, "step": 3820 }, { "epoch": 17.488584474885844, - "grad_norm": 0.06161818653345108, + "grad_norm": 0.20746544003486633, "learning_rate": 2.5022831050228314e-05, - "loss": 0.0261, + "loss": 0.0301, "step": 3830 }, { "epoch": 17.534246575342465, - "grad_norm": 0.10092420876026154, + "grad_norm": 0.18796005845069885, "learning_rate": 2.4957599478147426e-05, - "loss": 0.0348, + "loss": 0.0313, "step": 3840 }, { "epoch": 17.579908675799086, - "grad_norm": 0.06519993394613266, + "grad_norm": 0.1633671671152115, "learning_rate": 2.4892367906066538e-05, - "loss": 0.0296, + "loss": 0.031, "step": 3850 }, { "epoch": 17.625570776255707, - "grad_norm": 0.08260706067085266, + "grad_norm": 0.20464873313903809, "learning_rate": 2.482713633398565e-05, - "loss": 0.0362, + "loss": 0.0294, "step": 3860 }, { "epoch": 17.671232876712327, - "grad_norm": 0.07475756853818893, + "grad_norm": 0.1803971529006958, "learning_rate": 2.4761904761904762e-05, - "loss": 0.0296, + "loss": 0.0322, "step": 3870 }, { "epoch": 17.71689497716895, - "grad_norm": 0.05199064686894417, + "grad_norm": 0.2384488433599472, "learning_rate": 2.4696673189823878e-05, - "loss": 0.0281, + "loss": 0.0319, "step": 3880 }, { "epoch": 17.76255707762557, - "grad_norm": 0.09524153172969818, + "grad_norm": 0.11598934233188629, "learning_rate": 2.463144161774299e-05, - "loss": 0.031, + "loss": 0.0334, "step": 3890 }, { "epoch": 17.80821917808219, - "grad_norm": 0.05123337730765343, + "grad_norm": 0.2237691730260849, "learning_rate": 2.4566210045662102e-05, - "loss": 0.0306, + "loss": 0.0317, "step": 3900 }, { "epoch": 17.853881278538815, - "grad_norm": 0.06459668278694153, + "grad_norm": 0.23967674374580383, "learning_rate": 2.4500978473581214e-05, - "loss": 0.0287, + "loss": 0.0325, "step": 3910 }, { "epoch": 17.899543378995435, - "grad_norm": 0.06326840072870255, + "grad_norm": 0.15783822536468506, "learning_rate": 2.4435746901500327e-05, - "loss": 0.0287, + "loss": 0.0318, "step": 3920 }, { "epoch": 17.945205479452056, - "grad_norm": 0.060352522879838943, + "grad_norm": 0.1800832897424698, "learning_rate": 2.4370515329419442e-05, - "loss": 0.0344, + "loss": 0.0317, "step": 3930 }, { "epoch": 17.990867579908677, - "grad_norm": 0.08868108689785004, + "grad_norm": 0.5130301713943481, "learning_rate": 2.4305283757338554e-05, - "loss": 0.0276, + "loss": 0.0319, "step": 3940 }, { "epoch": 18.0, - "eval_bertscore_f1": 0.8797207474311985, - "eval_bleu": 0.5060863215577377, - "eval_loss": 0.030214933678507805, - "eval_rougeL": 0.3247787979416723, - "eval_runtime": 85.499, - "eval_samples_per_second": 17.579, - "eval_steps_per_second": 1.099, + "eval_bertscore_f1": 0.8798081558780201, + "eval_bleu": 0.7220797778715596, + "eval_loss": 0.042013321071863174, + "eval_rougeL": 0.30594782179206126, + "eval_runtime": 65.524, + "eval_samples_per_second": 22.938, + "eval_steps_per_second": 1.435, "step": 3942 }, { "epoch": 18.036529680365298, - "grad_norm": 0.10745666921138763, + "grad_norm": 0.16154253482818604, "learning_rate": 2.4240052185257666e-05, - "loss": 0.0318, + "loss": 0.03, "step": 3950 }, { "epoch": 18.08219178082192, - "grad_norm": 0.05487235262989998, + "grad_norm": 0.16206321120262146, "learning_rate": 2.417482061317678e-05, - "loss": 0.0279, + "loss": 0.0297, "step": 3960 }, { "epoch": 18.12785388127854, - "grad_norm": 0.14988921582698822, + "grad_norm": 0.22228258848190308, "learning_rate": 2.410958904109589e-05, - "loss": 0.0323, + "loss": 0.0302, "step": 3970 }, { "epoch": 18.17351598173516, - "grad_norm": 0.05322180688381195, + "grad_norm": 0.2068241685628891, "learning_rate": 2.4044357469015006e-05, - "loss": 0.0305, + "loss": 0.0322, "step": 3980 }, { "epoch": 18.21917808219178, - "grad_norm": 0.08688576519489288, + "grad_norm": 0.14935638010501862, "learning_rate": 2.397912589693412e-05, - "loss": 0.0285, + "loss": 0.0319, "step": 3990 }, { "epoch": 18.264840182648403, - "grad_norm": 0.064597949385643, + "grad_norm": 0.16079388558864594, "learning_rate": 2.391389432485323e-05, - "loss": 0.0283, + "loss": 0.0312, "step": 4000 }, { "epoch": 18.310502283105023, - "grad_norm": 0.23966668546199799, + "grad_norm": 0.1624707281589508, "learning_rate": 2.3848662752772343e-05, - "loss": 0.0307, + "loss": 0.0316, "step": 4010 }, { "epoch": 18.356164383561644, - "grad_norm": 0.27511870861053467, + "grad_norm": 0.1676281988620758, "learning_rate": 2.3783431180691455e-05, - "loss": 0.0281, + "loss": 0.0305, "step": 4020 }, { "epoch": 18.401826484018265, - "grad_norm": 0.23338817059993744, + "grad_norm": 0.16586793959140778, "learning_rate": 2.371819960861057e-05, - "loss": 0.0345, + "loss": 0.0323, "step": 4030 }, { "epoch": 18.447488584474886, - "grad_norm": 0.07026497274637222, + "grad_norm": 0.1340363472700119, "learning_rate": 2.3652968036529683e-05, - "loss": 0.0333, + "loss": 0.0289, "step": 4040 }, { "epoch": 18.493150684931507, - "grad_norm": 0.11002654582262039, + "grad_norm": 0.14134575426578522, "learning_rate": 2.3587736464448795e-05, - "loss": 0.0278, + "loss": 0.0318, "step": 4050 }, { "epoch": 18.538812785388128, - "grad_norm": 0.18548107147216797, + "grad_norm": 0.16164334118366241, "learning_rate": 2.3522504892367907e-05, - "loss": 0.0299, + "loss": 0.0303, "step": 4060 }, { "epoch": 18.58447488584475, - "grad_norm": 0.053734730929136276, + "grad_norm": 0.3421485722064972, "learning_rate": 2.345727332028702e-05, - "loss": 0.029, + "loss": 0.0313, "step": 4070 }, { "epoch": 18.63013698630137, - "grad_norm": 0.08840513974428177, + "grad_norm": 0.17548483610153198, "learning_rate": 2.3392041748206135e-05, - "loss": 0.0276, + "loss": 0.0322, "step": 4080 }, { "epoch": 18.67579908675799, - "grad_norm": 0.07163436710834503, + "grad_norm": 0.11101207137107849, "learning_rate": 2.3326810176125247e-05, - "loss": 0.0327, + "loss": 0.033, "step": 4090 }, { "epoch": 18.72146118721461, - "grad_norm": 0.07794025540351868, + "grad_norm": 0.19600647687911987, "learning_rate": 2.326157860404436e-05, - "loss": 0.0277, + "loss": 0.0314, "step": 4100 }, { "epoch": 18.767123287671232, - "grad_norm": 0.25026071071624756, + "grad_norm": 0.4979359805583954, "learning_rate": 2.319634703196347e-05, - "loss": 0.0271, + "loss": 0.0304, "step": 4110 }, { "epoch": 18.812785388127853, - "grad_norm": 0.05697787553071976, + "grad_norm": 0.16611818969249725, "learning_rate": 2.3131115459882584e-05, - "loss": 0.0368, + "loss": 0.0326, "step": 4120 }, { "epoch": 18.858447488584474, - "grad_norm": 0.31116795539855957, + "grad_norm": 0.2194747030735016, "learning_rate": 2.30658838878017e-05, - "loss": 0.0358, + "loss": 0.0336, "step": 4130 }, { "epoch": 18.904109589041095, - "grad_norm": 0.07639028131961823, + "grad_norm": 0.2581609785556793, "learning_rate": 2.300065231572081e-05, - "loss": 0.0268, + "loss": 0.0306, "step": 4140 }, { "epoch": 18.949771689497716, - "grad_norm": 0.10291515290737152, + "grad_norm": 0.20705465972423553, "learning_rate": 2.2935420743639923e-05, - "loss": 0.03, + "loss": 0.031, "step": 4150 }, { "epoch": 18.995433789954337, - "grad_norm": 0.06285667419433594, + "grad_norm": 0.18591341376304626, "learning_rate": 2.2870189171559036e-05, - "loss": 0.0293, + "loss": 0.0331, "step": 4160 }, { "epoch": 19.0, - "eval_bertscore_f1": 0.8820803382360214, - "eval_bleu": 0.6120517633158683, - "eval_loss": 0.03010978177189827, - "eval_rougeL": 0.3162528158829476, - "eval_runtime": 85.0314, - "eval_samples_per_second": 17.676, - "eval_steps_per_second": 1.105, + "eval_bertscore_f1": 0.8795998138502606, + "eval_bleu": 0.7183983246376581, + "eval_loss": 0.04113374277949333, + "eval_rougeL": 0.3053785040223903, + "eval_runtime": 64.8761, + "eval_samples_per_second": 23.167, + "eval_steps_per_second": 1.449, "step": 4161 }, { "epoch": 19.041095890410958, - "grad_norm": 0.06922904402017593, + "grad_norm": 0.2786708176136017, "learning_rate": 2.2804957599478148e-05, - "loss": 0.0306, + "loss": 0.0326, "step": 4170 }, { "epoch": 19.08675799086758, - "grad_norm": 0.07178625464439392, + "grad_norm": 0.17057956755161285, "learning_rate": 2.2739726027397263e-05, - "loss": 0.0296, + "loss": 0.0306, "step": 4180 }, { "epoch": 19.1324200913242, - "grad_norm": 0.18297427892684937, + "grad_norm": 0.15491874516010284, "learning_rate": 2.2674494455316376e-05, - "loss": 0.0277, + "loss": 0.0313, "step": 4190 }, { "epoch": 19.17808219178082, - "grad_norm": 0.08464095741510391, + "grad_norm": 0.319887638092041, "learning_rate": 2.2609262883235488e-05, - "loss": 0.0276, + "loss": 0.0316, "step": 4200 }, { "epoch": 19.22374429223744, - "grad_norm": 0.09229449927806854, + "grad_norm": 0.18852587044239044, "learning_rate": 2.25440313111546e-05, - "loss": 0.0269, + "loss": 0.0299, "step": 4210 }, { "epoch": 19.269406392694062, - "grad_norm": 0.05837394297122955, + "grad_norm": 0.17552684247493744, "learning_rate": 2.2478799739073715e-05, - "loss": 0.0292, + "loss": 0.0306, "step": 4220 }, { "epoch": 19.315068493150687, - "grad_norm": 0.0631113052368164, + "grad_norm": 0.25381070375442505, "learning_rate": 2.2413568166992828e-05, - "loss": 0.0316, + "loss": 0.0325, "step": 4230 }, { "epoch": 19.360730593607308, - "grad_norm": 0.06033403053879738, + "grad_norm": 0.23695605993270874, "learning_rate": 2.234833659491194e-05, - "loss": 0.0341, + "loss": 0.0317, "step": 4240 }, { "epoch": 19.40639269406393, - "grad_norm": 0.04350203275680542, + "grad_norm": 0.15467983484268188, "learning_rate": 2.2283105022831052e-05, - "loss": 0.0261, + "loss": 0.0307, "step": 4250 }, { "epoch": 19.45205479452055, - "grad_norm": 0.07761911302804947, + "grad_norm": 0.19283008575439453, "learning_rate": 2.2217873450750164e-05, - "loss": 0.0331, + "loss": 0.0312, "step": 4260 }, { "epoch": 19.49771689497717, - "grad_norm": 0.06852933019399643, + "grad_norm": 0.130938321352005, "learning_rate": 2.215264187866928e-05, - "loss": 0.0302, + "loss": 0.0304, "step": 4270 }, { "epoch": 19.54337899543379, - "grad_norm": 0.06854939460754395, + "grad_norm": 0.17690137028694153, "learning_rate": 2.2087410306588392e-05, - "loss": 0.0349, + "loss": 0.0326, "step": 4280 }, { "epoch": 19.589041095890412, - "grad_norm": 0.12086188793182373, + "grad_norm": 0.15000636875629425, "learning_rate": 2.2022178734507504e-05, - "loss": 0.0291, + "loss": 0.0299, "step": 4290 }, { "epoch": 19.634703196347033, - "grad_norm": 0.06224190816283226, + "grad_norm": 0.19072392582893372, "learning_rate": 2.1956947162426616e-05, - "loss": 0.0264, + "loss": 0.0311, "step": 4300 }, { "epoch": 19.680365296803654, - "grad_norm": 0.06803935021162033, + "grad_norm": 0.24786625802516937, "learning_rate": 2.189171559034573e-05, - "loss": 0.0294, + "loss": 0.0325, "step": 4310 }, { "epoch": 19.726027397260275, - "grad_norm": 0.26598790287971497, + "grad_norm": 1.5711880922317505, "learning_rate": 2.1826484018264844e-05, - "loss": 0.0318, + "loss": 0.0354, "step": 4320 }, { "epoch": 19.771689497716896, - "grad_norm": 0.05770856514573097, + "grad_norm": 0.2299468070268631, "learning_rate": 2.1761252446183956e-05, - "loss": 0.0285, + "loss": 0.0308, "step": 4330 }, { "epoch": 19.817351598173516, - "grad_norm": 0.06879496574401855, + "grad_norm": 0.14930671453475952, "learning_rate": 2.1696020874103068e-05, - "loss": 0.0279, + "loss": 0.0295, "step": 4340 }, { "epoch": 19.863013698630137, - "grad_norm": 0.14509794116020203, + "grad_norm": 0.263234406709671, "learning_rate": 2.163078930202218e-05, - "loss": 0.0309, + "loss": 0.0305, "step": 4350 }, { "epoch": 19.908675799086758, - "grad_norm": 0.0720428079366684, + "grad_norm": 0.2260236144065857, "learning_rate": 2.1565557729941293e-05, - "loss": 0.0289, + "loss": 0.0332, "step": 4360 }, { "epoch": 19.95433789954338, - "grad_norm": 0.060434482991695404, + "grad_norm": 0.23456306755542755, "learning_rate": 2.1500326157860408e-05, - "loss": 0.0315, + "loss": 0.0303, "step": 4370 }, { "epoch": 20.0, - "grad_norm": 0.06360732018947601, + "grad_norm": 0.18612632155418396, "learning_rate": 2.143509458577952e-05, - "loss": 0.0296, + "loss": 0.0305, "step": 4380 }, { "epoch": 20.0, - "eval_bertscore_f1": 0.8803962610280284, - "eval_bleu": 0.663448215955616, - "eval_loss": 0.030089378356933594, - "eval_rougeL": 0.31135197769275447, - "eval_runtime": 85.3087, - "eval_samples_per_second": 17.618, - "eval_steps_per_second": 1.102, + "eval_bertscore_f1": 0.8794884783223876, + "eval_bleu": 0.7197986607742484, + "eval_loss": 0.04177827760577202, + "eval_rougeL": 0.30584228187550916, + "eval_runtime": 64.9233, + "eval_samples_per_second": 23.15, + "eval_steps_per_second": 1.448, "step": 4380 }, { "epoch": 20.04566210045662, - "grad_norm": 0.04677814245223999, + "grad_norm": 1.3157190084457397, "learning_rate": 2.1369863013698632e-05, - "loss": 0.0296, + "loss": 0.0319, "step": 4390 }, { "epoch": 20.091324200913242, - "grad_norm": 0.054254136979579926, + "grad_norm": 0.2511720061302185, "learning_rate": 2.1304631441617745e-05, - "loss": 0.029, + "loss": 0.0316, "step": 4400 }, { "epoch": 20.136986301369863, - "grad_norm": 0.0981835126876831, + "grad_norm": 0.15252014994621277, "learning_rate": 2.1239399869536857e-05, - "loss": 0.0307, + "loss": 0.0304, "step": 4410 }, { "epoch": 20.182648401826484, - "grad_norm": 0.11695002764463425, + "grad_norm": 0.27835097908973694, "learning_rate": 2.1174168297455972e-05, - "loss": 0.0305, + "loss": 0.0318, "step": 4420 }, { "epoch": 20.228310502283104, - "grad_norm": 0.07786712050437927, + "grad_norm": 0.1549067199230194, "learning_rate": 2.1108936725375085e-05, - "loss": 0.0276, + "loss": 0.0317, "step": 4430 }, { "epoch": 20.273972602739725, - "grad_norm": 0.07433762401342392, + "grad_norm": 0.28859424591064453, "learning_rate": 2.1043705153294197e-05, - "loss": 0.0311, + "loss": 0.0321, "step": 4440 }, { "epoch": 20.319634703196346, - "grad_norm": 0.13224515318870544, + "grad_norm": 0.18468542397022247, "learning_rate": 2.097847358121331e-05, - "loss": 0.0305, + "loss": 0.0319, "step": 4450 }, { "epoch": 20.365296803652967, - "grad_norm": 0.07441609352827072, + "grad_norm": 0.14522314071655273, "learning_rate": 2.091324200913242e-05, - "loss": 0.0296, + "loss": 0.0298, "step": 4460 }, { "epoch": 20.410958904109588, - "grad_norm": 0.17592406272888184, + "grad_norm": 0.2167425900697708, "learning_rate": 2.0848010437051537e-05, - "loss": 0.0298, + "loss": 0.0301, "step": 4470 }, { "epoch": 20.45662100456621, - "grad_norm": 0.0610370934009552, + "grad_norm": 0.17523987591266632, "learning_rate": 2.078277886497065e-05, - "loss": 0.0273, + "loss": 0.0299, "step": 4480 }, { "epoch": 20.50228310502283, - "grad_norm": 0.09273794293403625, + "grad_norm": 0.2222181111574173, "learning_rate": 2.071754729288976e-05, - "loss": 0.0292, + "loss": 0.0318, "step": 4490 }, { "epoch": 20.54794520547945, - "grad_norm": 0.07045309990644455, + "grad_norm": 0.15766695141792297, "learning_rate": 2.0652315720808873e-05, - "loss": 0.031, + "loss": 0.0315, "step": 4500 }, { "epoch": 20.59360730593607, - "grad_norm": 0.09215089678764343, + "grad_norm": 0.23727627098560333, "learning_rate": 2.0587084148727985e-05, - "loss": 0.0275, + "loss": 0.0293, "step": 4510 }, { "epoch": 20.639269406392692, - "grad_norm": 0.1282932609319687, + "grad_norm": 0.18761341273784637, "learning_rate": 2.05218525766471e-05, - "loss": 0.0289, + "loss": 0.0298, "step": 4520 }, { "epoch": 20.684931506849313, - "grad_norm": 0.05365551635622978, + "grad_norm": 0.22776687145233154, "learning_rate": 2.0456621004566213e-05, - "loss": 0.0294, + "loss": 0.0326, "step": 4530 }, { "epoch": 20.730593607305934, - "grad_norm": 0.06082382798194885, + "grad_norm": 0.17745907604694366, "learning_rate": 2.0391389432485325e-05, - "loss": 0.0327, + "loss": 0.0302, "step": 4540 }, { "epoch": 20.77625570776256, - "grad_norm": 0.09889397025108337, + "grad_norm": 0.19407926499843597, "learning_rate": 2.0326157860404437e-05, - "loss": 0.0286, + "loss": 0.0323, "step": 4550 }, { "epoch": 20.82191780821918, - "grad_norm": 0.06901293992996216, + "grad_norm": 0.1550801694393158, "learning_rate": 2.026092628832355e-05, - "loss": 0.0297, + "loss": 0.0301, "step": 4560 }, { "epoch": 20.8675799086758, - "grad_norm": 0.05301612243056297, + "grad_norm": 0.2518145740032196, "learning_rate": 2.0195694716242665e-05, - "loss": 0.0284, + "loss": 0.0313, "step": 4570 }, { "epoch": 20.91324200913242, - "grad_norm": 0.07805132865905762, + "grad_norm": 0.18234769999980927, "learning_rate": 2.0130463144161777e-05, - "loss": 0.0289, + "loss": 0.0313, "step": 4580 }, { "epoch": 20.958904109589042, - "grad_norm": 0.16798809170722961, + "grad_norm": 0.17262135446071625, "learning_rate": 2.006523157208089e-05, - "loss": 0.0293, + "loss": 0.0302, "step": 4590 }, { "epoch": 21.0, - "eval_bertscore_f1": 0.8794706105710027, - "eval_bleu": 0.6654093778968074, - "eval_loss": 0.030202506110072136, - "eval_rougeL": 0.306777092559367, - "eval_runtime": 85.0172, - "eval_samples_per_second": 17.679, - "eval_steps_per_second": 1.106, + "eval_bertscore_f1": 0.8762413332720875, + "eval_bleu": 0.8743328548000596, + "eval_loss": 0.04259444400668144, + "eval_rougeL": 0.29478954803373814, + "eval_runtime": 65.0506, + "eval_samples_per_second": 23.105, + "eval_steps_per_second": 1.445, "step": 4599 }, { "epoch": 21.004566210045663, - "grad_norm": 0.11417368054389954, + "grad_norm": 0.22366954386234283, "learning_rate": 2e-05, - "loss": 0.0266, + "loss": 0.0327, "step": 4600 }, { "epoch": 21.050228310502284, - "grad_norm": 0.12795647978782654, + "grad_norm": 0.19344310462474823, "learning_rate": 1.9934768427919114e-05, - "loss": 0.0325, + "loss": 0.0333, "step": 4610 }, { "epoch": 21.095890410958905, - "grad_norm": 0.15962223708629608, + "grad_norm": 0.18375934660434723, "learning_rate": 1.986953685583823e-05, - "loss": 0.0324, + "loss": 0.0345, "step": 4620 }, { "epoch": 21.141552511415526, - "grad_norm": 0.06648146361112595, + "grad_norm": 0.15804964303970337, "learning_rate": 1.980430528375734e-05, - "loss": 0.0275, + "loss": 0.0291, "step": 4630 }, { "epoch": 21.187214611872147, - "grad_norm": 0.10432185977697372, + "grad_norm": 0.17671050131320953, "learning_rate": 1.9739073711676454e-05, - "loss": 0.0274, + "loss": 0.0316, "step": 4640 }, { "epoch": 21.232876712328768, - "grad_norm": 0.05011922866106033, + "grad_norm": 0.6683818101882935, "learning_rate": 1.9673842139595566e-05, - "loss": 0.0324, + "loss": 0.03, "step": 4650 }, { "epoch": 21.27853881278539, - "grad_norm": 0.07254044711589813, + "grad_norm": 0.16015271842479706, "learning_rate": 1.9608610567514678e-05, - "loss": 0.0271, + "loss": 0.0292, "step": 4660 }, { "epoch": 21.32420091324201, - "grad_norm": 0.279234379529953, + "grad_norm": 0.25762075185775757, "learning_rate": 1.9543378995433794e-05, - "loss": 0.0283, + "loss": 0.0324, "step": 4670 }, { "epoch": 21.36986301369863, - "grad_norm": 0.05222180485725403, + "grad_norm": 0.15417823195457458, "learning_rate": 1.9478147423352906e-05, - "loss": 0.0279, + "loss": 0.0321, "step": 4680 }, { "epoch": 21.41552511415525, - "grad_norm": 0.07844868302345276, + "grad_norm": 0.1617811620235443, "learning_rate": 1.9412915851272018e-05, - "loss": 0.0302, + "loss": 0.0334, "step": 4690 }, { "epoch": 21.461187214611872, - "grad_norm": 0.04424309730529785, + "grad_norm": 0.1719217449426651, "learning_rate": 1.934768427919113e-05, - "loss": 0.0277, + "loss": 0.0294, "step": 4700 }, { "epoch": 21.506849315068493, - "grad_norm": 0.06602602452039719, + "grad_norm": 0.1514827162027359, "learning_rate": 1.9282452707110242e-05, - "loss": 0.0277, + "loss": 0.0304, "step": 4710 }, { "epoch": 21.552511415525114, - "grad_norm": 0.10084933042526245, + "grad_norm": 0.14078788459300995, "learning_rate": 1.9217221135029358e-05, - "loss": 0.0292, + "loss": 0.0302, "step": 4720 }, { "epoch": 21.598173515981735, - "grad_norm": 0.1485004872083664, + "grad_norm": 0.1602380871772766, "learning_rate": 1.915198956294847e-05, - "loss": 0.0312, + "loss": 0.0309, "step": 4730 }, { "epoch": 21.643835616438356, - "grad_norm": 0.1026349812746048, + "grad_norm": 0.23968474566936493, "learning_rate": 1.9086757990867582e-05, - "loss": 0.03, + "loss": 0.031, "step": 4740 }, { "epoch": 21.689497716894977, - "grad_norm": 0.10063963383436203, + "grad_norm": 0.10929176211357117, "learning_rate": 1.9021526418786694e-05, - "loss": 0.0294, + "loss": 0.0293, "step": 4750 }, { "epoch": 21.735159817351597, - "grad_norm": 0.08801602572202682, + "grad_norm": 0.18268796801567078, "learning_rate": 1.8956294846705807e-05, - "loss": 0.0315, + "loss": 0.0319, "step": 4760 }, { "epoch": 21.78082191780822, - "grad_norm": 0.060193344950675964, + "grad_norm": 0.260498583316803, "learning_rate": 1.8891063274624922e-05, - "loss": 0.0274, + "loss": 0.0318, "step": 4770 }, { "epoch": 21.82648401826484, - "grad_norm": 0.08400905132293701, + "grad_norm": 0.18598124384880066, "learning_rate": 1.8825831702544034e-05, - "loss": 0.0307, + "loss": 0.0309, "step": 4780 }, { "epoch": 21.87214611872146, - "grad_norm": 0.07627220451831818, + "grad_norm": 0.14917334914207458, "learning_rate": 1.8760600130463146e-05, - "loss": 0.0268, + "loss": 0.0303, "step": 4790 }, { "epoch": 21.91780821917808, - "grad_norm": 0.11953066289424896, + "grad_norm": 0.17310568690299988, "learning_rate": 1.869536855838226e-05, - "loss": 0.0286, + "loss": 0.0311, "step": 4800 }, { "epoch": 21.963470319634702, - "grad_norm": 0.1060253456234932, + "grad_norm": 0.16899165511131287, "learning_rate": 1.863013698630137e-05, - "loss": 0.0282, + "loss": 0.0302, "step": 4810 }, { "epoch": 22.0, - "eval_bertscore_f1": 0.8792276669167235, - "eval_bleu": 0.7824206736837964, - "eval_loss": 0.029947301372885704, - "eval_rougeL": 0.2971952858857796, - "eval_runtime": 85.058, - "eval_samples_per_second": 17.67, - "eval_steps_per_second": 1.105, + "eval_bertscore_f1": 0.8747094868027363, + "eval_bleu": 0.9393280067525835, + "eval_loss": 0.041989196091890335, + "eval_rougeL": 0.2900498588163605, + "eval_runtime": 65.3354, + "eval_samples_per_second": 23.004, + "eval_steps_per_second": 1.439, "step": 4818 }, { "epoch": 22.009132420091323, - "grad_norm": 0.06057864427566528, + "grad_norm": 0.19090060889720917, "learning_rate": 1.8564905414220486e-05, - "loss": 0.0275, + "loss": 0.0354, "step": 4820 }, { "epoch": 22.054794520547944, - "grad_norm": 0.0509127639234066, + "grad_norm": 0.19969524443149567, "learning_rate": 1.84996738421396e-05, - "loss": 0.0277, + "loss": 0.0299, "step": 4830 }, { "epoch": 22.100456621004565, - "grad_norm": 0.3389517068862915, + "grad_norm": 0.2311638593673706, "learning_rate": 1.843444227005871e-05, - "loss": 0.0297, + "loss": 0.0301, "step": 4840 }, { "epoch": 22.146118721461185, - "grad_norm": 0.10847995430231094, + "grad_norm": 0.1671016663312912, "learning_rate": 1.8369210697977823e-05, - "loss": 0.0292, + "loss": 0.0302, "step": 4850 }, { "epoch": 22.19178082191781, - "grad_norm": 0.06611143052577972, + "grad_norm": 0.14974653720855713, "learning_rate": 1.8303979125896935e-05, - "loss": 0.0288, + "loss": 0.0296, "step": 4860 }, { "epoch": 22.23744292237443, - "grad_norm": 0.19594036042690277, + "grad_norm": 0.20775803923606873, "learning_rate": 1.823874755381605e-05, - "loss": 0.0319, + "loss": 0.0325, "step": 4870 }, { "epoch": 22.28310502283105, - "grad_norm": 0.18287107348442078, + "grad_norm": 0.18768472969532013, "learning_rate": 1.8173515981735163e-05, - "loss": 0.0346, + "loss": 0.0317, "step": 4880 }, { "epoch": 22.328767123287673, - "grad_norm": 0.06755329668521881, + "grad_norm": 0.18841847777366638, "learning_rate": 1.8108284409654275e-05, - "loss": 0.0297, + "loss": 0.0286, "step": 4890 }, { "epoch": 22.374429223744293, - "grad_norm": 0.062369752675294876, + "grad_norm": 0.15577343106269836, "learning_rate": 1.8043052837573387e-05, - "loss": 0.0283, + "loss": 0.0312, "step": 4900 }, { "epoch": 22.420091324200914, - "grad_norm": 0.05487988516688347, + "grad_norm": 0.17433637380599976, "learning_rate": 1.79778212654925e-05, - "loss": 0.0271, + "loss": 0.0303, "step": 4910 }, { "epoch": 22.465753424657535, - "grad_norm": 0.2627151906490326, + "grad_norm": 0.1529913693666458, "learning_rate": 1.7912589693411615e-05, - "loss": 0.0279, + "loss": 0.0295, "step": 4920 }, { "epoch": 22.511415525114156, - "grad_norm": 0.053799696266651154, + "grad_norm": 0.14870087802410126, "learning_rate": 1.7847358121330727e-05, - "loss": 0.0267, + "loss": 0.0316, "step": 4930 }, { "epoch": 22.557077625570777, - "grad_norm": 0.21573284268379211, + "grad_norm": 0.12427645176649094, "learning_rate": 1.778212654924984e-05, - "loss": 0.0293, + "loss": 0.0279, "step": 4940 }, { "epoch": 22.602739726027398, - "grad_norm": 0.07190805673599243, + "grad_norm": 0.2489793747663498, "learning_rate": 1.771689497716895e-05, - "loss": 0.0265, + "loss": 0.032, "step": 4950 }, { "epoch": 22.64840182648402, - "grad_norm": 0.12876766920089722, + "grad_norm": 0.15614229440689087, "learning_rate": 1.7651663405088064e-05, - "loss": 0.0322, + "loss": 0.0303, "step": 4960 }, { "epoch": 22.69406392694064, - "grad_norm": 0.05925724655389786, + "grad_norm": 0.4469321072101593, "learning_rate": 1.758643183300718e-05, - "loss": 0.0287, + "loss": 0.0314, "step": 4970 }, { "epoch": 22.73972602739726, - "grad_norm": 0.24361100792884827, + "grad_norm": 0.123336561024189, "learning_rate": 1.752120026092629e-05, - "loss": 0.0279, + "loss": 0.0303, "step": 4980 }, { "epoch": 22.78538812785388, - "grad_norm": 0.151267409324646, + "grad_norm": 0.1903766691684723, "learning_rate": 1.7455968688845403e-05, - "loss": 0.0268, + "loss": 0.03, "step": 4990 }, { "epoch": 22.831050228310502, - "grad_norm": 0.05790381506085396, + "grad_norm": 0.1910991221666336, "learning_rate": 1.7390737116764516e-05, - "loss": 0.0291, + "loss": 0.0297, "step": 5000 }, { "epoch": 22.876712328767123, - "grad_norm": 0.05819353461265564, + "grad_norm": 0.16593395173549652, "learning_rate": 1.7325505544683628e-05, - "loss": 0.0279, + "loss": 0.0309, "step": 5010 }, { "epoch": 22.922374429223744, - "grad_norm": 0.10005196928977966, + "grad_norm": 0.19862234592437744, "learning_rate": 1.7260273972602743e-05, - "loss": 0.026, + "loss": 0.0294, "step": 5020 }, { "epoch": 22.968036529680365, - "grad_norm": 0.13961489498615265, + "grad_norm": 0.5439543128013611, "learning_rate": 1.7195042400521855e-05, - "loss": 0.0317, + "loss": 0.031, "step": 5030 }, { "epoch": 23.0, - "eval_bertscore_f1": 0.8818914938274417, - "eval_bleu": 0.6471478207401697, - "eval_loss": 0.029684867709875107, - "eval_rougeL": 0.31794502636592814, - "eval_runtime": 86.5126, - "eval_samples_per_second": 17.373, - "eval_steps_per_second": 1.087, + "eval_bertscore_f1": 0.8795151574089776, + "eval_bleu": 0.726659433737333, + "eval_loss": 0.041650936007499695, + "eval_rougeL": 0.3058748502333608, + "eval_runtime": 65.391, + "eval_samples_per_second": 22.985, + "eval_steps_per_second": 1.438, "step": 5037 }, { "epoch": 23.013698630136986, - "grad_norm": 0.04921013489365578, + "grad_norm": 0.16144615411758423, "learning_rate": 1.7129810828440968e-05, - "loss": 0.0301, + "loss": 0.0308, "step": 5040 }, { "epoch": 23.059360730593607, - "grad_norm": 0.11158487945795059, + "grad_norm": 0.21443626284599304, "learning_rate": 1.706457925636008e-05, - "loss": 0.0273, + "loss": 0.0318, "step": 5050 }, { "epoch": 23.105022831050228, - "grad_norm": 0.05320843309164047, + "grad_norm": 0.1895105540752411, "learning_rate": 1.6999347684279192e-05, - "loss": 0.0283, + "loss": 0.0311, "step": 5060 }, { "epoch": 23.15068493150685, - "grad_norm": 0.08403103053569794, + "grad_norm": 0.17296628654003143, "learning_rate": 1.6934116112198308e-05, - "loss": 0.0276, + "loss": 0.0297, "step": 5070 }, { "epoch": 23.19634703196347, - "grad_norm": 0.05202396214008331, + "grad_norm": 0.22210587561130524, "learning_rate": 1.686888454011742e-05, - "loss": 0.0268, + "loss": 0.0286, "step": 5080 }, { "epoch": 23.24200913242009, - "grad_norm": 0.07416887581348419, + "grad_norm": 0.1891639083623886, "learning_rate": 1.6803652968036532e-05, - "loss": 0.0289, + "loss": 0.0301, "step": 5090 }, { "epoch": 23.28767123287671, - "grad_norm": 0.08157260715961456, + "grad_norm": 0.17124980688095093, "learning_rate": 1.6738421395955644e-05, - "loss": 0.0273, + "loss": 0.0321, "step": 5100 }, { "epoch": 23.333333333333332, - "grad_norm": 0.115360789000988, + "grad_norm": 0.16418173909187317, "learning_rate": 1.6673189823874756e-05, - "loss": 0.0302, + "loss": 0.0304, "step": 5110 }, { "epoch": 23.378995433789953, - "grad_norm": 0.09168818593025208, + "grad_norm": 0.17459054291248322, "learning_rate": 1.6607958251793872e-05, - "loss": 0.0296, + "loss": 0.0311, "step": 5120 }, { "epoch": 23.424657534246574, - "grad_norm": 0.06763426214456558, + "grad_norm": 0.15232379734516144, "learning_rate": 1.6542726679712984e-05, - "loss": 0.0284, + "loss": 0.0315, "step": 5130 }, { "epoch": 23.470319634703195, - "grad_norm": 0.06270505487918854, + "grad_norm": 0.25604724884033203, "learning_rate": 1.6477495107632096e-05, - "loss": 0.0284, + "loss": 0.0308, "step": 5140 }, { "epoch": 23.515981735159816, - "grad_norm": 0.08347708731889725, + "grad_norm": 0.3925830125808716, "learning_rate": 1.641226353555121e-05, - "loss": 0.0309, + "loss": 0.0283, "step": 5150 }, { "epoch": 23.561643835616437, - "grad_norm": 0.07440556585788727, + "grad_norm": 0.1935456395149231, "learning_rate": 1.634703196347032e-05, - "loss": 0.0266, + "loss": 0.0276, "step": 5160 }, { "epoch": 23.60730593607306, - "grad_norm": 0.07723133265972137, + "grad_norm": 0.16616645455360413, "learning_rate": 1.6281800391389436e-05, - "loss": 0.0286, + "loss": 0.0309, "step": 5170 }, { "epoch": 23.652968036529682, - "grad_norm": 0.08790794014930725, + "grad_norm": 0.19446122646331787, "learning_rate": 1.6216568819308548e-05, - "loss": 0.0324, + "loss": 0.0291, "step": 5180 }, { "epoch": 23.698630136986303, - "grad_norm": 0.04647298902273178, + "grad_norm": 0.20272910594940186, "learning_rate": 1.615133724722766e-05, - "loss": 0.0287, + "loss": 0.0314, "step": 5190 }, { "epoch": 23.744292237442924, - "grad_norm": 0.07351569831371307, + "grad_norm": 0.21669697761535645, "learning_rate": 1.6086105675146773e-05, - "loss": 0.0278, + "loss": 0.0322, "step": 5200 }, { "epoch": 23.789954337899545, - "grad_norm": 0.055103108286857605, + "grad_norm": 0.19931752979755402, "learning_rate": 1.6020874103065885e-05, - "loss": 0.0282, + "loss": 0.0296, "step": 5210 }, { "epoch": 23.835616438356166, - "grad_norm": 0.12256161123514175, + "grad_norm": 0.20344951748847961, "learning_rate": 1.5955642530985e-05, - "loss": 0.0321, + "loss": 0.0296, "step": 5220 }, { "epoch": 23.881278538812786, - "grad_norm": 0.062129609286785126, + "grad_norm": 0.15517154335975647, "learning_rate": 1.5890410958904112e-05, - "loss": 0.0281, + "loss": 0.0309, "step": 5230 }, { "epoch": 23.926940639269407, - "grad_norm": 0.057793620973825455, + "grad_norm": 0.22952044010162354, "learning_rate": 1.5825179386823225e-05, - "loss": 0.0287, + "loss": 0.0312, "step": 5240 }, { "epoch": 23.972602739726028, - "grad_norm": 0.068938247859478, + "grad_norm": 0.2000703364610672, "learning_rate": 1.5759947814742337e-05, - "loss": 0.0282, + "loss": 0.0321, "step": 5250 }, { "epoch": 24.0, - "eval_bertscore_f1": 0.8786919926375607, - "eval_bleu": 0.6852253219479566, - "eval_loss": 0.02980269491672516, - "eval_rougeL": 0.3053440930561995, - "eval_runtime": 85.9989, - "eval_samples_per_second": 17.477, - "eval_steps_per_second": 1.093, + "eval_bertscore_f1": 0.8752217368927306, + "eval_bleu": 0.931234723225367, + "eval_loss": 0.04182644933462143, + "eval_rougeL": 0.29302986167189593, + "eval_runtime": 64.8994, + "eval_samples_per_second": 23.159, + "eval_steps_per_second": 1.448, "step": 5256 }, { "epoch": 24.01826484018265, - "grad_norm": 0.06383677572011948, + "grad_norm": 0.12788528203964233, "learning_rate": 1.569471624266145e-05, - "loss": 0.027, + "loss": 0.0288, "step": 5260 }, { "epoch": 24.06392694063927, - "grad_norm": 0.26667171716690063, + "grad_norm": 0.1858115792274475, "learning_rate": 1.5629484670580565e-05, - "loss": 0.0288, + "loss": 0.0319, "step": 5270 }, { "epoch": 24.10958904109589, - "grad_norm": 0.0889834463596344, + "grad_norm": 0.20606492459774017, "learning_rate": 1.5564253098499677e-05, - "loss": 0.0287, + "loss": 0.0327, "step": 5280 }, { "epoch": 24.15525114155251, - "grad_norm": 0.06234045326709747, + "grad_norm": 0.1898220181465149, "learning_rate": 1.549902152641879e-05, - "loss": 0.0287, + "loss": 0.0307, "step": 5290 }, { "epoch": 24.200913242009133, - "grad_norm": 0.05488771200180054, + "grad_norm": 0.20800863206386566, "learning_rate": 1.54337899543379e-05, - "loss": 0.0276, + "loss": 0.0316, "step": 5300 }, { "epoch": 24.246575342465754, - "grad_norm": 0.05854570493102074, + "grad_norm": 0.1523219794034958, "learning_rate": 1.5368558382257013e-05, - "loss": 0.0292, + "loss": 0.0298, "step": 5310 }, { "epoch": 24.292237442922374, - "grad_norm": 0.07843585312366486, + "grad_norm": 0.1901863068342209, "learning_rate": 1.530332681017613e-05, - "loss": 0.0262, + "loss": 0.0298, "step": 5320 }, { "epoch": 24.337899543378995, - "grad_norm": 0.1690542995929718, + "grad_norm": 0.212814062833786, "learning_rate": 1.5238095238095241e-05, - "loss": 0.0292, + "loss": 0.0322, "step": 5330 }, { "epoch": 24.383561643835616, - "grad_norm": 0.05105036869645119, + "grad_norm": 0.17524544894695282, "learning_rate": 1.5172863666014353e-05, - "loss": 0.0256, + "loss": 0.0313, "step": 5340 }, { "epoch": 24.429223744292237, - "grad_norm": 0.05323030427098274, + "grad_norm": 0.10430175811052322, "learning_rate": 1.5107632093933465e-05, - "loss": 0.031, + "loss": 0.0292, "step": 5350 }, { "epoch": 24.474885844748858, - "grad_norm": 0.29972654581069946, + "grad_norm": 0.17600584030151367, "learning_rate": 1.504240052185258e-05, - "loss": 0.0295, + "loss": 0.0291, "step": 5360 }, { "epoch": 24.52054794520548, - "grad_norm": 0.13861894607543945, + "grad_norm": 0.18452323973178864, "learning_rate": 1.497716894977169e-05, - "loss": 0.029, + "loss": 0.0285, "step": 5370 }, { "epoch": 24.5662100456621, - "grad_norm": 0.11305614560842514, + "grad_norm": 0.19338224828243256, "learning_rate": 1.4911937377690802e-05, - "loss": 0.0296, + "loss": 0.0309, "step": 5380 }, { "epoch": 24.61187214611872, - "grad_norm": 0.06037652865052223, + "grad_norm": 0.10477393120527267, "learning_rate": 1.4846705805609914e-05, - "loss": 0.0278, + "loss": 0.0275, "step": 5390 }, { "epoch": 24.65753424657534, - "grad_norm": 0.09368506073951721, + "grad_norm": 0.15825144946575165, "learning_rate": 1.4781474233529028e-05, - "loss": 0.0259, + "loss": 0.0307, "step": 5400 }, { "epoch": 24.703196347031962, - "grad_norm": 0.16461175680160522, + "grad_norm": 0.1530427634716034, "learning_rate": 1.471624266144814e-05, - "loss": 0.0288, + "loss": 0.0282, "step": 5410 }, { "epoch": 24.748858447488583, - "grad_norm": 0.1278267502784729, + "grad_norm": 0.17281366884708405, "learning_rate": 1.4651011089367254e-05, - "loss": 0.0303, + "loss": 0.0298, "step": 5420 }, { "epoch": 24.794520547945204, - "grad_norm": 0.0516529381275177, + "grad_norm": 0.22076232731342316, "learning_rate": 1.4585779517286366e-05, - "loss": 0.0296, + "loss": 0.0294, "step": 5430 }, { "epoch": 24.840182648401825, - "grad_norm": 0.05440378189086914, + "grad_norm": 0.16596472263336182, "learning_rate": 1.4520547945205478e-05, - "loss": 0.0285, + "loss": 0.0307, "step": 5440 }, { "epoch": 24.885844748858446, - "grad_norm": 0.053505733609199524, + "grad_norm": 0.22020399570465088, "learning_rate": 1.4455316373124592e-05, - "loss": 0.031, + "loss": 0.0286, "step": 5450 }, { "epoch": 24.931506849315067, - "grad_norm": 0.0745423436164856, + "grad_norm": 0.15291544795036316, "learning_rate": 1.4390084801043704e-05, - "loss": 0.0287, + "loss": 0.0292, "step": 5460 }, { "epoch": 24.977168949771688, - "grad_norm": 0.0573294572532177, + "grad_norm": 0.23781470954418182, "learning_rate": 1.4324853228962818e-05, - "loss": 0.0292, + "loss": 0.0309, "step": 5470 }, { "epoch": 25.0, - "eval_bertscore_f1": 0.882176701418178, - "eval_bleu": 0.6490651628544384, - "eval_loss": 0.02972874790430069, - "eval_rougeL": 0.3165639622967543, - "eval_runtime": 85.5868, - "eval_samples_per_second": 17.561, - "eval_steps_per_second": 1.098, + "eval_bertscore_f1": 0.8788308417567078, + "eval_bleu": 0.7686779949835236, + "eval_loss": 0.04270849749445915, + "eval_rougeL": 0.3022423509007887, + "eval_runtime": 64.9101, + "eval_samples_per_second": 23.155, + "eval_steps_per_second": 1.448, "step": 5475 }, { "epoch": 25.022831050228312, - "grad_norm": 0.057134486734867096, + "grad_norm": 0.20918726921081543, "learning_rate": 1.425962165688193e-05, - "loss": 0.0276, + "loss": 0.0306, "step": 5480 }, { "epoch": 25.068493150684933, - "grad_norm": 0.06873016059398651, + "grad_norm": 0.11463820934295654, "learning_rate": 1.4194390084801042e-05, - "loss": 0.029, + "loss": 0.0299, "step": 5490 }, { "epoch": 25.114155251141554, - "grad_norm": 0.06870689243078232, + "grad_norm": 0.1545298844575882, "learning_rate": 1.4129158512720156e-05, - "loss": 0.0287, + "loss": 0.0311, "step": 5500 }, { "epoch": 25.159817351598175, - "grad_norm": 0.0451415553689003, + "grad_norm": 0.13323558866977692, "learning_rate": 1.4063926940639269e-05, - "loss": 0.028, + "loss": 0.0297, "step": 5510 }, { "epoch": 25.205479452054796, - "grad_norm": 0.051221951842308044, + "grad_norm": 0.16467635333538055, "learning_rate": 1.3998695368558382e-05, - "loss": 0.0263, + "loss": 0.0313, "step": 5520 }, { "epoch": 25.251141552511417, - "grad_norm": 0.4794943928718567, + "grad_norm": 0.19874680042266846, "learning_rate": 1.3933463796477495e-05, - "loss": 0.031, + "loss": 0.0311, "step": 5530 }, { "epoch": 25.296803652968038, - "grad_norm": 0.08168008923530579, + "grad_norm": 0.1181570440530777, "learning_rate": 1.3868232224396607e-05, - "loss": 0.0288, + "loss": 0.0284, "step": 5540 }, { "epoch": 25.34246575342466, - "grad_norm": 0.10532938688993454, + "grad_norm": 0.13484494388103485, "learning_rate": 1.380300065231572e-05, - "loss": 0.028, + "loss": 0.0321, "step": 5550 }, { "epoch": 25.38812785388128, - "grad_norm": 0.06582871824502945, + "grad_norm": 0.17654231190681458, "learning_rate": 1.3737769080234833e-05, - "loss": 0.0282, + "loss": 0.0297, "step": 5560 }, { "epoch": 25.4337899543379, - "grad_norm": 0.07675176113843918, + "grad_norm": 0.3322019875049591, "learning_rate": 1.3672537508153947e-05, - "loss": 0.0275, + "loss": 0.0297, "step": 5570 }, { "epoch": 25.47945205479452, - "grad_norm": 0.25649493932724, + "grad_norm": 0.12533101439476013, "learning_rate": 1.3607305936073059e-05, - "loss": 0.0308, + "loss": 0.0332, "step": 5580 }, { "epoch": 25.525114155251142, - "grad_norm": 0.081456758081913, + "grad_norm": 0.15482525527477264, "learning_rate": 1.3542074363992171e-05, - "loss": 0.027, + "loss": 0.0292, "step": 5590 }, { "epoch": 25.570776255707763, - "grad_norm": 0.14920422434806824, + "grad_norm": 0.16881971061229706, "learning_rate": 1.3476842791911285e-05, - "loss": 0.0275, + "loss": 0.0313, "step": 5600 }, { "epoch": 25.616438356164384, - "grad_norm": 0.08439820259809494, + "grad_norm": 0.17732620239257812, "learning_rate": 1.3411611219830397e-05, - "loss": 0.0269, + "loss": 0.0293, "step": 5610 }, { "epoch": 25.662100456621005, - "grad_norm": 0.06275507062673569, + "grad_norm": 0.184401273727417, "learning_rate": 1.3346379647749511e-05, - "loss": 0.0274, + "loss": 0.0293, "step": 5620 }, { "epoch": 25.707762557077626, - "grad_norm": 0.05262503772974014, + "grad_norm": 0.16654548048973083, "learning_rate": 1.3281148075668623e-05, - "loss": 0.0277, + "loss": 0.0295, "step": 5630 }, { "epoch": 25.753424657534246, - "grad_norm": 0.09741270542144775, + "grad_norm": 0.19625544548034668, "learning_rate": 1.3215916503587735e-05, - "loss": 0.0287, + "loss": 0.0299, "step": 5640 }, { "epoch": 25.799086757990867, - "grad_norm": 0.0535399504005909, + "grad_norm": 0.18322430551052094, "learning_rate": 1.3150684931506849e-05, - "loss": 0.0271, + "loss": 0.0306, "step": 5650 }, { "epoch": 25.84474885844749, - "grad_norm": 0.09080829471349716, + "grad_norm": 0.16067004203796387, "learning_rate": 1.3085453359425961e-05, - "loss": 0.0293, + "loss": 0.0303, "step": 5660 }, { "epoch": 25.89041095890411, - "grad_norm": 0.08309127390384674, + "grad_norm": 0.13349705934524536, "learning_rate": 1.3020221787345075e-05, - "loss": 0.0304, + "loss": 0.0279, "step": 5670 }, { "epoch": 25.93607305936073, - "grad_norm": 0.09403306990861893, + "grad_norm": 0.19681212306022644, "learning_rate": 1.2954990215264187e-05, - "loss": 0.0282, + "loss": 0.0304, "step": 5680 }, { "epoch": 25.98173515981735, - "grad_norm": 0.06915637105703354, + "grad_norm": 0.305829256772995, "learning_rate": 1.28897586431833e-05, - "loss": 0.0288, + "loss": 0.0292, "step": 5690 }, { "epoch": 26.0, - "eval_bertscore_f1": 0.8812559430232781, - "eval_bleu": 0.6023790904677201, - "eval_loss": 0.029696911573410034, - "eval_rougeL": 0.3190547097361511, - "eval_runtime": 85.4112, - "eval_samples_per_second": 17.597, - "eval_steps_per_second": 1.101, + "eval_bertscore_f1": 0.8774845823794306, + "eval_bleu": 0.6514875461109689, + "eval_loss": 0.042685650289058685, + "eval_rougeL": 0.3085123872128879, + "eval_runtime": 65.3563, + "eval_samples_per_second": 22.997, + "eval_steps_per_second": 1.438, "step": 5694 }, { "epoch": 26.027397260273972, - "grad_norm": 0.06425880640745163, + "grad_norm": 0.227409228682518, "learning_rate": 1.2824527071102413e-05, - "loss": 0.0267, + "loss": 0.0299, "step": 5700 }, { "epoch": 26.073059360730593, - "grad_norm": 0.04396981745958328, + "grad_norm": 0.17818889021873474, "learning_rate": 1.2759295499021525e-05, - "loss": 0.0268, + "loss": 0.03, "step": 5710 }, { "epoch": 26.118721461187214, - "grad_norm": 0.05449533835053444, + "grad_norm": 0.13791914284229279, "learning_rate": 1.269406392694064e-05, - "loss": 0.0269, + "loss": 0.029, "step": 5720 }, { "epoch": 26.164383561643834, - "grad_norm": 0.06189217418432236, + "grad_norm": 0.16791503131389618, "learning_rate": 1.2628832354859752e-05, - "loss": 0.0295, + "loss": 0.0308, "step": 5730 }, { "epoch": 26.210045662100455, - "grad_norm": 0.0630607083439827, + "grad_norm": 0.2646624445915222, "learning_rate": 1.2563600782778864e-05, "loss": 0.0303, "step": 5740 }, { "epoch": 26.255707762557076, - "grad_norm": 0.05802557244896889, + "grad_norm": 0.19949132204055786, "learning_rate": 1.249836921069798e-05, - "loss": 0.0268, + "loss": 0.0304, "step": 5750 }, { "epoch": 26.301369863013697, - "grad_norm": 0.045306626707315445, + "grad_norm": 0.14131474494934082, "learning_rate": 1.2433137638617091e-05, - "loss": 0.0288, + "loss": 0.0304, "step": 5760 }, { "epoch": 26.347031963470318, - "grad_norm": 0.06658345460891724, + "grad_norm": 0.13681089878082275, "learning_rate": 1.2367906066536204e-05, - "loss": 0.0274, + "loss": 0.0293, "step": 5770 }, { "epoch": 26.39269406392694, - "grad_norm": 0.06213853880763054, + "grad_norm": 0.18042202293872833, "learning_rate": 1.2302674494455317e-05, - "loss": 0.028, + "loss": 0.0298, "step": 5780 }, { "epoch": 26.438356164383563, - "grad_norm": 0.07253895699977875, + "grad_norm": 0.10717923194169998, "learning_rate": 1.223744292237443e-05, - "loss": 0.0261, + "loss": 0.0275, "step": 5790 }, { "epoch": 26.484018264840184, - "grad_norm": 0.0809069573879242, + "grad_norm": 0.1440982073545456, "learning_rate": 1.2172211350293543e-05, - "loss": 0.0284, + "loss": 0.0313, "step": 5800 }, { "epoch": 26.529680365296805, - "grad_norm": 0.15386025607585907, + "grad_norm": 0.21595457196235657, "learning_rate": 1.2106979778212656e-05, - "loss": 0.0271, + "loss": 0.0306, "step": 5810 }, { "epoch": 26.575342465753426, - "grad_norm": 0.0711253210902214, + "grad_norm": 0.18737435340881348, "learning_rate": 1.2041748206131768e-05, - "loss": 0.027, + "loss": 0.0295, "step": 5820 }, { "epoch": 26.621004566210047, - "grad_norm": 0.06340400129556656, + "grad_norm": 0.14051635563373566, "learning_rate": 1.1976516634050882e-05, - "loss": 0.0286, + "loss": 0.031, "step": 5830 }, { "epoch": 26.666666666666668, - "grad_norm": 0.06583566963672638, + "grad_norm": 0.23766343295574188, "learning_rate": 1.1911285061969994e-05, - "loss": 0.0265, + "loss": 0.0298, "step": 5840 }, { "epoch": 26.71232876712329, - "grad_norm": 0.09953002631664276, + "grad_norm": 0.19277657568454742, "learning_rate": 1.1846053489889108e-05, - "loss": 0.0299, + "loss": 0.0292, "step": 5850 }, { "epoch": 26.75799086757991, - "grad_norm": 0.10960971564054489, + "grad_norm": 0.2688007354736328, "learning_rate": 1.178082191780822e-05, - "loss": 0.0297, + "loss": 0.0304, "step": 5860 }, { "epoch": 26.80365296803653, - "grad_norm": 0.15185241401195526, + "grad_norm": 0.1929878294467926, "learning_rate": 1.1715590345727332e-05, - "loss": 0.029, + "loss": 0.028, "step": 5870 }, { "epoch": 26.84931506849315, - "grad_norm": 0.07303405553102493, + "grad_norm": 0.15954306721687317, "learning_rate": 1.1650358773646446e-05, - "loss": 0.0269, + "loss": 0.0297, "step": 5880 }, { "epoch": 26.894977168949772, - "grad_norm": 0.12658797204494476, + "grad_norm": 0.19489558041095734, "learning_rate": 1.1585127201565558e-05, - "loss": 0.0284, + "loss": 0.0322, "step": 5890 }, { "epoch": 26.940639269406393, - "grad_norm": 0.05162033811211586, + "grad_norm": 0.1502341479063034, "learning_rate": 1.1519895629484672e-05, - "loss": 0.0279, + "loss": 0.0286, "step": 5900 }, { "epoch": 26.986301369863014, - "grad_norm": 0.0728631243109703, + "grad_norm": 0.2791164219379425, "learning_rate": 1.1454664057403784e-05, - "loss": 0.0266, + "loss": 0.0318, "step": 5910 }, { "epoch": 27.0, - "eval_bertscore_f1": 0.8816696117896679, - "eval_bleu": 0.7002372942228009, - "eval_loss": 0.029564756900072098, - "eval_rougeL": 0.3117885832149029, - "eval_runtime": 85.283, - "eval_samples_per_second": 17.624, - "eval_steps_per_second": 1.102, + "eval_bertscore_f1": 0.8776480831944459, + "eval_bleu": 0.6814688964486258, + "eval_loss": 0.0420895554125309, + "eval_rougeL": 0.30520018097920826, + "eval_runtime": 65.2587, + "eval_samples_per_second": 23.031, + "eval_steps_per_second": 1.44, "step": 5913 }, { "epoch": 27.031963470319635, - "grad_norm": 0.05337512493133545, + "grad_norm": 0.17120176553726196, "learning_rate": 1.1389432485322896e-05, - "loss": 0.03, + "loss": 0.0291, "step": 5920 }, { "epoch": 27.077625570776256, - "grad_norm": 0.08450283855199814, + "grad_norm": 0.15285202860832214, "learning_rate": 1.132420091324201e-05, - "loss": 0.028, + "loss": 0.0287, "step": 5930 }, { "epoch": 27.123287671232877, - "grad_norm": 0.15338727831840515, + "grad_norm": 0.15016716718673706, "learning_rate": 1.1258969341161122e-05, - "loss": 0.028, + "loss": 0.0297, "step": 5940 }, { "epoch": 27.168949771689498, - "grad_norm": 0.13320617377758026, + "grad_norm": 0.16641657054424286, "learning_rate": 1.1193737769080236e-05, - "loss": 0.0283, + "loss": 0.0311, "step": 5950 }, { "epoch": 27.21461187214612, - "grad_norm": 0.13354717195034027, + "grad_norm": 0.24323779344558716, "learning_rate": 1.1128506196999348e-05, - "loss": 0.0293, + "loss": 0.0309, "step": 5960 }, { "epoch": 27.26027397260274, - "grad_norm": 0.06282184273004532, + "grad_norm": 0.18576470017433167, "learning_rate": 1.106327462491846e-05, - "loss": 0.0264, + "loss": 0.0285, "step": 5970 }, { "epoch": 27.30593607305936, - "grad_norm": 0.06496313959360123, + "grad_norm": 0.19552282989025116, "learning_rate": 1.0998043052837574e-05, - "loss": 0.0294, + "loss": 0.0281, "step": 5980 }, { "epoch": 27.35159817351598, - "grad_norm": 0.0739155188202858, + "grad_norm": 0.192937433719635, "learning_rate": 1.0932811480756687e-05, - "loss": 0.0289, + "loss": 0.0292, "step": 5990 }, { "epoch": 27.397260273972602, - "grad_norm": 0.07013906538486481, + "grad_norm": 0.19229499995708466, "learning_rate": 1.08675799086758e-05, - "loss": 0.0301, + "loss": 0.0287, "step": 6000 }, { "epoch": 27.442922374429223, - "grad_norm": 0.27129918336868286, + "grad_norm": 0.2223975658416748, "learning_rate": 1.0802348336594913e-05, - "loss": 0.0269, + "loss": 0.0309, "step": 6010 }, { "epoch": 27.488584474885844, - "grad_norm": 0.11588162928819656, + "grad_norm": 0.41968482732772827, "learning_rate": 1.0737116764514025e-05, - "loss": 0.0276, + "loss": 0.0332, "step": 6020 }, { "epoch": 27.534246575342465, - "grad_norm": 0.0485413484275341, + "grad_norm": 0.36680567264556885, "learning_rate": 1.0671885192433139e-05, - "loss": 0.0278, + "loss": 0.0291, "step": 6030 }, { "epoch": 27.579908675799086, - "grad_norm": 0.1332385092973709, + "grad_norm": 0.1873111128807068, "learning_rate": 1.0606653620352251e-05, - "loss": 0.0283, + "loss": 0.0282, "step": 6040 }, { "epoch": 27.625570776255707, - "grad_norm": 0.04625769704580307, + "grad_norm": 0.21136601269245148, "learning_rate": 1.0541422048271365e-05, - "loss": 0.0295, + "loss": 0.0285, "step": 6050 }, { "epoch": 27.671232876712327, - "grad_norm": 0.0804968923330307, + "grad_norm": 0.15826620161533356, "learning_rate": 1.0476190476190477e-05, - "loss": 0.0268, + "loss": 0.0291, "step": 6060 }, { "epoch": 27.71689497716895, - "grad_norm": 0.07371534407138824, + "grad_norm": 0.16135139763355255, "learning_rate": 1.0410958904109589e-05, - "loss": 0.0272, + "loss": 0.0312, "step": 6070 }, { "epoch": 27.76255707762557, - "grad_norm": 0.09114833176136017, + "grad_norm": 0.18225039541721344, "learning_rate": 1.0345727332028703e-05, - "loss": 0.0283, + "loss": 0.03, "step": 6080 }, { "epoch": 27.80821917808219, - "grad_norm": 0.06806255131959915, + "grad_norm": 0.19162273406982422, "learning_rate": 1.0280495759947815e-05, - "loss": 0.027, + "loss": 0.0285, "step": 6090 }, { "epoch": 27.853881278538815, - "grad_norm": 0.07149802148342133, + "grad_norm": 0.2653762102127075, "learning_rate": 1.0215264187866929e-05, - "loss": 0.0273, + "loss": 0.0318, "step": 6100 }, { "epoch": 27.899543378995435, - "grad_norm": 0.059655074030160904, + "grad_norm": 0.258783757686615, "learning_rate": 1.0150032615786041e-05, - "loss": 0.0266, + "loss": 0.0286, "step": 6110 }, { "epoch": 27.945205479452056, - "grad_norm": 0.060154326260089874, + "grad_norm": 0.18329358100891113, "learning_rate": 1.0084801043705153e-05, - "loss": 0.0241, + "loss": 0.0299, "step": 6120 }, { "epoch": 27.990867579908677, - "grad_norm": 0.06297276169061661, + "grad_norm": 0.19685736298561096, "learning_rate": 1.0019569471624267e-05, - "loss": 0.0265, + "loss": 0.0317, "step": 6130 }, { "epoch": 28.0, - "eval_bertscore_f1": 0.8819132992528077, - "eval_bleu": 0.6419990268369562, - "eval_loss": 0.02953849919140339, - "eval_rougeL": 0.31736312067462313, - "eval_runtime": 85.6257, - "eval_samples_per_second": 17.553, - "eval_steps_per_second": 1.098, + "eval_bertscore_f1": 0.8782506199653991, + "eval_bleu": 0.7977746926944412, + "eval_loss": 0.042202867567539215, + "eval_rougeL": 0.2995364849769159, + "eval_runtime": 65.0364, + "eval_samples_per_second": 23.11, + "eval_steps_per_second": 1.445, "step": 6132 }, { "epoch": 28.036529680365298, - "grad_norm": 0.06080065667629242, + "grad_norm": 0.1952570676803589, "learning_rate": 9.95433789954338e-06, - "loss": 0.0281, + "loss": 0.0308, "step": 6140 }, { "epoch": 28.08219178082192, - "grad_norm": 0.08424234390258789, + "grad_norm": 0.14125581085681915, "learning_rate": 9.889106327462493e-06, - "loss": 0.0284, + "loss": 0.0283, "step": 6150 }, { "epoch": 28.12785388127854, - "grad_norm": 0.19627715647220612, + "grad_norm": 0.1538328379392624, "learning_rate": 9.823874755381605e-06, - "loss": 0.0276, + "loss": 0.0307, "step": 6160 }, { "epoch": 28.17351598173516, - "grad_norm": 0.05729695409536362, + "grad_norm": 0.16571789979934692, "learning_rate": 9.758643183300718e-06, - "loss": 0.0273, + "loss": 0.0306, "step": 6170 }, { "epoch": 28.21917808219178, - "grad_norm": 0.056810833513736725, + "grad_norm": 0.2275710552930832, "learning_rate": 9.693411611219831e-06, - "loss": 0.0272, + "loss": 0.0298, "step": 6180 }, { "epoch": 28.264840182648403, - "grad_norm": 0.3247433602809906, + "grad_norm": 0.16344133019447327, "learning_rate": 9.628180039138944e-06, - "loss": 0.0288, + "loss": 0.0292, "step": 6190 }, { "epoch": 28.310502283105023, - "grad_norm": 0.054678067564964294, + "grad_norm": 0.15537413954734802, "learning_rate": 9.562948467058057e-06, - "loss": 0.0284, + "loss": 0.0297, "step": 6200 }, { "epoch": 28.356164383561644, - "grad_norm": 0.12813326716423035, + "grad_norm": 0.17001962661743164, "learning_rate": 9.49771689497717e-06, - "loss": 0.0277, + "loss": 0.0313, "step": 6210 }, { "epoch": 28.401826484018265, - "grad_norm": 0.09990496188402176, + "grad_norm": 0.17238697409629822, "learning_rate": 9.432485322896282e-06, - "loss": 0.0294, + "loss": 0.0303, "step": 6220 }, { "epoch": 28.447488584474886, - "grad_norm": 0.30804169178009033, + "grad_norm": 0.34354132413864136, "learning_rate": 9.367253750815396e-06, - "loss": 0.0274, + "loss": 0.0289, "step": 6230 }, { "epoch": 28.493150684931507, - "grad_norm": 0.0785110592842102, + "grad_norm": 0.10073091834783554, "learning_rate": 9.302022178734508e-06, - "loss": 0.0279, + "loss": 0.0303, "step": 6240 }, { "epoch": 28.538812785388128, - "grad_norm": 0.07497629523277283, + "grad_norm": 0.1798088103532791, "learning_rate": 9.236790606653622e-06, - "loss": 0.0289, + "loss": 0.0283, "step": 6250 }, { "epoch": 28.58447488584475, - "grad_norm": 0.048099491745233536, + "grad_norm": 0.1783922165632248, "learning_rate": 9.171559034572734e-06, - "loss": 0.0265, + "loss": 0.0294, "step": 6260 }, { "epoch": 28.63013698630137, - "grad_norm": 0.074525848031044, + "grad_norm": 0.177333265542984, "learning_rate": 9.106327462491846e-06, - "loss": 0.0268, + "loss": 0.0308, "step": 6270 }, { "epoch": 28.67579908675799, - "grad_norm": 0.0570547953248024, + "grad_norm": 0.1782556176185608, "learning_rate": 9.04109589041096e-06, - "loss": 0.03, + "loss": 0.0313, "step": 6280 }, { "epoch": 28.72146118721461, - "grad_norm": 0.11205938458442688, + "grad_norm": 0.20494292676448822, "learning_rate": 8.975864318330072e-06, - "loss": 0.0259, + "loss": 0.0272, "step": 6290 }, { "epoch": 28.767123287671232, - "grad_norm": 0.07583874464035034, + "grad_norm": 0.32170209288597107, "learning_rate": 8.910632746249186e-06, - "loss": 0.0273, + "loss": 0.0289, "step": 6300 }, { "epoch": 28.812785388127853, - "grad_norm": 0.04650593921542168, + "grad_norm": 0.14453481137752533, "learning_rate": 8.845401174168298e-06, - "loss": 0.0282, + "loss": 0.0288, "step": 6310 }, { "epoch": 28.858447488584474, - "grad_norm": 0.10401676595211029, + "grad_norm": 0.15864479541778564, "learning_rate": 8.78016960208741e-06, - "loss": 0.0281, + "loss": 0.0294, "step": 6320 }, { "epoch": 28.904109589041095, - "grad_norm": 0.10608301311731339, + "grad_norm": 0.1791839748620987, "learning_rate": 8.714938030006524e-06, - "loss": 0.0264, + "loss": 0.029, "step": 6330 }, { "epoch": 28.949771689497716, - "grad_norm": 0.06047683209180832, + "grad_norm": 0.21326766908168793, "learning_rate": 8.649706457925636e-06, - "loss": 0.0287, + "loss": 0.0288, "step": 6340 }, { "epoch": 28.995433789954337, - "grad_norm": 0.0955701544880867, + "grad_norm": 0.18078750371932983, "learning_rate": 8.58447488584475e-06, - "loss": 0.026, + "loss": 0.0284, "step": 6350 }, { "epoch": 29.0, - "eval_bertscore_f1": 0.8819990824320597, - "eval_bleu": 0.6091330252424353, - "eval_loss": 0.029434066265821457, - "eval_rougeL": 0.319823542874944, - "eval_runtime": 85.5987, - "eval_samples_per_second": 17.559, - "eval_steps_per_second": 1.098, + "eval_bertscore_f1": 0.8773477768311085, + "eval_bleu": 0.8007310925138926, + "eval_loss": 0.04212983325123787, + "eval_rougeL": 0.29792348880513014, + "eval_runtime": 64.8542, + "eval_samples_per_second": 23.175, + "eval_steps_per_second": 1.449, "step": 6351 }, { "epoch": 29.041095890410958, - "grad_norm": 0.047558221966028214, + "grad_norm": 0.3029780983924866, "learning_rate": 8.519243313763862e-06, - "loss": 0.0272, + "loss": 0.0278, "step": 6360 }, { "epoch": 29.08675799086758, - "grad_norm": 0.07335501164197922, + "grad_norm": 0.21092386543750763, "learning_rate": 8.454011741682975e-06, - "loss": 0.0274, + "loss": 0.0302, "step": 6370 }, { "epoch": 29.1324200913242, - "grad_norm": 0.06457233428955078, + "grad_norm": 0.20935019850730896, "learning_rate": 8.388780169602088e-06, - "loss": 0.03, + "loss": 0.0283, "step": 6380 }, { "epoch": 29.17808219178082, - "grad_norm": 0.06746174395084381, + "grad_norm": 0.1683349758386612, "learning_rate": 8.3235485975212e-06, - "loss": 0.031, + "loss": 0.0296, "step": 6390 }, { "epoch": 29.22374429223744, - "grad_norm": 0.0505414679646492, + "grad_norm": 0.15146371722221375, "learning_rate": 8.258317025440314e-06, - "loss": 0.026, + "loss": 0.0281, "step": 6400 }, { "epoch": 29.269406392694062, - "grad_norm": 0.0832168459892273, + "grad_norm": 0.1463659703731537, "learning_rate": 8.193085453359427e-06, - "loss": 0.0274, + "loss": 0.0306, "step": 6410 }, { "epoch": 29.315068493150687, - "grad_norm": 0.055076733231544495, + "grad_norm": 0.22994855046272278, "learning_rate": 8.127853881278539e-06, - "loss": 0.0267, + "loss": 0.0326, "step": 6420 }, { "epoch": 29.360730593607308, - "grad_norm": 0.04963900148868561, + "grad_norm": 0.18445008993148804, "learning_rate": 8.062622309197653e-06, - "loss": 0.0267, + "loss": 0.0297, "step": 6430 }, { "epoch": 29.40639269406393, - "grad_norm": 0.06072743609547615, + "grad_norm": 0.18058903515338898, "learning_rate": 7.997390737116765e-06, - "loss": 0.027, + "loss": 0.0302, "step": 6440 }, { "epoch": 29.45205479452055, - "grad_norm": 0.046447575092315674, + "grad_norm": 0.17116139829158783, "learning_rate": 7.932159165035879e-06, - "loss": 0.0252, + "loss": 0.0303, "step": 6450 }, { "epoch": 29.49771689497717, - "grad_norm": 0.08060900121927261, + "grad_norm": 0.18373791873455048, "learning_rate": 7.86692759295499e-06, - "loss": 0.0261, + "loss": 0.0266, "step": 6460 }, { "epoch": 29.54337899543379, - "grad_norm": 0.05984543263912201, + "grad_norm": 0.18657907843589783, "learning_rate": 7.801696020874103e-06, - "loss": 0.026, + "loss": 0.0303, "step": 6470 }, { "epoch": 29.589041095890412, - "grad_norm": 0.044129859656095505, + "grad_norm": 0.17647583782672882, "learning_rate": 7.736464448793217e-06, - "loss": 0.028, + "loss": 0.0277, "step": 6480 }, { "epoch": 29.634703196347033, - "grad_norm": 0.06660878658294678, + "grad_norm": 0.14070016145706177, "learning_rate": 7.671232876712329e-06, - "loss": 0.0281, + "loss": 0.0295, "step": 6490 }, { "epoch": 29.680365296803654, - "grad_norm": 0.0642257034778595, + "grad_norm": 0.1862981617450714, "learning_rate": 7.606001304631442e-06, - "loss": 0.0277, + "loss": 0.0284, "step": 6500 }, { "epoch": 29.726027397260275, - "grad_norm": 0.06701590865850449, + "grad_norm": 0.22975805401802063, "learning_rate": 7.540769732550555e-06, - "loss": 0.0283, + "loss": 0.0292, "step": 6510 }, { "epoch": 29.771689497716896, - "grad_norm": 0.06078553572297096, + "grad_norm": 0.31355002522468567, "learning_rate": 7.475538160469668e-06, - "loss": 0.0285, + "loss": 0.0293, "step": 6520 }, { "epoch": 29.817351598173516, - "grad_norm": 0.07285226136445999, + "grad_norm": 0.18986040353775024, "learning_rate": 7.410306588388781e-06, - "loss": 0.0267, + "loss": 0.0279, "step": 6530 }, { "epoch": 29.863013698630137, - "grad_norm": 0.09485471993684769, + "grad_norm": 0.21212980151176453, "learning_rate": 7.345075016307894e-06, - "loss": 0.028, + "loss": 0.029, "step": 6540 }, { "epoch": 29.908675799086758, - "grad_norm": 0.08331019431352615, + "grad_norm": 0.23085573315620422, "learning_rate": 7.279843444227006e-06, - "loss": 0.028, + "loss": 0.0293, "step": 6550 }, { "epoch": 29.95433789954338, - "grad_norm": 0.06982530653476715, + "grad_norm": 0.19988933205604553, "learning_rate": 7.214611872146119e-06, - "loss": 0.0252, + "loss": 0.0305, "step": 6560 }, { "epoch": 30.0, - "grad_norm": 0.19558538496494293, + "grad_norm": 0.19826553761959076, "learning_rate": 7.149380300065232e-06, - "loss": 0.0297, + "loss": 0.028, "step": 6570 }, { "epoch": 30.0, - "eval_bertscore_f1": 0.8825751674310096, - "eval_bleu": 0.6418448303227442, - "eval_loss": 0.029364319518208504, - "eval_rougeL": 0.3167741253484545, - "eval_runtime": 85.3228, - "eval_samples_per_second": 17.615, - "eval_steps_per_second": 1.102, + "eval_bertscore_f1": 0.8785085355291665, + "eval_bleu": 0.702881238096362, + "eval_loss": 0.04208363965153694, + "eval_rougeL": 0.3061862473643293, + "eval_runtime": 64.8873, + "eval_samples_per_second": 23.163, + "eval_steps_per_second": 1.449, "step": 6570 }, { "epoch": 30.04566210045662, - "grad_norm": 0.25312867760658264, + "grad_norm": 0.14557534456253052, "learning_rate": 7.084148727984345e-06, - "loss": 0.0286, + "loss": 0.0288, "step": 6580 }, { "epoch": 30.091324200913242, - "grad_norm": 0.07719423621892929, + "grad_norm": 0.1587667614221573, "learning_rate": 7.018917155903458e-06, - "loss": 0.0269, + "loss": 0.0296, "step": 6590 }, { "epoch": 30.136986301369863, - "grad_norm": 0.03981988877058029, + "grad_norm": 0.13343170285224915, "learning_rate": 6.9536855838225706e-06, - "loss": 0.0252, + "loss": 0.0289, "step": 6600 }, { "epoch": 30.182648401826484, - "grad_norm": 0.12466205656528473, + "grad_norm": 0.1827743649482727, "learning_rate": 6.8884540117416836e-06, - "loss": 0.0275, + "loss": 0.0289, "step": 6610 }, { "epoch": 30.228310502283104, - "grad_norm": 0.044825151562690735, + "grad_norm": 0.26161623001098633, "learning_rate": 6.823222439660797e-06, - "loss": 0.0269, + "loss": 0.0305, "step": 6620 }, { "epoch": 30.273972602739725, - "grad_norm": 0.11382755637168884, + "grad_norm": 0.1480785757303238, "learning_rate": 6.75799086757991e-06, - "loss": 0.0299, + "loss": 0.0277, "step": 6630 }, { "epoch": 30.319634703196346, - "grad_norm": 0.11189394444227219, + "grad_norm": 0.21986277401447296, "learning_rate": 6.692759295499023e-06, "loss": 0.0301, "step": 6640 }, { "epoch": 30.365296803652967, - "grad_norm": 0.059967365115880966, + "grad_norm": 0.21434344351291656, "learning_rate": 6.627527723418135e-06, - "loss": 0.0273, + "loss": 0.0307, "step": 6650 }, { "epoch": 30.410958904109588, - "grad_norm": 0.04897484555840492, + "grad_norm": 0.1664654165506363, "learning_rate": 6.562296151337248e-06, - "loss": 0.0266, + "loss": 0.0296, "step": 6660 }, { "epoch": 30.45662100456621, - "grad_norm": 0.16777561604976654, + "grad_norm": 0.21170803904533386, "learning_rate": 6.497064579256361e-06, - "loss": 0.0282, + "loss": 0.0281, "step": 6670 }, { "epoch": 30.50228310502283, - "grad_norm": 0.106645368039608, + "grad_norm": 0.2334418147802353, "learning_rate": 6.431833007175474e-06, - "loss": 0.0289, + "loss": 0.0295, "step": 6680 }, { "epoch": 30.54794520547945, - "grad_norm": 0.05931256338953972, + "grad_norm": 0.1512773334980011, "learning_rate": 6.366601435094587e-06, - "loss": 0.0266, + "loss": 0.0293, "step": 6690 }, { "epoch": 30.59360730593607, - "grad_norm": 0.0640476867556572, + "grad_norm": 0.1553216278553009, "learning_rate": 6.301369863013699e-06, - "loss": 0.028, + "loss": 0.0292, "step": 6700 }, { "epoch": 30.639269406392692, - "grad_norm": 0.05507276952266693, + "grad_norm": 0.14663270115852356, "learning_rate": 6.236138290932811e-06, - "loss": 0.0272, + "loss": 0.0277, "step": 6710 }, { "epoch": 30.684931506849313, - "grad_norm": 0.07414695620536804, + "grad_norm": 0.22883833944797516, "learning_rate": 6.170906718851924e-06, - "loss": 0.0286, + "loss": 0.0294, "step": 6720 }, { "epoch": 30.730593607305934, - "grad_norm": 0.08643855154514313, + "grad_norm": 0.19154661893844604, "learning_rate": 6.105675146771037e-06, - "loss": 0.0303, + "loss": 0.0275, "step": 6730 }, { "epoch": 30.77625570776256, - "grad_norm": 0.052528686821460724, + "grad_norm": 0.1744658201932907, "learning_rate": 6.04044357469015e-06, - "loss": 0.0279, + "loss": 0.0282, "step": 6740 }, { "epoch": 30.82191780821918, - "grad_norm": 0.0555759035050869, + "grad_norm": 0.13991741836071014, "learning_rate": 5.975212002609263e-06, - "loss": 0.0266, + "loss": 0.0301, "step": 6750 }, { "epoch": 30.8675799086758, - "grad_norm": 0.046230562031269073, + "grad_norm": 0.2106420397758484, "learning_rate": 5.9099804305283755e-06, - "loss": 0.0274, + "loss": 0.0297, "step": 6760 }, { "epoch": 30.91324200913242, - "grad_norm": 0.07497312128543854, + "grad_norm": 0.1574612706899643, "learning_rate": 5.8447488584474885e-06, - "loss": 0.027, + "loss": 0.0291, "step": 6770 }, { "epoch": 30.958904109589042, - "grad_norm": 0.11832709610462189, + "grad_norm": 0.2075018733739853, "learning_rate": 5.7795172863666015e-06, - "loss": 0.0266, + "loss": 0.0268, "step": 6780 }, { "epoch": 31.0, - "eval_bertscore_f1": 0.8814896751623985, - "eval_bleu": 0.58698815562408, - "eval_loss": 0.029345136135816574, - "eval_rougeL": 0.3197356893416895, - "eval_runtime": 85.6812, - "eval_samples_per_second": 17.542, - "eval_steps_per_second": 1.097, + "eval_bertscore_f1": 0.8759105591399624, + "eval_bleu": 0.6233413223757481, + "eval_loss": 0.04237150773406029, + "eval_rougeL": 0.3068191295790029, + "eval_runtime": 65.4719, + "eval_samples_per_second": 22.956, + "eval_steps_per_second": 1.436, "step": 6789 }, { "epoch": 31.004566210045663, - "grad_norm": 0.06090688705444336, + "grad_norm": 0.2037985920906067, "learning_rate": 5.7142857142857145e-06, - "loss": 0.0272, + "loss": 0.03, "step": 6790 }, { "epoch": 31.050228310502284, - "grad_norm": 0.11053823679685593, + "grad_norm": 0.2403508424758911, "learning_rate": 5.6490541422048275e-06, - "loss": 0.0274, + "loss": 0.0287, "step": 6800 }, { "epoch": 31.095890410958905, - "grad_norm": 0.06031995266675949, + "grad_norm": 0.16700613498687744, "learning_rate": 5.58382257012394e-06, - "loss": 0.0259, + "loss": 0.0305, "step": 6810 }, { "epoch": 31.141552511415526, - "grad_norm": 0.18411608040332794, + "grad_norm": 0.18683423101902008, "learning_rate": 5.518590998043053e-06, - "loss": 0.0249, + "loss": 0.0297, "step": 6820 }, { "epoch": 31.187214611872147, - "grad_norm": 0.066913902759552, + "grad_norm": 0.1736307293176651, "learning_rate": 5.453359425962166e-06, - "loss": 0.0288, + "loss": 0.0287, "step": 6830 }, { "epoch": 31.232876712328768, - "grad_norm": 0.14979858696460724, + "grad_norm": 0.4329126477241516, "learning_rate": 5.388127853881279e-06, - "loss": 0.0256, + "loss": 0.0286, "step": 6840 }, { "epoch": 31.27853881278539, - "grad_norm": 0.1862526834011078, + "grad_norm": 0.19322291016578674, "learning_rate": 5.322896281800392e-06, - "loss": 0.0259, + "loss": 0.0299, "step": 6850 }, { "epoch": 31.32420091324201, - "grad_norm": 0.20405951142311096, + "grad_norm": 0.12365171313285828, "learning_rate": 5.257664709719504e-06, - "loss": 0.0273, + "loss": 0.0288, "step": 6860 }, { "epoch": 31.36986301369863, - "grad_norm": 0.09556996822357178, + "grad_norm": 0.2190866321325302, "learning_rate": 5.192433137638617e-06, - "loss": 0.0284, + "loss": 0.0287, "step": 6870 }, { "epoch": 31.41552511415525, - "grad_norm": 0.0899861678481102, + "grad_norm": 0.1890123039484024, "learning_rate": 5.12720156555773e-06, - "loss": 0.0283, + "loss": 0.0292, "step": 6880 }, { "epoch": 31.461187214611872, - "grad_norm": 0.09562750160694122, + "grad_norm": 0.21521024405956268, "learning_rate": 5.061969993476843e-06, - "loss": 0.0301, + "loss": 0.0286, "step": 6890 }, { "epoch": 31.506849315068493, - "grad_norm": 0.05665372312068939, + "grad_norm": 0.15392209589481354, "learning_rate": 4.996738421395956e-06, - "loss": 0.0296, + "loss": 0.0279, "step": 6900 }, { "epoch": 31.552511415525114, - "grad_norm": 0.05309437960386276, + "grad_norm": 0.1591772735118866, "learning_rate": 4.931506849315068e-06, - "loss": 0.0266, + "loss": 0.0304, "step": 6910 }, { "epoch": 31.598173515981735, - "grad_norm": 0.0609840489923954, + "grad_norm": 0.18570680916309357, "learning_rate": 4.866275277234181e-06, - "loss": 0.0278, + "loss": 0.0292, "step": 6920 }, { "epoch": 31.643835616438356, - "grad_norm": 0.2213747799396515, + "grad_norm": 0.21021844446659088, "learning_rate": 4.801043705153294e-06, - "loss": 0.0285, + "loss": 0.03, "step": 6930 }, { "epoch": 31.689497716894977, - "grad_norm": 0.2735954225063324, + "grad_norm": 0.11242254078388214, "learning_rate": 4.735812133072407e-06, - "loss": 0.0286, + "loss": 0.0284, "step": 6940 }, { "epoch": 31.735159817351597, - "grad_norm": 0.07147183269262314, + "grad_norm": 0.12586183845996857, "learning_rate": 4.67058056099152e-06, - "loss": 0.0301, + "loss": 0.0315, "step": 6950 }, { "epoch": 31.78082191780822, - "grad_norm": 0.05247138813138008, + "grad_norm": 0.2376343458890915, "learning_rate": 4.6053489889106324e-06, - "loss": 0.0255, + "loss": 0.0268, "step": 6960 }, { "epoch": 31.82648401826484, - "grad_norm": 0.04454132914543152, + "grad_norm": 0.10612351447343826, "learning_rate": 4.5401174168297455e-06, - "loss": 0.0269, + "loss": 0.0274, "step": 6970 }, { "epoch": 31.87214611872146, - "grad_norm": 0.06428872048854828, + "grad_norm": 0.16219161450862885, "learning_rate": 4.4748858447488585e-06, - "loss": 0.029, + "loss": 0.0291, "step": 6980 }, { "epoch": 31.91780821917808, - "grad_norm": 0.1623128354549408, + "grad_norm": 0.22465045750141144, "learning_rate": 4.4096542726679715e-06, - "loss": 0.0271, + "loss": 0.0293, "step": 6990 }, { "epoch": 31.963470319634702, - "grad_norm": 0.13374125957489014, + "grad_norm": 0.17077922821044922, "learning_rate": 4.3444227005870845e-06, - "loss": 0.0296, + "loss": 0.0278, "step": 7000 }, { "epoch": 32.0, - "eval_bertscore_f1": 0.882116102847114, - "eval_bleu": 0.6198143411594959, - "eval_loss": 0.029304716736078262, - "eval_rougeL": 0.31779224232169023, - "eval_runtime": 85.5713, - "eval_samples_per_second": 17.564, - "eval_steps_per_second": 1.098, + "eval_bertscore_f1": 0.8782741467078685, + "eval_bleu": 0.7519832650589673, + "eval_loss": 0.04253660514950752, + "eval_rougeL": 0.30201395092264444, + "eval_runtime": 64.8813, + "eval_samples_per_second": 23.165, + "eval_steps_per_second": 1.449, "step": 7008 }, { "epoch": 32.009132420091326, - "grad_norm": 0.09440213441848755, + "grad_norm": 0.1892678290605545, "learning_rate": 4.2791911285061975e-06, - "loss": 0.0285, + "loss": 0.0277, "step": 7010 }, { "epoch": 32.054794520547944, - "grad_norm": 0.06098993867635727, + "grad_norm": 0.22672104835510254, "learning_rate": 4.21395955642531e-06, - "loss": 0.027, + "loss": 0.0303, "step": 7020 }, { "epoch": 32.10045662100457, - "grad_norm": 0.04748637229204178, + "grad_norm": 0.12975597381591797, "learning_rate": 4.148727984344423e-06, - "loss": 0.0269, + "loss": 0.0281, "step": 7030 }, { "epoch": 32.146118721461185, - "grad_norm": 0.049264486879110336, + "grad_norm": 0.17574161291122437, "learning_rate": 4.083496412263536e-06, - "loss": 0.0273, + "loss": 0.0301, "step": 7040 }, { "epoch": 32.19178082191781, - "grad_norm": 0.0713597983121872, + "grad_norm": 0.2801176607608795, "learning_rate": 4.018264840182649e-06, - "loss": 0.0274, + "loss": 0.0278, "step": 7050 }, { "epoch": 32.23744292237443, - "grad_norm": 0.06906388700008392, + "grad_norm": 0.17608405649662018, "learning_rate": 3.953033268101762e-06, - "loss": 0.0288, + "loss": 0.0287, "step": 7060 }, { "epoch": 32.28310502283105, - "grad_norm": 0.1591597944498062, + "grad_norm": 0.2439558506011963, "learning_rate": 3.887801696020874e-06, - "loss": 0.0269, + "loss": 0.0288, "step": 7070 }, { "epoch": 32.32876712328767, - "grad_norm": 0.04023635759949684, + "grad_norm": 0.1893412321805954, "learning_rate": 3.822570123939987e-06, - "loss": 0.0279, + "loss": 0.0297, "step": 7080 }, { "epoch": 32.37442922374429, - "grad_norm": 0.0428457073867321, + "grad_norm": 0.1842028945684433, "learning_rate": 3.7573385518591e-06, - "loss": 0.0274, + "loss": 0.0286, "step": 7090 }, { "epoch": 32.42009132420091, - "grad_norm": 0.09162160009145737, + "grad_norm": 0.14635466039180756, "learning_rate": 3.6921069797782126e-06, - "loss": 0.0265, + "loss": 0.0268, "step": 7100 }, { "epoch": 32.465753424657535, - "grad_norm": 0.05572756007313728, + "grad_norm": 0.16718840599060059, "learning_rate": 3.6268754076973256e-06, - "loss": 0.0274, + "loss": 0.0294, "step": 7110 }, { "epoch": 32.51141552511415, - "grad_norm": 0.05541946738958359, + "grad_norm": 0.19221429526805878, "learning_rate": 3.5616438356164386e-06, - "loss": 0.027, + "loss": 0.0285, "step": 7120 }, { "epoch": 32.55707762557078, - "grad_norm": 0.10358795523643494, + "grad_norm": 0.15354396402835846, "learning_rate": 3.496412263535551e-06, - "loss": 0.0265, + "loss": 0.0284, "step": 7130 }, { "epoch": 32.602739726027394, - "grad_norm": 0.10496276617050171, + "grad_norm": 0.15389499068260193, "learning_rate": 3.4311806914546642e-06, - "loss": 0.0278, + "loss": 0.028, "step": 7140 }, { "epoch": 32.64840182648402, - "grad_norm": 0.049546003341674805, + "grad_norm": 0.15369181334972382, "learning_rate": 3.365949119373777e-06, - "loss": 0.0268, + "loss": 0.0301, "step": 7150 }, { "epoch": 32.694063926940636, - "grad_norm": 0.04895747825503349, + "grad_norm": 0.21850866079330444, "learning_rate": 3.30071754729289e-06, - "loss": 0.0288, + "loss": 0.0294, "step": 7160 }, { "epoch": 32.73972602739726, - "grad_norm": 0.046649519354104996, + "grad_norm": 0.1314105987548828, "learning_rate": 3.235485975212003e-06, - "loss": 0.0267, + "loss": 0.0295, "step": 7170 }, { "epoch": 32.78538812785388, - "grad_norm": 0.060476336628198624, + "grad_norm": 0.15347546339035034, "learning_rate": 3.1702544031311154e-06, - "loss": 0.0275, + "loss": 0.0284, "step": 7180 }, { "epoch": 32.8310502283105, - "grad_norm": 0.08833310008049011, + "grad_norm": 0.191708505153656, "learning_rate": 3.1050228310502285e-06, - "loss": 0.0282, + "loss": 0.0297, "step": 7190 }, { "epoch": 32.87671232876713, - "grad_norm": 0.08543556928634644, + "grad_norm": 0.21553701162338257, "learning_rate": 3.0397912589693415e-06, - "loss": 0.0272, + "loss": 0.0277, "step": 7200 }, { "epoch": 32.922374429223744, - "grad_norm": 0.09319938719272614, + "grad_norm": 0.1383250206708908, "learning_rate": 2.974559686888454e-06, - "loss": 0.0254, + "loss": 0.0292, "step": 7210 }, { "epoch": 32.96803652968037, - "grad_norm": 0.07596831768751144, + "grad_norm": 0.1352762132883072, "learning_rate": 2.909328114807567e-06, - "loss": 0.0267, + "loss": 0.0285, "step": 7220 }, { "epoch": 33.0, - "eval_bertscore_f1": 0.8826242640108881, - "eval_bleu": 0.6506840160853707, - "eval_loss": 0.02929467149078846, - "eval_rougeL": 0.3175517644987171, - "eval_runtime": 85.9677, - "eval_samples_per_second": 17.483, - "eval_steps_per_second": 1.093, + "eval_bertscore_f1": 0.8765576011485762, + "eval_bleu": 0.6868938752836411, + "eval_loss": 0.0425349660217762, + "eval_rougeL": 0.3051064502374255, + "eval_runtime": 65.0494, + "eval_samples_per_second": 23.106, + "eval_steps_per_second": 1.445, "step": 7227 }, { "epoch": 33.013698630136986, - "grad_norm": 0.2628081440925598, + "grad_norm": 0.2675996422767639, "learning_rate": 2.8440965427266797e-06, - "loss": 0.0288, + "loss": 0.028, "step": 7230 }, { "epoch": 33.05936073059361, - "grad_norm": 0.05208074674010277, + "grad_norm": 0.19094286859035492, "learning_rate": 2.7788649706457927e-06, - "loss": 0.026, + "loss": 0.0282, "step": 7240 }, { "epoch": 33.10502283105023, - "grad_norm": 0.0972835123538971, + "grad_norm": 0.18227194249629974, "learning_rate": 2.7136333985649057e-06, - "loss": 0.0266, + "loss": 0.0286, "step": 7250 }, { "epoch": 33.15068493150685, - "grad_norm": 0.07271518558263779, + "grad_norm": 0.1970142275094986, "learning_rate": 2.6484018264840183e-06, - "loss": 0.028, + "loss": 0.0261, "step": 7260 }, { "epoch": 33.19634703196347, - "grad_norm": 0.196941077709198, + "grad_norm": 0.14779959619045258, "learning_rate": 2.5831702544031313e-06, - "loss": 0.027, + "loss": 0.0296, "step": 7270 }, { "epoch": 33.242009132420094, - "grad_norm": 0.07205737382173538, + "grad_norm": 0.1774580031633377, "learning_rate": 2.517938682322244e-06, - "loss": 0.0279, + "loss": 0.031, "step": 7280 }, { "epoch": 33.28767123287671, - "grad_norm": 0.06259785592556, + "grad_norm": 0.20665834844112396, "learning_rate": 2.452707110241357e-06, - "loss": 0.0266, + "loss": 0.028, "step": 7290 }, { "epoch": 33.333333333333336, - "grad_norm": 0.047566816210746765, + "grad_norm": 0.17283357679843903, "learning_rate": 2.38747553816047e-06, - "loss": 0.0262, + "loss": 0.0276, "step": 7300 }, { "epoch": 33.37899543378995, - "grad_norm": 0.06657677888870239, + "grad_norm": 0.14525634050369263, "learning_rate": 2.3222439660795826e-06, - "loss": 0.0269, + "loss": 0.0281, "step": 7310 }, { "epoch": 33.42465753424658, - "grad_norm": 0.13266189396381378, + "grad_norm": 0.16242118179798126, "learning_rate": 2.2570123939986956e-06, - "loss": 0.0282, + "loss": 0.0283, "step": 7320 }, { "epoch": 33.470319634703195, - "grad_norm": 0.08919128775596619, + "grad_norm": 0.20483170449733734, "learning_rate": 2.191780821917808e-06, - "loss": 0.0297, + "loss": 0.029, "step": 7330 }, { "epoch": 33.51598173515982, - "grad_norm": 0.05524025857448578, + "grad_norm": 0.18850266933441162, "learning_rate": 2.126549249836921e-06, - "loss": 0.029, + "loss": 0.0312, "step": 7340 }, { "epoch": 33.56164383561644, - "grad_norm": 0.0752720758318901, + "grad_norm": 0.24169522523880005, "learning_rate": 2.0613176777560342e-06, - "loss": 0.0278, + "loss": 0.0292, "step": 7350 }, { "epoch": 33.60730593607306, - "grad_norm": 0.06446948647499084, + "grad_norm": 0.1359410136938095, "learning_rate": 1.996086105675147e-06, "loss": 0.0279, "step": 7360 }, { "epoch": 33.65296803652968, - "grad_norm": 0.1158689633011818, + "grad_norm": 0.16504965722560883, "learning_rate": 1.93085453359426e-06, - "loss": 0.0273, + "loss": 0.0276, "step": 7370 }, { "epoch": 33.6986301369863, - "grad_norm": 0.05515358969569206, + "grad_norm": 0.12197738885879517, "learning_rate": 1.8656229615133726e-06, - "loss": 0.0276, + "loss": 0.028, "step": 7380 }, { "epoch": 33.74429223744292, - "grad_norm": 0.13118913769721985, + "grad_norm": 0.1623210310935974, "learning_rate": 1.8003913894324854e-06, - "loss": 0.0253, + "loss": 0.0295, "step": 7390 }, { "epoch": 33.789954337899545, - "grad_norm": 0.0866023376584053, + "grad_norm": 0.20221665501594543, "learning_rate": 1.7351598173515982e-06, - "loss": 0.0269, + "loss": 0.0265, "step": 7400 }, { "epoch": 33.83561643835616, - "grad_norm": 0.05457647144794464, + "grad_norm": 0.19585196673870087, "learning_rate": 1.669928245270711e-06, - "loss": 0.0274, + "loss": 0.0292, "step": 7410 }, { "epoch": 33.881278538812786, - "grad_norm": 0.1843695044517517, + "grad_norm": 0.2633768320083618, "learning_rate": 1.604696673189824e-06, - "loss": 0.0281, + "loss": 0.0293, "step": 7420 }, { "epoch": 33.926940639269404, - "grad_norm": 0.054692674428224564, + "grad_norm": 0.1686674803495407, "learning_rate": 1.5394651011089369e-06, - "loss": 0.0301, + "loss": 0.0298, "step": 7430 }, { "epoch": 33.97260273972603, - "grad_norm": 0.05386902019381523, + "grad_norm": 0.14225494861602783, "learning_rate": 1.4742335290280497e-06, - "loss": 0.0263, + "loss": 0.0296, "step": 7440 }, { "epoch": 34.0, - "eval_bertscore_f1": 0.8824349555743985, - "eval_bleu": 0.6426832403462529, - "eval_loss": 0.029273033142089844, - "eval_rougeL": 0.31879717895642, - "eval_runtime": 85.7948, - "eval_samples_per_second": 17.519, - "eval_steps_per_second": 1.096, + "eval_bertscore_f1": 0.8776290599933404, + "eval_bleu": 0.7360000570941625, + "eval_loss": 0.04238814115524292, + "eval_rougeL": 0.3019882967022193, + "eval_runtime": 65.4669, + "eval_samples_per_second": 22.958, + "eval_steps_per_second": 1.436, "step": 7446 }, { "epoch": 34.018264840182646, - "grad_norm": 0.059233132749795914, + "grad_norm": 0.1806912124156952, "learning_rate": 1.4090019569471625e-06, - "loss": 0.0275, + "loss": 0.027, "step": 7450 }, { "epoch": 34.06392694063927, - "grad_norm": 0.05460565164685249, + "grad_norm": 0.14808008074760437, "learning_rate": 1.3437703848662755e-06, - "loss": 0.0268, + "loss": 0.0292, "step": 7460 }, { "epoch": 34.10958904109589, - "grad_norm": 0.05039376765489578, + "grad_norm": 0.1517217606306076, "learning_rate": 1.2785388127853883e-06, - "loss": 0.027, + "loss": 0.0296, "step": 7470 }, { "epoch": 34.15525114155251, - "grad_norm": 0.061115965247154236, + "grad_norm": 0.24545730650424957, "learning_rate": 1.2133072407045011e-06, - "loss": 0.0271, + "loss": 0.0277, "step": 7480 }, { "epoch": 34.20091324200913, - "grad_norm": 0.13192027807235718, + "grad_norm": 0.1936875879764557, "learning_rate": 1.148075668623614e-06, - "loss": 0.0273, + "loss": 0.028, "step": 7490 }, { "epoch": 34.24657534246575, - "grad_norm": 0.08353226631879807, + "grad_norm": 0.16787278652191162, "learning_rate": 1.0828440965427267e-06, - "loss": 0.0258, + "loss": 0.0287, "step": 7500 }, { "epoch": 34.29223744292237, - "grad_norm": 0.060591306537389755, + "grad_norm": 0.15620262920856476, "learning_rate": 1.0176125244618398e-06, - "loss": 0.0264, + "loss": 0.0281, "step": 7510 }, { "epoch": 34.337899543378995, - "grad_norm": 0.1371307224035263, + "grad_norm": 0.20259173214435577, "learning_rate": 9.523809523809526e-07, - "loss": 0.0296, + "loss": 0.0293, "step": 7520 }, { "epoch": 34.38356164383562, - "grad_norm": 0.07051808387041092, + "grad_norm": 0.22473689913749695, "learning_rate": 8.871493803000652e-07, - "loss": 0.0265, + "loss": 0.0289, "step": 7530 }, { "epoch": 34.42922374429224, - "grad_norm": 0.0630018338561058, + "grad_norm": 0.1960284262895584, "learning_rate": 8.219178082191781e-07, - "loss": 0.0283, + "loss": 0.0285, "step": 7540 }, { "epoch": 34.47488584474886, - "grad_norm": 0.05780971795320511, + "grad_norm": 0.18783798813819885, "learning_rate": 7.56686236138291e-07, - "loss": 0.0265, + "loss": 0.0282, "step": 7550 }, { "epoch": 34.52054794520548, - "grad_norm": 0.09374094754457474, + "grad_norm": 0.22345979511737823, "learning_rate": 6.914546640574038e-07, - "loss": 0.0267, + "loss": 0.0283, "step": 7560 }, { "epoch": 34.5662100456621, - "grad_norm": 0.06963168829679489, + "grad_norm": 0.1570662409067154, "learning_rate": 6.262230919765167e-07, - "loss": 0.0287, + "loss": 0.028, "step": 7570 }, { "epoch": 34.61187214611872, - "grad_norm": 0.06316248327493668, + "grad_norm": 0.1383802741765976, "learning_rate": 5.609915198956295e-07, - "loss": 0.0265, + "loss": 0.0296, "step": 7580 }, { "epoch": 34.657534246575345, - "grad_norm": 0.09635402262210846, + "grad_norm": 0.1836177259683609, "learning_rate": 4.957599478147424e-07, - "loss": 0.0276, + "loss": 0.0273, "step": 7590 }, { "epoch": 34.70319634703196, - "grad_norm": 0.20677725970745087, + "grad_norm": 0.3105820417404175, "learning_rate": 4.305283757338552e-07, - "loss": 0.0271, + "loss": 0.0285, "step": 7600 }, { "epoch": 34.74885844748859, - "grad_norm": 0.22448600828647614, + "grad_norm": 0.23880144953727722, "learning_rate": 3.6529680365296803e-07, - "loss": 0.0277, + "loss": 0.0284, "step": 7610 }, { "epoch": 34.794520547945204, - "grad_norm": 0.05154965817928314, + "grad_norm": 0.20711283385753632, "learning_rate": 3.000652315720809e-07, - "loss": 0.0276, + "loss": 0.0272, "step": 7620 }, { "epoch": 34.84018264840183, - "grad_norm": 0.0522591657936573, + "grad_norm": 0.1828579604625702, "learning_rate": 2.3483365949119375e-07, - "loss": 0.0277, + "loss": 0.031, "step": 7630 }, { "epoch": 34.885844748858446, - "grad_norm": 0.07666347175836563, + "grad_norm": 0.15802130103111267, "learning_rate": 1.696020874103066e-07, - "loss": 0.0278, + "loss": 0.0266, "step": 7640 }, { "epoch": 34.93150684931507, - "grad_norm": 0.055545128881931305, + "grad_norm": 0.19433271884918213, "learning_rate": 1.0437051532941944e-07, - "loss": 0.0276, + "loss": 0.0281, "step": 7650 }, { "epoch": 34.97716894977169, - "grad_norm": 0.12518319487571716, + "grad_norm": 0.15197888016700745, "learning_rate": 3.9138943248532294e-08, - "loss": 0.0284, + "loss": 0.0279, "step": 7660 }, { "epoch": 35.0, - "eval_bertscore_f1": 0.8826152413429138, - "eval_bleu": 0.6509786399199852, - "eval_loss": 0.02927256189286709, - "eval_rougeL": 0.3174815619181566, - "eval_runtime": 86.2786, - "eval_samples_per_second": 17.42, - "eval_steps_per_second": 1.089, + "eval_bertscore_f1": 0.8782449891308666, + "eval_bleu": 0.7455200374263401, + "eval_loss": 0.04245344549417496, + "eval_rougeL": 0.30427156803355015, + "eval_runtime": 65.2903, + "eval_samples_per_second": 23.02, + "eval_steps_per_second": 1.44, "step": 7665 } ], @@ -5774,7 +5774,7 @@ "attributes": {} } }, - "total_flos": 1.737006710980608e+16, + "total_flos": 7469843241369600.0, "train_batch_size": 16, "trial_name": null, "trial_params": null