Llama-3.2-3B-Adapter / trainer_state.json
Vira21's picture
Upload folder using huggingface_hub
7881b87 verified
{
"best_metric": 44.91524109269023,
"best_model_checkpoint": "/workspace/llm-storage/output/llama-3B-Base/checkpoint-14000",
"epoch": 4.999958097632517,
"eval_steps": 1000,
"global_step": 14915,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006704378797402053,
"grad_norm": 6.649622440338135,
"learning_rate": 6.702412868632708e-07,
"loss": 7.6699,
"step": 20
},
{
"epoch": 0.013408757594804106,
"grad_norm": 5.821969985961914,
"learning_rate": 1.3404825737265416e-06,
"loss": 7.6967,
"step": 40
},
{
"epoch": 0.02011313639220616,
"grad_norm": 7.613153457641602,
"learning_rate": 2.0107238605898126e-06,
"loss": 7.68,
"step": 60
},
{
"epoch": 0.02681751518960821,
"grad_norm": 7.550271511077881,
"learning_rate": 2.680965147453083e-06,
"loss": 7.2033,
"step": 80
},
{
"epoch": 0.03352189398701027,
"grad_norm": 5.453164100646973,
"learning_rate": 3.351206434316354e-06,
"loss": 6.5851,
"step": 100
},
{
"epoch": 0.04022627278441232,
"grad_norm": 6.310723304748535,
"learning_rate": 4.021447721179625e-06,
"loss": 5.9701,
"step": 120
},
{
"epoch": 0.04693065158181437,
"grad_norm": 5.457154750823975,
"learning_rate": 4.691689008042896e-06,
"loss": 5.5137,
"step": 140
},
{
"epoch": 0.05363503037921642,
"grad_norm": 5.355574131011963,
"learning_rate": 5.361930294906166e-06,
"loss": 5.125,
"step": 160
},
{
"epoch": 0.06033940917661848,
"grad_norm": 5.482090473175049,
"learning_rate": 6.032171581769437e-06,
"loss": 5.171,
"step": 180
},
{
"epoch": 0.06704378797402054,
"grad_norm": 6.155514717102051,
"learning_rate": 6.702412868632708e-06,
"loss": 4.9362,
"step": 200
},
{
"epoch": 0.07374816677142258,
"grad_norm": 5.355989933013916,
"learning_rate": 7.372654155495978e-06,
"loss": 4.9172,
"step": 220
},
{
"epoch": 0.08045254556882464,
"grad_norm": 5.774538993835449,
"learning_rate": 8.04289544235925e-06,
"loss": 4.8096,
"step": 240
},
{
"epoch": 0.0871569243662267,
"grad_norm": 5.671195030212402,
"learning_rate": 8.71313672922252e-06,
"loss": 4.8034,
"step": 260
},
{
"epoch": 0.09386130316362874,
"grad_norm": 6.051366806030273,
"learning_rate": 9.383378016085791e-06,
"loss": 4.7359,
"step": 280
},
{
"epoch": 0.1005656819610308,
"grad_norm": 6.272449493408203,
"learning_rate": 1.0053619302949062e-05,
"loss": 4.6895,
"step": 300
},
{
"epoch": 0.10727006075843284,
"grad_norm": 6.4355669021606445,
"learning_rate": 1.0723860589812333e-05,
"loss": 4.5595,
"step": 320
},
{
"epoch": 0.1139744395558349,
"grad_norm": 8.054056167602539,
"learning_rate": 1.1394101876675605e-05,
"loss": 4.4998,
"step": 340
},
{
"epoch": 0.12067881835323696,
"grad_norm": 6.529457092285156,
"learning_rate": 1.2064343163538874e-05,
"loss": 4.5933,
"step": 360
},
{
"epoch": 0.127383197150639,
"grad_norm": 6.745616912841797,
"learning_rate": 1.2734584450402146e-05,
"loss": 4.5153,
"step": 380
},
{
"epoch": 0.13408757594804108,
"grad_norm": 6.701639175415039,
"learning_rate": 1.3404825737265417e-05,
"loss": 4.5117,
"step": 400
},
{
"epoch": 0.14079195474544312,
"grad_norm": 7.223796844482422,
"learning_rate": 1.4075067024128689e-05,
"loss": 4.5095,
"step": 420
},
{
"epoch": 0.14749633354284516,
"grad_norm": 6.9919891357421875,
"learning_rate": 1.4745308310991956e-05,
"loss": 4.504,
"step": 440
},
{
"epoch": 0.15420071234024724,
"grad_norm": 6.506685256958008,
"learning_rate": 1.5415549597855227e-05,
"loss": 4.3921,
"step": 460
},
{
"epoch": 0.16090509113764928,
"grad_norm": 6.946916103363037,
"learning_rate": 1.60857908847185e-05,
"loss": 4.4041,
"step": 480
},
{
"epoch": 0.16760946993505133,
"grad_norm": 6.313327789306641,
"learning_rate": 1.675603217158177e-05,
"loss": 4.3512,
"step": 500
},
{
"epoch": 0.1743138487324534,
"grad_norm": 7.055306911468506,
"learning_rate": 1.742627345844504e-05,
"loss": 4.3819,
"step": 520
},
{
"epoch": 0.18101822752985544,
"grad_norm": 6.881680965423584,
"learning_rate": 1.8096514745308312e-05,
"loss": 4.2658,
"step": 540
},
{
"epoch": 0.18772260632725749,
"grad_norm": 6.35658073425293,
"learning_rate": 1.8766756032171583e-05,
"loss": 4.1747,
"step": 560
},
{
"epoch": 0.19442698512465956,
"grad_norm": 7.282151222229004,
"learning_rate": 1.9436997319034853e-05,
"loss": 4.392,
"step": 580
},
{
"epoch": 0.2011313639220616,
"grad_norm": 7.252215385437012,
"learning_rate": 2.0107238605898124e-05,
"loss": 4.2709,
"step": 600
},
{
"epoch": 0.20783574271946365,
"grad_norm": 7.683035850524902,
"learning_rate": 2.0777479892761395e-05,
"loss": 4.1557,
"step": 620
},
{
"epoch": 0.2145401215168657,
"grad_norm": 7.786757469177246,
"learning_rate": 2.1447721179624665e-05,
"loss": 4.2957,
"step": 640
},
{
"epoch": 0.22124450031426776,
"grad_norm": 7.157670497894287,
"learning_rate": 2.211796246648794e-05,
"loss": 4.1558,
"step": 660
},
{
"epoch": 0.2279488791116698,
"grad_norm": 6.556661128997803,
"learning_rate": 2.278820375335121e-05,
"loss": 4.219,
"step": 680
},
{
"epoch": 0.23465325790907185,
"grad_norm": 7.472171306610107,
"learning_rate": 2.3458445040214477e-05,
"loss": 4.0597,
"step": 700
},
{
"epoch": 0.24135763670647392,
"grad_norm": 6.373785018920898,
"learning_rate": 2.4128686327077747e-05,
"loss": 4.0991,
"step": 720
},
{
"epoch": 0.24806201550387597,
"grad_norm": 7.932548999786377,
"learning_rate": 2.479892761394102e-05,
"loss": 4.2809,
"step": 740
},
{
"epoch": 0.254766394301278,
"grad_norm": 6.911128520965576,
"learning_rate": 2.5469168900804292e-05,
"loss": 4.107,
"step": 760
},
{
"epoch": 0.26147077309868005,
"grad_norm": 6.957625865936279,
"learning_rate": 2.6139410187667563e-05,
"loss": 4.2279,
"step": 780
},
{
"epoch": 0.26817515189608215,
"grad_norm": 6.987204551696777,
"learning_rate": 2.6809651474530833e-05,
"loss": 4.0514,
"step": 800
},
{
"epoch": 0.2748795306934842,
"grad_norm": 6.837357997894287,
"learning_rate": 2.7479892761394104e-05,
"loss": 4.2328,
"step": 820
},
{
"epoch": 0.28158390949088624,
"grad_norm": 7.49316930770874,
"learning_rate": 2.8150134048257378e-05,
"loss": 4.1407,
"step": 840
},
{
"epoch": 0.2882882882882883,
"grad_norm": 6.795347690582275,
"learning_rate": 2.8820375335120648e-05,
"loss": 4.0602,
"step": 860
},
{
"epoch": 0.29499266708569033,
"grad_norm": 6.921504020690918,
"learning_rate": 2.9490616621983912e-05,
"loss": 4.0738,
"step": 880
},
{
"epoch": 0.3016970458830924,
"grad_norm": 6.316340923309326,
"learning_rate": 3.0160857908847186e-05,
"loss": 3.9296,
"step": 900
},
{
"epoch": 0.3084014246804945,
"grad_norm": 6.55310583114624,
"learning_rate": 3.083109919571045e-05,
"loss": 4.0316,
"step": 920
},
{
"epoch": 0.3151058034778965,
"grad_norm": 5.821455001831055,
"learning_rate": 3.1501340482573724e-05,
"loss": 4.0779,
"step": 940
},
{
"epoch": 0.32181018227529856,
"grad_norm": 7.125136852264404,
"learning_rate": 3.2171581769437e-05,
"loss": 4.0213,
"step": 960
},
{
"epoch": 0.3285145610727006,
"grad_norm": 6.5631794929504395,
"learning_rate": 3.284182305630027e-05,
"loss": 4.0513,
"step": 980
},
{
"epoch": 0.33521893987010265,
"grad_norm": 6.145415782928467,
"learning_rate": 3.351206434316354e-05,
"loss": 3.9952,
"step": 1000
},
{
"epoch": 0.33521893987010265,
"eval_bleu_greedy": 42.15344877780225,
"eval_loss": 0.5149964690208435,
"eval_runtime": 264.7887,
"eval_samples_per_second": 0.378,
"eval_steps_per_second": 0.378,
"step": 1000
},
{
"epoch": 0.3419233186675047,
"grad_norm": 6.734564304351807,
"learning_rate": 3.418230563002681e-05,
"loss": 4.048,
"step": 1020
},
{
"epoch": 0.3486276974649068,
"grad_norm": 6.978208541870117,
"learning_rate": 3.485254691689008e-05,
"loss": 4.0318,
"step": 1040
},
{
"epoch": 0.35533207626230884,
"grad_norm": 5.949651718139648,
"learning_rate": 3.5522788203753354e-05,
"loss": 3.9334,
"step": 1060
},
{
"epoch": 0.3620364550597109,
"grad_norm": 5.890812397003174,
"learning_rate": 3.6193029490616625e-05,
"loss": 3.9546,
"step": 1080
},
{
"epoch": 0.3687408338571129,
"grad_norm": 6.559944152832031,
"learning_rate": 3.6863270777479895e-05,
"loss": 4.0145,
"step": 1100
},
{
"epoch": 0.37544521265451497,
"grad_norm": 6.098083972930908,
"learning_rate": 3.7533512064343166e-05,
"loss": 3.9196,
"step": 1120
},
{
"epoch": 0.382149591451917,
"grad_norm": 6.418199062347412,
"learning_rate": 3.8203753351206436e-05,
"loss": 3.9634,
"step": 1140
},
{
"epoch": 0.3888539702493191,
"grad_norm": 6.399519920349121,
"learning_rate": 3.887399463806971e-05,
"loss": 3.8874,
"step": 1160
},
{
"epoch": 0.39555834904672116,
"grad_norm": 6.154569149017334,
"learning_rate": 3.954423592493298e-05,
"loss": 3.8637,
"step": 1180
},
{
"epoch": 0.4022627278441232,
"grad_norm": 5.589611530303955,
"learning_rate": 4.021447721179625e-05,
"loss": 3.9136,
"step": 1200
},
{
"epoch": 0.40896710664152525,
"grad_norm": 6.433633327484131,
"learning_rate": 4.088471849865952e-05,
"loss": 3.9185,
"step": 1220
},
{
"epoch": 0.4156714854389273,
"grad_norm": 5.366619110107422,
"learning_rate": 4.155495978552279e-05,
"loss": 3.9046,
"step": 1240
},
{
"epoch": 0.42237586423632933,
"grad_norm": 5.7498884201049805,
"learning_rate": 4.222520107238606e-05,
"loss": 3.8305,
"step": 1260
},
{
"epoch": 0.4290802430337314,
"grad_norm": 6.001119613647461,
"learning_rate": 4.289544235924933e-05,
"loss": 3.8793,
"step": 1280
},
{
"epoch": 0.4357846218311335,
"grad_norm": 5.743397235870361,
"learning_rate": 4.35656836461126e-05,
"loss": 3.8746,
"step": 1300
},
{
"epoch": 0.4424890006285355,
"grad_norm": 6.727258205413818,
"learning_rate": 4.423592493297588e-05,
"loss": 3.7943,
"step": 1320
},
{
"epoch": 0.44919337942593757,
"grad_norm": 5.452583312988281,
"learning_rate": 4.490616621983915e-05,
"loss": 3.8095,
"step": 1340
},
{
"epoch": 0.4558977582233396,
"grad_norm": 4.872672080993652,
"learning_rate": 4.557640750670242e-05,
"loss": 3.7152,
"step": 1360
},
{
"epoch": 0.46260213702074165,
"grad_norm": 5.5072174072265625,
"learning_rate": 4.624664879356568e-05,
"loss": 3.7729,
"step": 1380
},
{
"epoch": 0.4693065158181437,
"grad_norm": 6.116482257843018,
"learning_rate": 4.6916890080428954e-05,
"loss": 3.8463,
"step": 1400
},
{
"epoch": 0.4760108946155458,
"grad_norm": 5.880556583404541,
"learning_rate": 4.7587131367292224e-05,
"loss": 3.8045,
"step": 1420
},
{
"epoch": 0.48271527341294784,
"grad_norm": 5.663512229919434,
"learning_rate": 4.8257372654155495e-05,
"loss": 3.7668,
"step": 1440
},
{
"epoch": 0.4894196522103499,
"grad_norm": 5.742367744445801,
"learning_rate": 4.8927613941018765e-05,
"loss": 3.7065,
"step": 1460
},
{
"epoch": 0.49612403100775193,
"grad_norm": 5.253595352172852,
"learning_rate": 4.959785522788204e-05,
"loss": 3.7373,
"step": 1480
},
{
"epoch": 0.502828409805154,
"grad_norm": 5.620617866516113,
"learning_rate": 4.997020040229457e-05,
"loss": 3.766,
"step": 1500
},
{
"epoch": 0.509532788602556,
"grad_norm": 5.477814197540283,
"learning_rate": 4.989570140803099e-05,
"loss": 3.8301,
"step": 1520
},
{
"epoch": 0.5162371673999581,
"grad_norm": 5.931734561920166,
"learning_rate": 4.982120241376742e-05,
"loss": 3.7464,
"step": 1540
},
{
"epoch": 0.5229415461973601,
"grad_norm": 4.500419616699219,
"learning_rate": 4.974670341950384e-05,
"loss": 3.7418,
"step": 1560
},
{
"epoch": 0.5296459249947622,
"grad_norm": 5.702874660491943,
"learning_rate": 4.967220442524026e-05,
"loss": 3.7451,
"step": 1580
},
{
"epoch": 0.5363503037921643,
"grad_norm": 6.070137023925781,
"learning_rate": 4.959770543097668e-05,
"loss": 3.6802,
"step": 1600
},
{
"epoch": 0.5430546825895664,
"grad_norm": 5.067490100860596,
"learning_rate": 4.95232064367131e-05,
"loss": 3.7898,
"step": 1620
},
{
"epoch": 0.5497590613869684,
"grad_norm": 6.116696834564209,
"learning_rate": 4.944870744244953e-05,
"loss": 3.6728,
"step": 1640
},
{
"epoch": 0.5564634401843704,
"grad_norm": 5.9352827072143555,
"learning_rate": 4.937420844818596e-05,
"loss": 3.7049,
"step": 1660
},
{
"epoch": 0.5631678189817725,
"grad_norm": 4.784963130950928,
"learning_rate": 4.929970945392238e-05,
"loss": 3.5477,
"step": 1680
},
{
"epoch": 0.5698721977791745,
"grad_norm": 5.402501106262207,
"learning_rate": 4.92252104596588e-05,
"loss": 3.6234,
"step": 1700
},
{
"epoch": 0.5765765765765766,
"grad_norm": 5.293265342712402,
"learning_rate": 4.915071146539522e-05,
"loss": 3.5493,
"step": 1720
},
{
"epoch": 0.5832809553739786,
"grad_norm": 5.232183933258057,
"learning_rate": 4.907621247113165e-05,
"loss": 3.5725,
"step": 1740
},
{
"epoch": 0.5899853341713807,
"grad_norm": 5.71828031539917,
"learning_rate": 4.900171347686807e-05,
"loss": 3.6631,
"step": 1760
},
{
"epoch": 0.5966897129687827,
"grad_norm": 5.855662822723389,
"learning_rate": 4.892721448260449e-05,
"loss": 3.6428,
"step": 1780
},
{
"epoch": 0.6033940917661847,
"grad_norm": 5.2928290367126465,
"learning_rate": 4.885271548834091e-05,
"loss": 3.5862,
"step": 1800
},
{
"epoch": 0.6100984705635868,
"grad_norm": 5.530043125152588,
"learning_rate": 4.877821649407733e-05,
"loss": 3.5894,
"step": 1820
},
{
"epoch": 0.616802849360989,
"grad_norm": 5.8700079917907715,
"learning_rate": 4.870371749981376e-05,
"loss": 3.4918,
"step": 1840
},
{
"epoch": 0.623507228158391,
"grad_norm": 4.9393510818481445,
"learning_rate": 4.862921850555018e-05,
"loss": 3.5439,
"step": 1860
},
{
"epoch": 0.630211606955793,
"grad_norm": 5.189883232116699,
"learning_rate": 4.85547195112866e-05,
"loss": 3.6312,
"step": 1880
},
{
"epoch": 0.6369159857531951,
"grad_norm": 4.674147605895996,
"learning_rate": 4.848022051702302e-05,
"loss": 3.5674,
"step": 1900
},
{
"epoch": 0.6436203645505971,
"grad_norm": 5.3367919921875,
"learning_rate": 4.840572152275945e-05,
"loss": 3.5407,
"step": 1920
},
{
"epoch": 0.6503247433479992,
"grad_norm": 5.333773612976074,
"learning_rate": 4.833122252849587e-05,
"loss": 3.5916,
"step": 1940
},
{
"epoch": 0.6570291221454012,
"grad_norm": 5.1199235916137695,
"learning_rate": 4.825672353423229e-05,
"loss": 3.5203,
"step": 1960
},
{
"epoch": 0.6637335009428033,
"grad_norm": 5.012689590454102,
"learning_rate": 4.818222453996871e-05,
"loss": 3.6189,
"step": 1980
},
{
"epoch": 0.6704378797402053,
"grad_norm": 4.8355231285095215,
"learning_rate": 4.810772554570513e-05,
"loss": 3.4858,
"step": 2000
},
{
"epoch": 0.6704378797402053,
"eval_bleu_greedy": 42.772572026733215,
"eval_loss": 0.44412651658058167,
"eval_runtime": 257.8686,
"eval_samples_per_second": 0.388,
"eval_steps_per_second": 0.388,
"step": 2000
},
{
"epoch": 0.6771422585376073,
"grad_norm": 5.045526504516602,
"learning_rate": 4.803322655144156e-05,
"loss": 3.4094,
"step": 2020
},
{
"epoch": 0.6838466373350094,
"grad_norm": 4.954864501953125,
"learning_rate": 4.795872755717798e-05,
"loss": 3.5093,
"step": 2040
},
{
"epoch": 0.6905510161324114,
"grad_norm": 4.870156288146973,
"learning_rate": 4.78842285629144e-05,
"loss": 3.4555,
"step": 2060
},
{
"epoch": 0.6972553949298136,
"grad_norm": 5.512898921966553,
"learning_rate": 4.780972956865082e-05,
"loss": 3.3533,
"step": 2080
},
{
"epoch": 0.7039597737272156,
"grad_norm": 5.5289788246154785,
"learning_rate": 4.773523057438724e-05,
"loss": 3.4053,
"step": 2100
},
{
"epoch": 0.7106641525246177,
"grad_norm": 4.736262798309326,
"learning_rate": 4.766073158012367e-05,
"loss": 3.4689,
"step": 2120
},
{
"epoch": 0.7173685313220197,
"grad_norm": 5.111291885375977,
"learning_rate": 4.758623258586009e-05,
"loss": 3.4979,
"step": 2140
},
{
"epoch": 0.7240729101194218,
"grad_norm": 5.018069267272949,
"learning_rate": 4.751173359159651e-05,
"loss": 3.3954,
"step": 2160
},
{
"epoch": 0.7307772889168238,
"grad_norm": 4.47459077835083,
"learning_rate": 4.7437234597332934e-05,
"loss": 3.4213,
"step": 2180
},
{
"epoch": 0.7374816677142259,
"grad_norm": 4.8868279457092285,
"learning_rate": 4.736273560306936e-05,
"loss": 3.5459,
"step": 2200
},
{
"epoch": 0.7441860465116279,
"grad_norm": 5.392606735229492,
"learning_rate": 4.728823660880579e-05,
"loss": 3.4798,
"step": 2220
},
{
"epoch": 0.7508904253090299,
"grad_norm": 4.868163108825684,
"learning_rate": 4.721373761454221e-05,
"loss": 3.3849,
"step": 2240
},
{
"epoch": 0.757594804106432,
"grad_norm": 5.640343189239502,
"learning_rate": 4.713923862027863e-05,
"loss": 3.4752,
"step": 2260
},
{
"epoch": 0.764299182903834,
"grad_norm": 5.552671432495117,
"learning_rate": 4.706473962601505e-05,
"loss": 3.3831,
"step": 2280
},
{
"epoch": 0.7710035617012361,
"grad_norm": 4.874811172485352,
"learning_rate": 4.699024063175147e-05,
"loss": 3.3214,
"step": 2300
},
{
"epoch": 0.7777079404986382,
"grad_norm": 5.65765380859375,
"learning_rate": 4.69157416374879e-05,
"loss": 3.3833,
"step": 2320
},
{
"epoch": 0.7844123192960403,
"grad_norm": 5.0529351234436035,
"learning_rate": 4.684124264322432e-05,
"loss": 3.3898,
"step": 2340
},
{
"epoch": 0.7911166980934423,
"grad_norm": 4.97359561920166,
"learning_rate": 4.676674364896074e-05,
"loss": 3.3449,
"step": 2360
},
{
"epoch": 0.7978210768908444,
"grad_norm": 5.133802890777588,
"learning_rate": 4.669224465469716e-05,
"loss": 3.3804,
"step": 2380
},
{
"epoch": 0.8045254556882464,
"grad_norm": 5.423835754394531,
"learning_rate": 4.661774566043359e-05,
"loss": 3.3506,
"step": 2400
},
{
"epoch": 0.8112298344856484,
"grad_norm": 4.704217910766602,
"learning_rate": 4.654324666617001e-05,
"loss": 3.3602,
"step": 2420
},
{
"epoch": 0.8179342132830505,
"grad_norm": 5.2804365158081055,
"learning_rate": 4.646874767190643e-05,
"loss": 3.2892,
"step": 2440
},
{
"epoch": 0.8246385920804525,
"grad_norm": 5.329574108123779,
"learning_rate": 4.639424867764285e-05,
"loss": 3.4255,
"step": 2460
},
{
"epoch": 0.8313429708778546,
"grad_norm": 5.078803062438965,
"learning_rate": 4.6319749683379274e-05,
"loss": 3.2791,
"step": 2480
},
{
"epoch": 0.8380473496752566,
"grad_norm": 5.376737117767334,
"learning_rate": 4.62452506891157e-05,
"loss": 3.2312,
"step": 2500
},
{
"epoch": 0.8447517284726587,
"grad_norm": 4.886073589324951,
"learning_rate": 4.617075169485212e-05,
"loss": 3.2618,
"step": 2520
},
{
"epoch": 0.8514561072700607,
"grad_norm": 4.772372722625732,
"learning_rate": 4.609625270058854e-05,
"loss": 3.3566,
"step": 2540
},
{
"epoch": 0.8581604860674628,
"grad_norm": 5.242844581604004,
"learning_rate": 4.6021753706324964e-05,
"loss": 3.1969,
"step": 2560
},
{
"epoch": 0.8648648648648649,
"grad_norm": 5.419410228729248,
"learning_rate": 4.5947254712061385e-05,
"loss": 3.2977,
"step": 2580
},
{
"epoch": 0.871569243662267,
"grad_norm": 5.235103130340576,
"learning_rate": 4.587275571779781e-05,
"loss": 3.2264,
"step": 2600
},
{
"epoch": 0.878273622459669,
"grad_norm": 6.972829341888428,
"learning_rate": 4.5798256723534234e-05,
"loss": 3.2103,
"step": 2620
},
{
"epoch": 0.884978001257071,
"grad_norm": 5.612443923950195,
"learning_rate": 4.5723757729270654e-05,
"loss": 3.2591,
"step": 2640
},
{
"epoch": 0.8916823800544731,
"grad_norm": 5.727964401245117,
"learning_rate": 4.5649258735007075e-05,
"loss": 3.3569,
"step": 2660
},
{
"epoch": 0.8983867588518751,
"grad_norm": 5.101806640625,
"learning_rate": 4.55747597407435e-05,
"loss": 3.2256,
"step": 2680
},
{
"epoch": 0.9050911376492772,
"grad_norm": 4.5348358154296875,
"learning_rate": 4.5500260746479924e-05,
"loss": 3.3395,
"step": 2700
},
{
"epoch": 0.9117955164466792,
"grad_norm": 4.911803722381592,
"learning_rate": 4.5425761752216345e-05,
"loss": 3.3203,
"step": 2720
},
{
"epoch": 0.9184998952440813,
"grad_norm": 5.24350643157959,
"learning_rate": 4.5351262757952766e-05,
"loss": 3.2867,
"step": 2740
},
{
"epoch": 0.9252042740414833,
"grad_norm": 5.0574493408203125,
"learning_rate": 4.527676376368919e-05,
"loss": 3.1712,
"step": 2760
},
{
"epoch": 0.9319086528388854,
"grad_norm": 4.724992752075195,
"learning_rate": 4.5202264769425614e-05,
"loss": 3.3225,
"step": 2780
},
{
"epoch": 0.9386130316362874,
"grad_norm": 5.762344837188721,
"learning_rate": 4.512776577516204e-05,
"loss": 3.277,
"step": 2800
},
{
"epoch": 0.9453174104336896,
"grad_norm": 5.392731666564941,
"learning_rate": 4.505326678089846e-05,
"loss": 3.3034,
"step": 2820
},
{
"epoch": 0.9520217892310916,
"grad_norm": 4.567736625671387,
"learning_rate": 4.4978767786634884e-05,
"loss": 3.2544,
"step": 2840
},
{
"epoch": 0.9587261680284936,
"grad_norm": 4.565167427062988,
"learning_rate": 4.4904268792371304e-05,
"loss": 3.2303,
"step": 2860
},
{
"epoch": 0.9654305468258957,
"grad_norm": 4.804454326629639,
"learning_rate": 4.482976979810773e-05,
"loss": 3.1671,
"step": 2880
},
{
"epoch": 0.9721349256232977,
"grad_norm": 4.879951477050781,
"learning_rate": 4.475527080384415e-05,
"loss": 3.1655,
"step": 2900
},
{
"epoch": 0.9788393044206998,
"grad_norm": 5.604247570037842,
"learning_rate": 4.4680771809580574e-05,
"loss": 3.2109,
"step": 2920
},
{
"epoch": 0.9855436832181018,
"grad_norm": 5.372274875640869,
"learning_rate": 4.4606272815316995e-05,
"loss": 3.1978,
"step": 2940
},
{
"epoch": 0.9922480620155039,
"grad_norm": 5.262199878692627,
"learning_rate": 4.4531773821053416e-05,
"loss": 3.1311,
"step": 2960
},
{
"epoch": 0.9989524408129059,
"grad_norm": 4.829686641693115,
"learning_rate": 4.445727482678984e-05,
"loss": 3.1546,
"step": 2980
},
{
"epoch": 1.0053635030379215,
"grad_norm": 4.690941333770752,
"learning_rate": 4.4382775832526264e-05,
"loss": 2.9199,
"step": 3000
},
{
"epoch": 1.0053635030379215,
"eval_bleu_greedy": 42.24584689498649,
"eval_loss": 0.4063108563423157,
"eval_runtime": 176.1437,
"eval_samples_per_second": 0.568,
"eval_steps_per_second": 0.568,
"step": 3000
},
{
"epoch": 1.0120678818353237,
"grad_norm": 5.380918025970459,
"learning_rate": 4.4308276838262685e-05,
"loss": 2.9888,
"step": 3020
},
{
"epoch": 1.0187722606327259,
"grad_norm": 4.866960048675537,
"learning_rate": 4.4233777843999106e-05,
"loss": 3.0593,
"step": 3040
},
{
"epoch": 1.0254766394301278,
"grad_norm": 5.36842155456543,
"learning_rate": 4.4159278849735534e-05,
"loss": 3.0062,
"step": 3060
},
{
"epoch": 1.03218101822753,
"grad_norm": 5.316088676452637,
"learning_rate": 4.4084779855471954e-05,
"loss": 3.0334,
"step": 3080
},
{
"epoch": 1.0388853970249319,
"grad_norm": 5.1895012855529785,
"learning_rate": 4.4010280861208375e-05,
"loss": 2.9251,
"step": 3100
},
{
"epoch": 1.045589775822334,
"grad_norm": 4.736208438873291,
"learning_rate": 4.3935781866944796e-05,
"loss": 2.9938,
"step": 3120
},
{
"epoch": 1.052294154619736,
"grad_norm": 5.429833889007568,
"learning_rate": 4.386128287268122e-05,
"loss": 3.0089,
"step": 3140
},
{
"epoch": 1.0589985334171381,
"grad_norm": 4.854675769805908,
"learning_rate": 4.3786783878417645e-05,
"loss": 3.0657,
"step": 3160
},
{
"epoch": 1.06570291221454,
"grad_norm": 4.766360759735107,
"learning_rate": 4.3712284884154066e-05,
"loss": 2.9854,
"step": 3180
},
{
"epoch": 1.0724072910119422,
"grad_norm": 5.70284366607666,
"learning_rate": 4.3637785889890487e-05,
"loss": 3.0657,
"step": 3200
},
{
"epoch": 1.0791116698093441,
"grad_norm": 5.729391574859619,
"learning_rate": 4.356328689562691e-05,
"loss": 2.9516,
"step": 3220
},
{
"epoch": 1.0858160486067463,
"grad_norm": 5.109742164611816,
"learning_rate": 4.348878790136333e-05,
"loss": 3.0221,
"step": 3240
},
{
"epoch": 1.0925204274041482,
"grad_norm": 5.132044315338135,
"learning_rate": 4.3414288907099756e-05,
"loss": 3.0308,
"step": 3260
},
{
"epoch": 1.0992248062015504,
"grad_norm": 4.71274995803833,
"learning_rate": 4.333978991283618e-05,
"loss": 2.9777,
"step": 3280
},
{
"epoch": 1.1059291849989525,
"grad_norm": 5.162365436553955,
"learning_rate": 4.3265290918572605e-05,
"loss": 3.0288,
"step": 3300
},
{
"epoch": 1.1126335637963545,
"grad_norm": 4.953509330749512,
"learning_rate": 4.3190791924309025e-05,
"loss": 3.1481,
"step": 3320
},
{
"epoch": 1.1193379425937566,
"grad_norm": 4.702221870422363,
"learning_rate": 4.3116292930045446e-05,
"loss": 3.0185,
"step": 3340
},
{
"epoch": 1.1260423213911586,
"grad_norm": 4.8475446701049805,
"learning_rate": 4.3041793935781874e-05,
"loss": 2.9887,
"step": 3360
},
{
"epoch": 1.1327467001885607,
"grad_norm": 5.558666706085205,
"learning_rate": 4.2967294941518295e-05,
"loss": 3.0207,
"step": 3380
},
{
"epoch": 1.1394510789859627,
"grad_norm": 5.5212602615356445,
"learning_rate": 4.2892795947254716e-05,
"loss": 3.0179,
"step": 3400
},
{
"epoch": 1.1461554577833648,
"grad_norm": 5.226992607116699,
"learning_rate": 4.2818296952991137e-05,
"loss": 3.0295,
"step": 3420
},
{
"epoch": 1.1528598365807667,
"grad_norm": 5.354259014129639,
"learning_rate": 4.274379795872756e-05,
"loss": 2.9132,
"step": 3440
},
{
"epoch": 1.159564215378169,
"grad_norm": 4.869040489196777,
"learning_rate": 4.2669298964463985e-05,
"loss": 2.9472,
"step": 3460
},
{
"epoch": 1.1662685941755708,
"grad_norm": 7.068531036376953,
"learning_rate": 4.2594799970200406e-05,
"loss": 2.915,
"step": 3480
},
{
"epoch": 1.172972972972973,
"grad_norm": 5.257974147796631,
"learning_rate": 4.252030097593683e-05,
"loss": 2.9875,
"step": 3500
},
{
"epoch": 1.179677351770375,
"grad_norm": 5.5318474769592285,
"learning_rate": 4.244580198167325e-05,
"loss": 2.9196,
"step": 3520
},
{
"epoch": 1.186381730567777,
"grad_norm": 5.792961597442627,
"learning_rate": 4.2371302987409675e-05,
"loss": 2.9411,
"step": 3540
},
{
"epoch": 1.1930861093651792,
"grad_norm": 5.819127559661865,
"learning_rate": 4.2296803993146096e-05,
"loss": 3.0377,
"step": 3560
},
{
"epoch": 1.1997904881625812,
"grad_norm": 5.471036911010742,
"learning_rate": 4.222230499888252e-05,
"loss": 2.9347,
"step": 3580
},
{
"epoch": 1.2064948669599833,
"grad_norm": 5.375743865966797,
"learning_rate": 4.214780600461894e-05,
"loss": 2.9063,
"step": 3600
},
{
"epoch": 1.2131992457573852,
"grad_norm": 5.005084991455078,
"learning_rate": 4.207330701035536e-05,
"loss": 2.9835,
"step": 3620
},
{
"epoch": 1.2199036245547874,
"grad_norm": 4.6130475997924805,
"learning_rate": 4.1998808016091787e-05,
"loss": 2.9734,
"step": 3640
},
{
"epoch": 1.2266080033521893,
"grad_norm": 4.87117338180542,
"learning_rate": 4.192430902182821e-05,
"loss": 2.873,
"step": 3660
},
{
"epoch": 1.2333123821495915,
"grad_norm": 5.72168493270874,
"learning_rate": 4.184981002756463e-05,
"loss": 2.8803,
"step": 3680
},
{
"epoch": 1.2400167609469934,
"grad_norm": 5.60111665725708,
"learning_rate": 4.177531103330105e-05,
"loss": 2.9545,
"step": 3700
},
{
"epoch": 1.2467211397443956,
"grad_norm": 5.848781108856201,
"learning_rate": 4.170081203903747e-05,
"loss": 2.8407,
"step": 3720
},
{
"epoch": 1.2534255185417975,
"grad_norm": 5.473219394683838,
"learning_rate": 4.16263130447739e-05,
"loss": 2.957,
"step": 3740
},
{
"epoch": 1.2601298973391997,
"grad_norm": 4.950867652893066,
"learning_rate": 4.155181405051032e-05,
"loss": 2.9443,
"step": 3760
},
{
"epoch": 1.2668342761366018,
"grad_norm": 5.851542949676514,
"learning_rate": 4.147731505624674e-05,
"loss": 2.9224,
"step": 3780
},
{
"epoch": 1.2735386549340038,
"grad_norm": 5.347169876098633,
"learning_rate": 4.140281606198316e-05,
"loss": 2.9604,
"step": 3800
},
{
"epoch": 1.2802430337314057,
"grad_norm": 4.820839881896973,
"learning_rate": 4.132831706771959e-05,
"loss": 2.9367,
"step": 3820
},
{
"epoch": 1.2869474125288078,
"grad_norm": 4.836258411407471,
"learning_rate": 4.125381807345601e-05,
"loss": 2.9184,
"step": 3840
},
{
"epoch": 1.29365179132621,
"grad_norm": 4.943812847137451,
"learning_rate": 4.1179319079192437e-05,
"loss": 2.935,
"step": 3860
},
{
"epoch": 1.300356170123612,
"grad_norm": 6.155868053436279,
"learning_rate": 4.110482008492886e-05,
"loss": 2.9132,
"step": 3880
},
{
"epoch": 1.307060548921014,
"grad_norm": 5.259906768798828,
"learning_rate": 4.103032109066528e-05,
"loss": 2.8939,
"step": 3900
},
{
"epoch": 1.313764927718416,
"grad_norm": 5.179457664489746,
"learning_rate": 4.09558220964017e-05,
"loss": 2.9348,
"step": 3920
},
{
"epoch": 1.3204693065158182,
"grad_norm": 4.546389579772949,
"learning_rate": 4.088132310213813e-05,
"loss": 2.8441,
"step": 3940
},
{
"epoch": 1.32717368531322,
"grad_norm": 5.706070423126221,
"learning_rate": 4.080682410787455e-05,
"loss": 3.0033,
"step": 3960
},
{
"epoch": 1.3338780641106223,
"grad_norm": 6.009608268737793,
"learning_rate": 4.073232511361097e-05,
"loss": 2.8735,
"step": 3980
},
{
"epoch": 1.3405824429080244,
"grad_norm": 4.916238784790039,
"learning_rate": 4.065782611934739e-05,
"loss": 2.9641,
"step": 4000
},
{
"epoch": 1.3405824429080244,
"eval_bleu_greedy": 42.46623138458086,
"eval_loss": 0.3836318850517273,
"eval_runtime": 102.4476,
"eval_samples_per_second": 0.976,
"eval_steps_per_second": 0.976,
"step": 4000
},
{
"epoch": 1.3472868217054264,
"grad_norm": 4.503624439239502,
"learning_rate": 4.058332712508382e-05,
"loss": 2.9549,
"step": 4020
},
{
"epoch": 1.3539912005028283,
"grad_norm": 4.97296142578125,
"learning_rate": 4.050882813082024e-05,
"loss": 2.8055,
"step": 4040
},
{
"epoch": 1.3606955793002304,
"grad_norm": 5.414029598236084,
"learning_rate": 4.043432913655666e-05,
"loss": 2.9634,
"step": 4060
},
{
"epoch": 1.3673999580976326,
"grad_norm": 5.9259033203125,
"learning_rate": 4.035983014229308e-05,
"loss": 2.8464,
"step": 4080
},
{
"epoch": 1.3741043368950345,
"grad_norm": 5.309682846069336,
"learning_rate": 4.02853311480295e-05,
"loss": 2.82,
"step": 4100
},
{
"epoch": 1.3808087156924367,
"grad_norm": 5.620110034942627,
"learning_rate": 4.021083215376593e-05,
"loss": 2.8368,
"step": 4120
},
{
"epoch": 1.3875130944898386,
"grad_norm": 5.198604583740234,
"learning_rate": 4.013633315950235e-05,
"loss": 2.8964,
"step": 4140
},
{
"epoch": 1.3942174732872408,
"grad_norm": 5.072934150695801,
"learning_rate": 4.006183416523877e-05,
"loss": 2.8942,
"step": 4160
},
{
"epoch": 1.4009218520846427,
"grad_norm": 4.95306921005249,
"learning_rate": 3.998733517097519e-05,
"loss": 2.838,
"step": 4180
},
{
"epoch": 1.4076262308820449,
"grad_norm": 4.760760307312012,
"learning_rate": 3.991283617671162e-05,
"loss": 2.8592,
"step": 4200
},
{
"epoch": 1.414330609679447,
"grad_norm": 4.957187652587891,
"learning_rate": 3.983833718244804e-05,
"loss": 2.904,
"step": 4220
},
{
"epoch": 1.421034988476849,
"grad_norm": 4.402778625488281,
"learning_rate": 3.976383818818446e-05,
"loss": 2.7385,
"step": 4240
},
{
"epoch": 1.4277393672742509,
"grad_norm": 5.658669471740723,
"learning_rate": 3.968933919392088e-05,
"loss": 2.8683,
"step": 4260
},
{
"epoch": 1.434443746071653,
"grad_norm": 4.498071670532227,
"learning_rate": 3.96148401996573e-05,
"loss": 2.9447,
"step": 4280
},
{
"epoch": 1.4411481248690552,
"grad_norm": 5.859282970428467,
"learning_rate": 3.954034120539373e-05,
"loss": 2.7831,
"step": 4300
},
{
"epoch": 1.4478525036664571,
"grad_norm": 4.9684062004089355,
"learning_rate": 3.946584221113015e-05,
"loss": 2.8563,
"step": 4320
},
{
"epoch": 1.4545568824638593,
"grad_norm": 4.819912910461426,
"learning_rate": 3.939134321686657e-05,
"loss": 2.8287,
"step": 4340
},
{
"epoch": 1.4612612612612612,
"grad_norm": 5.544245719909668,
"learning_rate": 3.931684422260299e-05,
"loss": 2.8593,
"step": 4360
},
{
"epoch": 1.4679656400586634,
"grad_norm": 5.144392013549805,
"learning_rate": 3.924234522833941e-05,
"loss": 2.925,
"step": 4380
},
{
"epoch": 1.4746700188560653,
"grad_norm": 5.183608531951904,
"learning_rate": 3.916784623407585e-05,
"loss": 2.842,
"step": 4400
},
{
"epoch": 1.4813743976534675,
"grad_norm": 5.042162895202637,
"learning_rate": 3.909334723981227e-05,
"loss": 2.7355,
"step": 4420
},
{
"epoch": 1.4880787764508696,
"grad_norm": 4.931463241577148,
"learning_rate": 3.901884824554869e-05,
"loss": 2.8361,
"step": 4440
},
{
"epoch": 1.4947831552482715,
"grad_norm": 5.274477481842041,
"learning_rate": 3.894434925128511e-05,
"loss": 2.8082,
"step": 4460
},
{
"epoch": 1.5014875340456735,
"grad_norm": 5.249208450317383,
"learning_rate": 3.886985025702153e-05,
"loss": 2.8168,
"step": 4480
},
{
"epoch": 1.5081919128430756,
"grad_norm": 4.666344165802002,
"learning_rate": 3.879535126275796e-05,
"loss": 2.7648,
"step": 4500
},
{
"epoch": 1.5148962916404778,
"grad_norm": 5.37315559387207,
"learning_rate": 3.872085226849438e-05,
"loss": 2.9321,
"step": 4520
},
{
"epoch": 1.5216006704378797,
"grad_norm": 6.209960460662842,
"learning_rate": 3.86463532742308e-05,
"loss": 2.8484,
"step": 4540
},
{
"epoch": 1.5283050492352817,
"grad_norm": 4.266842365264893,
"learning_rate": 3.857185427996722e-05,
"loss": 2.8629,
"step": 4560
},
{
"epoch": 1.5350094280326838,
"grad_norm": 4.92935848236084,
"learning_rate": 3.849735528570364e-05,
"loss": 2.8121,
"step": 4580
},
{
"epoch": 1.541713806830086,
"grad_norm": 5.50532865524292,
"learning_rate": 3.842285629144007e-05,
"loss": 2.8396,
"step": 4600
},
{
"epoch": 1.548418185627488,
"grad_norm": 4.804236888885498,
"learning_rate": 3.834835729717649e-05,
"loss": 2.7826,
"step": 4620
},
{
"epoch": 1.55512256442489,
"grad_norm": 4.995475769042969,
"learning_rate": 3.827385830291291e-05,
"loss": 2.8538,
"step": 4640
},
{
"epoch": 1.5618269432222922,
"grad_norm": 4.848133087158203,
"learning_rate": 3.819935930864933e-05,
"loss": 2.8618,
"step": 4660
},
{
"epoch": 1.5685313220196941,
"grad_norm": 4.338679790496826,
"learning_rate": 3.812486031438576e-05,
"loss": 2.7829,
"step": 4680
},
{
"epoch": 1.575235700817096,
"grad_norm": 4.795759201049805,
"learning_rate": 3.805036132012218e-05,
"loss": 2.7286,
"step": 4700
},
{
"epoch": 1.5819400796144982,
"grad_norm": 5.834278106689453,
"learning_rate": 3.79758623258586e-05,
"loss": 2.7273,
"step": 4720
},
{
"epoch": 1.5886444584119004,
"grad_norm": 4.841015338897705,
"learning_rate": 3.790136333159502e-05,
"loss": 2.8229,
"step": 4740
},
{
"epoch": 1.5953488372093023,
"grad_norm": 5.714345932006836,
"learning_rate": 3.7826864337331444e-05,
"loss": 2.7513,
"step": 4760
},
{
"epoch": 1.6020532160067043,
"grad_norm": 4.734280586242676,
"learning_rate": 3.775236534306787e-05,
"loss": 2.8036,
"step": 4780
},
{
"epoch": 1.6087575948041064,
"grad_norm": 5.643955707550049,
"learning_rate": 3.767786634880429e-05,
"loss": 2.7828,
"step": 4800
},
{
"epoch": 1.6154619736015086,
"grad_norm": 5.758233070373535,
"learning_rate": 3.7603367354540713e-05,
"loss": 2.868,
"step": 4820
},
{
"epoch": 1.6221663523989105,
"grad_norm": 4.5780110359191895,
"learning_rate": 3.7528868360277134e-05,
"loss": 2.7409,
"step": 4840
},
{
"epoch": 1.6288707311963124,
"grad_norm": 4.986692428588867,
"learning_rate": 3.745436936601356e-05,
"loss": 2.765,
"step": 4860
},
{
"epoch": 1.6355751099937148,
"grad_norm": 4.5255913734436035,
"learning_rate": 3.737987037174998e-05,
"loss": 2.7851,
"step": 4880
},
{
"epoch": 1.6422794887911167,
"grad_norm": 4.560822486877441,
"learning_rate": 3.7305371377486404e-05,
"loss": 2.7158,
"step": 4900
},
{
"epoch": 1.6489838675885187,
"grad_norm": 5.068883419036865,
"learning_rate": 3.7230872383222825e-05,
"loss": 2.6394,
"step": 4920
},
{
"epoch": 1.6556882463859208,
"grad_norm": 5.706743240356445,
"learning_rate": 3.715637338895925e-05,
"loss": 2.6872,
"step": 4940
},
{
"epoch": 1.662392625183323,
"grad_norm": 5.257516860961914,
"learning_rate": 3.708187439469567e-05,
"loss": 2.7565,
"step": 4960
},
{
"epoch": 1.669097003980725,
"grad_norm": 5.708479881286621,
"learning_rate": 3.70073754004321e-05,
"loss": 2.7354,
"step": 4980
},
{
"epoch": 1.6758013827781268,
"grad_norm": 5.4913482666015625,
"learning_rate": 3.693287640616852e-05,
"loss": 2.7493,
"step": 5000
},
{
"epoch": 1.6758013827781268,
"eval_bleu_greedy": 43.18356706938715,
"eval_loss": 0.3662695586681366,
"eval_runtime": 57.826,
"eval_samples_per_second": 1.729,
"eval_steps_per_second": 1.729,
"step": 5000
},
{
"epoch": 1.682505761575529,
"grad_norm": 5.242050647735596,
"learning_rate": 3.685837741190494e-05,
"loss": 2.8011,
"step": 5020
},
{
"epoch": 1.6892101403729312,
"grad_norm": 5.231212615966797,
"learning_rate": 3.6783878417641363e-05,
"loss": 2.8461,
"step": 5040
},
{
"epoch": 1.695914519170333,
"grad_norm": 4.671517848968506,
"learning_rate": 3.6709379423377784e-05,
"loss": 2.7855,
"step": 5060
},
{
"epoch": 1.702618897967735,
"grad_norm": 6.603331565856934,
"learning_rate": 3.663488042911421e-05,
"loss": 2.7619,
"step": 5080
},
{
"epoch": 1.7093232767651372,
"grad_norm": 5.420257568359375,
"learning_rate": 3.656038143485063e-05,
"loss": 2.8161,
"step": 5100
},
{
"epoch": 1.7160276555625393,
"grad_norm": 6.228781700134277,
"learning_rate": 3.6485882440587054e-05,
"loss": 2.8839,
"step": 5120
},
{
"epoch": 1.7227320343599413,
"grad_norm": 5.307265758514404,
"learning_rate": 3.6411383446323475e-05,
"loss": 2.7141,
"step": 5140
},
{
"epoch": 1.7294364131573434,
"grad_norm": 5.219127178192139,
"learning_rate": 3.63368844520599e-05,
"loss": 2.6867,
"step": 5160
},
{
"epoch": 1.7361407919547456,
"grad_norm": 7.216719150543213,
"learning_rate": 3.626238545779632e-05,
"loss": 2.8654,
"step": 5180
},
{
"epoch": 1.7428451707521475,
"grad_norm": 5.994204998016357,
"learning_rate": 3.6187886463532744e-05,
"loss": 2.7267,
"step": 5200
},
{
"epoch": 1.7495495495495494,
"grad_norm": 5.58418607711792,
"learning_rate": 3.6113387469269165e-05,
"loss": 2.809,
"step": 5220
},
{
"epoch": 1.7562539283469516,
"grad_norm": 5.072906970977783,
"learning_rate": 3.6038888475005586e-05,
"loss": 2.7781,
"step": 5240
},
{
"epoch": 1.7629583071443538,
"grad_norm": 5.0261335372924805,
"learning_rate": 3.5964389480742013e-05,
"loss": 2.7554,
"step": 5260
},
{
"epoch": 1.7696626859417557,
"grad_norm": 5.358790874481201,
"learning_rate": 3.5889890486478434e-05,
"loss": 2.8101,
"step": 5280
},
{
"epoch": 1.7763670647391576,
"grad_norm": 5.563721656799316,
"learning_rate": 3.5815391492214855e-05,
"loss": 2.8028,
"step": 5300
},
{
"epoch": 1.7830714435365598,
"grad_norm": 4.921150207519531,
"learning_rate": 3.5740892497951276e-05,
"loss": 2.7203,
"step": 5320
},
{
"epoch": 1.789775822333962,
"grad_norm": 5.121066093444824,
"learning_rate": 3.5666393503687704e-05,
"loss": 2.8034,
"step": 5340
},
{
"epoch": 1.7964802011313639,
"grad_norm": 5.708595275878906,
"learning_rate": 3.5591894509424125e-05,
"loss": 2.6728,
"step": 5360
},
{
"epoch": 1.803184579928766,
"grad_norm": 6.127959728240967,
"learning_rate": 3.5517395515160546e-05,
"loss": 2.8047,
"step": 5380
},
{
"epoch": 1.8098889587261682,
"grad_norm": 4.8620405197143555,
"learning_rate": 3.5442896520896966e-05,
"loss": 2.6712,
"step": 5400
},
{
"epoch": 1.81659333752357,
"grad_norm": 5.093926906585693,
"learning_rate": 3.536839752663339e-05,
"loss": 2.7286,
"step": 5420
},
{
"epoch": 1.823297716320972,
"grad_norm": 5.522273063659668,
"learning_rate": 3.5293898532369815e-05,
"loss": 2.7907,
"step": 5440
},
{
"epoch": 1.8300020951183742,
"grad_norm": 4.671688556671143,
"learning_rate": 3.5219399538106236e-05,
"loss": 2.7235,
"step": 5460
},
{
"epoch": 1.8367064739157763,
"grad_norm": 4.80985689163208,
"learning_rate": 3.514490054384266e-05,
"loss": 2.7607,
"step": 5480
},
{
"epoch": 1.8434108527131783,
"grad_norm": 5.098107814788818,
"learning_rate": 3.5070401549579084e-05,
"loss": 2.6983,
"step": 5500
},
{
"epoch": 1.8501152315105802,
"grad_norm": 5.1361236572265625,
"learning_rate": 3.4995902555315505e-05,
"loss": 2.6083,
"step": 5520
},
{
"epoch": 1.8568196103079824,
"grad_norm": 5.15921688079834,
"learning_rate": 3.492140356105193e-05,
"loss": 2.7306,
"step": 5540
},
{
"epoch": 1.8635239891053845,
"grad_norm": 5.7626519203186035,
"learning_rate": 3.4846904566788354e-05,
"loss": 2.774,
"step": 5560
},
{
"epoch": 1.8702283679027865,
"grad_norm": 5.519023895263672,
"learning_rate": 3.4772405572524775e-05,
"loss": 2.7246,
"step": 5580
},
{
"epoch": 1.8769327467001886,
"grad_norm": 5.080531120300293,
"learning_rate": 3.4697906578261196e-05,
"loss": 2.8704,
"step": 5600
},
{
"epoch": 1.8836371254975908,
"grad_norm": 4.591396808624268,
"learning_rate": 3.4623407583997616e-05,
"loss": 2.7437,
"step": 5620
},
{
"epoch": 1.8903415042949927,
"grad_norm": 5.403082847595215,
"learning_rate": 3.4548908589734044e-05,
"loss": 2.6397,
"step": 5640
},
{
"epoch": 1.8970458830923946,
"grad_norm": 6.168285369873047,
"learning_rate": 3.4474409595470465e-05,
"loss": 2.8026,
"step": 5660
},
{
"epoch": 1.9037502618897968,
"grad_norm": 6.647582530975342,
"learning_rate": 3.4399910601206886e-05,
"loss": 2.6709,
"step": 5680
},
{
"epoch": 1.910454640687199,
"grad_norm": 5.180938720703125,
"learning_rate": 3.432541160694331e-05,
"loss": 2.794,
"step": 5700
},
{
"epoch": 1.9171590194846009,
"grad_norm": 5.552131175994873,
"learning_rate": 3.425091261267973e-05,
"loss": 2.7171,
"step": 5720
},
{
"epoch": 1.9238633982820028,
"grad_norm": 5.482553958892822,
"learning_rate": 3.4176413618416155e-05,
"loss": 2.7021,
"step": 5740
},
{
"epoch": 1.930567777079405,
"grad_norm": 4.879262447357178,
"learning_rate": 3.4101914624152576e-05,
"loss": 2.5908,
"step": 5760
},
{
"epoch": 1.9372721558768071,
"grad_norm": 5.676559925079346,
"learning_rate": 3.4027415629889e-05,
"loss": 2.7314,
"step": 5780
},
{
"epoch": 1.943976534674209,
"grad_norm": 5.349039077758789,
"learning_rate": 3.395291663562542e-05,
"loss": 2.7559,
"step": 5800
},
{
"epoch": 1.950680913471611,
"grad_norm": 4.996124744415283,
"learning_rate": 3.3878417641361846e-05,
"loss": 2.696,
"step": 5820
},
{
"epoch": 1.9573852922690131,
"grad_norm": 5.283078193664551,
"learning_rate": 3.3803918647098266e-05,
"loss": 2.7735,
"step": 5840
},
{
"epoch": 1.9640896710664153,
"grad_norm": 5.360789775848389,
"learning_rate": 3.372941965283469e-05,
"loss": 2.6893,
"step": 5860
},
{
"epoch": 1.9707940498638172,
"grad_norm": 4.45306921005249,
"learning_rate": 3.365492065857111e-05,
"loss": 2.7067,
"step": 5880
},
{
"epoch": 1.9774984286612194,
"grad_norm": 5.298338890075684,
"learning_rate": 3.358042166430753e-05,
"loss": 2.7175,
"step": 5900
},
{
"epoch": 1.9842028074586215,
"grad_norm": 5.592753887176514,
"learning_rate": 3.350592267004396e-05,
"loss": 2.6956,
"step": 5920
},
{
"epoch": 1.9909071862560235,
"grad_norm": 4.717761993408203,
"learning_rate": 3.343142367578038e-05,
"loss": 2.8405,
"step": 5940
},
{
"epoch": 1.9976115650534254,
"grad_norm": 5.031712532043457,
"learning_rate": 3.33569246815168e-05,
"loss": 2.7726,
"step": 5960
},
{
"epoch": 2.004022627278441,
"grad_norm": 5.590005397796631,
"learning_rate": 3.328242568725322e-05,
"loss": 2.3678,
"step": 5980
},
{
"epoch": 2.010727006075843,
"grad_norm": 5.272172451019287,
"learning_rate": 3.320792669298965e-05,
"loss": 2.4877,
"step": 6000
},
{
"epoch": 2.010727006075843,
"eval_bleu_greedy": 43.32802392418832,
"eval_loss": 0.3537669777870178,
"eval_runtime": 89.2292,
"eval_samples_per_second": 1.121,
"eval_steps_per_second": 1.121,
"step": 6000
},
{
"epoch": 2.0174313848732455,
"grad_norm": 5.440276622772217,
"learning_rate": 3.313342769872607e-05,
"loss": 2.5165,
"step": 6020
},
{
"epoch": 2.0241357636706474,
"grad_norm": 4.8560709953308105,
"learning_rate": 3.3058928704462496e-05,
"loss": 2.4584,
"step": 6040
},
{
"epoch": 2.0308401424680493,
"grad_norm": 4.232001781463623,
"learning_rate": 3.2984429710198916e-05,
"loss": 2.5005,
"step": 6060
},
{
"epoch": 2.0375445212654517,
"grad_norm": 4.847684383392334,
"learning_rate": 3.290993071593534e-05,
"loss": 2.5064,
"step": 6080
},
{
"epoch": 2.0442489000628536,
"grad_norm": 4.831601619720459,
"learning_rate": 3.283543172167176e-05,
"loss": 2.5224,
"step": 6100
},
{
"epoch": 2.0509532788602556,
"grad_norm": 4.691544055938721,
"learning_rate": 3.2760932727408186e-05,
"loss": 2.5174,
"step": 6120
},
{
"epoch": 2.0576576576576575,
"grad_norm": 4.6900248527526855,
"learning_rate": 3.268643373314461e-05,
"loss": 2.4071,
"step": 6140
},
{
"epoch": 2.06436203645506,
"grad_norm": 4.56058931350708,
"learning_rate": 3.261193473888103e-05,
"loss": 2.4984,
"step": 6160
},
{
"epoch": 2.071066415252462,
"grad_norm": 4.817535877227783,
"learning_rate": 3.253743574461745e-05,
"loss": 2.5249,
"step": 6180
},
{
"epoch": 2.0777707940498638,
"grad_norm": 4.966011047363281,
"learning_rate": 3.2462936750353876e-05,
"loss": 2.4622,
"step": 6200
},
{
"epoch": 2.0844751728472657,
"grad_norm": 5.804232120513916,
"learning_rate": 3.23884377560903e-05,
"loss": 2.4868,
"step": 6220
},
{
"epoch": 2.091179551644668,
"grad_norm": 4.816470623016357,
"learning_rate": 3.231393876182672e-05,
"loss": 2.3994,
"step": 6240
},
{
"epoch": 2.09788393044207,
"grad_norm": 5.60403299331665,
"learning_rate": 3.223943976756314e-05,
"loss": 2.511,
"step": 6260
},
{
"epoch": 2.104588309239472,
"grad_norm": 4.696137428283691,
"learning_rate": 3.216494077329956e-05,
"loss": 2.4767,
"step": 6280
},
{
"epoch": 2.1112926880368743,
"grad_norm": 4.613682270050049,
"learning_rate": 3.209044177903599e-05,
"loss": 2.3844,
"step": 6300
},
{
"epoch": 2.1179970668342762,
"grad_norm": 4.697776794433594,
"learning_rate": 3.201594278477241e-05,
"loss": 2.4177,
"step": 6320
},
{
"epoch": 2.124701445631678,
"grad_norm": 5.78491735458374,
"learning_rate": 3.194144379050883e-05,
"loss": 2.4688,
"step": 6340
},
{
"epoch": 2.13140582442908,
"grad_norm": 5.266840934753418,
"learning_rate": 3.186694479624525e-05,
"loss": 2.5086,
"step": 6360
},
{
"epoch": 2.1381102032264825,
"grad_norm": 6.113589286804199,
"learning_rate": 3.179244580198167e-05,
"loss": 2.4037,
"step": 6380
},
{
"epoch": 2.1448145820238844,
"grad_norm": 5.600616455078125,
"learning_rate": 3.17179468077181e-05,
"loss": 2.5343,
"step": 6400
},
{
"epoch": 2.1515189608212864,
"grad_norm": 5.7312774658203125,
"learning_rate": 3.164344781345452e-05,
"loss": 2.4902,
"step": 6420
},
{
"epoch": 2.1582233396186883,
"grad_norm": 5.237963676452637,
"learning_rate": 3.156894881919094e-05,
"loss": 2.5499,
"step": 6440
},
{
"epoch": 2.1649277184160907,
"grad_norm": 5.6732916831970215,
"learning_rate": 3.149444982492736e-05,
"loss": 2.4206,
"step": 6460
},
{
"epoch": 2.1716320972134926,
"grad_norm": 5.351031303405762,
"learning_rate": 3.141995083066379e-05,
"loss": 2.4755,
"step": 6480
},
{
"epoch": 2.1783364760108945,
"grad_norm": 5.774845123291016,
"learning_rate": 3.134545183640021e-05,
"loss": 2.405,
"step": 6500
},
{
"epoch": 2.1850408548082965,
"grad_norm": 6.55280876159668,
"learning_rate": 3.127095284213663e-05,
"loss": 2.5264,
"step": 6520
},
{
"epoch": 2.191745233605699,
"grad_norm": 5.785919666290283,
"learning_rate": 3.119645384787305e-05,
"loss": 2.4565,
"step": 6540
},
{
"epoch": 2.1984496124031008,
"grad_norm": 5.330859184265137,
"learning_rate": 3.112195485360947e-05,
"loss": 2.4884,
"step": 6560
},
{
"epoch": 2.2051539912005027,
"grad_norm": 5.32330322265625,
"learning_rate": 3.10474558593459e-05,
"loss": 2.4773,
"step": 6580
},
{
"epoch": 2.211858369997905,
"grad_norm": 6.741830825805664,
"learning_rate": 3.097295686508233e-05,
"loss": 2.424,
"step": 6600
},
{
"epoch": 2.218562748795307,
"grad_norm": 4.638687610626221,
"learning_rate": 3.089845787081875e-05,
"loss": 2.41,
"step": 6620
},
{
"epoch": 2.225267127592709,
"grad_norm": 5.136049270629883,
"learning_rate": 3.082395887655517e-05,
"loss": 2.4351,
"step": 6640
},
{
"epoch": 2.231971506390111,
"grad_norm": 5.4034528732299805,
"learning_rate": 3.074945988229159e-05,
"loss": 2.4679,
"step": 6660
},
{
"epoch": 2.2386758851875133,
"grad_norm": 5.091638088226318,
"learning_rate": 3.067496088802802e-05,
"loss": 2.5217,
"step": 6680
},
{
"epoch": 2.245380263984915,
"grad_norm": 4.8217668533325195,
"learning_rate": 3.060046189376444e-05,
"loss": 2.4203,
"step": 6700
},
{
"epoch": 2.252084642782317,
"grad_norm": 5.1358113288879395,
"learning_rate": 3.052596289950086e-05,
"loss": 2.4624,
"step": 6720
},
{
"epoch": 2.2587890215797195,
"grad_norm": 5.394964218139648,
"learning_rate": 3.045146390523728e-05,
"loss": 2.3956,
"step": 6740
},
{
"epoch": 2.2654934003771214,
"grad_norm": 5.930358409881592,
"learning_rate": 3.0376964910973705e-05,
"loss": 2.3428,
"step": 6760
},
{
"epoch": 2.2721977791745234,
"grad_norm": 5.13174295425415,
"learning_rate": 3.0302465916710126e-05,
"loss": 2.4459,
"step": 6780
},
{
"epoch": 2.2789021579719253,
"grad_norm": 5.4948248863220215,
"learning_rate": 3.022796692244655e-05,
"loss": 2.4351,
"step": 6800
},
{
"epoch": 2.2856065367693272,
"grad_norm": 5.601160049438477,
"learning_rate": 3.015346792818297e-05,
"loss": 2.4617,
"step": 6820
},
{
"epoch": 2.2923109155667296,
"grad_norm": 5.33076810836792,
"learning_rate": 3.0078968933919395e-05,
"loss": 2.435,
"step": 6840
},
{
"epoch": 2.2990152943641315,
"grad_norm": 5.344810485839844,
"learning_rate": 3.0004469939655816e-05,
"loss": 2.4458,
"step": 6860
},
{
"epoch": 2.3057196731615335,
"grad_norm": 5.304954528808594,
"learning_rate": 2.9929970945392237e-05,
"loss": 2.4774,
"step": 6880
},
{
"epoch": 2.312424051958936,
"grad_norm": 5.514897346496582,
"learning_rate": 2.985547195112866e-05,
"loss": 2.5046,
"step": 6900
},
{
"epoch": 2.319128430756338,
"grad_norm": 5.057281494140625,
"learning_rate": 2.9780972956865082e-05,
"loss": 2.4099,
"step": 6920
},
{
"epoch": 2.3258328095537397,
"grad_norm": 5.301017761230469,
"learning_rate": 2.9706473962601506e-05,
"loss": 2.4959,
"step": 6940
},
{
"epoch": 2.3325371883511417,
"grad_norm": 5.437694549560547,
"learning_rate": 2.9631974968337927e-05,
"loss": 2.4614,
"step": 6960
},
{
"epoch": 2.339241567148544,
"grad_norm": 4.668396949768066,
"learning_rate": 2.955747597407435e-05,
"loss": 2.4309,
"step": 6980
},
{
"epoch": 2.345945945945946,
"grad_norm": 5.610180854797363,
"learning_rate": 2.9482976979810772e-05,
"loss": 2.4053,
"step": 7000
},
{
"epoch": 2.345945945945946,
"eval_bleu_greedy": 43.68322019190237,
"eval_loss": 0.34515950083732605,
"eval_runtime": 129.474,
"eval_samples_per_second": 0.772,
"eval_steps_per_second": 0.772,
"step": 7000
},
{
"epoch": 2.352650324743348,
"grad_norm": 5.010399341583252,
"learning_rate": 2.9408477985547193e-05,
"loss": 2.4909,
"step": 7020
},
{
"epoch": 2.35935470354075,
"grad_norm": 5.367332935333252,
"learning_rate": 2.9333978991283618e-05,
"loss": 2.4067,
"step": 7040
},
{
"epoch": 2.366059082338152,
"grad_norm": 6.149716854095459,
"learning_rate": 2.925947999702004e-05,
"loss": 2.3804,
"step": 7060
},
{
"epoch": 2.372763461135554,
"grad_norm": 5.402404308319092,
"learning_rate": 2.9184981002756463e-05,
"loss": 2.5418,
"step": 7080
},
{
"epoch": 2.379467839932956,
"grad_norm": 5.056270599365234,
"learning_rate": 2.9110482008492884e-05,
"loss": 2.426,
"step": 7100
},
{
"epoch": 2.3861722187303585,
"grad_norm": 5.246522426605225,
"learning_rate": 2.9035983014229308e-05,
"loss": 2.402,
"step": 7120
},
{
"epoch": 2.3928765975277604,
"grad_norm": 5.366527557373047,
"learning_rate": 2.8961484019965735e-05,
"loss": 2.4275,
"step": 7140
},
{
"epoch": 2.3995809763251623,
"grad_norm": 5.639255523681641,
"learning_rate": 2.8886985025702156e-05,
"loss": 2.4606,
"step": 7160
},
{
"epoch": 2.4062853551225643,
"grad_norm": 4.962663650512695,
"learning_rate": 2.881248603143858e-05,
"loss": 2.4537,
"step": 7180
},
{
"epoch": 2.4129897339199666,
"grad_norm": 5.757133483886719,
"learning_rate": 2.8737987037175e-05,
"loss": 2.4827,
"step": 7200
},
{
"epoch": 2.4196941127173686,
"grad_norm": 5.6149516105651855,
"learning_rate": 2.8663488042911422e-05,
"loss": 2.457,
"step": 7220
},
{
"epoch": 2.4263984915147705,
"grad_norm": 5.717596054077148,
"learning_rate": 2.8588989048647847e-05,
"loss": 2.4372,
"step": 7240
},
{
"epoch": 2.4331028703121724,
"grad_norm": 6.159657001495361,
"learning_rate": 2.8514490054384268e-05,
"loss": 2.4332,
"step": 7260
},
{
"epoch": 2.439807249109575,
"grad_norm": 5.708389759063721,
"learning_rate": 2.8439991060120692e-05,
"loss": 2.3862,
"step": 7280
},
{
"epoch": 2.4465116279069767,
"grad_norm": 5.569727420806885,
"learning_rate": 2.8365492065857113e-05,
"loss": 2.4465,
"step": 7300
},
{
"epoch": 2.4532160067043787,
"grad_norm": 5.431128025054932,
"learning_rate": 2.8290993071593537e-05,
"loss": 2.4079,
"step": 7320
},
{
"epoch": 2.459920385501781,
"grad_norm": 5.69559907913208,
"learning_rate": 2.8216494077329958e-05,
"loss": 2.4039,
"step": 7340
},
{
"epoch": 2.466624764299183,
"grad_norm": 5.6564226150512695,
"learning_rate": 2.814199508306638e-05,
"loss": 2.5032,
"step": 7360
},
{
"epoch": 2.473329143096585,
"grad_norm": 5.606060028076172,
"learning_rate": 2.8067496088802803e-05,
"loss": 2.2959,
"step": 7380
},
{
"epoch": 2.480033521893987,
"grad_norm": 5.51323127746582,
"learning_rate": 2.7992997094539224e-05,
"loss": 2.3528,
"step": 7400
},
{
"epoch": 2.4867379006913892,
"grad_norm": 4.964369773864746,
"learning_rate": 2.7918498100275648e-05,
"loss": 2.4204,
"step": 7420
},
{
"epoch": 2.493442279488791,
"grad_norm": 4.708135604858398,
"learning_rate": 2.784399910601207e-05,
"loss": 2.4221,
"step": 7440
},
{
"epoch": 2.500146658286193,
"grad_norm": 4.968412399291992,
"learning_rate": 2.7769500111748493e-05,
"loss": 2.4488,
"step": 7460
},
{
"epoch": 2.506851037083595,
"grad_norm": 4.854182243347168,
"learning_rate": 2.7695001117484914e-05,
"loss": 2.412,
"step": 7480
},
{
"epoch": 2.5135554158809974,
"grad_norm": 5.069591999053955,
"learning_rate": 2.762050212322134e-05,
"loss": 2.4355,
"step": 7500
},
{
"epoch": 2.5202597946783993,
"grad_norm": 5.306612014770508,
"learning_rate": 2.754600312895776e-05,
"loss": 2.4293,
"step": 7520
},
{
"epoch": 2.5269641734758013,
"grad_norm": 5.377747058868408,
"learning_rate": 2.747150413469418e-05,
"loss": 2.5042,
"step": 7540
},
{
"epoch": 2.5336685522732036,
"grad_norm": 5.272029399871826,
"learning_rate": 2.7397005140430604e-05,
"loss": 2.4412,
"step": 7560
},
{
"epoch": 2.5403729310706056,
"grad_norm": 6.27547550201416,
"learning_rate": 2.7322506146167025e-05,
"loss": 2.4129,
"step": 7580
},
{
"epoch": 2.5470773098680075,
"grad_norm": 4.881494045257568,
"learning_rate": 2.724800715190345e-05,
"loss": 2.4177,
"step": 7600
},
{
"epoch": 2.5537816886654094,
"grad_norm": 4.519489288330078,
"learning_rate": 2.717350815763987e-05,
"loss": 2.486,
"step": 7620
},
{
"epoch": 2.5604860674628114,
"grad_norm": 5.494065284729004,
"learning_rate": 2.7099009163376295e-05,
"loss": 2.3461,
"step": 7640
},
{
"epoch": 2.5671904462602138,
"grad_norm": 5.3841776847839355,
"learning_rate": 2.7024510169112716e-05,
"loss": 2.4056,
"step": 7660
},
{
"epoch": 2.5738948250576157,
"grad_norm": 4.913102626800537,
"learning_rate": 2.6950011174849143e-05,
"loss": 2.3517,
"step": 7680
},
{
"epoch": 2.5805992038550176,
"grad_norm": 5.246600151062012,
"learning_rate": 2.6875512180585564e-05,
"loss": 2.4435,
"step": 7700
},
{
"epoch": 2.58730358265242,
"grad_norm": 5.883936882019043,
"learning_rate": 2.680101318632199e-05,
"loss": 2.4236,
"step": 7720
},
{
"epoch": 2.594007961449822,
"grad_norm": 4.634848117828369,
"learning_rate": 2.672651419205841e-05,
"loss": 2.4566,
"step": 7740
},
{
"epoch": 2.600712340247224,
"grad_norm": 5.388699531555176,
"learning_rate": 2.6652015197794834e-05,
"loss": 2.4541,
"step": 7760
},
{
"epoch": 2.6074167190446262,
"grad_norm": 5.410739421844482,
"learning_rate": 2.6577516203531254e-05,
"loss": 2.4242,
"step": 7780
},
{
"epoch": 2.614121097842028,
"grad_norm": 6.787940979003906,
"learning_rate": 2.650301720926768e-05,
"loss": 2.4763,
"step": 7800
},
{
"epoch": 2.62082547663943,
"grad_norm": 5.818297863006592,
"learning_rate": 2.64285182150041e-05,
"loss": 2.4732,
"step": 7820
},
{
"epoch": 2.627529855436832,
"grad_norm": 5.29996395111084,
"learning_rate": 2.6354019220740524e-05,
"loss": 2.4218,
"step": 7840
},
{
"epoch": 2.634234234234234,
"grad_norm": 5.816316604614258,
"learning_rate": 2.6279520226476945e-05,
"loss": 2.4662,
"step": 7860
},
{
"epoch": 2.6409386130316364,
"grad_norm": 6.0321149826049805,
"learning_rate": 2.6205021232213366e-05,
"loss": 2.437,
"step": 7880
},
{
"epoch": 2.6476429918290383,
"grad_norm": 5.97547721862793,
"learning_rate": 2.613052223794979e-05,
"loss": 2.3648,
"step": 7900
},
{
"epoch": 2.65434737062644,
"grad_norm": 5.269493579864502,
"learning_rate": 2.605602324368621e-05,
"loss": 2.4529,
"step": 7920
},
{
"epoch": 2.6610517494238426,
"grad_norm": 5.6759114265441895,
"learning_rate": 2.5981524249422635e-05,
"loss": 2.347,
"step": 7940
},
{
"epoch": 2.6677561282212445,
"grad_norm": 5.74458122253418,
"learning_rate": 2.5907025255159056e-05,
"loss": 2.3639,
"step": 7960
},
{
"epoch": 2.6744605070186465,
"grad_norm": 6.335291385650635,
"learning_rate": 2.583252626089548e-05,
"loss": 2.2973,
"step": 7980
},
{
"epoch": 2.681164885816049,
"grad_norm": 4.999330520629883,
"learning_rate": 2.57580272666319e-05,
"loss": 2.4947,
"step": 8000
},
{
"epoch": 2.681164885816049,
"eval_bleu_greedy": 43.85960445724608,
"eval_loss": 0.3381543755531311,
"eval_runtime": 116.2511,
"eval_samples_per_second": 0.86,
"eval_steps_per_second": 0.86,
"step": 8000
},
{
"epoch": 2.6878692646134508,
"grad_norm": 5.183805465698242,
"learning_rate": 2.5683528272368322e-05,
"loss": 2.4735,
"step": 8020
},
{
"epoch": 2.6945736434108527,
"grad_norm": 5.173423767089844,
"learning_rate": 2.5609029278104746e-05,
"loss": 2.4424,
"step": 8040
},
{
"epoch": 2.7012780222082546,
"grad_norm": 5.538326740264893,
"learning_rate": 2.5534530283841167e-05,
"loss": 2.4517,
"step": 8060
},
{
"epoch": 2.7079824010056566,
"grad_norm": 5.611026287078857,
"learning_rate": 2.546003128957759e-05,
"loss": 2.4325,
"step": 8080
},
{
"epoch": 2.714686779803059,
"grad_norm": 5.152528762817383,
"learning_rate": 2.5385532295314012e-05,
"loss": 2.3573,
"step": 8100
},
{
"epoch": 2.721391158600461,
"grad_norm": 5.090909481048584,
"learning_rate": 2.5311033301050437e-05,
"loss": 2.3834,
"step": 8120
},
{
"epoch": 2.728095537397863,
"grad_norm": 5.87515926361084,
"learning_rate": 2.5236534306786857e-05,
"loss": 2.4277,
"step": 8140
},
{
"epoch": 2.734799916195265,
"grad_norm": 5.354443073272705,
"learning_rate": 2.516203531252328e-05,
"loss": 2.4928,
"step": 8160
},
{
"epoch": 2.741504294992667,
"grad_norm": 5.228540897369385,
"learning_rate": 2.5087536318259703e-05,
"loss": 2.4103,
"step": 8180
},
{
"epoch": 2.748208673790069,
"grad_norm": 5.605003356933594,
"learning_rate": 2.5013037323996123e-05,
"loss": 2.4431,
"step": 8200
},
{
"epoch": 2.7549130525874714,
"grad_norm": 5.046622276306152,
"learning_rate": 2.493853832973255e-05,
"loss": 2.3604,
"step": 8220
},
{
"epoch": 2.7616174313848734,
"grad_norm": 5.582128047943115,
"learning_rate": 2.4864039335468972e-05,
"loss": 2.3736,
"step": 8240
},
{
"epoch": 2.7683218101822753,
"grad_norm": 5.479374408721924,
"learning_rate": 2.4789540341205393e-05,
"loss": 2.3829,
"step": 8260
},
{
"epoch": 2.7750261889796772,
"grad_norm": 5.341858386993408,
"learning_rate": 2.4715041346941817e-05,
"loss": 2.391,
"step": 8280
},
{
"epoch": 2.781730567777079,
"grad_norm": 5.183146953582764,
"learning_rate": 2.4640542352678238e-05,
"loss": 2.4453,
"step": 8300
},
{
"epoch": 2.7884349465744815,
"grad_norm": 5.269819736480713,
"learning_rate": 2.4566043358414662e-05,
"loss": 2.393,
"step": 8320
},
{
"epoch": 2.7951393253718835,
"grad_norm": 5.160492420196533,
"learning_rate": 2.4491544364151083e-05,
"loss": 2.3292,
"step": 8340
},
{
"epoch": 2.8018437041692854,
"grad_norm": 5.365363121032715,
"learning_rate": 2.4417045369887507e-05,
"loss": 2.423,
"step": 8360
},
{
"epoch": 2.808548082966688,
"grad_norm": 5.038601398468018,
"learning_rate": 2.4342546375623932e-05,
"loss": 2.4559,
"step": 8380
},
{
"epoch": 2.8152524617640897,
"grad_norm": 6.426731109619141,
"learning_rate": 2.4268047381360353e-05,
"loss": 2.4339,
"step": 8400
},
{
"epoch": 2.8219568405614917,
"grad_norm": 5.634438991546631,
"learning_rate": 2.4193548387096777e-05,
"loss": 2.4238,
"step": 8420
},
{
"epoch": 2.828661219358894,
"grad_norm": 5.305586338043213,
"learning_rate": 2.4119049392833198e-05,
"loss": 2.3275,
"step": 8440
},
{
"epoch": 2.835365598156296,
"grad_norm": 5.70265531539917,
"learning_rate": 2.4044550398569622e-05,
"loss": 2.3728,
"step": 8460
},
{
"epoch": 2.842069976953698,
"grad_norm": 5.732183933258057,
"learning_rate": 2.3970051404306043e-05,
"loss": 2.4455,
"step": 8480
},
{
"epoch": 2.8487743557511,
"grad_norm": 5.569748401641846,
"learning_rate": 2.3895552410042464e-05,
"loss": 2.4491,
"step": 8500
},
{
"epoch": 2.8554787345485018,
"grad_norm": 6.472934246063232,
"learning_rate": 2.3821053415778888e-05,
"loss": 2.4048,
"step": 8520
},
{
"epoch": 2.862183113345904,
"grad_norm": 6.311745643615723,
"learning_rate": 2.374655442151531e-05,
"loss": 2.4798,
"step": 8540
},
{
"epoch": 2.868887492143306,
"grad_norm": 5.956141948699951,
"learning_rate": 2.3672055427251733e-05,
"loss": 2.4407,
"step": 8560
},
{
"epoch": 2.875591870940708,
"grad_norm": 5.066039085388184,
"learning_rate": 2.3597556432988154e-05,
"loss": 2.4024,
"step": 8580
},
{
"epoch": 2.8822962497381104,
"grad_norm": 5.680649757385254,
"learning_rate": 2.352305743872458e-05,
"loss": 2.3765,
"step": 8600
},
{
"epoch": 2.8890006285355123,
"grad_norm": 6.591916561126709,
"learning_rate": 2.3448558444461e-05,
"loss": 2.3552,
"step": 8620
},
{
"epoch": 2.8957050073329142,
"grad_norm": 5.818574905395508,
"learning_rate": 2.3374059450197424e-05,
"loss": 2.4488,
"step": 8640
},
{
"epoch": 2.9024093861303166,
"grad_norm": 5.694536209106445,
"learning_rate": 2.3299560455933848e-05,
"loss": 2.3762,
"step": 8660
},
{
"epoch": 2.9091137649277186,
"grad_norm": 4.8993659019470215,
"learning_rate": 2.322506146167027e-05,
"loss": 2.3233,
"step": 8680
},
{
"epoch": 2.9158181437251205,
"grad_norm": 5.326479911804199,
"learning_rate": 2.3150562467406693e-05,
"loss": 2.2993,
"step": 8700
},
{
"epoch": 2.9225225225225224,
"grad_norm": 5.418346405029297,
"learning_rate": 2.3076063473143114e-05,
"loss": 2.3986,
"step": 8720
},
{
"epoch": 2.9292269013199244,
"grad_norm": 5.561238765716553,
"learning_rate": 2.3001564478879538e-05,
"loss": 2.3875,
"step": 8740
},
{
"epoch": 2.9359312801173267,
"grad_norm": 5.399714946746826,
"learning_rate": 2.292706548461596e-05,
"loss": 2.4019,
"step": 8760
},
{
"epoch": 2.9426356589147287,
"grad_norm": 5.176342964172363,
"learning_rate": 2.285256649035238e-05,
"loss": 2.3222,
"step": 8780
},
{
"epoch": 2.9493400377121306,
"grad_norm": 4.745908260345459,
"learning_rate": 2.2778067496088804e-05,
"loss": 2.4057,
"step": 8800
},
{
"epoch": 2.956044416509533,
"grad_norm": 6.640355587005615,
"learning_rate": 2.2703568501825225e-05,
"loss": 2.3954,
"step": 8820
},
{
"epoch": 2.962748795306935,
"grad_norm": 6.0387749671936035,
"learning_rate": 2.262906950756165e-05,
"loss": 2.2882,
"step": 8840
},
{
"epoch": 2.969453174104337,
"grad_norm": 6.5941596031188965,
"learning_rate": 2.255457051329807e-05,
"loss": 2.3585,
"step": 8860
},
{
"epoch": 2.9761575529017392,
"grad_norm": 6.242068767547607,
"learning_rate": 2.2480071519034494e-05,
"loss": 2.3699,
"step": 8880
},
{
"epoch": 2.982861931699141,
"grad_norm": 6.592676162719727,
"learning_rate": 2.2405572524770915e-05,
"loss": 2.376,
"step": 8900
},
{
"epoch": 2.989566310496543,
"grad_norm": 5.625716209411621,
"learning_rate": 2.233107353050734e-05,
"loss": 2.321,
"step": 8920
},
{
"epoch": 2.996270689293945,
"grad_norm": 5.320464134216309,
"learning_rate": 2.2256574536243764e-05,
"loss": 2.3896,
"step": 8940
},
{
"epoch": 3.0026817515189608,
"grad_norm": 4.8960418701171875,
"learning_rate": 2.2182075541980185e-05,
"loss": 2.1192,
"step": 8960
},
{
"epoch": 3.0093861303163627,
"grad_norm": 5.314412593841553,
"learning_rate": 2.210757654771661e-05,
"loss": 2.1751,
"step": 8980
},
{
"epoch": 3.016090509113765,
"grad_norm": 5.510196208953857,
"learning_rate": 2.203307755345303e-05,
"loss": 2.2691,
"step": 9000
},
{
"epoch": 3.016090509113765,
"eval_bleu_greedy": 43.740830458991255,
"eval_loss": 0.33234962821006775,
"eval_runtime": 141.0542,
"eval_samples_per_second": 0.709,
"eval_steps_per_second": 0.709,
"step": 9000
},
{
"epoch": 3.022794887911167,
"grad_norm": 5.984165191650391,
"learning_rate": 2.195857855918945e-05,
"loss": 2.1367,
"step": 9020
},
{
"epoch": 3.029499266708569,
"grad_norm": 5.3130669593811035,
"learning_rate": 2.1884079564925875e-05,
"loss": 2.2206,
"step": 9040
},
{
"epoch": 3.036203645505971,
"grad_norm": 5.565290927886963,
"learning_rate": 2.1809580570662296e-05,
"loss": 2.1031,
"step": 9060
},
{
"epoch": 3.0429080243033733,
"grad_norm": 5.757679462432861,
"learning_rate": 2.173508157639872e-05,
"loss": 2.2013,
"step": 9080
},
{
"epoch": 3.049612403100775,
"grad_norm": 5.874185085296631,
"learning_rate": 2.166058258213514e-05,
"loss": 2.118,
"step": 9100
},
{
"epoch": 3.056316781898177,
"grad_norm": 5.614380836486816,
"learning_rate": 2.1586083587871565e-05,
"loss": 2.1824,
"step": 9120
},
{
"epoch": 3.0630211606955795,
"grad_norm": 5.548777103424072,
"learning_rate": 2.1511584593607986e-05,
"loss": 2.1007,
"step": 9140
},
{
"epoch": 3.0697255394929814,
"grad_norm": 5.060155868530273,
"learning_rate": 2.1437085599344407e-05,
"loss": 2.0839,
"step": 9160
},
{
"epoch": 3.0764299182903834,
"grad_norm": 5.213876724243164,
"learning_rate": 2.1362586605080835e-05,
"loss": 2.1763,
"step": 9180
},
{
"epoch": 3.0831342970877853,
"grad_norm": 5.595554351806641,
"learning_rate": 2.1288087610817256e-05,
"loss": 2.2138,
"step": 9200
},
{
"epoch": 3.0898386758851877,
"grad_norm": 6.205844402313232,
"learning_rate": 2.121358861655368e-05,
"loss": 2.1493,
"step": 9220
},
{
"epoch": 3.0965430546825896,
"grad_norm": 5.1306352615356445,
"learning_rate": 2.11390896222901e-05,
"loss": 2.2229,
"step": 9240
},
{
"epoch": 3.1032474334799915,
"grad_norm": 6.070255756378174,
"learning_rate": 2.106459062802652e-05,
"loss": 2.1823,
"step": 9260
},
{
"epoch": 3.1099518122773935,
"grad_norm": 5.469337463378906,
"learning_rate": 2.0990091633762946e-05,
"loss": 2.1498,
"step": 9280
},
{
"epoch": 3.116656191074796,
"grad_norm": 5.587215900421143,
"learning_rate": 2.0915592639499367e-05,
"loss": 2.2351,
"step": 9300
},
{
"epoch": 3.123360569872198,
"grad_norm": 5.845519065856934,
"learning_rate": 2.084109364523579e-05,
"loss": 2.1333,
"step": 9320
},
{
"epoch": 3.1300649486695997,
"grad_norm": 5.8296732902526855,
"learning_rate": 2.0766594650972212e-05,
"loss": 2.2287,
"step": 9340
},
{
"epoch": 3.136769327467002,
"grad_norm": 5.801497459411621,
"learning_rate": 2.0692095656708636e-05,
"loss": 2.2238,
"step": 9360
},
{
"epoch": 3.143473706264404,
"grad_norm": 6.177136421203613,
"learning_rate": 2.0617596662445057e-05,
"loss": 2.1735,
"step": 9380
},
{
"epoch": 3.150178085061806,
"grad_norm": 6.680226802825928,
"learning_rate": 2.0543097668181478e-05,
"loss": 2.1529,
"step": 9400
},
{
"epoch": 3.156882463859208,
"grad_norm": 6.010555267333984,
"learning_rate": 2.0468598673917902e-05,
"loss": 2.1644,
"step": 9420
},
{
"epoch": 3.1635868426566103,
"grad_norm": 5.7896528244018555,
"learning_rate": 2.0394099679654323e-05,
"loss": 2.1184,
"step": 9440
},
{
"epoch": 3.170291221454012,
"grad_norm": 5.442397594451904,
"learning_rate": 2.031960068539075e-05,
"loss": 2.1438,
"step": 9460
},
{
"epoch": 3.176995600251414,
"grad_norm": 6.730420112609863,
"learning_rate": 2.024510169112717e-05,
"loss": 2.2004,
"step": 9480
},
{
"epoch": 3.183699979048816,
"grad_norm": 5.808672904968262,
"learning_rate": 2.0170602696863593e-05,
"loss": 2.1597,
"step": 9500
},
{
"epoch": 3.1904043578462185,
"grad_norm": 5.694986820220947,
"learning_rate": 2.0096103702600017e-05,
"loss": 2.2067,
"step": 9520
},
{
"epoch": 3.1971087366436204,
"grad_norm": 5.049539089202881,
"learning_rate": 2.0021604708336438e-05,
"loss": 2.2071,
"step": 9540
},
{
"epoch": 3.2038131154410223,
"grad_norm": 5.884223937988281,
"learning_rate": 1.9947105714072862e-05,
"loss": 2.1081,
"step": 9560
},
{
"epoch": 3.2105174942384247,
"grad_norm": 6.397696495056152,
"learning_rate": 1.9872606719809283e-05,
"loss": 2.1475,
"step": 9580
},
{
"epoch": 3.2172218730358266,
"grad_norm": 6.474588394165039,
"learning_rate": 1.9798107725545707e-05,
"loss": 2.0934,
"step": 9600
},
{
"epoch": 3.2239262518332286,
"grad_norm": 5.678287506103516,
"learning_rate": 1.9723608731282128e-05,
"loss": 2.1236,
"step": 9620
},
{
"epoch": 3.2306306306306305,
"grad_norm": 5.606823921203613,
"learning_rate": 1.9649109737018552e-05,
"loss": 2.2182,
"step": 9640
},
{
"epoch": 3.237335009428033,
"grad_norm": 5.8254499435424805,
"learning_rate": 1.9574610742754973e-05,
"loss": 2.1691,
"step": 9660
},
{
"epoch": 3.244039388225435,
"grad_norm": 5.576842784881592,
"learning_rate": 1.9500111748491394e-05,
"loss": 2.1609,
"step": 9680
},
{
"epoch": 3.2507437670228367,
"grad_norm": 5.685579776763916,
"learning_rate": 1.9425612754227818e-05,
"loss": 2.1802,
"step": 9700
},
{
"epoch": 3.2574481458202387,
"grad_norm": 7.00796365737915,
"learning_rate": 1.935111375996424e-05,
"loss": 2.1535,
"step": 9720
},
{
"epoch": 3.264152524617641,
"grad_norm": 5.360428333282471,
"learning_rate": 1.9276614765700667e-05,
"loss": 2.1896,
"step": 9740
},
{
"epoch": 3.270856903415043,
"grad_norm": 5.5369696617126465,
"learning_rate": 1.9202115771437088e-05,
"loss": 2.114,
"step": 9760
},
{
"epoch": 3.277561282212445,
"grad_norm": 6.027918815612793,
"learning_rate": 1.912761677717351e-05,
"loss": 2.1165,
"step": 9780
},
{
"epoch": 3.2842656610098473,
"grad_norm": 6.253023624420166,
"learning_rate": 1.9053117782909933e-05,
"loss": 2.132,
"step": 9800
},
{
"epoch": 3.2909700398072492,
"grad_norm": 5.557729721069336,
"learning_rate": 1.8978618788646354e-05,
"loss": 2.098,
"step": 9820
},
{
"epoch": 3.297674418604651,
"grad_norm": 5.2757344245910645,
"learning_rate": 1.8904119794382778e-05,
"loss": 2.1631,
"step": 9840
},
{
"epoch": 3.304378797402053,
"grad_norm": 6.308342456817627,
"learning_rate": 1.88296208001192e-05,
"loss": 2.1348,
"step": 9860
},
{
"epoch": 3.311083176199455,
"grad_norm": 5.6540913581848145,
"learning_rate": 1.8755121805855623e-05,
"loss": 2.1916,
"step": 9880
},
{
"epoch": 3.3177875549968574,
"grad_norm": 5.817383289337158,
"learning_rate": 1.8680622811592044e-05,
"loss": 2.1844,
"step": 9900
},
{
"epoch": 3.3244919337942593,
"grad_norm": 6.088453769683838,
"learning_rate": 1.8606123817328465e-05,
"loss": 2.1049,
"step": 9920
},
{
"epoch": 3.3311963125916613,
"grad_norm": 5.837181091308594,
"learning_rate": 1.853162482306489e-05,
"loss": 2.1038,
"step": 9940
},
{
"epoch": 3.3379006913890636,
"grad_norm": 5.060699939727783,
"learning_rate": 1.845712582880131e-05,
"loss": 2.0959,
"step": 9960
},
{
"epoch": 3.3446050701864656,
"grad_norm": 5.951335430145264,
"learning_rate": 1.8382626834537734e-05,
"loss": 2.1495,
"step": 9980
},
{
"epoch": 3.3513094489838675,
"grad_norm": 5.952776908874512,
"learning_rate": 1.830812784027416e-05,
"loss": 2.124,
"step": 10000
},
{
"epoch": 3.3513094489838675,
"eval_bleu_greedy": 44.34903037092752,
"eval_loss": 0.3302690088748932,
"eval_runtime": 118.2606,
"eval_samples_per_second": 0.846,
"eval_steps_per_second": 0.846,
"step": 10000
},
{
"epoch": 3.35801382778127,
"grad_norm": 5.622983455657959,
"learning_rate": 1.823362884601058e-05,
"loss": 2.1254,
"step": 10020
},
{
"epoch": 3.364718206578672,
"grad_norm": 5.697426795959473,
"learning_rate": 1.8159129851747004e-05,
"loss": 2.147,
"step": 10040
},
{
"epoch": 3.3714225853760738,
"grad_norm": 5.682360649108887,
"learning_rate": 1.8084630857483425e-05,
"loss": 2.1331,
"step": 10060
},
{
"epoch": 3.3781269641734757,
"grad_norm": 6.510776519775391,
"learning_rate": 1.801013186321985e-05,
"loss": 2.2537,
"step": 10080
},
{
"epoch": 3.3848313429708776,
"grad_norm": 6.060893535614014,
"learning_rate": 1.793563286895627e-05,
"loss": 2.1469,
"step": 10100
},
{
"epoch": 3.39153572176828,
"grad_norm": 6.059525012969971,
"learning_rate": 1.7861133874692694e-05,
"loss": 2.1388,
"step": 10120
},
{
"epoch": 3.398240100565682,
"grad_norm": 6.2327117919921875,
"learning_rate": 1.7786634880429115e-05,
"loss": 2.118,
"step": 10140
},
{
"epoch": 3.404944479363084,
"grad_norm": 5.623783588409424,
"learning_rate": 1.7712135886165536e-05,
"loss": 2.1141,
"step": 10160
},
{
"epoch": 3.4116488581604862,
"grad_norm": 6.822098255157471,
"learning_rate": 1.763763689190196e-05,
"loss": 2.1523,
"step": 10180
},
{
"epoch": 3.418353236957888,
"grad_norm": 5.375659942626953,
"learning_rate": 1.756313789763838e-05,
"loss": 2.1232,
"step": 10200
},
{
"epoch": 3.42505761575529,
"grad_norm": 6.811357498168945,
"learning_rate": 1.7488638903374805e-05,
"loss": 2.1598,
"step": 10220
},
{
"epoch": 3.431761994552692,
"grad_norm": 5.709009647369385,
"learning_rate": 1.7414139909111226e-05,
"loss": 2.1571,
"step": 10240
},
{
"epoch": 3.4384663733500944,
"grad_norm": 6.078985214233398,
"learning_rate": 1.733964091484765e-05,
"loss": 2.1624,
"step": 10260
},
{
"epoch": 3.4451707521474964,
"grad_norm": 5.974235534667969,
"learning_rate": 1.7265141920584075e-05,
"loss": 2.179,
"step": 10280
},
{
"epoch": 3.4518751309448983,
"grad_norm": 6.026799201965332,
"learning_rate": 1.7190642926320496e-05,
"loss": 2.1788,
"step": 10300
},
{
"epoch": 3.4585795097423,
"grad_norm": 6.061217308044434,
"learning_rate": 1.711614393205692e-05,
"loss": 2.1601,
"step": 10320
},
{
"epoch": 3.4652838885397026,
"grad_norm": 6.014716625213623,
"learning_rate": 1.704164493779334e-05,
"loss": 2.1726,
"step": 10340
},
{
"epoch": 3.4719882673371045,
"grad_norm": 5.476884365081787,
"learning_rate": 1.6967145943529765e-05,
"loss": 2.1277,
"step": 10360
},
{
"epoch": 3.4786926461345065,
"grad_norm": 6.806579113006592,
"learning_rate": 1.6892646949266186e-05,
"loss": 2.1742,
"step": 10380
},
{
"epoch": 3.485397024931909,
"grad_norm": 6.386503219604492,
"learning_rate": 1.6818147955002607e-05,
"loss": 2.093,
"step": 10400
},
{
"epoch": 3.4921014037293108,
"grad_norm": 5.830183982849121,
"learning_rate": 1.674364896073903e-05,
"loss": 2.2129,
"step": 10420
},
{
"epoch": 3.4988057825267127,
"grad_norm": 7.050870895385742,
"learning_rate": 1.6669149966475452e-05,
"loss": 2.0302,
"step": 10440
},
{
"epoch": 3.505510161324115,
"grad_norm": 6.339367866516113,
"learning_rate": 1.6594650972211876e-05,
"loss": 2.2095,
"step": 10460
},
{
"epoch": 3.512214540121517,
"grad_norm": 6.269477844238281,
"learning_rate": 1.6520151977948297e-05,
"loss": 2.1547,
"step": 10480
},
{
"epoch": 3.518918918918919,
"grad_norm": 5.818455219268799,
"learning_rate": 1.644565298368472e-05,
"loss": 2.1763,
"step": 10500
},
{
"epoch": 3.525623297716321,
"grad_norm": 5.608326435089111,
"learning_rate": 1.6371153989421142e-05,
"loss": 2.1408,
"step": 10520
},
{
"epoch": 3.532327676513723,
"grad_norm": 5.302171230316162,
"learning_rate": 1.6296654995157566e-05,
"loss": 2.1537,
"step": 10540
},
{
"epoch": 3.539032055311125,
"grad_norm": 5.502221584320068,
"learning_rate": 1.622215600089399e-05,
"loss": 2.0883,
"step": 10560
},
{
"epoch": 3.545736434108527,
"grad_norm": 6.126551628112793,
"learning_rate": 1.614765700663041e-05,
"loss": 2.1461,
"step": 10580
},
{
"epoch": 3.552440812905929,
"grad_norm": 6.817451000213623,
"learning_rate": 1.6073158012366836e-05,
"loss": 2.2614,
"step": 10600
},
{
"epoch": 3.5591451917033314,
"grad_norm": 6.151493072509766,
"learning_rate": 1.5998659018103257e-05,
"loss": 2.2219,
"step": 10620
},
{
"epoch": 3.5658495705007334,
"grad_norm": 5.696464538574219,
"learning_rate": 1.592416002383968e-05,
"loss": 2.0755,
"step": 10640
},
{
"epoch": 3.5725539492981353,
"grad_norm": 5.986370086669922,
"learning_rate": 1.5849661029576102e-05,
"loss": 2.1614,
"step": 10660
},
{
"epoch": 3.5792583280955372,
"grad_norm": 5.665823936462402,
"learning_rate": 1.5775162035312523e-05,
"loss": 2.1477,
"step": 10680
},
{
"epoch": 3.5859627068929396,
"grad_norm": 6.486339092254639,
"learning_rate": 1.5700663041048947e-05,
"loss": 2.1662,
"step": 10700
},
{
"epoch": 3.5926670856903415,
"grad_norm": 6.461730003356934,
"learning_rate": 1.5626164046785368e-05,
"loss": 2.2309,
"step": 10720
},
{
"epoch": 3.5993714644877435,
"grad_norm": 5.702849864959717,
"learning_rate": 1.5551665052521792e-05,
"loss": 2.0836,
"step": 10740
},
{
"epoch": 3.6060758432851454,
"grad_norm": 5.847084999084473,
"learning_rate": 1.5477166058258213e-05,
"loss": 2.1142,
"step": 10760
},
{
"epoch": 3.612780222082548,
"grad_norm": 6.496880531311035,
"learning_rate": 1.5402667063994637e-05,
"loss": 2.22,
"step": 10780
},
{
"epoch": 3.6194846008799497,
"grad_norm": 5.130465507507324,
"learning_rate": 1.5328168069731058e-05,
"loss": 2.1727,
"step": 10800
},
{
"epoch": 3.6261889796773517,
"grad_norm": 5.424370288848877,
"learning_rate": 1.5253669075467482e-05,
"loss": 2.1294,
"step": 10820
},
{
"epoch": 3.632893358474754,
"grad_norm": 5.883772373199463,
"learning_rate": 1.5179170081203905e-05,
"loss": 2.2141,
"step": 10840
},
{
"epoch": 3.639597737272156,
"grad_norm": 6.7028374671936035,
"learning_rate": 1.5104671086940328e-05,
"loss": 2.1481,
"step": 10860
},
{
"epoch": 3.646302116069558,
"grad_norm": 5.938512325286865,
"learning_rate": 1.503017209267675e-05,
"loss": 2.0623,
"step": 10880
},
{
"epoch": 3.65300649486696,
"grad_norm": 6.206230640411377,
"learning_rate": 1.4955673098413173e-05,
"loss": 2.1146,
"step": 10900
},
{
"epoch": 3.6597108736643618,
"grad_norm": 5.823507308959961,
"learning_rate": 1.4881174104149595e-05,
"loss": 2.2025,
"step": 10920
},
{
"epoch": 3.666415252461764,
"grad_norm": 6.715165138244629,
"learning_rate": 1.4806675109886018e-05,
"loss": 2.1292,
"step": 10940
},
{
"epoch": 3.673119631259166,
"grad_norm": 5.429907321929932,
"learning_rate": 1.473217611562244e-05,
"loss": 2.0976,
"step": 10960
},
{
"epoch": 3.679824010056568,
"grad_norm": 6.245195388793945,
"learning_rate": 1.4657677121358861e-05,
"loss": 2.086,
"step": 10980
},
{
"epoch": 3.6865283888539704,
"grad_norm": 5.78788948059082,
"learning_rate": 1.4583178127095284e-05,
"loss": 2.1993,
"step": 11000
},
{
"epoch": 3.6865283888539704,
"eval_bleu_greedy": 44.647707129353364,
"eval_loss": 0.3233819603919983,
"eval_runtime": 93.9426,
"eval_samples_per_second": 1.064,
"eval_steps_per_second": 1.064,
"step": 11000
},
{
"epoch": 3.6942384244709827,
"grad_norm": 6.311983108520508,
"learning_rate": 1.4508679132831707e-05,
"loss": 1.9803,
"step": 11020
},
{
"epoch": 3.7009428032683847,
"grad_norm": 5.580496788024902,
"learning_rate": 1.4434180138568129e-05,
"loss": 1.9703,
"step": 11040
},
{
"epoch": 3.7076471820657866,
"grad_norm": 5.837003707885742,
"learning_rate": 1.4359681144304552e-05,
"loss": 1.9426,
"step": 11060
},
{
"epoch": 3.7143515608631885,
"grad_norm": 5.717148780822754,
"learning_rate": 1.4285182150040974e-05,
"loss": 1.9986,
"step": 11080
},
{
"epoch": 3.721055939660591,
"grad_norm": 6.6829023361206055,
"learning_rate": 1.4210683155777399e-05,
"loss": 1.9822,
"step": 11100
},
{
"epoch": 3.727760318457993,
"grad_norm": 6.537654876708984,
"learning_rate": 1.4136184161513821e-05,
"loss": 1.9483,
"step": 11120
},
{
"epoch": 3.7344646972553948,
"grad_norm": 6.249990940093994,
"learning_rate": 1.4061685167250244e-05,
"loss": 2.0025,
"step": 11140
},
{
"epoch": 3.741169076052797,
"grad_norm": 5.1023736000061035,
"learning_rate": 1.3987186172986666e-05,
"loss": 2.0612,
"step": 11160
},
{
"epoch": 3.747873454850199,
"grad_norm": 7.344115257263184,
"learning_rate": 1.3912687178723089e-05,
"loss": 1.9921,
"step": 11180
},
{
"epoch": 3.754577833647601,
"grad_norm": 6.085274696350098,
"learning_rate": 1.3838188184459511e-05,
"loss": 2.0035,
"step": 11200
},
{
"epoch": 3.7612822124450034,
"grad_norm": 6.330926418304443,
"learning_rate": 1.3763689190195932e-05,
"loss": 1.9726,
"step": 11220
},
{
"epoch": 3.7679865912424053,
"grad_norm": 6.293400287628174,
"learning_rate": 1.3689190195932355e-05,
"loss": 2.0186,
"step": 11240
},
{
"epoch": 3.7746909700398072,
"grad_norm": 5.887099266052246,
"learning_rate": 1.3614691201668777e-05,
"loss": 1.9584,
"step": 11260
},
{
"epoch": 3.781395348837209,
"grad_norm": 7.026653289794922,
"learning_rate": 1.35401922074052e-05,
"loss": 1.9965,
"step": 11280
},
{
"epoch": 3.788099727634611,
"grad_norm": 5.680566787719727,
"learning_rate": 1.3465693213141623e-05,
"loss": 1.9619,
"step": 11300
},
{
"epoch": 3.7948041064320135,
"grad_norm": 6.181577682495117,
"learning_rate": 1.3391194218878045e-05,
"loss": 1.932,
"step": 11320
},
{
"epoch": 3.8015084852294154,
"grad_norm": 5.5085015296936035,
"learning_rate": 1.3316695224614468e-05,
"loss": 1.9788,
"step": 11340
},
{
"epoch": 3.8082128640268174,
"grad_norm": 5.90690279006958,
"learning_rate": 1.324219623035089e-05,
"loss": 2.0387,
"step": 11360
},
{
"epoch": 3.8149172428242197,
"grad_norm": 5.351406097412109,
"learning_rate": 1.3167697236087315e-05,
"loss": 2.0106,
"step": 11380
},
{
"epoch": 3.8216216216216217,
"grad_norm": 6.080597400665283,
"learning_rate": 1.3093198241823737e-05,
"loss": 1.9646,
"step": 11400
},
{
"epoch": 3.8283260004190236,
"grad_norm": 6.259003639221191,
"learning_rate": 1.301869924756016e-05,
"loss": 2.0344,
"step": 11420
},
{
"epoch": 3.835030379216426,
"grad_norm": 5.44300651550293,
"learning_rate": 1.2944200253296582e-05,
"loss": 1.8795,
"step": 11440
},
{
"epoch": 3.841734758013828,
"grad_norm": 4.950876235961914,
"learning_rate": 1.2869701259033005e-05,
"loss": 1.9399,
"step": 11460
},
{
"epoch": 3.84843913681123,
"grad_norm": 5.528983116149902,
"learning_rate": 1.2795202264769426e-05,
"loss": 1.9733,
"step": 11480
},
{
"epoch": 3.8551435156086318,
"grad_norm": 6.489850044250488,
"learning_rate": 1.2720703270505848e-05,
"loss": 1.9987,
"step": 11500
},
{
"epoch": 3.8618478944060337,
"grad_norm": 6.303378105163574,
"learning_rate": 1.2646204276242271e-05,
"loss": 1.9656,
"step": 11520
},
{
"epoch": 3.868552273203436,
"grad_norm": 6.525257587432861,
"learning_rate": 1.2571705281978693e-05,
"loss": 1.9482,
"step": 11540
},
{
"epoch": 3.875256652000838,
"grad_norm": 6.339199066162109,
"learning_rate": 1.2497206287715116e-05,
"loss": 1.9149,
"step": 11560
},
{
"epoch": 3.88196103079824,
"grad_norm": 5.968883037567139,
"learning_rate": 1.242270729345154e-05,
"loss": 1.89,
"step": 11580
},
{
"epoch": 3.8886654095956423,
"grad_norm": 5.6695990562438965,
"learning_rate": 1.2348208299187961e-05,
"loss": 2.0011,
"step": 11600
},
{
"epoch": 3.8953697883930443,
"grad_norm": 5.211636066436768,
"learning_rate": 1.2273709304924384e-05,
"loss": 1.9736,
"step": 11620
},
{
"epoch": 3.902074167190446,
"grad_norm": 6.887118816375732,
"learning_rate": 1.2199210310660806e-05,
"loss": 2.0432,
"step": 11640
},
{
"epoch": 3.9087785459878486,
"grad_norm": 5.855321407318115,
"learning_rate": 1.2124711316397229e-05,
"loss": 2.0008,
"step": 11660
},
{
"epoch": 3.9154829247852505,
"grad_norm": 5.927514553070068,
"learning_rate": 1.2050212322133651e-05,
"loss": 1.9975,
"step": 11680
},
{
"epoch": 3.9221873035826524,
"grad_norm": 5.483107089996338,
"learning_rate": 1.1975713327870074e-05,
"loss": 1.9729,
"step": 11700
},
{
"epoch": 3.9288916823800544,
"grad_norm": 6.38045072555542,
"learning_rate": 1.1901214333606497e-05,
"loss": 1.957,
"step": 11720
},
{
"epoch": 3.9355960611774563,
"grad_norm": 5.1667561531066895,
"learning_rate": 1.182671533934292e-05,
"loss": 1.9953,
"step": 11740
},
{
"epoch": 3.9423004399748587,
"grad_norm": 6.497701168060303,
"learning_rate": 1.1752216345079342e-05,
"loss": 2.0336,
"step": 11760
},
{
"epoch": 3.9490048187722606,
"grad_norm": 6.4357194900512695,
"learning_rate": 1.1677717350815764e-05,
"loss": 1.9825,
"step": 11780
},
{
"epoch": 3.9557091975696625,
"grad_norm": 5.45937442779541,
"learning_rate": 1.1603218356552187e-05,
"loss": 2.0056,
"step": 11800
},
{
"epoch": 3.962413576367065,
"grad_norm": 6.294889450073242,
"learning_rate": 1.152871936228861e-05,
"loss": 1.9627,
"step": 11820
},
{
"epoch": 3.969117955164467,
"grad_norm": 6.30310583114624,
"learning_rate": 1.1454220368025032e-05,
"loss": 1.9793,
"step": 11840
},
{
"epoch": 3.975822333961869,
"grad_norm": 5.265941143035889,
"learning_rate": 1.1379721373761455e-05,
"loss": 1.9109,
"step": 11860
},
{
"epoch": 3.982526712759271,
"grad_norm": 6.546343803405762,
"learning_rate": 1.1305222379497877e-05,
"loss": 1.9913,
"step": 11880
},
{
"epoch": 3.989231091556673,
"grad_norm": 5.486214637756348,
"learning_rate": 1.12307233852343e-05,
"loss": 1.917,
"step": 11900
},
{
"epoch": 3.995935470354075,
"grad_norm": 6.186803817749023,
"learning_rate": 1.1156224390970722e-05,
"loss": 1.9161,
"step": 11920
},
{
"epoch": 4.002681751518961,
"grad_norm": 6.181544780731201,
"learning_rate": 1.1081725396707145e-05,
"loss": 2.0623,
"step": 11940
},
{
"epoch": 4.009386130316363,
"grad_norm": 6.881202220916748,
"learning_rate": 1.1007226402443568e-05,
"loss": 1.9415,
"step": 11960
},
{
"epoch": 4.016090509113765,
"grad_norm": 5.8216633796691895,
"learning_rate": 1.093272740817999e-05,
"loss": 1.9856,
"step": 11980
},
{
"epoch": 4.022794887911167,
"grad_norm": 7.12168025970459,
"learning_rate": 1.0858228413916413e-05,
"loss": 1.923,
"step": 12000
},
{
"epoch": 4.022794887911167,
"eval_bleu_greedy": 44.50439408731819,
"eval_loss": 0.32926422357559204,
"eval_runtime": 73.5162,
"eval_samples_per_second": 1.36,
"eval_steps_per_second": 1.36,
"step": 12000
},
{
"epoch": 4.029499266708569,
"grad_norm": 6.934213161468506,
"learning_rate": 1.0783729419652835e-05,
"loss": 1.9524,
"step": 12020
},
{
"epoch": 4.036203645505971,
"grad_norm": 5.604892730712891,
"learning_rate": 1.0709230425389258e-05,
"loss": 1.899,
"step": 12040
},
{
"epoch": 4.042908024303373,
"grad_norm": 6.461178302764893,
"learning_rate": 1.063473143112568e-05,
"loss": 1.9747,
"step": 12060
},
{
"epoch": 4.049612403100776,
"grad_norm": 5.3131866455078125,
"learning_rate": 1.0560232436862103e-05,
"loss": 1.8441,
"step": 12080
},
{
"epoch": 4.056316781898177,
"grad_norm": 6.133450508117676,
"learning_rate": 1.0485733442598526e-05,
"loss": 1.9745,
"step": 12100
},
{
"epoch": 4.0630211606955795,
"grad_norm": 6.431405067443848,
"learning_rate": 1.0411234448334948e-05,
"loss": 2.0148,
"step": 12120
},
{
"epoch": 4.069725539492981,
"grad_norm": 6.80940055847168,
"learning_rate": 1.033673545407137e-05,
"loss": 2.0231,
"step": 12140
},
{
"epoch": 4.076429918290383,
"grad_norm": 6.070991516113281,
"learning_rate": 1.0262236459807793e-05,
"loss": 1.8877,
"step": 12160
},
{
"epoch": 4.083134297087786,
"grad_norm": 6.459563255310059,
"learning_rate": 1.0187737465544216e-05,
"loss": 1.9614,
"step": 12180
},
{
"epoch": 4.089838675885187,
"grad_norm": 6.669441223144531,
"learning_rate": 1.0113238471280638e-05,
"loss": 1.9879,
"step": 12200
},
{
"epoch": 4.09654305468259,
"grad_norm": 6.027960300445557,
"learning_rate": 1.0038739477017061e-05,
"loss": 2.0168,
"step": 12220
},
{
"epoch": 4.103247433479992,
"grad_norm": 6.358060359954834,
"learning_rate": 9.964240482753482e-06,
"loss": 1.9143,
"step": 12240
},
{
"epoch": 4.1099518122773935,
"grad_norm": 7.689465045928955,
"learning_rate": 9.889741488489906e-06,
"loss": 1.9383,
"step": 12260
},
{
"epoch": 4.116656191074796,
"grad_norm": 6.243340969085693,
"learning_rate": 9.815242494226329e-06,
"loss": 1.9484,
"step": 12280
},
{
"epoch": 4.123360569872197,
"grad_norm": 6.634798049926758,
"learning_rate": 9.740743499962751e-06,
"loss": 1.9055,
"step": 12300
},
{
"epoch": 4.1300649486696,
"grad_norm": 6.069347858428955,
"learning_rate": 9.666244505699174e-06,
"loss": 1.9314,
"step": 12320
},
{
"epoch": 4.136769327467002,
"grad_norm": 5.5364251136779785,
"learning_rate": 9.591745511435596e-06,
"loss": 1.9819,
"step": 12340
},
{
"epoch": 4.143473706264404,
"grad_norm": 7.139957904815674,
"learning_rate": 9.517246517172019e-06,
"loss": 2.0314,
"step": 12360
},
{
"epoch": 4.150178085061806,
"grad_norm": 6.039831161499023,
"learning_rate": 9.44274752290844e-06,
"loss": 1.8655,
"step": 12380
},
{
"epoch": 4.156882463859208,
"grad_norm": 6.120654582977295,
"learning_rate": 9.368248528644864e-06,
"loss": 1.9809,
"step": 12400
},
{
"epoch": 4.16358684265661,
"grad_norm": 6.08513879776001,
"learning_rate": 9.293749534381287e-06,
"loss": 1.963,
"step": 12420
},
{
"epoch": 4.170291221454012,
"grad_norm": 6.9406208992004395,
"learning_rate": 9.21925054011771e-06,
"loss": 1.9999,
"step": 12440
},
{
"epoch": 4.176995600251415,
"grad_norm": 5.910630226135254,
"learning_rate": 9.144751545854132e-06,
"loss": 1.9606,
"step": 12460
},
{
"epoch": 4.183699979048816,
"grad_norm": 6.108793258666992,
"learning_rate": 9.070252551590554e-06,
"loss": 1.9267,
"step": 12480
},
{
"epoch": 4.1904043578462185,
"grad_norm": 6.7554497718811035,
"learning_rate": 8.995753557326975e-06,
"loss": 1.9704,
"step": 12500
},
{
"epoch": 4.19710873664362,
"grad_norm": 5.927764415740967,
"learning_rate": 8.921254563063398e-06,
"loss": 1.9711,
"step": 12520
},
{
"epoch": 4.203813115441022,
"grad_norm": 5.403862476348877,
"learning_rate": 8.846755568799822e-06,
"loss": 1.9401,
"step": 12540
},
{
"epoch": 4.210517494238425,
"grad_norm": 6.7553181648254395,
"learning_rate": 8.772256574536245e-06,
"loss": 1.9349,
"step": 12560
},
{
"epoch": 4.217221873035826,
"grad_norm": 5.846068382263184,
"learning_rate": 8.697757580272667e-06,
"loss": 1.9993,
"step": 12580
},
{
"epoch": 4.223926251833229,
"grad_norm": 5.979813575744629,
"learning_rate": 8.62325858600909e-06,
"loss": 2.0041,
"step": 12600
},
{
"epoch": 4.230630630630631,
"grad_norm": 6.5849080085754395,
"learning_rate": 8.54875959174551e-06,
"loss": 1.9836,
"step": 12620
},
{
"epoch": 4.237335009428032,
"grad_norm": 6.310152530670166,
"learning_rate": 8.474260597481933e-06,
"loss": 1.9863,
"step": 12640
},
{
"epoch": 4.244039388225435,
"grad_norm": 5.853137493133545,
"learning_rate": 8.399761603218358e-06,
"loss": 1.9395,
"step": 12660
},
{
"epoch": 4.250743767022837,
"grad_norm": 6.041353225708008,
"learning_rate": 8.32526260895478e-06,
"loss": 1.9732,
"step": 12680
},
{
"epoch": 4.257448145820239,
"grad_norm": 5.885549545288086,
"learning_rate": 8.250763614691203e-06,
"loss": 1.9788,
"step": 12700
},
{
"epoch": 4.264152524617641,
"grad_norm": 6.728966236114502,
"learning_rate": 8.176264620427625e-06,
"loss": 2.0255,
"step": 12720
},
{
"epoch": 4.2708569034150425,
"grad_norm": 6.438037395477295,
"learning_rate": 8.101765626164046e-06,
"loss": 1.964,
"step": 12740
},
{
"epoch": 4.277561282212445,
"grad_norm": 6.513815879821777,
"learning_rate": 8.027266631900469e-06,
"loss": 1.9188,
"step": 12760
},
{
"epoch": 4.284265661009847,
"grad_norm": 5.92940616607666,
"learning_rate": 7.952767637636891e-06,
"loss": 1.9952,
"step": 12780
},
{
"epoch": 4.290970039807249,
"grad_norm": 5.621717929840088,
"learning_rate": 7.878268643373316e-06,
"loss": 1.944,
"step": 12800
},
{
"epoch": 4.297674418604651,
"grad_norm": 5.906243324279785,
"learning_rate": 7.803769649109738e-06,
"loss": 1.8989,
"step": 12820
},
{
"epoch": 4.3043787974020535,
"grad_norm": 6.153092861175537,
"learning_rate": 7.72927065484616e-06,
"loss": 1.9672,
"step": 12840
},
{
"epoch": 4.311083176199455,
"grad_norm": 6.292876720428467,
"learning_rate": 7.654771660582582e-06,
"loss": 1.9418,
"step": 12860
},
{
"epoch": 4.317787554996857,
"grad_norm": 6.2637858390808105,
"learning_rate": 7.580272666319005e-06,
"loss": 1.9032,
"step": 12880
},
{
"epoch": 4.32449193379426,
"grad_norm": 6.278430461883545,
"learning_rate": 7.505773672055427e-06,
"loss": 2.0032,
"step": 12900
},
{
"epoch": 4.331196312591661,
"grad_norm": 6.218298435211182,
"learning_rate": 7.4312746777918494e-06,
"loss": 1.9331,
"step": 12920
},
{
"epoch": 4.337900691389064,
"grad_norm": 5.950978755950928,
"learning_rate": 7.356775683528274e-06,
"loss": 1.9175,
"step": 12940
},
{
"epoch": 4.344605070186465,
"grad_norm": 6.4616780281066895,
"learning_rate": 7.2822766892646954e-06,
"loss": 1.9171,
"step": 12960
},
{
"epoch": 4.3513094489838675,
"grad_norm": 6.791792869567871,
"learning_rate": 7.207777695001118e-06,
"loss": 1.9694,
"step": 12980
},
{
"epoch": 4.35801382778127,
"grad_norm": 6.647165775299072,
"learning_rate": 7.133278700737541e-06,
"loss": 1.9414,
"step": 13000
},
{
"epoch": 4.35801382778127,
"eval_bleu_greedy": 44.55894868872612,
"eval_loss": 0.3250446319580078,
"eval_runtime": 38.3365,
"eval_samples_per_second": 2.608,
"eval_steps_per_second": 2.608,
"step": 13000
},
{
"epoch": 4.364718206578671,
"grad_norm": 6.127755165100098,
"learning_rate": 7.058779706473963e-06,
"loss": 1.907,
"step": 13020
},
{
"epoch": 4.371422585376074,
"grad_norm": 7.295255184173584,
"learning_rate": 6.984280712210385e-06,
"loss": 1.9099,
"step": 13040
},
{
"epoch": 4.378126964173476,
"grad_norm": 5.896840572357178,
"learning_rate": 6.9097817179468075e-06,
"loss": 1.9675,
"step": 13060
},
{
"epoch": 4.384831342970878,
"grad_norm": 6.170024394989014,
"learning_rate": 6.835282723683231e-06,
"loss": 1.9138,
"step": 13080
},
{
"epoch": 4.39153572176828,
"grad_norm": 7.63161039352417,
"learning_rate": 6.7607837294196535e-06,
"loss": 1.9458,
"step": 13100
},
{
"epoch": 4.398240100565682,
"grad_norm": 6.906017780303955,
"learning_rate": 6.686284735156076e-06,
"loss": 1.9417,
"step": 13120
},
{
"epoch": 4.404944479363084,
"grad_norm": 6.07291316986084,
"learning_rate": 6.611785740892499e-06,
"loss": 1.9937,
"step": 13140
},
{
"epoch": 4.411648858160486,
"grad_norm": 7.198663234710693,
"learning_rate": 6.53728674662892e-06,
"loss": 1.9158,
"step": 13160
},
{
"epoch": 4.418353236957888,
"grad_norm": 6.324075222015381,
"learning_rate": 6.462787752365343e-06,
"loss": 1.9194,
"step": 13180
},
{
"epoch": 4.42505761575529,
"grad_norm": 5.920898914337158,
"learning_rate": 6.3882887581017655e-06,
"loss": 2.0104,
"step": 13200
},
{
"epoch": 4.4317619945526925,
"grad_norm": 5.628123760223389,
"learning_rate": 6.313789763838189e-06,
"loss": 1.9472,
"step": 13220
},
{
"epoch": 4.438466373350094,
"grad_norm": 7.292685031890869,
"learning_rate": 6.239290769574611e-06,
"loss": 1.9488,
"step": 13240
},
{
"epoch": 4.445170752147496,
"grad_norm": 6.090283393859863,
"learning_rate": 6.164791775311034e-06,
"loss": 1.9137,
"step": 13260
},
{
"epoch": 4.451875130944899,
"grad_norm": 6.441646099090576,
"learning_rate": 6.090292781047456e-06,
"loss": 1.9346,
"step": 13280
},
{
"epoch": 4.4585795097423,
"grad_norm": 5.901956081390381,
"learning_rate": 6.015793786783878e-06,
"loss": 1.9465,
"step": 13300
},
{
"epoch": 4.465283888539703,
"grad_norm": 6.282461643218994,
"learning_rate": 5.941294792520302e-06,
"loss": 1.9368,
"step": 13320
},
{
"epoch": 4.471988267337105,
"grad_norm": 5.921506404876709,
"learning_rate": 5.8667957982567235e-06,
"loss": 1.9144,
"step": 13340
},
{
"epoch": 4.4786926461345065,
"grad_norm": 6.149923801422119,
"learning_rate": 5.792296803993146e-06,
"loss": 1.9946,
"step": 13360
},
{
"epoch": 4.485397024931909,
"grad_norm": 5.411886692047119,
"learning_rate": 5.7177978097295695e-06,
"loss": 1.9656,
"step": 13380
},
{
"epoch": 4.49210140372931,
"grad_norm": 6.29611349105835,
"learning_rate": 5.643298815465991e-06,
"loss": 1.9522,
"step": 13400
},
{
"epoch": 4.498805782526713,
"grad_norm": 6.302646160125732,
"learning_rate": 5.568799821202414e-06,
"loss": 2.028,
"step": 13420
},
{
"epoch": 4.505510161324115,
"grad_norm": 6.083780765533447,
"learning_rate": 5.494300826938836e-06,
"loss": 1.9707,
"step": 13440
},
{
"epoch": 4.512214540121517,
"grad_norm": 5.8760857582092285,
"learning_rate": 5.419801832675259e-06,
"loss": 1.9808,
"step": 13460
},
{
"epoch": 4.518918918918919,
"grad_norm": 6.291037559509277,
"learning_rate": 5.3453028384116815e-06,
"loss": 1.9806,
"step": 13480
},
{
"epoch": 4.525623297716321,
"grad_norm": 6.295835494995117,
"learning_rate": 5.270803844148104e-06,
"loss": 1.9802,
"step": 13500
},
{
"epoch": 4.532327676513723,
"grad_norm": 6.249302387237549,
"learning_rate": 5.196304849884527e-06,
"loss": 1.9503,
"step": 13520
},
{
"epoch": 4.539032055311125,
"grad_norm": 6.210274696350098,
"learning_rate": 5.121805855620949e-06,
"loss": 1.945,
"step": 13540
},
{
"epoch": 4.545736434108527,
"grad_norm": 6.666885852813721,
"learning_rate": 5.047306861357372e-06,
"loss": 1.906,
"step": 13560
},
{
"epoch": 4.552440812905929,
"grad_norm": 6.02937650680542,
"learning_rate": 4.972807867093794e-06,
"loss": 1.8988,
"step": 13580
},
{
"epoch": 4.559145191703331,
"grad_norm": 5.890073776245117,
"learning_rate": 4.898308872830217e-06,
"loss": 2.0141,
"step": 13600
},
{
"epoch": 4.565849570500733,
"grad_norm": 6.179807662963867,
"learning_rate": 4.8238098785666396e-06,
"loss": 1.8731,
"step": 13620
},
{
"epoch": 4.572553949298135,
"grad_norm": 6.25577974319458,
"learning_rate": 4.749310884303062e-06,
"loss": 1.881,
"step": 13640
},
{
"epoch": 4.579258328095538,
"grad_norm": 5.9553303718566895,
"learning_rate": 4.674811890039485e-06,
"loss": 1.9032,
"step": 13660
},
{
"epoch": 4.585962706892939,
"grad_norm": 6.600682735443115,
"learning_rate": 4.600312895775907e-06,
"loss": 2.0216,
"step": 13680
},
{
"epoch": 4.5926670856903415,
"grad_norm": 5.493414878845215,
"learning_rate": 4.52581390151233e-06,
"loss": 1.9494,
"step": 13700
},
{
"epoch": 4.599371464487744,
"grad_norm": 6.232935905456543,
"learning_rate": 4.4513149072487524e-06,
"loss": 1.9676,
"step": 13720
},
{
"epoch": 4.606075843285145,
"grad_norm": 5.859748840332031,
"learning_rate": 4.376815912985175e-06,
"loss": 1.904,
"step": 13740
},
{
"epoch": 4.612780222082548,
"grad_norm": 6.198875427246094,
"learning_rate": 4.3023169187215976e-06,
"loss": 1.9837,
"step": 13760
},
{
"epoch": 4.61948460087995,
"grad_norm": 5.675384521484375,
"learning_rate": 4.22781792445802e-06,
"loss": 1.9275,
"step": 13780
},
{
"epoch": 4.626188979677352,
"grad_norm": 6.3175129890441895,
"learning_rate": 4.153318930194443e-06,
"loss": 1.9237,
"step": 13800
},
{
"epoch": 4.632893358474754,
"grad_norm": 7.661358833312988,
"learning_rate": 4.078819935930865e-06,
"loss": 2.0447,
"step": 13820
},
{
"epoch": 4.6395977372721555,
"grad_norm": 6.920125484466553,
"learning_rate": 4.004320941667288e-06,
"loss": 1.9915,
"step": 13840
},
{
"epoch": 4.646302116069558,
"grad_norm": 5.844318866729736,
"learning_rate": 3.9298219474037105e-06,
"loss": 1.9599,
"step": 13860
},
{
"epoch": 4.65300649486696,
"grad_norm": 6.021313667297363,
"learning_rate": 3.855322953140133e-06,
"loss": 1.985,
"step": 13880
},
{
"epoch": 4.659710873664362,
"grad_norm": 6.7081298828125,
"learning_rate": 3.780823958876555e-06,
"loss": 1.9395,
"step": 13900
},
{
"epoch": 4.666415252461764,
"grad_norm": 6.496410846710205,
"learning_rate": 3.7063249646129778e-06,
"loss": 1.9911,
"step": 13920
},
{
"epoch": 4.6731196312591665,
"grad_norm": 6.401267051696777,
"learning_rate": 3.6318259703494007e-06,
"loss": 1.9315,
"step": 13940
},
{
"epoch": 4.679824010056568,
"grad_norm": 6.458618640899658,
"learning_rate": 3.557326976085823e-06,
"loss": 1.9749,
"step": 13960
},
{
"epoch": 4.68652838885397,
"grad_norm": 6.759514331817627,
"learning_rate": 3.4828279818222455e-06,
"loss": 2.0216,
"step": 13980
},
{
"epoch": 4.693232767651372,
"grad_norm": 6.3571953773498535,
"learning_rate": 3.4083289875586685e-06,
"loss": 1.912,
"step": 14000
},
{
"epoch": 4.693232767651372,
"eval_bleu_greedy": 44.91524109269023,
"eval_loss": 0.32178983092308044,
"eval_runtime": 58.0406,
"eval_samples_per_second": 1.723,
"eval_steps_per_second": 1.723,
"step": 14000
},
{
"epoch": 4.699937146448774,
"grad_norm": 6.1466593742370605,
"learning_rate": 3.3338299932950906e-06,
"loss": 1.9918,
"step": 14020
},
{
"epoch": 4.706641525246177,
"grad_norm": 6.885775089263916,
"learning_rate": 3.259330999031513e-06,
"loss": 1.9707,
"step": 14040
},
{
"epoch": 4.713345904043578,
"grad_norm": 6.186959743499756,
"learning_rate": 3.1848320047679354e-06,
"loss": 1.9067,
"step": 14060
},
{
"epoch": 4.7200502828409805,
"grad_norm": 6.915486812591553,
"learning_rate": 3.1103330105043583e-06,
"loss": 1.9584,
"step": 14080
},
{
"epoch": 4.726754661638383,
"grad_norm": 6.508053302764893,
"learning_rate": 3.035834016240781e-06,
"loss": 1.9481,
"step": 14100
},
{
"epoch": 4.733459040435784,
"grad_norm": 5.979186058044434,
"learning_rate": 2.9613350219772035e-06,
"loss": 1.9316,
"step": 14120
},
{
"epoch": 4.740163419233187,
"grad_norm": 6.08104944229126,
"learning_rate": 2.886836027713626e-06,
"loss": 1.9287,
"step": 14140
},
{
"epoch": 4.746867798030589,
"grad_norm": 6.154195308685303,
"learning_rate": 2.8123370334500486e-06,
"loss": 1.935,
"step": 14160
},
{
"epoch": 4.753572176827991,
"grad_norm": 6.8738508224487305,
"learning_rate": 2.737838039186471e-06,
"loss": 1.9573,
"step": 14180
},
{
"epoch": 4.760276555625393,
"grad_norm": 6.409038066864014,
"learning_rate": 2.663339044922894e-06,
"loss": 1.868,
"step": 14200
},
{
"epoch": 4.766980934422795,
"grad_norm": 6.662640571594238,
"learning_rate": 2.5888400506593164e-06,
"loss": 1.9635,
"step": 14220
},
{
"epoch": 4.773685313220197,
"grad_norm": 6.879613876342773,
"learning_rate": 2.5143410563957385e-06,
"loss": 1.9151,
"step": 14240
},
{
"epoch": 4.780389692017599,
"grad_norm": 6.439156532287598,
"learning_rate": 2.4398420621321615e-06,
"loss": 1.991,
"step": 14260
},
{
"epoch": 4.787094070815001,
"grad_norm": 5.503256797790527,
"learning_rate": 2.3653430678685837e-06,
"loss": 1.8901,
"step": 14280
},
{
"epoch": 4.793798449612403,
"grad_norm": 6.81868839263916,
"learning_rate": 2.2908440736050062e-06,
"loss": 1.9397,
"step": 14300
},
{
"epoch": 4.8005028284098055,
"grad_norm": 6.4171857833862305,
"learning_rate": 2.2163450793414292e-06,
"loss": 1.9112,
"step": 14320
},
{
"epoch": 4.807207207207207,
"grad_norm": 7.29513692855835,
"learning_rate": 2.1418460850778514e-06,
"loss": 1.9406,
"step": 14340
},
{
"epoch": 4.813911586004609,
"grad_norm": 6.337085723876953,
"learning_rate": 2.0673470908142744e-06,
"loss": 1.9374,
"step": 14360
},
{
"epoch": 4.820615964802011,
"grad_norm": 6.306519508361816,
"learning_rate": 1.9928480965506965e-06,
"loss": 1.9493,
"step": 14380
},
{
"epoch": 4.827320343599413,
"grad_norm": 5.782825469970703,
"learning_rate": 1.918349102287119e-06,
"loss": 1.886,
"step": 14400
},
{
"epoch": 4.834024722396816,
"grad_norm": 6.214239597320557,
"learning_rate": 1.843850108023542e-06,
"loss": 1.9407,
"step": 14420
},
{
"epoch": 4.840729101194217,
"grad_norm": 6.3050336837768555,
"learning_rate": 1.7693511137599643e-06,
"loss": 2.0325,
"step": 14440
},
{
"epoch": 4.847433479991619,
"grad_norm": 5.516683101654053,
"learning_rate": 1.694852119496387e-06,
"loss": 1.9344,
"step": 14460
},
{
"epoch": 4.854137858789022,
"grad_norm": 6.436117172241211,
"learning_rate": 1.6203531252328094e-06,
"loss": 1.9626,
"step": 14480
},
{
"epoch": 4.860842237586423,
"grad_norm": 6.287826061248779,
"learning_rate": 1.545854130969232e-06,
"loss": 1.9281,
"step": 14500
},
{
"epoch": 4.867546616383826,
"grad_norm": 5.765741348266602,
"learning_rate": 1.4713551367056546e-06,
"loss": 1.9629,
"step": 14520
},
{
"epoch": 4.874250995181228,
"grad_norm": 7.784135818481445,
"learning_rate": 1.3968561424420771e-06,
"loss": 1.9842,
"step": 14540
},
{
"epoch": 4.8809553739786296,
"grad_norm": 6.396987438201904,
"learning_rate": 1.3223571481784995e-06,
"loss": 1.9481,
"step": 14560
},
{
"epoch": 4.887659752776032,
"grad_norm": 6.332607746124268,
"learning_rate": 1.247858153914922e-06,
"loss": 1.9444,
"step": 14580
},
{
"epoch": 4.894364131573434,
"grad_norm": 7.538281440734863,
"learning_rate": 1.1733591596513449e-06,
"loss": 1.9695,
"step": 14600
},
{
"epoch": 4.901068510370836,
"grad_norm": 6.990569591522217,
"learning_rate": 1.0988601653877674e-06,
"loss": 1.9596,
"step": 14620
},
{
"epoch": 4.907772889168238,
"grad_norm": 7.1072998046875,
"learning_rate": 1.0243611711241898e-06,
"loss": 1.9351,
"step": 14640
},
{
"epoch": 4.9144772679656406,
"grad_norm": 6.343371391296387,
"learning_rate": 9.498621768606124e-07,
"loss": 1.862,
"step": 14660
},
{
"epoch": 4.921181646763042,
"grad_norm": 6.640133857727051,
"learning_rate": 8.75363182597035e-07,
"loss": 1.9533,
"step": 14680
},
{
"epoch": 4.927886025560444,
"grad_norm": 6.360694408416748,
"learning_rate": 8.008641883334574e-07,
"loss": 1.9342,
"step": 14700
},
{
"epoch": 4.934590404357846,
"grad_norm": 5.88399076461792,
"learning_rate": 7.263651940698801e-07,
"loss": 1.9337,
"step": 14720
},
{
"epoch": 4.941294783155248,
"grad_norm": 6.079113006591797,
"learning_rate": 6.518661998063027e-07,
"loss": 1.8845,
"step": 14740
},
{
"epoch": 4.947999161952651,
"grad_norm": 6.270729064941406,
"learning_rate": 5.773672055427253e-07,
"loss": 1.9492,
"step": 14760
},
{
"epoch": 4.954703540750052,
"grad_norm": 6.091942310333252,
"learning_rate": 5.028682112791477e-07,
"loss": 1.9806,
"step": 14780
},
{
"epoch": 4.9614079195474545,
"grad_norm": 7.632548809051514,
"learning_rate": 4.2836921701557035e-07,
"loss": 1.969,
"step": 14800
},
{
"epoch": 4.968112298344856,
"grad_norm": 6.508310794830322,
"learning_rate": 3.538702227519929e-07,
"loss": 1.946,
"step": 14820
},
{
"epoch": 4.974816677142258,
"grad_norm": 5.643795967102051,
"learning_rate": 2.793712284884154e-07,
"loss": 1.966,
"step": 14840
},
{
"epoch": 4.981521055939661,
"grad_norm": 5.909745693206787,
"learning_rate": 2.0487223422483797e-07,
"loss": 2.0078,
"step": 14860
},
{
"epoch": 4.988225434737062,
"grad_norm": 6.114766597747803,
"learning_rate": 1.3037323996126055e-07,
"loss": 1.9257,
"step": 14880
},
{
"epoch": 4.994929813534465,
"grad_norm": 6.140356063842773,
"learning_rate": 5.587424569768308e-08,
"loss": 1.9642,
"step": 14900
}
],
"logging_steps": 20,
"max_steps": 14915,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1016910755245588e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}