Qwen-3B / trainer_state.json
Vira21's picture
Upload 14 files
9d6536a verified
{
"best_metric": 3.0770967925659036,
"best_model_checkpoint": "/workspace/llm-storage/output/qwen-3B/checkpoint-1000",
"epoch": 4.999958097632517,
"eval_steps": 1000,
"global_step": 14915,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006704378797402053,
"grad_norm": 11.061790466308594,
"learning_rate": 6.702412868632708e-07,
"loss": 11.096,
"step": 20
},
{
"epoch": 0.013408757594804106,
"grad_norm": 10.477948188781738,
"learning_rate": 1.3404825737265416e-06,
"loss": 11.0987,
"step": 40
},
{
"epoch": 0.02011313639220616,
"grad_norm": 12.07636547088623,
"learning_rate": 2.0107238605898126e-06,
"loss": 11.0549,
"step": 60
},
{
"epoch": 0.02681751518960821,
"grad_norm": 11.286941528320312,
"learning_rate": 2.680965147453083e-06,
"loss": 10.3064,
"step": 80
},
{
"epoch": 0.03352189398701027,
"grad_norm": 6.030429840087891,
"learning_rate": 3.351206434316354e-06,
"loss": 9.5768,
"step": 100
},
{
"epoch": 0.04022627278441232,
"grad_norm": 8.383974075317383,
"learning_rate": 4.021447721179625e-06,
"loss": 8.5186,
"step": 120
},
{
"epoch": 0.04693065158181437,
"grad_norm": 5.529025554656982,
"learning_rate": 4.691689008042896e-06,
"loss": 7.5261,
"step": 140
},
{
"epoch": 0.05363503037921642,
"grad_norm": 3.8176066875457764,
"learning_rate": 5.361930294906166e-06,
"loss": 7.0029,
"step": 160
},
{
"epoch": 0.06033940917661848,
"grad_norm": 3.5050182342529297,
"learning_rate": 6.032171581769437e-06,
"loss": 6.9204,
"step": 180
},
{
"epoch": 0.06704378797402054,
"grad_norm": 4.123743534088135,
"learning_rate": 6.702412868632708e-06,
"loss": 6.6679,
"step": 200
},
{
"epoch": 0.07374816677142258,
"grad_norm": 3.595418691635132,
"learning_rate": 7.372654155495978e-06,
"loss": 6.6241,
"step": 220
},
{
"epoch": 0.08045254556882464,
"grad_norm": 4.242733001708984,
"learning_rate": 8.04289544235925e-06,
"loss": 6.4657,
"step": 240
},
{
"epoch": 0.0871569243662267,
"grad_norm": 4.478307247161865,
"learning_rate": 8.71313672922252e-06,
"loss": 6.4337,
"step": 260
},
{
"epoch": 0.09386130316362874,
"grad_norm": 4.557178497314453,
"learning_rate": 9.383378016085791e-06,
"loss": 6.3413,
"step": 280
},
{
"epoch": 0.1005656819610308,
"grad_norm": 4.335766792297363,
"learning_rate": 1.0053619302949062e-05,
"loss": 6.4344,
"step": 300
},
{
"epoch": 0.10727006075843284,
"grad_norm": 4.590956687927246,
"learning_rate": 1.0723860589812333e-05,
"loss": 6.3337,
"step": 320
},
{
"epoch": 0.1139744395558349,
"grad_norm": 4.8770060539245605,
"learning_rate": 1.1394101876675605e-05,
"loss": 6.2143,
"step": 340
},
{
"epoch": 0.12067881835323696,
"grad_norm": 4.7008891105651855,
"learning_rate": 1.2064343163538874e-05,
"loss": 6.2898,
"step": 360
},
{
"epoch": 0.127383197150639,
"grad_norm": 4.890043258666992,
"learning_rate": 1.2734584450402146e-05,
"loss": 6.1775,
"step": 380
},
{
"epoch": 0.13408757594804108,
"grad_norm": 5.183588981628418,
"learning_rate": 1.3404825737265417e-05,
"loss": 6.1675,
"step": 400
},
{
"epoch": 0.14079195474544312,
"grad_norm": 5.2846527099609375,
"learning_rate": 1.4075067024128689e-05,
"loss": 6.2091,
"step": 420
},
{
"epoch": 0.14749633354284516,
"grad_norm": 5.238739490509033,
"learning_rate": 1.4745308310991956e-05,
"loss": 6.1628,
"step": 440
},
{
"epoch": 0.15420071234024724,
"grad_norm": 5.562626838684082,
"learning_rate": 1.5415549597855227e-05,
"loss": 6.0092,
"step": 460
},
{
"epoch": 0.16090509113764928,
"grad_norm": 6.380126476287842,
"learning_rate": 1.60857908847185e-05,
"loss": 6.0497,
"step": 480
},
{
"epoch": 0.16760946993505133,
"grad_norm": 5.533380031585693,
"learning_rate": 1.675603217158177e-05,
"loss": 5.9952,
"step": 500
},
{
"epoch": 0.1743138487324534,
"grad_norm": 6.026157379150391,
"learning_rate": 1.742627345844504e-05,
"loss": 6.0327,
"step": 520
},
{
"epoch": 0.18101822752985544,
"grad_norm": 5.79816198348999,
"learning_rate": 1.8096514745308312e-05,
"loss": 5.8172,
"step": 540
},
{
"epoch": 0.18772260632725749,
"grad_norm": 6.133901119232178,
"learning_rate": 1.8766756032171583e-05,
"loss": 5.7605,
"step": 560
},
{
"epoch": 0.19442698512465956,
"grad_norm": 7.115331649780273,
"learning_rate": 1.9436997319034853e-05,
"loss": 5.9287,
"step": 580
},
{
"epoch": 0.2011313639220616,
"grad_norm": 6.978466033935547,
"learning_rate": 2.0107238605898124e-05,
"loss": 5.7846,
"step": 600
},
{
"epoch": 0.20783574271946365,
"grad_norm": 7.083895206451416,
"learning_rate": 2.0777479892761395e-05,
"loss": 5.6818,
"step": 620
},
{
"epoch": 0.2145401215168657,
"grad_norm": 7.947028636932373,
"learning_rate": 2.1447721179624665e-05,
"loss": 5.7934,
"step": 640
},
{
"epoch": 0.22124450031426776,
"grad_norm": 6.990066051483154,
"learning_rate": 2.211796246648794e-05,
"loss": 5.6991,
"step": 660
},
{
"epoch": 0.2279488791116698,
"grad_norm": 6.842931747436523,
"learning_rate": 2.278820375335121e-05,
"loss": 5.75,
"step": 680
},
{
"epoch": 0.23465325790907185,
"grad_norm": 6.710008144378662,
"learning_rate": 2.3458445040214477e-05,
"loss": 5.5526,
"step": 700
},
{
"epoch": 0.24135763670647392,
"grad_norm": 6.721392631530762,
"learning_rate": 2.4128686327077747e-05,
"loss": 5.6124,
"step": 720
},
{
"epoch": 0.24806201550387597,
"grad_norm": 7.801576614379883,
"learning_rate": 2.479892761394102e-05,
"loss": 5.6865,
"step": 740
},
{
"epoch": 0.254766394301278,
"grad_norm": 7.230539798736572,
"learning_rate": 2.5469168900804292e-05,
"loss": 5.5821,
"step": 760
},
{
"epoch": 0.26147077309868005,
"grad_norm": 7.283995151519775,
"learning_rate": 2.6139410187667563e-05,
"loss": 5.6686,
"step": 780
},
{
"epoch": 0.26817515189608215,
"grad_norm": 7.756102085113525,
"learning_rate": 2.6809651474530833e-05,
"loss": 5.4761,
"step": 800
},
{
"epoch": 0.2748795306934842,
"grad_norm": 7.6954569816589355,
"learning_rate": 2.7479892761394104e-05,
"loss": 5.6806,
"step": 820
},
{
"epoch": 0.28158390949088624,
"grad_norm": 8.64757251739502,
"learning_rate": 2.8150134048257378e-05,
"loss": 5.525,
"step": 840
},
{
"epoch": 0.2882882882882883,
"grad_norm": 7.394837379455566,
"learning_rate": 2.8820375335120648e-05,
"loss": 5.4771,
"step": 860
},
{
"epoch": 0.29499266708569033,
"grad_norm": 8.593236923217773,
"learning_rate": 2.9490616621983912e-05,
"loss": 5.5333,
"step": 880
},
{
"epoch": 0.3016970458830924,
"grad_norm": 7.424787998199463,
"learning_rate": 3.0160857908847186e-05,
"loss": 5.2982,
"step": 900
},
{
"epoch": 0.3084014246804945,
"grad_norm": 7.988162994384766,
"learning_rate": 3.083109919571045e-05,
"loss": 5.4675,
"step": 920
},
{
"epoch": 0.3151058034778965,
"grad_norm": 7.624905586242676,
"learning_rate": 3.1501340482573724e-05,
"loss": 5.4906,
"step": 940
},
{
"epoch": 0.32181018227529856,
"grad_norm": 8.334522247314453,
"learning_rate": 3.2171581769437e-05,
"loss": 5.4069,
"step": 960
},
{
"epoch": 0.3285145610727006,
"grad_norm": 8.150382041931152,
"learning_rate": 3.284182305630027e-05,
"loss": 5.4312,
"step": 980
},
{
"epoch": 0.33521893987010265,
"grad_norm": 8.183965682983398,
"learning_rate": 3.351206434316354e-05,
"loss": 5.4012,
"step": 1000
},
{
"epoch": 0.33521893987010265,
"eval_bleu_greedy": 3.0770967925659036,
"eval_loss": 0.693783700466156,
"eval_runtime": 309.6004,
"eval_samples_per_second": 0.323,
"eval_steps_per_second": 0.323,
"step": 1000
},
{
"epoch": 0.3419233186675047,
"grad_norm": 8.621966361999512,
"learning_rate": 3.418230563002681e-05,
"loss": 5.3954,
"step": 1020
},
{
"epoch": 0.3486276974649068,
"grad_norm": 8.160908699035645,
"learning_rate": 3.485254691689008e-05,
"loss": 5.4285,
"step": 1040
},
{
"epoch": 0.35533207626230884,
"grad_norm": 7.480456352233887,
"learning_rate": 3.5522788203753354e-05,
"loss": 5.3167,
"step": 1060
},
{
"epoch": 0.3620364550597109,
"grad_norm": 7.508689880371094,
"learning_rate": 3.6193029490616625e-05,
"loss": 5.3439,
"step": 1080
},
{
"epoch": 0.3687408338571129,
"grad_norm": 9.118748664855957,
"learning_rate": 3.6863270777479895e-05,
"loss": 5.3604,
"step": 1100
},
{
"epoch": 0.37544521265451497,
"grad_norm": 7.6000471115112305,
"learning_rate": 3.7533512064343166e-05,
"loss": 5.2976,
"step": 1120
},
{
"epoch": 0.382149591451917,
"grad_norm": 7.776809215545654,
"learning_rate": 3.8203753351206436e-05,
"loss": 5.3177,
"step": 1140
},
{
"epoch": 0.3888539702493191,
"grad_norm": 8.50612735748291,
"learning_rate": 3.887399463806971e-05,
"loss": 5.2408,
"step": 1160
},
{
"epoch": 0.39555834904672116,
"grad_norm": 7.958391189575195,
"learning_rate": 3.954423592493298e-05,
"loss": 5.2248,
"step": 1180
},
{
"epoch": 0.4022627278441232,
"grad_norm": 7.7386579513549805,
"learning_rate": 4.021447721179625e-05,
"loss": 5.2663,
"step": 1200
},
{
"epoch": 0.40896710664152525,
"grad_norm": 8.172608375549316,
"learning_rate": 4.088471849865952e-05,
"loss": 5.2612,
"step": 1220
},
{
"epoch": 0.4156714854389273,
"grad_norm": 7.354376792907715,
"learning_rate": 4.155495978552279e-05,
"loss": 5.2672,
"step": 1240
},
{
"epoch": 0.42237586423632933,
"grad_norm": 7.837838649749756,
"learning_rate": 4.222520107238606e-05,
"loss": 5.1707,
"step": 1260
},
{
"epoch": 0.4290802430337314,
"grad_norm": 8.173263549804688,
"learning_rate": 4.289544235924933e-05,
"loss": 5.2387,
"step": 1280
},
{
"epoch": 0.4357846218311335,
"grad_norm": 7.122191905975342,
"learning_rate": 4.35656836461126e-05,
"loss": 5.2604,
"step": 1300
},
{
"epoch": 0.4424890006285355,
"grad_norm": 7.6141839027404785,
"learning_rate": 4.423592493297588e-05,
"loss": 5.1689,
"step": 1320
},
{
"epoch": 0.44919337942593757,
"grad_norm": 7.241093635559082,
"learning_rate": 4.490616621983915e-05,
"loss": 5.1451,
"step": 1340
},
{
"epoch": 0.4558977582233396,
"grad_norm": 6.968513011932373,
"learning_rate": 4.557640750670242e-05,
"loss": 5.0216,
"step": 1360
},
{
"epoch": 0.46260213702074165,
"grad_norm": 7.734285354614258,
"learning_rate": 4.624664879356568e-05,
"loss": 5.0932,
"step": 1380
},
{
"epoch": 0.4693065158181437,
"grad_norm": 8.277946472167969,
"learning_rate": 4.6916890080428954e-05,
"loss": 5.1977,
"step": 1400
},
{
"epoch": 0.4760108946155458,
"grad_norm": 7.437044620513916,
"learning_rate": 4.7587131367292224e-05,
"loss": 5.1254,
"step": 1420
},
{
"epoch": 0.48271527341294784,
"grad_norm": 7.227113723754883,
"learning_rate": 4.8257372654155495e-05,
"loss": 5.0943,
"step": 1440
},
{
"epoch": 0.4894196522103499,
"grad_norm": 7.8769683837890625,
"learning_rate": 4.8927613941018765e-05,
"loss": 5.0513,
"step": 1460
},
{
"epoch": 0.49612403100775193,
"grad_norm": 6.573411464691162,
"learning_rate": 4.959785522788204e-05,
"loss": 5.045,
"step": 1480
},
{
"epoch": 0.502828409805154,
"grad_norm": 6.970616817474365,
"learning_rate": 4.997020040229457e-05,
"loss": 5.0549,
"step": 1500
},
{
"epoch": 0.509532788602556,
"grad_norm": 7.552504539489746,
"learning_rate": 4.989570140803099e-05,
"loss": 5.1787,
"step": 1520
},
{
"epoch": 0.5162371673999581,
"grad_norm": 6.920768737792969,
"learning_rate": 4.982120241376742e-05,
"loss": 5.0916,
"step": 1540
},
{
"epoch": 0.5229415461973601,
"grad_norm": 6.61656379699707,
"learning_rate": 4.974670341950384e-05,
"loss": 5.0528,
"step": 1560
},
{
"epoch": 0.5296459249947622,
"grad_norm": 7.405433654785156,
"learning_rate": 4.967220442524026e-05,
"loss": 5.0544,
"step": 1580
},
{
"epoch": 0.5363503037921643,
"grad_norm": 7.384308815002441,
"learning_rate": 4.959770543097668e-05,
"loss": 5.0058,
"step": 1600
},
{
"epoch": 0.5430546825895664,
"grad_norm": 6.147129058837891,
"learning_rate": 4.95232064367131e-05,
"loss": 5.1453,
"step": 1620
},
{
"epoch": 0.5497590613869684,
"grad_norm": 6.856501579284668,
"learning_rate": 4.944870744244953e-05,
"loss": 4.997,
"step": 1640
},
{
"epoch": 0.5564634401843704,
"grad_norm": 7.677363395690918,
"learning_rate": 4.937420844818596e-05,
"loss": 5.0445,
"step": 1660
},
{
"epoch": 0.5631678189817725,
"grad_norm": 6.1870269775390625,
"learning_rate": 4.929970945392238e-05,
"loss": 4.8786,
"step": 1680
},
{
"epoch": 0.5698721977791745,
"grad_norm": 6.6285529136657715,
"learning_rate": 4.92252104596588e-05,
"loss": 4.9182,
"step": 1700
},
{
"epoch": 0.5765765765765766,
"grad_norm": 6.762671947479248,
"learning_rate": 4.915071146539522e-05,
"loss": 4.836,
"step": 1720
},
{
"epoch": 0.5832809553739786,
"grad_norm": 6.840793132781982,
"learning_rate": 4.907621247113165e-05,
"loss": 4.9259,
"step": 1740
},
{
"epoch": 0.5899853341713807,
"grad_norm": 7.508916854858398,
"learning_rate": 4.900171347686807e-05,
"loss": 4.9955,
"step": 1760
},
{
"epoch": 0.5966897129687827,
"grad_norm": 7.2800374031066895,
"learning_rate": 4.892721448260449e-05,
"loss": 4.9515,
"step": 1780
},
{
"epoch": 0.6033940917661847,
"grad_norm": 6.398910999298096,
"learning_rate": 4.885271548834091e-05,
"loss": 4.9002,
"step": 1800
},
{
"epoch": 0.6100984705635868,
"grad_norm": 7.054329872131348,
"learning_rate": 4.877821649407733e-05,
"loss": 4.8998,
"step": 1820
},
{
"epoch": 0.616802849360989,
"grad_norm": 6.903358459472656,
"learning_rate": 4.870371749981376e-05,
"loss": 4.8227,
"step": 1840
},
{
"epoch": 0.623507228158391,
"grad_norm": 6.382834434509277,
"learning_rate": 4.862921850555018e-05,
"loss": 4.8448,
"step": 1860
},
{
"epoch": 0.630211606955793,
"grad_norm": 6.244606018066406,
"learning_rate": 4.85547195112866e-05,
"loss": 4.9048,
"step": 1880
},
{
"epoch": 0.6369159857531951,
"grad_norm": 6.7048115730285645,
"learning_rate": 4.848022051702302e-05,
"loss": 4.8862,
"step": 1900
},
{
"epoch": 0.6436203645505971,
"grad_norm": 6.761898994445801,
"learning_rate": 4.840572152275945e-05,
"loss": 4.836,
"step": 1920
},
{
"epoch": 0.6503247433479992,
"grad_norm": 6.694396495819092,
"learning_rate": 4.833122252849587e-05,
"loss": 4.922,
"step": 1940
},
{
"epoch": 0.6570291221454012,
"grad_norm": 7.083889961242676,
"learning_rate": 4.825672353423229e-05,
"loss": 4.801,
"step": 1960
},
{
"epoch": 0.6637335009428033,
"grad_norm": 6.358527183532715,
"learning_rate": 4.818222453996871e-05,
"loss": 4.9146,
"step": 1980
},
{
"epoch": 0.6704378797402053,
"grad_norm": 6.129880428314209,
"learning_rate": 4.810772554570513e-05,
"loss": 4.7806,
"step": 2000
},
{
"epoch": 0.6704378797402053,
"eval_bleu_greedy": 1.635969783116869,
"eval_loss": 0.6162992715835571,
"eval_runtime": 98.9607,
"eval_samples_per_second": 1.011,
"eval_steps_per_second": 1.011,
"step": 2000
},
{
"epoch": 0.6771422585376073,
"grad_norm": 6.157505989074707,
"learning_rate": 4.803322655144156e-05,
"loss": 4.6963,
"step": 2020
},
{
"epoch": 0.6838466373350094,
"grad_norm": 6.576890468597412,
"learning_rate": 4.795872755717798e-05,
"loss": 4.7945,
"step": 2040
},
{
"epoch": 0.6905510161324114,
"grad_norm": 6.158898830413818,
"learning_rate": 4.78842285629144e-05,
"loss": 4.7629,
"step": 2060
},
{
"epoch": 0.6972553949298136,
"grad_norm": 7.410290241241455,
"learning_rate": 4.780972956865082e-05,
"loss": 4.6563,
"step": 2080
},
{
"epoch": 0.7039597737272156,
"grad_norm": 6.731761455535889,
"learning_rate": 4.773523057438724e-05,
"loss": 4.6947,
"step": 2100
},
{
"epoch": 0.7106641525246177,
"grad_norm": 6.34529447555542,
"learning_rate": 4.766073158012367e-05,
"loss": 4.7754,
"step": 2120
},
{
"epoch": 0.7173685313220197,
"grad_norm": 6.288251876831055,
"learning_rate": 4.758623258586009e-05,
"loss": 4.7656,
"step": 2140
},
{
"epoch": 0.7240729101194218,
"grad_norm": 6.207250595092773,
"learning_rate": 4.751173359159651e-05,
"loss": 4.6873,
"step": 2160
},
{
"epoch": 0.7307772889168238,
"grad_norm": 6.283239841461182,
"learning_rate": 4.7437234597332934e-05,
"loss": 4.6951,
"step": 2180
},
{
"epoch": 0.7374816677142259,
"grad_norm": 6.399360179901123,
"learning_rate": 4.736273560306936e-05,
"loss": 4.8608,
"step": 2200
},
{
"epoch": 0.7441860465116279,
"grad_norm": 7.108467102050781,
"learning_rate": 4.728823660880579e-05,
"loss": 4.7696,
"step": 2220
},
{
"epoch": 0.7508904253090299,
"grad_norm": 6.39521598815918,
"learning_rate": 4.721373761454221e-05,
"loss": 4.6725,
"step": 2240
},
{
"epoch": 0.757594804106432,
"grad_norm": 7.189324855804443,
"learning_rate": 4.713923862027863e-05,
"loss": 4.7705,
"step": 2260
},
{
"epoch": 0.764299182903834,
"grad_norm": 6.980968952178955,
"learning_rate": 4.706473962601505e-05,
"loss": 4.6747,
"step": 2280
},
{
"epoch": 0.7710035617012361,
"grad_norm": 5.775346755981445,
"learning_rate": 4.699024063175147e-05,
"loss": 4.5941,
"step": 2300
},
{
"epoch": 0.7777079404986382,
"grad_norm": 6.348450183868408,
"learning_rate": 4.69157416374879e-05,
"loss": 4.6419,
"step": 2320
},
{
"epoch": 0.7844123192960403,
"grad_norm": 6.2754340171813965,
"learning_rate": 4.684124264322432e-05,
"loss": 4.6844,
"step": 2340
},
{
"epoch": 0.7911166980934423,
"grad_norm": 6.037561893463135,
"learning_rate": 4.676674364896074e-05,
"loss": 4.6424,
"step": 2360
},
{
"epoch": 0.7978210768908444,
"grad_norm": 6.3136372566223145,
"learning_rate": 4.669224465469716e-05,
"loss": 4.6775,
"step": 2380
},
{
"epoch": 0.8045254556882464,
"grad_norm": 6.91141939163208,
"learning_rate": 4.661774566043359e-05,
"loss": 4.6124,
"step": 2400
},
{
"epoch": 0.8112298344856484,
"grad_norm": 5.787718772888184,
"learning_rate": 4.654324666617001e-05,
"loss": 4.6595,
"step": 2420
},
{
"epoch": 0.8179342132830505,
"grad_norm": 6.696752548217773,
"learning_rate": 4.646874767190643e-05,
"loss": 4.563,
"step": 2440
},
{
"epoch": 0.8246385920804525,
"grad_norm": 6.550769805908203,
"learning_rate": 4.639424867764285e-05,
"loss": 4.7101,
"step": 2460
},
{
"epoch": 0.8313429708778546,
"grad_norm": 6.5647969245910645,
"learning_rate": 4.6319749683379274e-05,
"loss": 4.5763,
"step": 2480
},
{
"epoch": 0.8380473496752566,
"grad_norm": 6.71086311340332,
"learning_rate": 4.62452506891157e-05,
"loss": 4.4918,
"step": 2500
},
{
"epoch": 0.8447517284726587,
"grad_norm": 6.1139445304870605,
"learning_rate": 4.617075169485212e-05,
"loss": 4.5095,
"step": 2520
},
{
"epoch": 0.8514561072700607,
"grad_norm": 5.594122886657715,
"learning_rate": 4.609625270058854e-05,
"loss": 4.6359,
"step": 2540
},
{
"epoch": 0.8581604860674628,
"grad_norm": 6.769913196563721,
"learning_rate": 4.6021753706324964e-05,
"loss": 4.4938,
"step": 2560
},
{
"epoch": 0.8648648648648649,
"grad_norm": 6.663547039031982,
"learning_rate": 4.5947254712061385e-05,
"loss": 4.5619,
"step": 2580
},
{
"epoch": 0.871569243662267,
"grad_norm": 6.266171455383301,
"learning_rate": 4.587275571779781e-05,
"loss": 4.4406,
"step": 2600
},
{
"epoch": 0.878273622459669,
"grad_norm": 6.513619422912598,
"learning_rate": 4.5798256723534234e-05,
"loss": 4.498,
"step": 2620
},
{
"epoch": 0.884978001257071,
"grad_norm": 6.088827133178711,
"learning_rate": 4.5723757729270654e-05,
"loss": 4.5242,
"step": 2640
},
{
"epoch": 0.8916823800544731,
"grad_norm": 6.1904296875,
"learning_rate": 4.5649258735007075e-05,
"loss": 4.5976,
"step": 2660
},
{
"epoch": 0.8983867588518751,
"grad_norm": 6.429610252380371,
"learning_rate": 4.55747597407435e-05,
"loss": 4.4584,
"step": 2680
},
{
"epoch": 0.9050911376492772,
"grad_norm": 5.907393932342529,
"learning_rate": 4.5500260746479924e-05,
"loss": 4.5814,
"step": 2700
},
{
"epoch": 0.9117955164466792,
"grad_norm": 6.102148056030273,
"learning_rate": 4.5425761752216345e-05,
"loss": 4.5913,
"step": 2720
},
{
"epoch": 0.9184998952440813,
"grad_norm": 6.3552327156066895,
"learning_rate": 4.5351262757952766e-05,
"loss": 4.5327,
"step": 2740
},
{
"epoch": 0.9252042740414833,
"grad_norm": 5.990479946136475,
"learning_rate": 4.527676376368919e-05,
"loss": 4.4025,
"step": 2760
},
{
"epoch": 0.9319086528388854,
"grad_norm": 5.946578502655029,
"learning_rate": 4.5202264769425614e-05,
"loss": 4.5777,
"step": 2780
},
{
"epoch": 0.9386130316362874,
"grad_norm": 6.422057151794434,
"learning_rate": 4.512776577516204e-05,
"loss": 4.4951,
"step": 2800
},
{
"epoch": 0.9453174104336896,
"grad_norm": 6.144739151000977,
"learning_rate": 4.505326678089846e-05,
"loss": 4.5586,
"step": 2820
},
{
"epoch": 0.9520217892310916,
"grad_norm": 6.011499404907227,
"learning_rate": 4.4978767786634884e-05,
"loss": 4.4695,
"step": 2840
},
{
"epoch": 0.9587261680284936,
"grad_norm": 5.852478504180908,
"learning_rate": 4.4904268792371304e-05,
"loss": 4.4765,
"step": 2860
},
{
"epoch": 0.9654305468258957,
"grad_norm": 5.951258182525635,
"learning_rate": 4.482976979810773e-05,
"loss": 4.4017,
"step": 2880
},
{
"epoch": 0.9721349256232977,
"grad_norm": 6.046126842498779,
"learning_rate": 4.475527080384415e-05,
"loss": 4.4065,
"step": 2900
},
{
"epoch": 0.9788393044206998,
"grad_norm": 7.13279390335083,
"learning_rate": 4.4680771809580574e-05,
"loss": 4.4527,
"step": 2920
},
{
"epoch": 0.9855436832181018,
"grad_norm": 6.364200115203857,
"learning_rate": 4.4606272815316995e-05,
"loss": 4.4392,
"step": 2940
},
{
"epoch": 0.9922480620155039,
"grad_norm": 6.487414360046387,
"learning_rate": 4.4531773821053416e-05,
"loss": 4.3799,
"step": 2960
},
{
"epoch": 0.9989524408129059,
"grad_norm": 6.20075798034668,
"learning_rate": 4.445727482678984e-05,
"loss": 4.3814,
"step": 2980
},
{
"epoch": 1.0056987219777918,
"grad_norm": 6.238715171813965,
"learning_rate": 4.4382775832526264e-05,
"loss": 4.3533,
"step": 3000
},
{
"epoch": 1.0056987219777918,
"eval_bleu_greedy": 2.12858213528201,
"eval_loss": 0.568386971950531,
"eval_runtime": 434.1939,
"eval_samples_per_second": 0.23,
"eval_steps_per_second": 0.23,
"step": 3000
},
{
"epoch": 1.012403100775194,
"grad_norm": 6.458903789520264,
"learning_rate": 4.4308276838262685e-05,
"loss": 4.218,
"step": 3020
},
{
"epoch": 1.0191074795725958,
"grad_norm": 6.330122947692871,
"learning_rate": 4.4233777843999106e-05,
"loss": 4.3083,
"step": 3040
},
{
"epoch": 1.025811858369998,
"grad_norm": 6.430805683135986,
"learning_rate": 4.4159278849735534e-05,
"loss": 4.2772,
"step": 3060
},
{
"epoch": 1.0325162371674,
"grad_norm": 6.592049598693848,
"learning_rate": 4.4084779855471954e-05,
"loss": 4.3208,
"step": 3080
},
{
"epoch": 1.039220615964802,
"grad_norm": 6.5312299728393555,
"learning_rate": 4.4010280861208375e-05,
"loss": 4.1853,
"step": 3100
},
{
"epoch": 1.045924994762204,
"grad_norm": 5.972381591796875,
"learning_rate": 4.3935781866944796e-05,
"loss": 4.2346,
"step": 3120
},
{
"epoch": 1.0526293735596062,
"grad_norm": 5.913834095001221,
"learning_rate": 4.386128287268122e-05,
"loss": 4.2687,
"step": 3140
},
{
"epoch": 1.059333752357008,
"grad_norm": 6.429443836212158,
"learning_rate": 4.3786783878417645e-05,
"loss": 4.33,
"step": 3160
},
{
"epoch": 1.0660381311544103,
"grad_norm": 6.044195175170898,
"learning_rate": 4.3712284884154066e-05,
"loss": 4.2251,
"step": 3180
},
{
"epoch": 1.0727425099518122,
"grad_norm": 6.010583877563477,
"learning_rate": 4.3637785889890487e-05,
"loss": 4.3161,
"step": 3200
},
{
"epoch": 1.0794468887492144,
"grad_norm": 5.625052452087402,
"learning_rate": 4.356328689562691e-05,
"loss": 4.2069,
"step": 3220
},
{
"epoch": 1.0861512675466163,
"grad_norm": 6.308145999908447,
"learning_rate": 4.348878790136333e-05,
"loss": 4.2812,
"step": 3240
},
{
"epoch": 1.0928556463440184,
"grad_norm": 6.858571529388428,
"learning_rate": 4.3414288907099756e-05,
"loss": 4.2798,
"step": 3260
},
{
"epoch": 1.0995600251414206,
"grad_norm": 6.711289882659912,
"learning_rate": 4.333978991283618e-05,
"loss": 4.2203,
"step": 3280
},
{
"epoch": 1.1062644039388225,
"grad_norm": 6.270653247833252,
"learning_rate": 4.3265290918572605e-05,
"loss": 4.259,
"step": 3300
},
{
"epoch": 1.1129687827362247,
"grad_norm": 5.929893493652344,
"learning_rate": 4.3190791924309025e-05,
"loss": 4.3751,
"step": 3320
},
{
"epoch": 1.1196731615336266,
"grad_norm": 5.837188720703125,
"learning_rate": 4.3116292930045446e-05,
"loss": 4.2712,
"step": 3340
},
{
"epoch": 1.1263775403310288,
"grad_norm": 6.670574188232422,
"learning_rate": 4.3041793935781874e-05,
"loss": 4.2151,
"step": 3360
},
{
"epoch": 1.1330819191284307,
"grad_norm": 6.452718734741211,
"learning_rate": 4.2967294941518295e-05,
"loss": 4.321,
"step": 3380
},
{
"epoch": 1.1397862979258329,
"grad_norm": 6.4428391456604,
"learning_rate": 4.2892795947254716e-05,
"loss": 4.246,
"step": 3400
},
{
"epoch": 1.1464906767232348,
"grad_norm": 6.246615409851074,
"learning_rate": 4.2818296952991137e-05,
"loss": 4.2763,
"step": 3420
},
{
"epoch": 1.153195055520637,
"grad_norm": 6.092718124389648,
"learning_rate": 4.274379795872756e-05,
"loss": 4.1313,
"step": 3440
},
{
"epoch": 1.1598994343180389,
"grad_norm": 5.333466529846191,
"learning_rate": 4.2669298964463985e-05,
"loss": 4.2007,
"step": 3460
},
{
"epoch": 1.166603813115441,
"grad_norm": 7.08294677734375,
"learning_rate": 4.2594799970200406e-05,
"loss": 4.1708,
"step": 3480
},
{
"epoch": 1.173308191912843,
"grad_norm": 6.408305644989014,
"learning_rate": 4.252030097593683e-05,
"loss": 4.2438,
"step": 3500
},
{
"epoch": 1.1800125707102451,
"grad_norm": 5.942695140838623,
"learning_rate": 4.244580198167325e-05,
"loss": 4.1205,
"step": 3520
},
{
"epoch": 1.1867169495076473,
"grad_norm": 6.69981050491333,
"learning_rate": 4.2371302987409675e-05,
"loss": 4.1979,
"step": 3540
},
{
"epoch": 1.1934213283050492,
"grad_norm": 7.073486804962158,
"learning_rate": 4.2296803993146096e-05,
"loss": 4.286,
"step": 3560
},
{
"epoch": 1.2001257071024514,
"grad_norm": 6.129693031311035,
"learning_rate": 4.222230499888252e-05,
"loss": 4.1252,
"step": 3580
},
{
"epoch": 1.2068300858998533,
"grad_norm": 6.761497497558594,
"learning_rate": 4.214780600461894e-05,
"loss": 4.16,
"step": 3600
},
{
"epoch": 1.2135344646972555,
"grad_norm": 6.681031703948975,
"learning_rate": 4.207330701035536e-05,
"loss": 4.239,
"step": 3620
},
{
"epoch": 1.2202388434946574,
"grad_norm": 6.160006523132324,
"learning_rate": 4.1998808016091787e-05,
"loss": 4.1435,
"step": 3640
},
{
"epoch": 1.2269432222920595,
"grad_norm": 6.183200359344482,
"learning_rate": 4.192430902182821e-05,
"loss": 4.1166,
"step": 3660
},
{
"epoch": 1.2336476010894615,
"grad_norm": 5.975028991699219,
"learning_rate": 4.184981002756463e-05,
"loss": 4.0858,
"step": 3680
},
{
"epoch": 1.2403519798868636,
"grad_norm": 6.827803134918213,
"learning_rate": 4.177531103330105e-05,
"loss": 4.1952,
"step": 3700
},
{
"epoch": 1.2470563586842656,
"grad_norm": 6.478833198547363,
"learning_rate": 4.170081203903747e-05,
"loss": 4.059,
"step": 3720
},
{
"epoch": 1.2537607374816677,
"grad_norm": 6.197700500488281,
"learning_rate": 4.16263130447739e-05,
"loss": 4.2003,
"step": 3740
},
{
"epoch": 1.2604651162790699,
"grad_norm": 5.54361629486084,
"learning_rate": 4.155181405051032e-05,
"loss": 4.1438,
"step": 3760
},
{
"epoch": 1.2671694950764718,
"grad_norm": 5.62382698059082,
"learning_rate": 4.147731505624674e-05,
"loss": 4.154,
"step": 3780
},
{
"epoch": 1.2738738738738737,
"grad_norm": 5.418813228607178,
"learning_rate": 4.140281606198316e-05,
"loss": 4.1561,
"step": 3800
},
{
"epoch": 1.280578252671276,
"grad_norm": 5.975061893463135,
"learning_rate": 4.132831706771959e-05,
"loss": 4.178,
"step": 3820
},
{
"epoch": 1.287282631468678,
"grad_norm": 6.231929302215576,
"learning_rate": 4.125381807345601e-05,
"loss": 4.1253,
"step": 3840
},
{
"epoch": 1.29398701026608,
"grad_norm": 6.092617034912109,
"learning_rate": 4.1179319079192437e-05,
"loss": 4.1438,
"step": 3860
},
{
"epoch": 1.3006913890634821,
"grad_norm": 6.094106674194336,
"learning_rate": 4.110482008492886e-05,
"loss": 4.1144,
"step": 3880
},
{
"epoch": 1.307395767860884,
"grad_norm": 6.208296298980713,
"learning_rate": 4.103032109066528e-05,
"loss": 4.1352,
"step": 3900
},
{
"epoch": 1.3141001466582862,
"grad_norm": 5.595242500305176,
"learning_rate": 4.09558220964017e-05,
"loss": 4.1146,
"step": 3920
},
{
"epoch": 1.3208045254556882,
"grad_norm": 6.050002098083496,
"learning_rate": 4.088132310213813e-05,
"loss": 4.0787,
"step": 3940
},
{
"epoch": 1.3275089042530903,
"grad_norm": 5.937078952789307,
"learning_rate": 4.080682410787455e-05,
"loss": 4.2249,
"step": 3960
},
{
"epoch": 1.3342132830504925,
"grad_norm": 6.359293460845947,
"learning_rate": 4.073232511361097e-05,
"loss": 4.0592,
"step": 3980
},
{
"epoch": 1.3409176618478944,
"grad_norm": 5.486398696899414,
"learning_rate": 4.065782611934739e-05,
"loss": 4.183,
"step": 4000
},
{
"epoch": 1.3409176618478944,
"eval_bleu_greedy": 1.8924225876289293,
"eval_loss": 0.5379119515419006,
"eval_runtime": 41.2638,
"eval_samples_per_second": 2.423,
"eval_steps_per_second": 2.423,
"step": 4000
},
{
"epoch": 1.3476220406452963,
"grad_norm": 6.733095169067383,
"learning_rate": 4.058332712508382e-05,
"loss": 4.1826,
"step": 4020
},
{
"epoch": 1.3543264194426985,
"grad_norm": 6.984947681427002,
"learning_rate": 4.050882813082024e-05,
"loss": 4.0244,
"step": 4040
},
{
"epoch": 1.3610307982401006,
"grad_norm": 6.321763515472412,
"learning_rate": 4.043432913655666e-05,
"loss": 4.1419,
"step": 4060
},
{
"epoch": 1.3677351770375026,
"grad_norm": 6.014941692352295,
"learning_rate": 4.035983014229308e-05,
"loss": 4.0579,
"step": 4080
},
{
"epoch": 1.3744395558349047,
"grad_norm": 6.782519340515137,
"learning_rate": 4.02853311480295e-05,
"loss": 4.037,
"step": 4100
},
{
"epoch": 1.3811439346323067,
"grad_norm": 5.457937717437744,
"learning_rate": 4.021083215376593e-05,
"loss": 4.0483,
"step": 4120
},
{
"epoch": 1.3878483134297088,
"grad_norm": 6.121335983276367,
"learning_rate": 4.013633315950235e-05,
"loss": 4.0982,
"step": 4140
},
{
"epoch": 1.3945526922271108,
"grad_norm": 6.334305763244629,
"learning_rate": 4.006183416523877e-05,
"loss": 4.106,
"step": 4160
},
{
"epoch": 1.401257071024513,
"grad_norm": 6.201812744140625,
"learning_rate": 3.998733517097519e-05,
"loss": 4.0633,
"step": 4180
},
{
"epoch": 1.407961449821915,
"grad_norm": 6.2243828773498535,
"learning_rate": 3.991283617671162e-05,
"loss": 4.0215,
"step": 4200
},
{
"epoch": 1.414665828619317,
"grad_norm": 6.266222953796387,
"learning_rate": 3.983833718244804e-05,
"loss": 4.1322,
"step": 4220
},
{
"epoch": 1.421370207416719,
"grad_norm": 5.890945911407471,
"learning_rate": 3.976383818818446e-05,
"loss": 3.8993,
"step": 4240
},
{
"epoch": 1.428074586214121,
"grad_norm": 5.7960991859436035,
"learning_rate": 3.968933919392088e-05,
"loss": 4.1042,
"step": 4260
},
{
"epoch": 1.4347789650115232,
"grad_norm": 6.632575988769531,
"learning_rate": 3.96148401996573e-05,
"loss": 4.1236,
"step": 4280
},
{
"epoch": 1.4414833438089252,
"grad_norm": 6.313004493713379,
"learning_rate": 3.954034120539373e-05,
"loss": 4.0445,
"step": 4300
},
{
"epoch": 1.4481877226063273,
"grad_norm": 6.819790840148926,
"learning_rate": 3.946584221113015e-05,
"loss": 4.0814,
"step": 4320
},
{
"epoch": 1.4548921014037293,
"grad_norm": 5.719134330749512,
"learning_rate": 3.939134321686657e-05,
"loss": 3.9592,
"step": 4340
},
{
"epoch": 1.4615964802011314,
"grad_norm": 6.105227470397949,
"learning_rate": 3.931684422260299e-05,
"loss": 4.0809,
"step": 4360
},
{
"epoch": 1.4683008589985334,
"grad_norm": 6.068193435668945,
"learning_rate": 3.924234522833941e-05,
"loss": 4.1412,
"step": 4380
},
{
"epoch": 1.4750052377959355,
"grad_norm": 5.725421905517578,
"learning_rate": 3.916784623407585e-05,
"loss": 4.0711,
"step": 4400
},
{
"epoch": 1.4817096165933374,
"grad_norm": 6.790429592132568,
"learning_rate": 3.909334723981227e-05,
"loss": 3.9287,
"step": 4420
},
{
"epoch": 1.4884139953907396,
"grad_norm": 6.1765971183776855,
"learning_rate": 3.901884824554869e-05,
"loss": 4.0219,
"step": 4440
},
{
"epoch": 1.4951183741881415,
"grad_norm": 5.821228981018066,
"learning_rate": 3.894434925128511e-05,
"loss": 3.9816,
"step": 4460
},
{
"epoch": 1.5018227529855437,
"grad_norm": 6.144356727600098,
"learning_rate": 3.886985025702153e-05,
"loss": 3.9802,
"step": 4480
},
{
"epoch": 1.5085271317829458,
"grad_norm": 5.687018871307373,
"learning_rate": 3.879535126275796e-05,
"loss": 3.9389,
"step": 4500
},
{
"epoch": 1.5152315105803478,
"grad_norm": 5.2722673416137695,
"learning_rate": 3.872085226849438e-05,
"loss": 4.115,
"step": 4520
},
{
"epoch": 1.5219358893777497,
"grad_norm": 5.904730796813965,
"learning_rate": 3.86463532742308e-05,
"loss": 4.0509,
"step": 4540
},
{
"epoch": 1.5286402681751519,
"grad_norm": 6.75799560546875,
"learning_rate": 3.857185427996722e-05,
"loss": 4.074,
"step": 4560
},
{
"epoch": 1.535344646972554,
"grad_norm": 6.9323039054870605,
"learning_rate": 3.849735528570364e-05,
"loss": 3.9566,
"step": 4580
},
{
"epoch": 1.542049025769956,
"grad_norm": 6.0153398513793945,
"learning_rate": 3.842285629144007e-05,
"loss": 4.0422,
"step": 4600
},
{
"epoch": 1.548753404567358,
"grad_norm": 6.283784866333008,
"learning_rate": 3.834835729717649e-05,
"loss": 3.9645,
"step": 4620
},
{
"epoch": 1.5554577833647603,
"grad_norm": 6.145251274108887,
"learning_rate": 3.827385830291291e-05,
"loss": 4.0442,
"step": 4640
},
{
"epoch": 1.5621621621621622,
"grad_norm": 5.55891227722168,
"learning_rate": 3.819935930864933e-05,
"loss": 4.0023,
"step": 4660
},
{
"epoch": 1.5688665409595641,
"grad_norm": 6.398717880249023,
"learning_rate": 3.812486031438576e-05,
"loss": 3.9904,
"step": 4680
},
{
"epoch": 1.5755709197569663,
"grad_norm": 7.079937934875488,
"learning_rate": 3.805036132012218e-05,
"loss": 3.851,
"step": 4700
},
{
"epoch": 1.5822752985543684,
"grad_norm": 5.542669773101807,
"learning_rate": 3.79758623258586e-05,
"loss": 3.9195,
"step": 4720
},
{
"epoch": 1.5889796773517704,
"grad_norm": 6.840109348297119,
"learning_rate": 3.790136333159502e-05,
"loss": 4.0468,
"step": 4740
},
{
"epoch": 1.5956840561491723,
"grad_norm": 5.582828044891357,
"learning_rate": 3.7826864337331444e-05,
"loss": 3.897,
"step": 4760
},
{
"epoch": 1.6023884349465745,
"grad_norm": 5.687999725341797,
"learning_rate": 3.775236534306787e-05,
"loss": 4.0053,
"step": 4780
},
{
"epoch": 1.6090928137439766,
"grad_norm": 5.822837829589844,
"learning_rate": 3.767786634880429e-05,
"loss": 3.9547,
"step": 4800
},
{
"epoch": 1.6157971925413785,
"grad_norm": 6.070379257202148,
"learning_rate": 3.7603367354540713e-05,
"loss": 4.0644,
"step": 4820
},
{
"epoch": 1.6225015713387805,
"grad_norm": 5.518836975097656,
"learning_rate": 3.7528868360277134e-05,
"loss": 3.8831,
"step": 4840
},
{
"epoch": 1.6292059501361829,
"grad_norm": 5.762600898742676,
"learning_rate": 3.745436936601356e-05,
"loss": 3.9588,
"step": 4860
},
{
"epoch": 1.6359103289335848,
"grad_norm": 6.048323154449463,
"learning_rate": 3.737987037174998e-05,
"loss": 3.9825,
"step": 4880
},
{
"epoch": 1.6426147077309867,
"grad_norm": 5.4770097732543945,
"learning_rate": 3.7305371377486404e-05,
"loss": 3.8577,
"step": 4900
},
{
"epoch": 1.6493190865283889,
"grad_norm": 6.0199294090271,
"learning_rate": 3.7230872383222825e-05,
"loss": 3.8402,
"step": 4920
},
{
"epoch": 1.656023465325791,
"grad_norm": 5.508368492126465,
"learning_rate": 3.715637338895925e-05,
"loss": 3.8506,
"step": 4940
},
{
"epoch": 1.662727844123193,
"grad_norm": 5.987868309020996,
"learning_rate": 3.708187439469567e-05,
"loss": 3.928,
"step": 4960
},
{
"epoch": 1.669432222920595,
"grad_norm": 6.022989749908447,
"learning_rate": 3.70073754004321e-05,
"loss": 3.9483,
"step": 4980
},
{
"epoch": 1.676136601717997,
"grad_norm": 5.780736446380615,
"learning_rate": 3.693287640616852e-05,
"loss": 3.9055,
"step": 5000
},
{
"epoch": 1.676136601717997,
"eval_bleu_greedy": 2.074504503599086,
"eval_loss": 0.5163004398345947,
"eval_runtime": 114.7725,
"eval_samples_per_second": 0.871,
"eval_steps_per_second": 0.871,
"step": 5000
},
{
"epoch": 1.6828409805153992,
"grad_norm": 7.780221462249756,
"learning_rate": 3.685837741190494e-05,
"loss": 4.0413,
"step": 5020
},
{
"epoch": 1.6895453593128011,
"grad_norm": 5.654071807861328,
"learning_rate": 3.6783878417641363e-05,
"loss": 3.9454,
"step": 5040
},
{
"epoch": 1.696249738110203,
"grad_norm": 5.763638019561768,
"learning_rate": 3.6709379423377784e-05,
"loss": 3.9884,
"step": 5060
},
{
"epoch": 1.7029541169076052,
"grad_norm": 5.78656005859375,
"learning_rate": 3.663488042911421e-05,
"loss": 3.9436,
"step": 5080
},
{
"epoch": 1.7096584957050074,
"grad_norm": 6.413984775543213,
"learning_rate": 3.656038143485063e-05,
"loss": 3.9803,
"step": 5100
},
{
"epoch": 1.7163628745024093,
"grad_norm": 5.727552890777588,
"learning_rate": 3.6485882440587054e-05,
"loss": 4.0424,
"step": 5120
},
{
"epoch": 1.7230672532998115,
"grad_norm": 5.366096496582031,
"learning_rate": 3.6411383446323475e-05,
"loss": 3.9212,
"step": 5140
},
{
"epoch": 1.7297716320972136,
"grad_norm": 5.877246856689453,
"learning_rate": 3.63368844520599e-05,
"loss": 3.9313,
"step": 5160
},
{
"epoch": 1.7364760108946156,
"grad_norm": 4.906258583068848,
"learning_rate": 3.626238545779632e-05,
"loss": 3.9713,
"step": 5180
},
{
"epoch": 1.7431803896920175,
"grad_norm": 5.745492458343506,
"learning_rate": 3.6187886463532744e-05,
"loss": 3.9141,
"step": 5200
},
{
"epoch": 1.7498847684894197,
"grad_norm": 5.654531002044678,
"learning_rate": 3.6113387469269165e-05,
"loss": 3.9644,
"step": 5220
},
{
"epoch": 1.7565891472868218,
"grad_norm": 5.877029895782471,
"learning_rate": 3.6038888475005586e-05,
"loss": 3.9801,
"step": 5240
},
{
"epoch": 1.7632935260842237,
"grad_norm": 6.766676425933838,
"learning_rate": 3.5964389480742013e-05,
"loss": 3.9611,
"step": 5260
},
{
"epoch": 1.7699979048816257,
"grad_norm": 6.007946968078613,
"learning_rate": 3.5889890486478434e-05,
"loss": 3.9596,
"step": 5280
},
{
"epoch": 1.7767022836790278,
"grad_norm": 5.436508655548096,
"learning_rate": 3.5815391492214855e-05,
"loss": 3.9234,
"step": 5300
},
{
"epoch": 1.78340666247643,
"grad_norm": 6.275641441345215,
"learning_rate": 3.5740892497951276e-05,
"loss": 3.9003,
"step": 5320
},
{
"epoch": 1.790111041273832,
"grad_norm": 5.461209774017334,
"learning_rate": 3.5666393503687704e-05,
"loss": 3.9583,
"step": 5340
},
{
"epoch": 1.796815420071234,
"grad_norm": 5.981624603271484,
"learning_rate": 3.5591894509424125e-05,
"loss": 3.8287,
"step": 5360
},
{
"epoch": 1.8035197988686362,
"grad_norm": 5.947275638580322,
"learning_rate": 3.5517395515160546e-05,
"loss": 3.9774,
"step": 5380
},
{
"epoch": 1.8102241776660382,
"grad_norm": 6.036500930786133,
"learning_rate": 3.5442896520896966e-05,
"loss": 3.8426,
"step": 5400
},
{
"epoch": 1.81692855646344,
"grad_norm": 5.644313335418701,
"learning_rate": 3.536839752663339e-05,
"loss": 3.8918,
"step": 5420
},
{
"epoch": 1.8236329352608422,
"grad_norm": 6.368311882019043,
"learning_rate": 3.5293898532369815e-05,
"loss": 3.9701,
"step": 5440
},
{
"epoch": 1.8303373140582444,
"grad_norm": 5.9047932624816895,
"learning_rate": 3.5219399538106236e-05,
"loss": 3.878,
"step": 5460
},
{
"epoch": 1.8370416928556463,
"grad_norm": 6.0813679695129395,
"learning_rate": 3.514490054384266e-05,
"loss": 3.9292,
"step": 5480
},
{
"epoch": 1.8437460716530483,
"grad_norm": 6.120319843292236,
"learning_rate": 3.5070401549579084e-05,
"loss": 3.8474,
"step": 5500
},
{
"epoch": 1.8504504504504504,
"grad_norm": 6.296043395996094,
"learning_rate": 3.4995902555315505e-05,
"loss": 3.7354,
"step": 5520
},
{
"epoch": 1.8571548292478526,
"grad_norm": 5.409097671508789,
"learning_rate": 3.492140356105193e-05,
"loss": 3.8795,
"step": 5540
},
{
"epoch": 1.8638592080452545,
"grad_norm": 6.105241775512695,
"learning_rate": 3.4846904566788354e-05,
"loss": 3.9765,
"step": 5560
},
{
"epoch": 1.8705635868426564,
"grad_norm": 5.878379821777344,
"learning_rate": 3.4772405572524775e-05,
"loss": 3.8718,
"step": 5580
},
{
"epoch": 1.8772679656400588,
"grad_norm": 5.730438709259033,
"learning_rate": 3.4697906578261196e-05,
"loss": 4.0157,
"step": 5600
},
{
"epoch": 1.8839723444374608,
"grad_norm": 5.375248432159424,
"learning_rate": 3.4623407583997616e-05,
"loss": 3.9336,
"step": 5620
},
{
"epoch": 1.8906767232348627,
"grad_norm": 6.077249526977539,
"learning_rate": 3.4548908589734044e-05,
"loss": 3.8112,
"step": 5640
},
{
"epoch": 1.8973811020322648,
"grad_norm": 5.51649808883667,
"learning_rate": 3.4474409595470465e-05,
"loss": 3.9677,
"step": 5660
},
{
"epoch": 1.904085480829667,
"grad_norm": 5.96297025680542,
"learning_rate": 3.4399910601206886e-05,
"loss": 3.8401,
"step": 5680
},
{
"epoch": 1.910789859627069,
"grad_norm": 5.845096588134766,
"learning_rate": 3.432541160694331e-05,
"loss": 3.9445,
"step": 5700
},
{
"epoch": 1.9174942384244709,
"grad_norm": 6.597667694091797,
"learning_rate": 3.425091261267973e-05,
"loss": 3.8767,
"step": 5720
},
{
"epoch": 1.924198617221873,
"grad_norm": 5.085957050323486,
"learning_rate": 3.4176413618416155e-05,
"loss": 3.816,
"step": 5740
},
{
"epoch": 1.9309029960192752,
"grad_norm": 5.354710578918457,
"learning_rate": 3.4101914624152576e-05,
"loss": 3.7329,
"step": 5760
},
{
"epoch": 1.937607374816677,
"grad_norm": 6.152263641357422,
"learning_rate": 3.4027415629889e-05,
"loss": 3.9058,
"step": 5780
},
{
"epoch": 1.944311753614079,
"grad_norm": 5.678866863250732,
"learning_rate": 3.395291663562542e-05,
"loss": 3.9123,
"step": 5800
},
{
"epoch": 1.9510161324114812,
"grad_norm": 5.211181640625,
"learning_rate": 3.3878417641361846e-05,
"loss": 3.857,
"step": 5820
},
{
"epoch": 1.9577205112088834,
"grad_norm": 5.752172470092773,
"learning_rate": 3.3803918647098266e-05,
"loss": 3.9512,
"step": 5840
},
{
"epoch": 1.9644248900062853,
"grad_norm": 5.982390403747559,
"learning_rate": 3.372941965283469e-05,
"loss": 3.8322,
"step": 5860
},
{
"epoch": 1.9711292688036874,
"grad_norm": 6.09535551071167,
"learning_rate": 3.365492065857111e-05,
"loss": 3.8806,
"step": 5880
},
{
"epoch": 1.9778336476010896,
"grad_norm": 6.2229905128479,
"learning_rate": 3.358042166430753e-05,
"loss": 3.885,
"step": 5900
},
{
"epoch": 1.9845380263984915,
"grad_norm": 5.936634540557861,
"learning_rate": 3.350592267004396e-05,
"loss": 3.8126,
"step": 5920
},
{
"epoch": 1.9912424051958935,
"grad_norm": 5.78571081161499,
"learning_rate": 3.343142367578038e-05,
"loss": 4.0106,
"step": 5940
},
{
"epoch": 1.9979467839932956,
"grad_norm": 6.465760231018066,
"learning_rate": 3.33569246815168e-05,
"loss": 3.8896,
"step": 5960
},
{
"epoch": 2.004357846218311,
"grad_norm": 6.171356201171875,
"learning_rate": 3.328242568725322e-05,
"loss": 3.4701,
"step": 5980
},
{
"epoch": 2.0110622250157135,
"grad_norm": 5.609477519989014,
"learning_rate": 3.320792669298965e-05,
"loss": 3.6326,
"step": 6000
},
{
"epoch": 2.0110622250157135,
"eval_bleu_greedy": 2.0660877864138794,
"eval_loss": 0.5018166899681091,
"eval_runtime": 118.8592,
"eval_samples_per_second": 0.841,
"eval_steps_per_second": 0.841,
"step": 6000
},
{
"epoch": 2.0177666038131155,
"grad_norm": 5.845204830169678,
"learning_rate": 3.313342769872607e-05,
"loss": 3.7089,
"step": 6020
},
{
"epoch": 2.0244709826105174,
"grad_norm": 6.158261775970459,
"learning_rate": 3.3058928704462496e-05,
"loss": 3.6354,
"step": 6040
},
{
"epoch": 2.0311753614079198,
"grad_norm": 5.758568286895752,
"learning_rate": 3.2984429710198916e-05,
"loss": 3.6658,
"step": 6060
},
{
"epoch": 2.0378797402053217,
"grad_norm": 6.4133381843566895,
"learning_rate": 3.290993071593534e-05,
"loss": 3.716,
"step": 6080
},
{
"epoch": 2.0445841190027236,
"grad_norm": 5.51917028427124,
"learning_rate": 3.283543172167176e-05,
"loss": 3.6775,
"step": 6100
},
{
"epoch": 2.0512884978001256,
"grad_norm": 5.898686408996582,
"learning_rate": 3.2760932727408186e-05,
"loss": 3.7088,
"step": 6120
},
{
"epoch": 2.057992876597528,
"grad_norm": 6.7731852531433105,
"learning_rate": 3.268643373314461e-05,
"loss": 3.5724,
"step": 6140
},
{
"epoch": 2.06469725539493,
"grad_norm": 5.695714950561523,
"learning_rate": 3.261193473888103e-05,
"loss": 3.6688,
"step": 6160
},
{
"epoch": 2.071401634192332,
"grad_norm": 6.517350196838379,
"learning_rate": 3.253743574461745e-05,
"loss": 3.7519,
"step": 6180
},
{
"epoch": 2.0781060129897337,
"grad_norm": 5.876154899597168,
"learning_rate": 3.2462936750353876e-05,
"loss": 3.6403,
"step": 6200
},
{
"epoch": 2.084810391787136,
"grad_norm": 6.117770671844482,
"learning_rate": 3.23884377560903e-05,
"loss": 3.636,
"step": 6220
},
{
"epoch": 2.091514770584538,
"grad_norm": 5.719681262969971,
"learning_rate": 3.231393876182672e-05,
"loss": 3.5995,
"step": 6240
},
{
"epoch": 2.09821914938194,
"grad_norm": 6.031946182250977,
"learning_rate": 3.223943976756314e-05,
"loss": 3.6692,
"step": 6260
},
{
"epoch": 2.104923528179342,
"grad_norm": 6.5091023445129395,
"learning_rate": 3.216494077329956e-05,
"loss": 3.6672,
"step": 6280
},
{
"epoch": 2.1116279069767443,
"grad_norm": 6.04213285446167,
"learning_rate": 3.209044177903599e-05,
"loss": 3.5401,
"step": 6300
},
{
"epoch": 2.1183322857741462,
"grad_norm": 6.8394036293029785,
"learning_rate": 3.201594278477241e-05,
"loss": 3.6274,
"step": 6320
},
{
"epoch": 2.125036664571548,
"grad_norm": 5.81780481338501,
"learning_rate": 3.194144379050883e-05,
"loss": 3.6487,
"step": 6340
},
{
"epoch": 2.1317410433689505,
"grad_norm": 6.043388366699219,
"learning_rate": 3.186694479624525e-05,
"loss": 3.6658,
"step": 6360
},
{
"epoch": 2.1384454221663525,
"grad_norm": 6.0370378494262695,
"learning_rate": 3.179244580198167e-05,
"loss": 3.5821,
"step": 6380
},
{
"epoch": 2.1451498009637544,
"grad_norm": 5.804340362548828,
"learning_rate": 3.17179468077181e-05,
"loss": 3.7046,
"step": 6400
},
{
"epoch": 2.1518541797611563,
"grad_norm": 5.964964866638184,
"learning_rate": 3.164344781345452e-05,
"loss": 3.6777,
"step": 6420
},
{
"epoch": 2.1585585585585587,
"grad_norm": 6.0428853034973145,
"learning_rate": 3.156894881919094e-05,
"loss": 3.7281,
"step": 6440
},
{
"epoch": 2.1652629373559606,
"grad_norm": 5.866547584533691,
"learning_rate": 3.149444982492736e-05,
"loss": 3.5978,
"step": 6460
},
{
"epoch": 2.1719673161533626,
"grad_norm": 6.283875465393066,
"learning_rate": 3.141995083066379e-05,
"loss": 3.6787,
"step": 6480
},
{
"epoch": 2.1786716949507645,
"grad_norm": 5.971242427825928,
"learning_rate": 3.134545183640021e-05,
"loss": 3.5794,
"step": 6500
},
{
"epoch": 2.185376073748167,
"grad_norm": 6.014956474304199,
"learning_rate": 3.127095284213663e-05,
"loss": 3.6924,
"step": 6520
},
{
"epoch": 2.192080452545569,
"grad_norm": 5.171935558319092,
"learning_rate": 3.119645384787305e-05,
"loss": 3.6373,
"step": 6540
},
{
"epoch": 2.1987848313429708,
"grad_norm": 6.373608112335205,
"learning_rate": 3.112195485360947e-05,
"loss": 3.6745,
"step": 6560
},
{
"epoch": 2.205489210140373,
"grad_norm": 5.605614185333252,
"learning_rate": 3.10474558593459e-05,
"loss": 3.6497,
"step": 6580
},
{
"epoch": 2.212193588937775,
"grad_norm": 5.9374613761901855,
"learning_rate": 3.097295686508233e-05,
"loss": 3.5979,
"step": 6600
},
{
"epoch": 2.218897967735177,
"grad_norm": 5.818667411804199,
"learning_rate": 3.089845787081875e-05,
"loss": 3.5593,
"step": 6620
},
{
"epoch": 2.225602346532579,
"grad_norm": 5.608858585357666,
"learning_rate": 3.082395887655517e-05,
"loss": 3.5811,
"step": 6640
},
{
"epoch": 2.2323067253299813,
"grad_norm": 6.370382785797119,
"learning_rate": 3.074945988229159e-05,
"loss": 3.6893,
"step": 6660
},
{
"epoch": 2.2390111041273832,
"grad_norm": 5.760286331176758,
"learning_rate": 3.067496088802802e-05,
"loss": 3.6869,
"step": 6680
},
{
"epoch": 2.245715482924785,
"grad_norm": 5.964378833770752,
"learning_rate": 3.060046189376444e-05,
"loss": 3.5856,
"step": 6700
},
{
"epoch": 2.252419861722187,
"grad_norm": 5.7680439949035645,
"learning_rate": 3.052596289950086e-05,
"loss": 3.6309,
"step": 6720
},
{
"epoch": 2.2591242405195895,
"grad_norm": 6.063139915466309,
"learning_rate": 3.045146390523728e-05,
"loss": 3.542,
"step": 6740
},
{
"epoch": 2.2658286193169914,
"grad_norm": 7.324517250061035,
"learning_rate": 3.0376964910973705e-05,
"loss": 3.5469,
"step": 6760
},
{
"epoch": 2.2725329981143934,
"grad_norm": 5.549790859222412,
"learning_rate": 3.0302465916710126e-05,
"loss": 3.6051,
"step": 6780
},
{
"epoch": 2.2792373769117953,
"grad_norm": 5.7682929039001465,
"learning_rate": 3.022796692244655e-05,
"loss": 3.5874,
"step": 6800
},
{
"epoch": 2.2859417557091977,
"grad_norm": 6.120064735412598,
"learning_rate": 3.015346792818297e-05,
"loss": 3.6209,
"step": 6820
},
{
"epoch": 2.2926461345065996,
"grad_norm": 5.814151287078857,
"learning_rate": 3.0078968933919395e-05,
"loss": 3.5889,
"step": 6840
},
{
"epoch": 2.2993505133040015,
"grad_norm": 6.4021077156066895,
"learning_rate": 3.0004469939655816e-05,
"loss": 3.6508,
"step": 6860
},
{
"epoch": 2.306054892101404,
"grad_norm": 6.688700199127197,
"learning_rate": 2.9929970945392237e-05,
"loss": 3.6598,
"step": 6880
},
{
"epoch": 2.312759270898806,
"grad_norm": 5.971013069152832,
"learning_rate": 2.985547195112866e-05,
"loss": 3.6805,
"step": 6900
},
{
"epoch": 2.3194636496962078,
"grad_norm": 5.857511520385742,
"learning_rate": 2.9780972956865082e-05,
"loss": 3.5774,
"step": 6920
},
{
"epoch": 2.3261680284936097,
"grad_norm": 5.832746505737305,
"learning_rate": 2.9706473962601506e-05,
"loss": 3.6995,
"step": 6940
},
{
"epoch": 2.332872407291012,
"grad_norm": 6.846590042114258,
"learning_rate": 2.9631974968337927e-05,
"loss": 3.6018,
"step": 6960
},
{
"epoch": 2.339576786088414,
"grad_norm": 5.618002891540527,
"learning_rate": 2.955747597407435e-05,
"loss": 3.5473,
"step": 6980
},
{
"epoch": 2.346281164885816,
"grad_norm": 5.3922319412231445,
"learning_rate": 2.9482976979810772e-05,
"loss": 3.5538,
"step": 7000
},
{
"epoch": 2.346281164885816,
"eval_bleu_greedy": 2.167759735525418,
"eval_loss": 0.4932926893234253,
"eval_runtime": 337.6724,
"eval_samples_per_second": 0.296,
"eval_steps_per_second": 0.296,
"step": 7000
},
{
"epoch": 2.352985543683218,
"grad_norm": 5.728118896484375,
"learning_rate": 2.9408477985547193e-05,
"loss": 3.637,
"step": 7020
},
{
"epoch": 2.3596899224806203,
"grad_norm": 5.794277191162109,
"learning_rate": 2.9333978991283618e-05,
"loss": 3.5977,
"step": 7040
},
{
"epoch": 2.366394301278022,
"grad_norm": 6.4884419441223145,
"learning_rate": 2.925947999702004e-05,
"loss": 3.5448,
"step": 7060
},
{
"epoch": 2.373098680075424,
"grad_norm": 7.127490520477295,
"learning_rate": 2.9184981002756463e-05,
"loss": 3.7117,
"step": 7080
},
{
"epoch": 2.3798030588728265,
"grad_norm": 5.834691047668457,
"learning_rate": 2.9110482008492884e-05,
"loss": 3.5537,
"step": 7100
},
{
"epoch": 2.3865074376702284,
"grad_norm": 5.9557671546936035,
"learning_rate": 2.9035983014229308e-05,
"loss": 3.5724,
"step": 7120
},
{
"epoch": 2.3932118164676304,
"grad_norm": 6.775606632232666,
"learning_rate": 2.8961484019965735e-05,
"loss": 3.5767,
"step": 7140
},
{
"epoch": 2.3999161952650323,
"grad_norm": 5.350723743438721,
"learning_rate": 2.8886985025702156e-05,
"loss": 3.6228,
"step": 7160
},
{
"epoch": 2.4066205740624347,
"grad_norm": 6.00510835647583,
"learning_rate": 2.881248603143858e-05,
"loss": 3.6664,
"step": 7180
},
{
"epoch": 2.4133249528598366,
"grad_norm": 6.175734519958496,
"learning_rate": 2.8737987037175e-05,
"loss": 3.6428,
"step": 7200
},
{
"epoch": 2.4200293316572385,
"grad_norm": 6.390973091125488,
"learning_rate": 2.8663488042911422e-05,
"loss": 3.5884,
"step": 7220
},
{
"epoch": 2.4267337104546405,
"grad_norm": 6.262541770935059,
"learning_rate": 2.8588989048647847e-05,
"loss": 3.5878,
"step": 7240
},
{
"epoch": 2.433438089252043,
"grad_norm": 5.385353088378906,
"learning_rate": 2.8514490054384268e-05,
"loss": 3.5796,
"step": 7260
},
{
"epoch": 2.440142468049445,
"grad_norm": 5.796669006347656,
"learning_rate": 2.8439991060120692e-05,
"loss": 3.5707,
"step": 7280
},
{
"epoch": 2.4468468468468467,
"grad_norm": 6.3658857345581055,
"learning_rate": 2.8365492065857113e-05,
"loss": 3.6012,
"step": 7300
},
{
"epoch": 2.453551225644249,
"grad_norm": 5.848957538604736,
"learning_rate": 2.8290993071593537e-05,
"loss": 3.5894,
"step": 7320
},
{
"epoch": 2.460255604441651,
"grad_norm": 6.327582836151123,
"learning_rate": 2.8216494077329958e-05,
"loss": 3.5888,
"step": 7340
},
{
"epoch": 2.466959983239053,
"grad_norm": 6.102633476257324,
"learning_rate": 2.814199508306638e-05,
"loss": 3.6819,
"step": 7360
},
{
"epoch": 2.473664362036455,
"grad_norm": 5.988522529602051,
"learning_rate": 2.8067496088802803e-05,
"loss": 3.4606,
"step": 7380
},
{
"epoch": 2.4803687408338573,
"grad_norm": 5.952702522277832,
"learning_rate": 2.7992997094539224e-05,
"loss": 3.5249,
"step": 7400
},
{
"epoch": 2.487073119631259,
"grad_norm": 6.563230514526367,
"learning_rate": 2.7918498100275648e-05,
"loss": 3.5629,
"step": 7420
},
{
"epoch": 2.493777498428661,
"grad_norm": 5.4130988121032715,
"learning_rate": 2.784399910601207e-05,
"loss": 3.5616,
"step": 7440
},
{
"epoch": 2.500481877226063,
"grad_norm": 6.370597839355469,
"learning_rate": 2.7769500111748493e-05,
"loss": 3.6258,
"step": 7460
},
{
"epoch": 2.5071862560234655,
"grad_norm": 5.860075950622559,
"learning_rate": 2.7695001117484914e-05,
"loss": 3.584,
"step": 7480
},
{
"epoch": 2.5138906348208674,
"grad_norm": 5.848262786865234,
"learning_rate": 2.762050212322134e-05,
"loss": 3.5515,
"step": 7500
},
{
"epoch": 2.5205950136182693,
"grad_norm": 6.018378257751465,
"learning_rate": 2.754600312895776e-05,
"loss": 3.6001,
"step": 7520
},
{
"epoch": 2.5272993924156717,
"grad_norm": 5.92679500579834,
"learning_rate": 2.747150413469418e-05,
"loss": 3.6455,
"step": 7540
},
{
"epoch": 2.5340037712130736,
"grad_norm": 6.104831695556641,
"learning_rate": 2.7397005140430604e-05,
"loss": 3.5881,
"step": 7560
},
{
"epoch": 2.5407081500104756,
"grad_norm": 5.604018211364746,
"learning_rate": 2.7322506146167025e-05,
"loss": 3.5521,
"step": 7580
},
{
"epoch": 2.5474125288078775,
"grad_norm": 6.820720195770264,
"learning_rate": 2.724800715190345e-05,
"loss": 3.5984,
"step": 7600
},
{
"epoch": 2.5541169076052794,
"grad_norm": 5.802369117736816,
"learning_rate": 2.717350815763987e-05,
"loss": 3.6231,
"step": 7620
},
{
"epoch": 2.560821286402682,
"grad_norm": 6.830519676208496,
"learning_rate": 2.7099009163376295e-05,
"loss": 3.513,
"step": 7640
},
{
"epoch": 2.5675256652000837,
"grad_norm": 5.891795635223389,
"learning_rate": 2.7024510169112716e-05,
"loss": 3.4983,
"step": 7660
},
{
"epoch": 2.5742300439974857,
"grad_norm": 5.775413513183594,
"learning_rate": 2.6950011174849143e-05,
"loss": 3.5026,
"step": 7680
},
{
"epoch": 2.580934422794888,
"grad_norm": 6.1186442375183105,
"learning_rate": 2.6875512180585564e-05,
"loss": 3.5892,
"step": 7700
},
{
"epoch": 2.58763880159229,
"grad_norm": 6.562758445739746,
"learning_rate": 2.680101318632199e-05,
"loss": 3.5703,
"step": 7720
},
{
"epoch": 2.594343180389692,
"grad_norm": 6.671054840087891,
"learning_rate": 2.672651419205841e-05,
"loss": 3.6545,
"step": 7740
},
{
"epoch": 2.6010475591870943,
"grad_norm": 6.263803005218506,
"learning_rate": 2.6652015197794834e-05,
"loss": 3.6269,
"step": 7760
},
{
"epoch": 2.6077519379844962,
"grad_norm": 6.635150909423828,
"learning_rate": 2.6577516203531254e-05,
"loss": 3.5478,
"step": 7780
},
{
"epoch": 2.614456316781898,
"grad_norm": 6.699692726135254,
"learning_rate": 2.650301720926768e-05,
"loss": 3.6548,
"step": 7800
},
{
"epoch": 2.6211606955793,
"grad_norm": 5.610607624053955,
"learning_rate": 2.64285182150041e-05,
"loss": 3.6276,
"step": 7820
},
{
"epoch": 2.627865074376702,
"grad_norm": 6.077248573303223,
"learning_rate": 2.6354019220740524e-05,
"loss": 3.5808,
"step": 7840
},
{
"epoch": 2.6345694531741044,
"grad_norm": 6.732864856719971,
"learning_rate": 2.6279520226476945e-05,
"loss": 3.6394,
"step": 7860
},
{
"epoch": 2.6412738319715063,
"grad_norm": 7.764287948608398,
"learning_rate": 2.6205021232213366e-05,
"loss": 3.6177,
"step": 7880
},
{
"epoch": 2.6479782107689083,
"grad_norm": 5.52256441116333,
"learning_rate": 2.613052223794979e-05,
"loss": 3.5052,
"step": 7900
},
{
"epoch": 2.6546825895663106,
"grad_norm": 5.835344314575195,
"learning_rate": 2.605602324368621e-05,
"loss": 3.5943,
"step": 7920
},
{
"epoch": 2.6613869683637126,
"grad_norm": 6.355226993560791,
"learning_rate": 2.5981524249422635e-05,
"loss": 3.512,
"step": 7940
},
{
"epoch": 2.6680913471611145,
"grad_norm": 5.310232639312744,
"learning_rate": 2.5907025255159056e-05,
"loss": 3.4958,
"step": 7960
},
{
"epoch": 2.674795725958517,
"grad_norm": 6.357884883880615,
"learning_rate": 2.583252626089548e-05,
"loss": 3.4715,
"step": 7980
},
{
"epoch": 2.681500104755919,
"grad_norm": 5.544501304626465,
"learning_rate": 2.57580272666319e-05,
"loss": 3.6412,
"step": 8000
},
{
"epoch": 2.681500104755919,
"eval_bleu_greedy": 2.0426177231502667,
"eval_loss": 0.478807270526886,
"eval_runtime": 461.9761,
"eval_samples_per_second": 0.216,
"eval_steps_per_second": 0.216,
"step": 8000
},
{
"epoch": 2.6882044835533208,
"grad_norm": 6.2651214599609375,
"learning_rate": 2.5683528272368322e-05,
"loss": 3.6418,
"step": 8020
},
{
"epoch": 2.6949088623507227,
"grad_norm": 5.576724529266357,
"learning_rate": 2.5609029278104746e-05,
"loss": 3.57,
"step": 8040
},
{
"epoch": 2.7016132411481246,
"grad_norm": 6.44381856918335,
"learning_rate": 2.5534530283841167e-05,
"loss": 3.6091,
"step": 8060
},
{
"epoch": 2.708317619945527,
"grad_norm": 6.368646621704102,
"learning_rate": 2.546003128957759e-05,
"loss": 3.6117,
"step": 8080
},
{
"epoch": 2.715021998742929,
"grad_norm": 6.0201096534729,
"learning_rate": 2.5385532295314012e-05,
"loss": 3.4642,
"step": 8100
},
{
"epoch": 2.721726377540331,
"grad_norm": 6.197525501251221,
"learning_rate": 2.5311033301050437e-05,
"loss": 3.5663,
"step": 8120
},
{
"epoch": 2.7284307563377332,
"grad_norm": 7.0478434562683105,
"learning_rate": 2.5236534306786857e-05,
"loss": 3.5866,
"step": 8140
},
{
"epoch": 2.735135135135135,
"grad_norm": 5.948623180389404,
"learning_rate": 2.516203531252328e-05,
"loss": 3.594,
"step": 8160
},
{
"epoch": 2.741839513932537,
"grad_norm": 6.0779266357421875,
"learning_rate": 2.5087536318259703e-05,
"loss": 3.5592,
"step": 8180
},
{
"epoch": 2.7485438927299395,
"grad_norm": 6.036412715911865,
"learning_rate": 2.5013037323996123e-05,
"loss": 3.6267,
"step": 8200
},
{
"epoch": 2.7552482715273414,
"grad_norm": 5.411278247833252,
"learning_rate": 2.493853832973255e-05,
"loss": 3.4901,
"step": 8220
},
{
"epoch": 2.7619526503247434,
"grad_norm": 5.945597171783447,
"learning_rate": 2.4864039335468972e-05,
"loss": 3.5108,
"step": 8240
},
{
"epoch": 2.7686570291221453,
"grad_norm": 5.927489280700684,
"learning_rate": 2.4789540341205393e-05,
"loss": 3.523,
"step": 8260
},
{
"epoch": 2.775361407919547,
"grad_norm": 5.989095211029053,
"learning_rate": 2.4715041346941817e-05,
"loss": 3.5256,
"step": 8280
},
{
"epoch": 2.7820657867169496,
"grad_norm": 5.67732572555542,
"learning_rate": 2.4640542352678238e-05,
"loss": 3.5598,
"step": 8300
},
{
"epoch": 2.7887701655143515,
"grad_norm": 5.954450607299805,
"learning_rate": 2.4566043358414662e-05,
"loss": 3.5808,
"step": 8320
},
{
"epoch": 2.7954745443117535,
"grad_norm": 6.370481967926025,
"learning_rate": 2.4491544364151083e-05,
"loss": 3.4797,
"step": 8340
},
{
"epoch": 2.802178923109156,
"grad_norm": 6.09319543838501,
"learning_rate": 2.4417045369887507e-05,
"loss": 3.5653,
"step": 8360
},
{
"epoch": 2.8088833019065578,
"grad_norm": 6.781850337982178,
"learning_rate": 2.4342546375623932e-05,
"loss": 3.5961,
"step": 8380
},
{
"epoch": 2.8155876807039597,
"grad_norm": 6.419500350952148,
"learning_rate": 2.4268047381360353e-05,
"loss": 3.5896,
"step": 8400
},
{
"epoch": 2.822292059501362,
"grad_norm": 6.156778335571289,
"learning_rate": 2.4193548387096777e-05,
"loss": 3.5622,
"step": 8420
},
{
"epoch": 2.828996438298764,
"grad_norm": 6.792672157287598,
"learning_rate": 2.4119049392833198e-05,
"loss": 3.5262,
"step": 8440
},
{
"epoch": 2.835700817096166,
"grad_norm": 6.010193347930908,
"learning_rate": 2.4044550398569622e-05,
"loss": 3.5115,
"step": 8460
},
{
"epoch": 2.842405195893568,
"grad_norm": 5.631977081298828,
"learning_rate": 2.3970051404306043e-05,
"loss": 3.5809,
"step": 8480
},
{
"epoch": 2.84910957469097,
"grad_norm": 5.957998275756836,
"learning_rate": 2.3895552410042464e-05,
"loss": 3.5782,
"step": 8500
},
{
"epoch": 2.855813953488372,
"grad_norm": 6.206627368927002,
"learning_rate": 2.3821053415778888e-05,
"loss": 3.5551,
"step": 8520
},
{
"epoch": 2.862518332285774,
"grad_norm": 5.459038257598877,
"learning_rate": 2.374655442151531e-05,
"loss": 3.6324,
"step": 8540
},
{
"epoch": 2.869222711083176,
"grad_norm": 6.841930866241455,
"learning_rate": 2.3672055427251733e-05,
"loss": 3.6122,
"step": 8560
},
{
"epoch": 2.8759270898805784,
"grad_norm": 6.351479530334473,
"learning_rate": 2.3597556432988154e-05,
"loss": 3.5087,
"step": 8580
},
{
"epoch": 2.8826314686779804,
"grad_norm": 5.920718193054199,
"learning_rate": 2.352305743872458e-05,
"loss": 3.5419,
"step": 8600
},
{
"epoch": 2.8893358474753823,
"grad_norm": 6.254413604736328,
"learning_rate": 2.3448558444461e-05,
"loss": 3.475,
"step": 8620
},
{
"epoch": 2.8960402262727847,
"grad_norm": 6.233896732330322,
"learning_rate": 2.3374059450197424e-05,
"loss": 3.6237,
"step": 8640
},
{
"epoch": 2.9027446050701866,
"grad_norm": 6.401550769805908,
"learning_rate": 2.3299560455933848e-05,
"loss": 3.5505,
"step": 8660
},
{
"epoch": 2.9094489838675885,
"grad_norm": 6.0176591873168945,
"learning_rate": 2.322506146167027e-05,
"loss": 3.4655,
"step": 8680
},
{
"epoch": 2.9161533626649905,
"grad_norm": 6.907371520996094,
"learning_rate": 2.3150562467406693e-05,
"loss": 3.4332,
"step": 8700
},
{
"epoch": 2.9228577414623924,
"grad_norm": 6.5138444900512695,
"learning_rate": 2.3076063473143114e-05,
"loss": 3.5849,
"step": 8720
},
{
"epoch": 2.929562120259795,
"grad_norm": 7.2710347175598145,
"learning_rate": 2.3001564478879538e-05,
"loss": 3.5402,
"step": 8740
},
{
"epoch": 2.9362664990571967,
"grad_norm": 6.0399909019470215,
"learning_rate": 2.292706548461596e-05,
"loss": 3.5121,
"step": 8760
},
{
"epoch": 2.9429708778545987,
"grad_norm": 6.308010578155518,
"learning_rate": 2.285256649035238e-05,
"loss": 3.473,
"step": 8780
},
{
"epoch": 2.949675256652001,
"grad_norm": 5.680022716522217,
"learning_rate": 2.2778067496088804e-05,
"loss": 3.5039,
"step": 8800
},
{
"epoch": 2.956379635449403,
"grad_norm": 5.785823345184326,
"learning_rate": 2.2703568501825225e-05,
"loss": 3.5442,
"step": 8820
},
{
"epoch": 2.963084014246805,
"grad_norm": 5.894392967224121,
"learning_rate": 2.262906950756165e-05,
"loss": 3.4217,
"step": 8840
},
{
"epoch": 2.969788393044207,
"grad_norm": 5.803259372711182,
"learning_rate": 2.255457051329807e-05,
"loss": 3.5274,
"step": 8860
},
{
"epoch": 2.976492771841609,
"grad_norm": 6.49872350692749,
"learning_rate": 2.2480071519034494e-05,
"loss": 3.5022,
"step": 8880
},
{
"epoch": 2.983197150639011,
"grad_norm": 6.164760112762451,
"learning_rate": 2.2405572524770915e-05,
"loss": 3.5333,
"step": 8900
},
{
"epoch": 2.989901529436413,
"grad_norm": 6.127744674682617,
"learning_rate": 2.233107353050734e-05,
"loss": 3.4646,
"step": 8920
},
{
"epoch": 2.996605908233815,
"grad_norm": 6.783234119415283,
"learning_rate": 2.2256574536243764e-05,
"loss": 3.5176,
"step": 8940
},
{
"epoch": 3.0030169704588308,
"grad_norm": 5.963592052459717,
"learning_rate": 2.2182075541980185e-05,
"loss": 3.202,
"step": 8960
},
{
"epoch": 3.009721349256233,
"grad_norm": 6.0857744216918945,
"learning_rate": 2.210757654771661e-05,
"loss": 3.3323,
"step": 8980
},
{
"epoch": 3.016425728053635,
"grad_norm": 6.808197975158691,
"learning_rate": 2.203307755345303e-05,
"loss": 3.4583,
"step": 9000
},
{
"epoch": 3.016425728053635,
"eval_bleu_greedy": 2.4234819018656997,
"eval_loss": 0.4750092625617981,
"eval_runtime": 445.5917,
"eval_samples_per_second": 0.224,
"eval_steps_per_second": 0.224,
"step": 9000
},
{
"epoch": 3.023130106851037,
"grad_norm": 6.107368469238281,
"learning_rate": 2.195857855918945e-05,
"loss": 3.2516,
"step": 9020
},
{
"epoch": 3.029834485648439,
"grad_norm": 6.768011093139648,
"learning_rate": 2.1884079564925875e-05,
"loss": 3.3924,
"step": 9040
},
{
"epoch": 3.0365388644458413,
"grad_norm": 6.5038743019104,
"learning_rate": 2.1809580570662296e-05,
"loss": 3.2324,
"step": 9060
},
{
"epoch": 3.0432432432432432,
"grad_norm": 5.936684608459473,
"learning_rate": 2.173508157639872e-05,
"loss": 3.3328,
"step": 9080
},
{
"epoch": 3.049947622040645,
"grad_norm": 6.226217746734619,
"learning_rate": 2.166058258213514e-05,
"loss": 3.2819,
"step": 9100
},
{
"epoch": 3.0566520008380476,
"grad_norm": 5.835100173950195,
"learning_rate": 2.1586083587871565e-05,
"loss": 3.3229,
"step": 9120
},
{
"epoch": 3.0633563796354495,
"grad_norm": 6.959074020385742,
"learning_rate": 2.1511584593607986e-05,
"loss": 3.2994,
"step": 9140
},
{
"epoch": 3.0700607584328514,
"grad_norm": 6.586850643157959,
"learning_rate": 2.1437085599344407e-05,
"loss": 3.2011,
"step": 9160
},
{
"epoch": 3.0767651372302534,
"grad_norm": 5.986579418182373,
"learning_rate": 2.1362586605080835e-05,
"loss": 3.3602,
"step": 9180
},
{
"epoch": 3.0834695160276557,
"grad_norm": 5.427093029022217,
"learning_rate": 2.1288087610817256e-05,
"loss": 3.3568,
"step": 9200
},
{
"epoch": 3.0901738948250577,
"grad_norm": 6.213650226593018,
"learning_rate": 2.121358861655368e-05,
"loss": 3.2927,
"step": 9220
},
{
"epoch": 3.0968782736224596,
"grad_norm": 6.2159342765808105,
"learning_rate": 2.11390896222901e-05,
"loss": 3.3695,
"step": 9240
},
{
"epoch": 3.1035826524198615,
"grad_norm": 6.4243597984313965,
"learning_rate": 2.106459062802652e-05,
"loss": 3.3794,
"step": 9260
},
{
"epoch": 3.110287031217264,
"grad_norm": 6.737236499786377,
"learning_rate": 2.0990091633762946e-05,
"loss": 3.3333,
"step": 9280
},
{
"epoch": 3.116991410014666,
"grad_norm": 6.42462158203125,
"learning_rate": 2.0915592639499367e-05,
"loss": 3.3944,
"step": 9300
},
{
"epoch": 3.1236957888120678,
"grad_norm": 6.075654029846191,
"learning_rate": 2.084109364523579e-05,
"loss": 3.3037,
"step": 9320
},
{
"epoch": 3.13040016760947,
"grad_norm": 5.77776575088501,
"learning_rate": 2.0766594650972212e-05,
"loss": 3.4129,
"step": 9340
},
{
"epoch": 3.137104546406872,
"grad_norm": 6.13924503326416,
"learning_rate": 2.0692095656708636e-05,
"loss": 3.3723,
"step": 9360
},
{
"epoch": 3.143808925204274,
"grad_norm": 6.142735481262207,
"learning_rate": 2.0617596662445057e-05,
"loss": 3.3321,
"step": 9380
},
{
"epoch": 3.150513304001676,
"grad_norm": 6.672779083251953,
"learning_rate": 2.0543097668181478e-05,
"loss": 3.3234,
"step": 9400
},
{
"epoch": 3.1572176827990783,
"grad_norm": 6.145503520965576,
"learning_rate": 2.0468598673917902e-05,
"loss": 3.2741,
"step": 9420
},
{
"epoch": 3.1639220615964803,
"grad_norm": 6.716073513031006,
"learning_rate": 2.0394099679654323e-05,
"loss": 3.2791,
"step": 9440
},
{
"epoch": 3.170626440393882,
"grad_norm": 6.335756778717041,
"learning_rate": 2.031960068539075e-05,
"loss": 3.2895,
"step": 9460
},
{
"epoch": 3.177330819191284,
"grad_norm": 7.066572189331055,
"learning_rate": 2.024510169112717e-05,
"loss": 3.3522,
"step": 9480
},
{
"epoch": 3.1840351979886865,
"grad_norm": 6.07637357711792,
"learning_rate": 2.0170602696863593e-05,
"loss": 3.2799,
"step": 9500
},
{
"epoch": 3.1907395767860884,
"grad_norm": 6.036308288574219,
"learning_rate": 2.0096103702600017e-05,
"loss": 3.3748,
"step": 9520
},
{
"epoch": 3.1974439555834904,
"grad_norm": 6.354751110076904,
"learning_rate": 2.0021604708336438e-05,
"loss": 3.3692,
"step": 9540
},
{
"epoch": 3.2041483343808927,
"grad_norm": 5.642934322357178,
"learning_rate": 1.9947105714072862e-05,
"loss": 3.2563,
"step": 9560
},
{
"epoch": 3.2108527131782947,
"grad_norm": 6.8418803215026855,
"learning_rate": 1.9872606719809283e-05,
"loss": 3.3056,
"step": 9580
},
{
"epoch": 3.2175570919756966,
"grad_norm": 5.866108417510986,
"learning_rate": 1.9798107725545707e-05,
"loss": 3.2476,
"step": 9600
},
{
"epoch": 3.2242614707730985,
"grad_norm": 5.497636795043945,
"learning_rate": 1.9723608731282128e-05,
"loss": 3.2634,
"step": 9620
},
{
"epoch": 3.230965849570501,
"grad_norm": 6.034823894500732,
"learning_rate": 1.9649109737018552e-05,
"loss": 3.4055,
"step": 9640
},
{
"epoch": 3.237670228367903,
"grad_norm": 7.224872589111328,
"learning_rate": 1.9574610742754973e-05,
"loss": 3.3118,
"step": 9660
},
{
"epoch": 3.244374607165305,
"grad_norm": 6.321878433227539,
"learning_rate": 1.9500111748491394e-05,
"loss": 3.3225,
"step": 9680
},
{
"epoch": 3.2510789859627067,
"grad_norm": 6.296338081359863,
"learning_rate": 1.9425612754227818e-05,
"loss": 3.349,
"step": 9700
},
{
"epoch": 3.257783364760109,
"grad_norm": 6.3536505699157715,
"learning_rate": 1.935111375996424e-05,
"loss": 3.3247,
"step": 9720
},
{
"epoch": 3.264487743557511,
"grad_norm": 5.711906433105469,
"learning_rate": 1.9276614765700667e-05,
"loss": 3.3193,
"step": 9740
},
{
"epoch": 3.271192122354913,
"grad_norm": 6.689239978790283,
"learning_rate": 1.9202115771437088e-05,
"loss": 3.272,
"step": 9760
},
{
"epoch": 3.2778965011523153,
"grad_norm": 6.301712989807129,
"learning_rate": 1.912761677717351e-05,
"loss": 3.2867,
"step": 9780
},
{
"epoch": 3.2846008799497173,
"grad_norm": 6.167557239532471,
"learning_rate": 1.9053117782909933e-05,
"loss": 3.3041,
"step": 9800
},
{
"epoch": 3.291305258747119,
"grad_norm": 6.17465353012085,
"learning_rate": 1.8978618788646354e-05,
"loss": 3.2369,
"step": 9820
},
{
"epoch": 3.298009637544521,
"grad_norm": 6.496537208557129,
"learning_rate": 1.8904119794382778e-05,
"loss": 3.3375,
"step": 9840
},
{
"epoch": 3.304714016341923,
"grad_norm": 6.527161598205566,
"learning_rate": 1.88296208001192e-05,
"loss": 3.3039,
"step": 9860
},
{
"epoch": 3.3114183951393255,
"grad_norm": 7.209779262542725,
"learning_rate": 1.8755121805855623e-05,
"loss": 3.3638,
"step": 9880
},
{
"epoch": 3.3181227739367274,
"grad_norm": 6.651127815246582,
"learning_rate": 1.8680622811592044e-05,
"loss": 3.3062,
"step": 9900
},
{
"epoch": 3.3248271527341293,
"grad_norm": 5.988480567932129,
"learning_rate": 1.8606123817328465e-05,
"loss": 3.2656,
"step": 9920
},
{
"epoch": 3.3315315315315317,
"grad_norm": 6.463028907775879,
"learning_rate": 1.853162482306489e-05,
"loss": 3.2631,
"step": 9940
},
{
"epoch": 3.3382359103289336,
"grad_norm": 5.910898685455322,
"learning_rate": 1.845712582880131e-05,
"loss": 3.2458,
"step": 9960
},
{
"epoch": 3.3449402891263356,
"grad_norm": 6.237380027770996,
"learning_rate": 1.8382626834537734e-05,
"loss": 3.32,
"step": 9980
},
{
"epoch": 3.3516446679237375,
"grad_norm": 6.681026458740234,
"learning_rate": 1.830812784027416e-05,
"loss": 3.2698,
"step": 10000
},
{
"epoch": 3.3516446679237375,
"eval_bleu_greedy": 2.2580959043747226,
"eval_loss": 0.46980682015419006,
"eval_runtime": 206.5153,
"eval_samples_per_second": 0.484,
"eval_steps_per_second": 0.484,
"step": 10000
},
{
"epoch": 3.35901948460088,
"grad_norm": 6.202412128448486,
"learning_rate": 1.823362884601058e-05,
"loss": 3.2913,
"step": 10020
},
{
"epoch": 3.365723863398282,
"grad_norm": 7.057358741760254,
"learning_rate": 1.8159129851747004e-05,
"loss": 3.2247,
"step": 10040
},
{
"epoch": 3.372428242195684,
"grad_norm": 6.253905773162842,
"learning_rate": 1.8084630857483425e-05,
"loss": 3.2799,
"step": 10060
},
{
"epoch": 3.379132620993086,
"grad_norm": 5.532748222351074,
"learning_rate": 1.801013186321985e-05,
"loss": 3.2562,
"step": 10080
},
{
"epoch": 3.385836999790488,
"grad_norm": 5.992211818695068,
"learning_rate": 1.793563286895627e-05,
"loss": 3.3089,
"step": 10100
},
{
"epoch": 3.3925413785878904,
"grad_norm": 5.832738876342773,
"learning_rate": 1.7861133874692694e-05,
"loss": 3.2162,
"step": 10120
},
{
"epoch": 3.3992457573852923,
"grad_norm": 7.200679779052734,
"learning_rate": 1.7786634880429115e-05,
"loss": 3.2683,
"step": 10140
},
{
"epoch": 3.4059501361826943,
"grad_norm": 5.809505939483643,
"learning_rate": 1.7712135886165536e-05,
"loss": 3.298,
"step": 10160
},
{
"epoch": 3.412654514980096,
"grad_norm": 6.071516513824463,
"learning_rate": 1.763763689190196e-05,
"loss": 3.289,
"step": 10180
},
{
"epoch": 3.4193588937774986,
"grad_norm": 7.4007744789123535,
"learning_rate": 1.756313789763838e-05,
"loss": 3.2438,
"step": 10200
},
{
"epoch": 3.4260632725749005,
"grad_norm": 5.917469501495361,
"learning_rate": 1.7488638903374805e-05,
"loss": 3.2932,
"step": 10220
},
{
"epoch": 3.4327676513723024,
"grad_norm": 6.20914363861084,
"learning_rate": 1.7414139909111226e-05,
"loss": 3.2827,
"step": 10240
},
{
"epoch": 3.4394720301697044,
"grad_norm": 6.218352794647217,
"learning_rate": 1.733964091484765e-05,
"loss": 3.2885,
"step": 10260
},
{
"epoch": 3.4461764089671068,
"grad_norm": 6.699190616607666,
"learning_rate": 1.7265141920584075e-05,
"loss": 3.2237,
"step": 10280
},
{
"epoch": 3.4528807877645087,
"grad_norm": 5.649641990661621,
"learning_rate": 1.7190642926320496e-05,
"loss": 3.1781,
"step": 10300
},
{
"epoch": 3.4595851665619106,
"grad_norm": 6.218810558319092,
"learning_rate": 1.711614393205692e-05,
"loss": 3.1712,
"step": 10320
},
{
"epoch": 3.466289545359313,
"grad_norm": 7.203590393066406,
"learning_rate": 1.704164493779334e-05,
"loss": 3.2399,
"step": 10340
},
{
"epoch": 3.472993924156715,
"grad_norm": 6.820786952972412,
"learning_rate": 1.6967145943529765e-05,
"loss": 3.2389,
"step": 10360
},
{
"epoch": 3.479698302954117,
"grad_norm": 6.1062798500061035,
"learning_rate": 1.6892646949266186e-05,
"loss": 3.2507,
"step": 10380
},
{
"epoch": 3.486402681751519,
"grad_norm": 6.02709436416626,
"learning_rate": 1.6818147955002607e-05,
"loss": 3.2714,
"step": 10400
},
{
"epoch": 3.493107060548921,
"grad_norm": 6.64369010925293,
"learning_rate": 1.674364896073903e-05,
"loss": 3.2319,
"step": 10420
},
{
"epoch": 3.499811439346323,
"grad_norm": 5.996265888214111,
"learning_rate": 1.6669149966475452e-05,
"loss": 3.221,
"step": 10440
},
{
"epoch": 3.506515818143725,
"grad_norm": 6.68301248550415,
"learning_rate": 1.6594650972211876e-05,
"loss": 3.2714,
"step": 10460
},
{
"epoch": 3.513220196941127,
"grad_norm": 5.8294878005981445,
"learning_rate": 1.6520151977948297e-05,
"loss": 3.2909,
"step": 10480
},
{
"epoch": 3.5199245757385293,
"grad_norm": 6.585033893585205,
"learning_rate": 1.644565298368472e-05,
"loss": 3.2091,
"step": 10500
},
{
"epoch": 3.5266289545359313,
"grad_norm": 6.6064934730529785,
"learning_rate": 1.6371153989421142e-05,
"loss": 3.2033,
"step": 10520
},
{
"epoch": 3.533333333333333,
"grad_norm": 6.687121868133545,
"learning_rate": 1.6296654995157566e-05,
"loss": 3.2532,
"step": 10540
},
{
"epoch": 3.5400377121307356,
"grad_norm": 7.002409934997559,
"learning_rate": 1.622215600089399e-05,
"loss": 3.2823,
"step": 10560
},
{
"epoch": 3.5467420909281375,
"grad_norm": 6.823233127593994,
"learning_rate": 1.614765700663041e-05,
"loss": 3.3175,
"step": 10580
},
{
"epoch": 3.5534464697255395,
"grad_norm": 5.88700532913208,
"learning_rate": 1.6073158012366836e-05,
"loss": 3.2122,
"step": 10600
},
{
"epoch": 3.5601508485229414,
"grad_norm": 6.105057716369629,
"learning_rate": 1.5998659018103257e-05,
"loss": 3.1478,
"step": 10620
},
{
"epoch": 3.5668552273203433,
"grad_norm": 6.8328022956848145,
"learning_rate": 1.592416002383968e-05,
"loss": 3.2097,
"step": 10640
},
{
"epoch": 3.5735596061177457,
"grad_norm": 6.577600002288818,
"learning_rate": 1.5849661029576102e-05,
"loss": 3.1664,
"step": 10660
},
{
"epoch": 3.5802639849151476,
"grad_norm": 7.270109176635742,
"learning_rate": 1.5775162035312523e-05,
"loss": 3.1901,
"step": 10680
},
{
"epoch": 3.5869683637125496,
"grad_norm": 6.4257683753967285,
"learning_rate": 1.5700663041048947e-05,
"loss": 3.226,
"step": 10700
},
{
"epoch": 3.593672742509952,
"grad_norm": 5.963393688201904,
"learning_rate": 1.5626164046785368e-05,
"loss": 3.2249,
"step": 10720
},
{
"epoch": 3.600377121307354,
"grad_norm": 6.7239766120910645,
"learning_rate": 1.5551665052521792e-05,
"loss": 3.2575,
"step": 10740
},
{
"epoch": 3.607081500104756,
"grad_norm": 6.665550231933594,
"learning_rate": 1.5477166058258213e-05,
"loss": 3.2503,
"step": 10760
},
{
"epoch": 3.613785878902158,
"grad_norm": 5.777255058288574,
"learning_rate": 1.5402667063994637e-05,
"loss": 3.2054,
"step": 10780
},
{
"epoch": 3.62049025769956,
"grad_norm": 6.491949558258057,
"learning_rate": 1.5328168069731058e-05,
"loss": 3.1708,
"step": 10800
},
{
"epoch": 3.627194636496962,
"grad_norm": 6.6252665519714355,
"learning_rate": 1.5253669075467482e-05,
"loss": 3.2418,
"step": 10820
},
{
"epoch": 3.633899015294364,
"grad_norm": 6.817688941955566,
"learning_rate": 1.5179170081203905e-05,
"loss": 3.259,
"step": 10840
},
{
"epoch": 3.640603394091766,
"grad_norm": 6.675405025482178,
"learning_rate": 1.5104671086940328e-05,
"loss": 3.218,
"step": 10860
},
{
"epoch": 3.6473077728891683,
"grad_norm": 6.145236015319824,
"learning_rate": 1.503017209267675e-05,
"loss": 3.2562,
"step": 10880
},
{
"epoch": 3.6540121516865702,
"grad_norm": 6.741012096405029,
"learning_rate": 1.4955673098413173e-05,
"loss": 3.2195,
"step": 10900
},
{
"epoch": 3.660716530483972,
"grad_norm": 6.519981861114502,
"learning_rate": 1.4881174104149595e-05,
"loss": 3.2343,
"step": 10920
},
{
"epoch": 3.6674209092813745,
"grad_norm": 5.3957648277282715,
"learning_rate": 1.4806675109886018e-05,
"loss": 3.2527,
"step": 10940
},
{
"epoch": 3.6741252880787765,
"grad_norm": 7.068119525909424,
"learning_rate": 1.473217611562244e-05,
"loss": 3.1907,
"step": 10960
},
{
"epoch": 3.6808296668761784,
"grad_norm": 6.537194728851318,
"learning_rate": 1.4657677121358861e-05,
"loss": 3.1773,
"step": 10980
},
{
"epoch": 3.687534045673581,
"grad_norm": 6.715285778045654,
"learning_rate": 1.4583178127095284e-05,
"loss": 3.2272,
"step": 11000
},
{
"epoch": 3.687534045673581,
"eval_bleu_greedy": 2.3251290926774186,
"eval_loss": 0.46657735109329224,
"eval_runtime": 464.198,
"eval_samples_per_second": 0.215,
"eval_steps_per_second": 0.215,
"step": 11000
},
{
"epoch": 3.6942384244709827,
"grad_norm": 6.894411087036133,
"learning_rate": 1.4508679132831707e-05,
"loss": 3.1783,
"step": 11020
},
{
"epoch": 3.7009428032683847,
"grad_norm": 6.030000686645508,
"learning_rate": 1.4434180138568129e-05,
"loss": 3.2033,
"step": 11040
},
{
"epoch": 3.7076471820657866,
"grad_norm": 6.902273654937744,
"learning_rate": 1.4359681144304552e-05,
"loss": 3.1508,
"step": 11060
},
{
"epoch": 3.7143515608631885,
"grad_norm": 6.1826934814453125,
"learning_rate": 1.4285182150040974e-05,
"loss": 3.2302,
"step": 11080
},
{
"epoch": 3.721055939660591,
"grad_norm": 7.401595592498779,
"learning_rate": 1.4210683155777399e-05,
"loss": 3.2139,
"step": 11100
},
{
"epoch": 3.727760318457993,
"grad_norm": 6.83165168762207,
"learning_rate": 1.4136184161513821e-05,
"loss": 3.1857,
"step": 11120
},
{
"epoch": 3.7344646972553948,
"grad_norm": 6.816708087921143,
"learning_rate": 1.4061685167250244e-05,
"loss": 3.2377,
"step": 11140
},
{
"epoch": 3.741169076052797,
"grad_norm": 5.8378987312316895,
"learning_rate": 1.3987186172986666e-05,
"loss": 3.2827,
"step": 11160
},
{
"epoch": 3.747873454850199,
"grad_norm": 7.595831394195557,
"learning_rate": 1.3912687178723089e-05,
"loss": 3.237,
"step": 11180
},
{
"epoch": 3.754577833647601,
"grad_norm": 6.578879356384277,
"learning_rate": 1.3838188184459511e-05,
"loss": 3.2416,
"step": 11200
},
{
"epoch": 3.7612822124450034,
"grad_norm": 5.939640522003174,
"learning_rate": 1.3763689190195932e-05,
"loss": 3.1734,
"step": 11220
},
{
"epoch": 3.7679865912424053,
"grad_norm": 6.304750442504883,
"learning_rate": 1.3689190195932355e-05,
"loss": 3.2416,
"step": 11240
},
{
"epoch": 3.7746909700398072,
"grad_norm": 6.177415370941162,
"learning_rate": 1.3614691201668777e-05,
"loss": 3.1675,
"step": 11260
},
{
"epoch": 3.781395348837209,
"grad_norm": 6.993617057800293,
"learning_rate": 1.35401922074052e-05,
"loss": 3.2162,
"step": 11280
},
{
"epoch": 3.788099727634611,
"grad_norm": 6.1251726150512695,
"learning_rate": 1.3465693213141623e-05,
"loss": 3.2232,
"step": 11300
},
{
"epoch": 3.7948041064320135,
"grad_norm": 6.485012054443359,
"learning_rate": 1.3391194218878045e-05,
"loss": 3.1591,
"step": 11320
},
{
"epoch": 3.8015084852294154,
"grad_norm": 6.347079277038574,
"learning_rate": 1.3316695224614468e-05,
"loss": 3.1745,
"step": 11340
},
{
"epoch": 3.8082128640268174,
"grad_norm": 7.0095744132995605,
"learning_rate": 1.324219623035089e-05,
"loss": 3.2463,
"step": 11360
},
{
"epoch": 3.8149172428242197,
"grad_norm": 6.158694267272949,
"learning_rate": 1.3167697236087315e-05,
"loss": 3.223,
"step": 11380
},
{
"epoch": 3.8216216216216217,
"grad_norm": 7.0430827140808105,
"learning_rate": 1.3093198241823737e-05,
"loss": 3.1774,
"step": 11400
},
{
"epoch": 3.8283260004190236,
"grad_norm": 6.411921501159668,
"learning_rate": 1.301869924756016e-05,
"loss": 3.2934,
"step": 11420
},
{
"epoch": 3.835030379216426,
"grad_norm": 6.355661392211914,
"learning_rate": 1.2944200253296582e-05,
"loss": 3.129,
"step": 11440
},
{
"epoch": 3.841734758013828,
"grad_norm": 5.618327617645264,
"learning_rate": 1.2869701259033005e-05,
"loss": 3.138,
"step": 11460
},
{
"epoch": 3.84843913681123,
"grad_norm": 6.159928321838379,
"learning_rate": 1.2795202264769426e-05,
"loss": 3.2041,
"step": 11480
},
{
"epoch": 3.8551435156086318,
"grad_norm": 7.234489917755127,
"learning_rate": 1.2720703270505848e-05,
"loss": 3.2447,
"step": 11500
},
{
"epoch": 3.8618478944060337,
"grad_norm": 6.748493671417236,
"learning_rate": 1.2646204276242271e-05,
"loss": 3.2015,
"step": 11520
},
{
"epoch": 3.868552273203436,
"grad_norm": 6.751996040344238,
"learning_rate": 1.2571705281978693e-05,
"loss": 3.1563,
"step": 11540
},
{
"epoch": 3.875256652000838,
"grad_norm": 6.8070783615112305,
"learning_rate": 1.2497206287715116e-05,
"loss": 3.1416,
"step": 11560
},
{
"epoch": 3.88196103079824,
"grad_norm": 6.117493152618408,
"learning_rate": 1.242270729345154e-05,
"loss": 3.1295,
"step": 11580
},
{
"epoch": 3.8886654095956423,
"grad_norm": 6.02462100982666,
"learning_rate": 1.2348208299187961e-05,
"loss": 3.2225,
"step": 11600
},
{
"epoch": 3.8953697883930443,
"grad_norm": 5.968542575836182,
"learning_rate": 1.2273709304924384e-05,
"loss": 3.175,
"step": 11620
},
{
"epoch": 3.902074167190446,
"grad_norm": 7.16673469543457,
"learning_rate": 1.2199210310660806e-05,
"loss": 3.2582,
"step": 11640
},
{
"epoch": 3.9087785459878486,
"grad_norm": 6.56205415725708,
"learning_rate": 1.2124711316397229e-05,
"loss": 3.2285,
"step": 11660
},
{
"epoch": 3.9154829247852505,
"grad_norm": 6.418534755706787,
"learning_rate": 1.2050212322133651e-05,
"loss": 3.2023,
"step": 11680
},
{
"epoch": 3.9221873035826524,
"grad_norm": 6.226400852203369,
"learning_rate": 1.1975713327870074e-05,
"loss": 3.1821,
"step": 11700
},
{
"epoch": 3.9288916823800544,
"grad_norm": 6.837843894958496,
"learning_rate": 1.1901214333606497e-05,
"loss": 3.1677,
"step": 11720
},
{
"epoch": 3.9355960611774563,
"grad_norm": 5.950616836547852,
"learning_rate": 1.182671533934292e-05,
"loss": 3.1856,
"step": 11740
},
{
"epoch": 3.9423004399748587,
"grad_norm": 6.373692035675049,
"learning_rate": 1.1752216345079342e-05,
"loss": 3.2501,
"step": 11760
},
{
"epoch": 3.9490048187722606,
"grad_norm": 6.721376895904541,
"learning_rate": 1.1677717350815764e-05,
"loss": 3.2044,
"step": 11780
},
{
"epoch": 3.9557091975696625,
"grad_norm": 6.181844711303711,
"learning_rate": 1.1603218356552187e-05,
"loss": 3.1938,
"step": 11800
},
{
"epoch": 3.962413576367065,
"grad_norm": 6.3947577476501465,
"learning_rate": 1.152871936228861e-05,
"loss": 3.1984,
"step": 11820
},
{
"epoch": 3.969117955164467,
"grad_norm": 6.491850852966309,
"learning_rate": 1.1454220368025032e-05,
"loss": 3.1825,
"step": 11840
},
{
"epoch": 3.975822333961869,
"grad_norm": 6.11356782913208,
"learning_rate": 1.1379721373761455e-05,
"loss": 3.1174,
"step": 11860
},
{
"epoch": 3.982526712759271,
"grad_norm": 7.591030597686768,
"learning_rate": 1.1305222379497877e-05,
"loss": 3.2322,
"step": 11880
},
{
"epoch": 3.989231091556673,
"grad_norm": 5.719244956970215,
"learning_rate": 1.12307233852343e-05,
"loss": 3.1449,
"step": 11900
},
{
"epoch": 3.995935470354075,
"grad_norm": 6.756486892700195,
"learning_rate": 1.1156224390970722e-05,
"loss": 3.1296,
"step": 11920
},
{
"epoch": 4.002681751518961,
"grad_norm": 6.4487528800964355,
"learning_rate": 1.1081725396707145e-05,
"loss": 3.259,
"step": 11940
},
{
"epoch": 4.009386130316363,
"grad_norm": 6.847874641418457,
"learning_rate": 1.1007226402443568e-05,
"loss": 3.1175,
"step": 11960
},
{
"epoch": 4.016090509113765,
"grad_norm": 6.221479892730713,
"learning_rate": 1.093272740817999e-05,
"loss": 3.1395,
"step": 11980
},
{
"epoch": 4.022794887911167,
"grad_norm": 7.619890213012695,
"learning_rate": 1.0858228413916413e-05,
"loss": 3.0701,
"step": 12000
},
{
"epoch": 4.022794887911167,
"eval_bleu_greedy": 2.2056340612259895,
"eval_loss": 0.4672054350376129,
"eval_runtime": 519.4009,
"eval_samples_per_second": 0.193,
"eval_steps_per_second": 0.193,
"step": 12000
},
{
"epoch": 4.029499266708569,
"grad_norm": 6.872377872467041,
"learning_rate": 1.0783729419652835e-05,
"loss": 3.1056,
"step": 12020
},
{
"epoch": 4.036203645505971,
"grad_norm": 6.781915664672852,
"learning_rate": 1.0709230425389258e-05,
"loss": 3.0394,
"step": 12040
},
{
"epoch": 4.042908024303373,
"grad_norm": 6.772296905517578,
"learning_rate": 1.063473143112568e-05,
"loss": 3.1418,
"step": 12060
},
{
"epoch": 4.049612403100776,
"grad_norm": 5.7719011306762695,
"learning_rate": 1.0560232436862103e-05,
"loss": 2.9814,
"step": 12080
},
{
"epoch": 4.056316781898177,
"grad_norm": 6.217945575714111,
"learning_rate": 1.0485733442598526e-05,
"loss": 3.1063,
"step": 12100
},
{
"epoch": 4.0630211606955795,
"grad_norm": 7.190448760986328,
"learning_rate": 1.0411234448334948e-05,
"loss": 3.1758,
"step": 12120
},
{
"epoch": 4.069725539492981,
"grad_norm": 7.15440559387207,
"learning_rate": 1.033673545407137e-05,
"loss": 3.1914,
"step": 12140
},
{
"epoch": 4.076429918290383,
"grad_norm": 6.738652229309082,
"learning_rate": 1.0262236459807793e-05,
"loss": 2.9985,
"step": 12160
},
{
"epoch": 4.083134297087786,
"grad_norm": 6.6931962966918945,
"learning_rate": 1.0187737465544216e-05,
"loss": 3.1319,
"step": 12180
},
{
"epoch": 4.089838675885187,
"grad_norm": 7.118000507354736,
"learning_rate": 1.0113238471280638e-05,
"loss": 3.1749,
"step": 12200
},
{
"epoch": 4.09654305468259,
"grad_norm": 6.880594730377197,
"learning_rate": 1.0038739477017061e-05,
"loss": 3.151,
"step": 12220
},
{
"epoch": 4.103247433479992,
"grad_norm": 6.61147403717041,
"learning_rate": 9.964240482753482e-06,
"loss": 3.0578,
"step": 12240
},
{
"epoch": 4.1099518122773935,
"grad_norm": 8.498011589050293,
"learning_rate": 9.889741488489906e-06,
"loss": 3.0902,
"step": 12260
},
{
"epoch": 4.116656191074796,
"grad_norm": 7.078530788421631,
"learning_rate": 9.815242494226329e-06,
"loss": 3.1134,
"step": 12280
},
{
"epoch": 4.123360569872197,
"grad_norm": 7.098989486694336,
"learning_rate": 9.740743499962751e-06,
"loss": 3.0835,
"step": 12300
},
{
"epoch": 4.1300649486696,
"grad_norm": 6.889624118804932,
"learning_rate": 9.666244505699174e-06,
"loss": 3.0836,
"step": 12320
},
{
"epoch": 4.136769327467002,
"grad_norm": 6.000241279602051,
"learning_rate": 9.591745511435596e-06,
"loss": 3.1416,
"step": 12340
},
{
"epoch": 4.143473706264404,
"grad_norm": 7.567933559417725,
"learning_rate": 9.517246517172019e-06,
"loss": 3.2006,
"step": 12360
},
{
"epoch": 4.150178085061806,
"grad_norm": 6.100574493408203,
"learning_rate": 9.44274752290844e-06,
"loss": 3.0368,
"step": 12380
},
{
"epoch": 4.156882463859208,
"grad_norm": 6.519239902496338,
"learning_rate": 9.368248528644864e-06,
"loss": 3.1715,
"step": 12400
},
{
"epoch": 4.16358684265661,
"grad_norm": 7.143859386444092,
"learning_rate": 9.293749534381287e-06,
"loss": 3.1236,
"step": 12420
},
{
"epoch": 4.170291221454012,
"grad_norm": 7.759309768676758,
"learning_rate": 9.21925054011771e-06,
"loss": 3.1434,
"step": 12440
},
{
"epoch": 4.176995600251415,
"grad_norm": 6.477195739746094,
"learning_rate": 9.144751545854132e-06,
"loss": 3.1094,
"step": 12460
},
{
"epoch": 4.183699979048816,
"grad_norm": 6.8250813484191895,
"learning_rate": 9.070252551590554e-06,
"loss": 3.0832,
"step": 12480
},
{
"epoch": 4.1904043578462185,
"grad_norm": 6.915088176727295,
"learning_rate": 8.995753557326975e-06,
"loss": 3.1025,
"step": 12500
},
{
"epoch": 4.19710873664362,
"grad_norm": 6.240241527557373,
"learning_rate": 8.921254563063398e-06,
"loss": 3.1142,
"step": 12520
},
{
"epoch": 4.203813115441022,
"grad_norm": 5.7193522453308105,
"learning_rate": 8.846755568799822e-06,
"loss": 3.0886,
"step": 12540
},
{
"epoch": 4.210517494238425,
"grad_norm": 6.799840927124023,
"learning_rate": 8.772256574536245e-06,
"loss": 3.0755,
"step": 12560
},
{
"epoch": 4.217221873035826,
"grad_norm": 6.537818908691406,
"learning_rate": 8.697757580272667e-06,
"loss": 3.1854,
"step": 12580
},
{
"epoch": 4.223926251833229,
"grad_norm": 6.453887939453125,
"learning_rate": 8.62325858600909e-06,
"loss": 3.1353,
"step": 12600
},
{
"epoch": 4.230630630630631,
"grad_norm": 6.519958019256592,
"learning_rate": 8.54875959174551e-06,
"loss": 3.0728,
"step": 12620
},
{
"epoch": 4.237335009428032,
"grad_norm": 6.916313648223877,
"learning_rate": 8.474260597481933e-06,
"loss": 3.1365,
"step": 12640
},
{
"epoch": 4.244039388225435,
"grad_norm": 6.080234527587891,
"learning_rate": 8.399761603218358e-06,
"loss": 3.1026,
"step": 12660
},
{
"epoch": 4.250743767022837,
"grad_norm": 6.199918746948242,
"learning_rate": 8.32526260895478e-06,
"loss": 3.1174,
"step": 12680
},
{
"epoch": 4.257448145820239,
"grad_norm": 6.077798843383789,
"learning_rate": 8.250763614691203e-06,
"loss": 3.1353,
"step": 12700
},
{
"epoch": 4.264152524617641,
"grad_norm": 7.003963947296143,
"learning_rate": 8.176264620427625e-06,
"loss": 3.1721,
"step": 12720
},
{
"epoch": 4.2708569034150425,
"grad_norm": 7.276467323303223,
"learning_rate": 8.101765626164046e-06,
"loss": 3.1403,
"step": 12740
},
{
"epoch": 4.277561282212445,
"grad_norm": 6.939758777618408,
"learning_rate": 8.027266631900469e-06,
"loss": 3.084,
"step": 12760
},
{
"epoch": 4.284265661009847,
"grad_norm": 6.425601959228516,
"learning_rate": 7.952767637636891e-06,
"loss": 3.1602,
"step": 12780
},
{
"epoch": 4.290970039807249,
"grad_norm": 6.354540824890137,
"learning_rate": 7.878268643373316e-06,
"loss": 3.0745,
"step": 12800
},
{
"epoch": 4.297674418604651,
"grad_norm": 6.399191856384277,
"learning_rate": 7.803769649109738e-06,
"loss": 3.0546,
"step": 12820
},
{
"epoch": 4.3043787974020535,
"grad_norm": 7.009991645812988,
"learning_rate": 7.72927065484616e-06,
"loss": 3.1064,
"step": 12840
},
{
"epoch": 4.311083176199455,
"grad_norm": 6.549511432647705,
"learning_rate": 7.654771660582582e-06,
"loss": 3.119,
"step": 12860
},
{
"epoch": 4.317787554996857,
"grad_norm": 6.825671672821045,
"learning_rate": 7.580272666319005e-06,
"loss": 3.0588,
"step": 12880
},
{
"epoch": 4.32449193379426,
"grad_norm": 6.320077896118164,
"learning_rate": 7.505773672055427e-06,
"loss": 3.1293,
"step": 12900
},
{
"epoch": 4.331196312591661,
"grad_norm": 6.614448070526123,
"learning_rate": 7.4312746777918494e-06,
"loss": 3.0686,
"step": 12920
},
{
"epoch": 4.337900691389064,
"grad_norm": 6.515445709228516,
"learning_rate": 7.356775683528274e-06,
"loss": 3.0706,
"step": 12940
},
{
"epoch": 4.344605070186465,
"grad_norm": 7.301309585571289,
"learning_rate": 7.2822766892646954e-06,
"loss": 3.0647,
"step": 12960
},
{
"epoch": 4.3513094489838675,
"grad_norm": 7.263702869415283,
"learning_rate": 7.207777695001118e-06,
"loss": 3.1394,
"step": 12980
},
{
"epoch": 4.35801382778127,
"grad_norm": 6.944880962371826,
"learning_rate": 7.133278700737541e-06,
"loss": 3.1024,
"step": 13000
},
{
"epoch": 4.35801382778127,
"eval_bleu_greedy": 2.3552626321962884,
"eval_loss": 0.4624796211719513,
"eval_runtime": 534.5676,
"eval_samples_per_second": 0.187,
"eval_steps_per_second": 0.187,
"step": 13000
},
{
"epoch": 4.364718206578671,
"grad_norm": 6.3444414138793945,
"learning_rate": 7.058779706473963e-06,
"loss": 3.0579,
"step": 13020
},
{
"epoch": 4.371422585376074,
"grad_norm": 7.333340167999268,
"learning_rate": 6.984280712210385e-06,
"loss": 3.0466,
"step": 13040
},
{
"epoch": 4.378126964173476,
"grad_norm": 7.396731376647949,
"learning_rate": 6.9097817179468075e-06,
"loss": 3.0913,
"step": 13060
},
{
"epoch": 4.384831342970878,
"grad_norm": 7.228068828582764,
"learning_rate": 6.835282723683231e-06,
"loss": 3.0371,
"step": 13080
},
{
"epoch": 4.39153572176828,
"grad_norm": 7.033063888549805,
"learning_rate": 6.7607837294196535e-06,
"loss": 3.1139,
"step": 13100
},
{
"epoch": 4.398240100565682,
"grad_norm": 7.278767108917236,
"learning_rate": 6.686284735156076e-06,
"loss": 3.0944,
"step": 13120
},
{
"epoch": 4.404944479363084,
"grad_norm": 7.024720191955566,
"learning_rate": 6.611785740892499e-06,
"loss": 3.1348,
"step": 13140
},
{
"epoch": 4.411648858160486,
"grad_norm": 7.331086158752441,
"learning_rate": 6.53728674662892e-06,
"loss": 3.0433,
"step": 13160
},
{
"epoch": 4.418353236957888,
"grad_norm": 6.916625499725342,
"learning_rate": 6.462787752365343e-06,
"loss": 3.0803,
"step": 13180
},
{
"epoch": 4.42505761575529,
"grad_norm": 6.226632595062256,
"learning_rate": 6.3882887581017655e-06,
"loss": 3.1699,
"step": 13200
},
{
"epoch": 4.4317619945526925,
"grad_norm": 6.50321626663208,
"learning_rate": 6.313789763838189e-06,
"loss": 3.0663,
"step": 13220
},
{
"epoch": 4.438466373350094,
"grad_norm": 7.217479705810547,
"learning_rate": 6.239290769574611e-06,
"loss": 3.0969,
"step": 13240
},
{
"epoch": 4.445170752147496,
"grad_norm": 7.183232307434082,
"learning_rate": 6.164791775311034e-06,
"loss": 3.0768,
"step": 13260
},
{
"epoch": 4.451875130944899,
"grad_norm": 6.6673102378845215,
"learning_rate": 6.090292781047456e-06,
"loss": 3.1062,
"step": 13280
},
{
"epoch": 4.4585795097423,
"grad_norm": 7.2146525382995605,
"learning_rate": 6.015793786783878e-06,
"loss": 3.1092,
"step": 13300
},
{
"epoch": 4.465283888539703,
"grad_norm": 7.235128402709961,
"learning_rate": 5.941294792520302e-06,
"loss": 3.0743,
"step": 13320
},
{
"epoch": 4.471988267337105,
"grad_norm": 6.275545597076416,
"learning_rate": 5.8667957982567235e-06,
"loss": 3.0743,
"step": 13340
},
{
"epoch": 4.4786926461345065,
"grad_norm": 6.9439473152160645,
"learning_rate": 5.792296803993146e-06,
"loss": 3.1373,
"step": 13360
},
{
"epoch": 4.485397024931909,
"grad_norm": 6.065330982208252,
"learning_rate": 5.7177978097295695e-06,
"loss": 3.1196,
"step": 13380
},
{
"epoch": 4.49210140372931,
"grad_norm": 6.784725666046143,
"learning_rate": 5.643298815465991e-06,
"loss": 3.1024,
"step": 13400
},
{
"epoch": 4.498805782526713,
"grad_norm": 6.992110729217529,
"learning_rate": 5.568799821202414e-06,
"loss": 3.2035,
"step": 13420
},
{
"epoch": 4.505510161324115,
"grad_norm": 6.317196846008301,
"learning_rate": 5.494300826938836e-06,
"loss": 3.1222,
"step": 13440
},
{
"epoch": 4.512214540121517,
"grad_norm": 6.293644428253174,
"learning_rate": 5.419801832675259e-06,
"loss": 3.1413,
"step": 13460
},
{
"epoch": 4.518918918918919,
"grad_norm": 7.206140518188477,
"learning_rate": 5.3453028384116815e-06,
"loss": 3.125,
"step": 13480
},
{
"epoch": 4.525623297716321,
"grad_norm": 6.770303726196289,
"learning_rate": 5.270803844148104e-06,
"loss": 3.1346,
"step": 13500
},
{
"epoch": 4.532327676513723,
"grad_norm": 7.2229485511779785,
"learning_rate": 5.196304849884527e-06,
"loss": 3.1146,
"step": 13520
},
{
"epoch": 4.539032055311125,
"grad_norm": 7.110487461090088,
"learning_rate": 5.121805855620949e-06,
"loss": 3.1114,
"step": 13540
},
{
"epoch": 4.545736434108527,
"grad_norm": 7.351033687591553,
"learning_rate": 5.047306861357372e-06,
"loss": 3.0575,
"step": 13560
},
{
"epoch": 4.552440812905929,
"grad_norm": 6.950778484344482,
"learning_rate": 4.972807867093794e-06,
"loss": 3.0641,
"step": 13580
},
{
"epoch": 4.559145191703331,
"grad_norm": 6.550965785980225,
"learning_rate": 4.898308872830217e-06,
"loss": 3.1761,
"step": 13600
},
{
"epoch": 4.565849570500733,
"grad_norm": 6.709011554718018,
"learning_rate": 4.8238098785666396e-06,
"loss": 2.9988,
"step": 13620
},
{
"epoch": 4.572553949298135,
"grad_norm": 6.507779598236084,
"learning_rate": 4.749310884303062e-06,
"loss": 3.016,
"step": 13640
},
{
"epoch": 4.579258328095538,
"grad_norm": 6.363673210144043,
"learning_rate": 4.674811890039485e-06,
"loss": 3.0496,
"step": 13660
},
{
"epoch": 4.585962706892939,
"grad_norm": 6.965389251708984,
"learning_rate": 4.600312895775907e-06,
"loss": 3.1518,
"step": 13680
},
{
"epoch": 4.5926670856903415,
"grad_norm": 6.091116905212402,
"learning_rate": 4.52581390151233e-06,
"loss": 3.0881,
"step": 13700
},
{
"epoch": 4.599371464487744,
"grad_norm": 7.049524784088135,
"learning_rate": 4.4513149072487524e-06,
"loss": 3.1202,
"step": 13720
},
{
"epoch": 4.606075843285145,
"grad_norm": 6.323545932769775,
"learning_rate": 4.376815912985175e-06,
"loss": 3.0242,
"step": 13740
},
{
"epoch": 4.612780222082548,
"grad_norm": 7.295837879180908,
"learning_rate": 4.3023169187215976e-06,
"loss": 3.1243,
"step": 13760
},
{
"epoch": 4.61948460087995,
"grad_norm": 6.582053184509277,
"learning_rate": 4.22781792445802e-06,
"loss": 3.0687,
"step": 13780
},
{
"epoch": 4.626188979677352,
"grad_norm": 6.175601959228516,
"learning_rate": 4.153318930194443e-06,
"loss": 3.0427,
"step": 13800
},
{
"epoch": 4.632893358474754,
"grad_norm": 7.662842273712158,
"learning_rate": 4.078819935930865e-06,
"loss": 3.1979,
"step": 13820
},
{
"epoch": 4.6395977372721555,
"grad_norm": 7.036664009094238,
"learning_rate": 4.004320941667288e-06,
"loss": 3.1559,
"step": 13840
},
{
"epoch": 4.646302116069558,
"grad_norm": 5.965688228607178,
"learning_rate": 3.9298219474037105e-06,
"loss": 3.1257,
"step": 13860
},
{
"epoch": 4.65300649486696,
"grad_norm": 6.378177165985107,
"learning_rate": 3.855322953140133e-06,
"loss": 3.1514,
"step": 13880
},
{
"epoch": 4.659710873664362,
"grad_norm": 6.670738220214844,
"learning_rate": 3.780823958876555e-06,
"loss": 3.0992,
"step": 13900
},
{
"epoch": 4.666415252461764,
"grad_norm": 6.76698112487793,
"learning_rate": 3.7063249646129778e-06,
"loss": 3.1566,
"step": 13920
},
{
"epoch": 4.6731196312591665,
"grad_norm": 6.328171253204346,
"learning_rate": 3.6318259703494007e-06,
"loss": 3.0974,
"step": 13940
},
{
"epoch": 4.679824010056568,
"grad_norm": 7.151896953582764,
"learning_rate": 3.557326976085823e-06,
"loss": 3.1234,
"step": 13960
},
{
"epoch": 4.68652838885397,
"grad_norm": 6.95003080368042,
"learning_rate": 3.4828279818222455e-06,
"loss": 3.156,
"step": 13980
},
{
"epoch": 4.693232767651372,
"grad_norm": 7.1711931228637695,
"learning_rate": 3.4083289875586685e-06,
"loss": 3.0428,
"step": 14000
},
{
"epoch": 4.693232767651372,
"eval_bleu_greedy": 2.416340135494281,
"eval_loss": 0.45956096053123474,
"eval_runtime": 314.8396,
"eval_samples_per_second": 0.318,
"eval_steps_per_second": 0.318,
"step": 14000
},
{
"epoch": 4.699937146448774,
"grad_norm": 6.529189109802246,
"learning_rate": 3.3338299932950906e-06,
"loss": 3.1417,
"step": 14020
},
{
"epoch": 4.706641525246177,
"grad_norm": 7.026646137237549,
"learning_rate": 3.259330999031513e-06,
"loss": 3.1099,
"step": 14040
},
{
"epoch": 4.713345904043578,
"grad_norm": 6.561285972595215,
"learning_rate": 3.1848320047679354e-06,
"loss": 3.0675,
"step": 14060
},
{
"epoch": 4.7200502828409805,
"grad_norm": 7.228313446044922,
"learning_rate": 3.1103330105043583e-06,
"loss": 3.0952,
"step": 14080
},
{
"epoch": 4.726754661638383,
"grad_norm": 6.70543098449707,
"learning_rate": 3.035834016240781e-06,
"loss": 3.1087,
"step": 14100
},
{
"epoch": 4.733459040435784,
"grad_norm": 6.371490478515625,
"learning_rate": 2.9613350219772035e-06,
"loss": 3.0608,
"step": 14120
},
{
"epoch": 4.740163419233187,
"grad_norm": 6.534164905548096,
"learning_rate": 2.886836027713626e-06,
"loss": 3.055,
"step": 14140
},
{
"epoch": 4.746867798030589,
"grad_norm": 6.988217353820801,
"learning_rate": 2.8123370334500486e-06,
"loss": 3.066,
"step": 14160
},
{
"epoch": 4.753572176827991,
"grad_norm": 7.489045143127441,
"learning_rate": 2.737838039186471e-06,
"loss": 3.1232,
"step": 14180
},
{
"epoch": 4.760276555625393,
"grad_norm": 6.6933512687683105,
"learning_rate": 2.663339044922894e-06,
"loss": 3.0314,
"step": 14200
},
{
"epoch": 4.766980934422795,
"grad_norm": 6.849923133850098,
"learning_rate": 2.5888400506593164e-06,
"loss": 3.0855,
"step": 14220
},
{
"epoch": 4.773685313220197,
"grad_norm": 6.958053112030029,
"learning_rate": 2.5143410563957385e-06,
"loss": 3.085,
"step": 14240
},
{
"epoch": 4.780389692017599,
"grad_norm": 7.325470924377441,
"learning_rate": 2.4398420621321615e-06,
"loss": 3.1435,
"step": 14260
},
{
"epoch": 4.787094070815001,
"grad_norm": 6.421871185302734,
"learning_rate": 2.3653430678685837e-06,
"loss": 3.0735,
"step": 14280
},
{
"epoch": 4.793798449612403,
"grad_norm": 6.636096000671387,
"learning_rate": 2.2908440736050062e-06,
"loss": 3.083,
"step": 14300
},
{
"epoch": 4.8005028284098055,
"grad_norm": 7.074666976928711,
"learning_rate": 2.2163450793414292e-06,
"loss": 3.0699,
"step": 14320
},
{
"epoch": 4.807207207207207,
"grad_norm": 6.859719276428223,
"learning_rate": 2.1418460850778514e-06,
"loss": 3.126,
"step": 14340
},
{
"epoch": 4.813911586004609,
"grad_norm": 7.162552833557129,
"learning_rate": 2.0673470908142744e-06,
"loss": 3.066,
"step": 14360
},
{
"epoch": 4.820615964802011,
"grad_norm": 6.949527740478516,
"learning_rate": 1.9928480965506965e-06,
"loss": 3.0962,
"step": 14380
},
{
"epoch": 4.827320343599413,
"grad_norm": 6.28379487991333,
"learning_rate": 1.918349102287119e-06,
"loss": 3.0169,
"step": 14400
},
{
"epoch": 4.834024722396816,
"grad_norm": 6.530064582824707,
"learning_rate": 1.843850108023542e-06,
"loss": 3.0847,
"step": 14420
},
{
"epoch": 4.840729101194217,
"grad_norm": 7.0545783042907715,
"learning_rate": 1.7693511137599643e-06,
"loss": 3.1926,
"step": 14440
},
{
"epoch": 4.847433479991619,
"grad_norm": 6.212683200836182,
"learning_rate": 1.694852119496387e-06,
"loss": 3.0994,
"step": 14460
},
{
"epoch": 4.854137858789022,
"grad_norm": 6.651175498962402,
"learning_rate": 1.6203531252328094e-06,
"loss": 3.115,
"step": 14480
},
{
"epoch": 4.860842237586423,
"grad_norm": 6.536131381988525,
"learning_rate": 1.545854130969232e-06,
"loss": 3.0519,
"step": 14500
},
{
"epoch": 4.867546616383826,
"grad_norm": 6.119905948638916,
"learning_rate": 1.4713551367056546e-06,
"loss": 3.097,
"step": 14520
},
{
"epoch": 4.874250995181228,
"grad_norm": 7.160987854003906,
"learning_rate": 1.3968561424420771e-06,
"loss": 3.1391,
"step": 14540
},
{
"epoch": 4.8809553739786296,
"grad_norm": 6.599812984466553,
"learning_rate": 1.3223571481784995e-06,
"loss": 3.0939,
"step": 14560
},
{
"epoch": 4.887659752776032,
"grad_norm": 6.979626178741455,
"learning_rate": 1.247858153914922e-06,
"loss": 3.1038,
"step": 14580
},
{
"epoch": 4.894364131573434,
"grad_norm": 7.19669771194458,
"learning_rate": 1.1733591596513449e-06,
"loss": 3.1171,
"step": 14600
},
{
"epoch": 4.901068510370836,
"grad_norm": 7.708127975463867,
"learning_rate": 1.0988601653877674e-06,
"loss": 3.1308,
"step": 14620
},
{
"epoch": 4.907772889168238,
"grad_norm": 7.753808498382568,
"learning_rate": 1.0243611711241898e-06,
"loss": 3.0866,
"step": 14640
},
{
"epoch": 4.9144772679656406,
"grad_norm": 6.714838981628418,
"learning_rate": 9.498621768606124e-07,
"loss": 2.9962,
"step": 14660
},
{
"epoch": 4.921181646763042,
"grad_norm": 6.7879767417907715,
"learning_rate": 8.75363182597035e-07,
"loss": 3.0925,
"step": 14680
},
{
"epoch": 4.927886025560444,
"grad_norm": 7.638024806976318,
"learning_rate": 8.008641883334574e-07,
"loss": 3.0839,
"step": 14700
},
{
"epoch": 4.934590404357846,
"grad_norm": 6.672430038452148,
"learning_rate": 7.263651940698801e-07,
"loss": 3.0642,
"step": 14720
},
{
"epoch": 4.941294783155248,
"grad_norm": 6.447202682495117,
"learning_rate": 6.518661998063027e-07,
"loss": 3.0111,
"step": 14740
},
{
"epoch": 4.947999161952651,
"grad_norm": 6.621779441833496,
"learning_rate": 5.773672055427253e-07,
"loss": 3.1408,
"step": 14760
},
{
"epoch": 4.954703540750052,
"grad_norm": 7.014694690704346,
"learning_rate": 5.028682112791477e-07,
"loss": 3.1408,
"step": 14780
},
{
"epoch": 4.9614079195474545,
"grad_norm": 7.518828868865967,
"learning_rate": 4.2836921701557035e-07,
"loss": 3.1069,
"step": 14800
},
{
"epoch": 4.968112298344856,
"grad_norm": 6.64265775680542,
"learning_rate": 3.538702227519929e-07,
"loss": 3.0875,
"step": 14820
},
{
"epoch": 4.974816677142258,
"grad_norm": 6.18177604675293,
"learning_rate": 2.793712284884154e-07,
"loss": 3.1182,
"step": 14840
},
{
"epoch": 4.981521055939661,
"grad_norm": 6.46857213973999,
"learning_rate": 2.0487223422483797e-07,
"loss": 3.1629,
"step": 14860
},
{
"epoch": 4.988225434737062,
"grad_norm": 6.637886047363281,
"learning_rate": 1.3037323996126055e-07,
"loss": 3.0804,
"step": 14880
},
{
"epoch": 4.994929813534465,
"grad_norm": 6.32726526260376,
"learning_rate": 5.587424569768308e-08,
"loss": 3.0856,
"step": 14900
}
],
"logging_steps": 20,
"max_steps": 14915,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.0778373124393533e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}