{ "best_metric": 3.0770967925659036, "best_model_checkpoint": "/workspace/llm-storage/output/qwen-3B/checkpoint-1000", "epoch": 4.999958097632517, "eval_steps": 1000, "global_step": 14915, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006704378797402053, "grad_norm": 11.061790466308594, "learning_rate": 6.702412868632708e-07, "loss": 11.096, "step": 20 }, { "epoch": 0.013408757594804106, "grad_norm": 10.477948188781738, "learning_rate": 1.3404825737265416e-06, "loss": 11.0987, "step": 40 }, { "epoch": 0.02011313639220616, "grad_norm": 12.07636547088623, "learning_rate": 2.0107238605898126e-06, "loss": 11.0549, "step": 60 }, { "epoch": 0.02681751518960821, "grad_norm": 11.286941528320312, "learning_rate": 2.680965147453083e-06, "loss": 10.3064, "step": 80 }, { "epoch": 0.03352189398701027, "grad_norm": 6.030429840087891, "learning_rate": 3.351206434316354e-06, "loss": 9.5768, "step": 100 }, { "epoch": 0.04022627278441232, "grad_norm": 8.383974075317383, "learning_rate": 4.021447721179625e-06, "loss": 8.5186, "step": 120 }, { "epoch": 0.04693065158181437, "grad_norm": 5.529025554656982, "learning_rate": 4.691689008042896e-06, "loss": 7.5261, "step": 140 }, { "epoch": 0.05363503037921642, "grad_norm": 3.8176066875457764, "learning_rate": 5.361930294906166e-06, "loss": 7.0029, "step": 160 }, { "epoch": 0.06033940917661848, "grad_norm": 3.5050182342529297, "learning_rate": 6.032171581769437e-06, "loss": 6.9204, "step": 180 }, { "epoch": 0.06704378797402054, "grad_norm": 4.123743534088135, "learning_rate": 6.702412868632708e-06, "loss": 6.6679, "step": 200 }, { "epoch": 0.07374816677142258, "grad_norm": 3.595418691635132, "learning_rate": 7.372654155495978e-06, "loss": 6.6241, "step": 220 }, { "epoch": 0.08045254556882464, "grad_norm": 4.242733001708984, "learning_rate": 8.04289544235925e-06, "loss": 6.4657, "step": 240 }, { "epoch": 0.0871569243662267, "grad_norm": 4.478307247161865, "learning_rate": 8.71313672922252e-06, "loss": 6.4337, "step": 260 }, { "epoch": 0.09386130316362874, "grad_norm": 4.557178497314453, "learning_rate": 9.383378016085791e-06, "loss": 6.3413, "step": 280 }, { "epoch": 0.1005656819610308, "grad_norm": 4.335766792297363, "learning_rate": 1.0053619302949062e-05, "loss": 6.4344, "step": 300 }, { "epoch": 0.10727006075843284, "grad_norm": 4.590956687927246, "learning_rate": 1.0723860589812333e-05, "loss": 6.3337, "step": 320 }, { "epoch": 0.1139744395558349, "grad_norm": 4.8770060539245605, "learning_rate": 1.1394101876675605e-05, "loss": 6.2143, "step": 340 }, { "epoch": 0.12067881835323696, "grad_norm": 4.7008891105651855, "learning_rate": 1.2064343163538874e-05, "loss": 6.2898, "step": 360 }, { "epoch": 0.127383197150639, "grad_norm": 4.890043258666992, "learning_rate": 1.2734584450402146e-05, "loss": 6.1775, "step": 380 }, { "epoch": 0.13408757594804108, "grad_norm": 5.183588981628418, "learning_rate": 1.3404825737265417e-05, "loss": 6.1675, "step": 400 }, { "epoch": 0.14079195474544312, "grad_norm": 5.2846527099609375, "learning_rate": 1.4075067024128689e-05, "loss": 6.2091, "step": 420 }, { "epoch": 0.14749633354284516, "grad_norm": 5.238739490509033, "learning_rate": 1.4745308310991956e-05, "loss": 6.1628, "step": 440 }, { "epoch": 0.15420071234024724, "grad_norm": 5.562626838684082, "learning_rate": 1.5415549597855227e-05, "loss": 6.0092, "step": 460 }, { "epoch": 0.16090509113764928, "grad_norm": 6.380126476287842, "learning_rate": 1.60857908847185e-05, "loss": 6.0497, "step": 480 }, { "epoch": 0.16760946993505133, "grad_norm": 5.533380031585693, "learning_rate": 1.675603217158177e-05, "loss": 5.9952, "step": 500 }, { "epoch": 0.1743138487324534, "grad_norm": 6.026157379150391, "learning_rate": 1.742627345844504e-05, "loss": 6.0327, "step": 520 }, { "epoch": 0.18101822752985544, "grad_norm": 5.79816198348999, "learning_rate": 1.8096514745308312e-05, "loss": 5.8172, "step": 540 }, { "epoch": 0.18772260632725749, "grad_norm": 6.133901119232178, "learning_rate": 1.8766756032171583e-05, "loss": 5.7605, "step": 560 }, { "epoch": 0.19442698512465956, "grad_norm": 7.115331649780273, "learning_rate": 1.9436997319034853e-05, "loss": 5.9287, "step": 580 }, { "epoch": 0.2011313639220616, "grad_norm": 6.978466033935547, "learning_rate": 2.0107238605898124e-05, "loss": 5.7846, "step": 600 }, { "epoch": 0.20783574271946365, "grad_norm": 7.083895206451416, "learning_rate": 2.0777479892761395e-05, "loss": 5.6818, "step": 620 }, { "epoch": 0.2145401215168657, "grad_norm": 7.947028636932373, "learning_rate": 2.1447721179624665e-05, "loss": 5.7934, "step": 640 }, { "epoch": 0.22124450031426776, "grad_norm": 6.990066051483154, "learning_rate": 2.211796246648794e-05, "loss": 5.6991, "step": 660 }, { "epoch": 0.2279488791116698, "grad_norm": 6.842931747436523, "learning_rate": 2.278820375335121e-05, "loss": 5.75, "step": 680 }, { "epoch": 0.23465325790907185, "grad_norm": 6.710008144378662, "learning_rate": 2.3458445040214477e-05, "loss": 5.5526, "step": 700 }, { "epoch": 0.24135763670647392, "grad_norm": 6.721392631530762, "learning_rate": 2.4128686327077747e-05, "loss": 5.6124, "step": 720 }, { "epoch": 0.24806201550387597, "grad_norm": 7.801576614379883, "learning_rate": 2.479892761394102e-05, "loss": 5.6865, "step": 740 }, { "epoch": 0.254766394301278, "grad_norm": 7.230539798736572, "learning_rate": 2.5469168900804292e-05, "loss": 5.5821, "step": 760 }, { "epoch": 0.26147077309868005, "grad_norm": 7.283995151519775, "learning_rate": 2.6139410187667563e-05, "loss": 5.6686, "step": 780 }, { "epoch": 0.26817515189608215, "grad_norm": 7.756102085113525, "learning_rate": 2.6809651474530833e-05, "loss": 5.4761, "step": 800 }, { "epoch": 0.2748795306934842, "grad_norm": 7.6954569816589355, "learning_rate": 2.7479892761394104e-05, "loss": 5.6806, "step": 820 }, { "epoch": 0.28158390949088624, "grad_norm": 8.64757251739502, "learning_rate": 2.8150134048257378e-05, "loss": 5.525, "step": 840 }, { "epoch": 0.2882882882882883, "grad_norm": 7.394837379455566, "learning_rate": 2.8820375335120648e-05, "loss": 5.4771, "step": 860 }, { "epoch": 0.29499266708569033, "grad_norm": 8.593236923217773, "learning_rate": 2.9490616621983912e-05, "loss": 5.5333, "step": 880 }, { "epoch": 0.3016970458830924, "grad_norm": 7.424787998199463, "learning_rate": 3.0160857908847186e-05, "loss": 5.2982, "step": 900 }, { "epoch": 0.3084014246804945, "grad_norm": 7.988162994384766, "learning_rate": 3.083109919571045e-05, "loss": 5.4675, "step": 920 }, { "epoch": 0.3151058034778965, "grad_norm": 7.624905586242676, "learning_rate": 3.1501340482573724e-05, "loss": 5.4906, "step": 940 }, { "epoch": 0.32181018227529856, "grad_norm": 8.334522247314453, "learning_rate": 3.2171581769437e-05, "loss": 5.4069, "step": 960 }, { "epoch": 0.3285145610727006, "grad_norm": 8.150382041931152, "learning_rate": 3.284182305630027e-05, "loss": 5.4312, "step": 980 }, { "epoch": 0.33521893987010265, "grad_norm": 8.183965682983398, "learning_rate": 3.351206434316354e-05, "loss": 5.4012, "step": 1000 }, { "epoch": 0.33521893987010265, "eval_bleu_greedy": 3.0770967925659036, "eval_loss": 0.693783700466156, "eval_runtime": 309.6004, "eval_samples_per_second": 0.323, "eval_steps_per_second": 0.323, "step": 1000 }, { "epoch": 0.3419233186675047, "grad_norm": 8.621966361999512, "learning_rate": 3.418230563002681e-05, "loss": 5.3954, "step": 1020 }, { "epoch": 0.3486276974649068, "grad_norm": 8.160908699035645, "learning_rate": 3.485254691689008e-05, "loss": 5.4285, "step": 1040 }, { "epoch": 0.35533207626230884, "grad_norm": 7.480456352233887, "learning_rate": 3.5522788203753354e-05, "loss": 5.3167, "step": 1060 }, { "epoch": 0.3620364550597109, "grad_norm": 7.508689880371094, "learning_rate": 3.6193029490616625e-05, "loss": 5.3439, "step": 1080 }, { "epoch": 0.3687408338571129, "grad_norm": 9.118748664855957, "learning_rate": 3.6863270777479895e-05, "loss": 5.3604, "step": 1100 }, { "epoch": 0.37544521265451497, "grad_norm": 7.6000471115112305, "learning_rate": 3.7533512064343166e-05, "loss": 5.2976, "step": 1120 }, { "epoch": 0.382149591451917, "grad_norm": 7.776809215545654, "learning_rate": 3.8203753351206436e-05, "loss": 5.3177, "step": 1140 }, { "epoch": 0.3888539702493191, "grad_norm": 8.50612735748291, "learning_rate": 3.887399463806971e-05, "loss": 5.2408, "step": 1160 }, { "epoch": 0.39555834904672116, "grad_norm": 7.958391189575195, "learning_rate": 3.954423592493298e-05, "loss": 5.2248, "step": 1180 }, { "epoch": 0.4022627278441232, "grad_norm": 7.7386579513549805, "learning_rate": 4.021447721179625e-05, "loss": 5.2663, "step": 1200 }, { "epoch": 0.40896710664152525, "grad_norm": 8.172608375549316, "learning_rate": 4.088471849865952e-05, "loss": 5.2612, "step": 1220 }, { "epoch": 0.4156714854389273, "grad_norm": 7.354376792907715, "learning_rate": 4.155495978552279e-05, "loss": 5.2672, "step": 1240 }, { "epoch": 0.42237586423632933, "grad_norm": 7.837838649749756, "learning_rate": 4.222520107238606e-05, "loss": 5.1707, "step": 1260 }, { "epoch": 0.4290802430337314, "grad_norm": 8.173263549804688, "learning_rate": 4.289544235924933e-05, "loss": 5.2387, "step": 1280 }, { "epoch": 0.4357846218311335, "grad_norm": 7.122191905975342, "learning_rate": 4.35656836461126e-05, "loss": 5.2604, "step": 1300 }, { "epoch": 0.4424890006285355, "grad_norm": 7.6141839027404785, "learning_rate": 4.423592493297588e-05, "loss": 5.1689, "step": 1320 }, { "epoch": 0.44919337942593757, "grad_norm": 7.241093635559082, "learning_rate": 4.490616621983915e-05, "loss": 5.1451, "step": 1340 }, { "epoch": 0.4558977582233396, "grad_norm": 6.968513011932373, "learning_rate": 4.557640750670242e-05, "loss": 5.0216, "step": 1360 }, { "epoch": 0.46260213702074165, "grad_norm": 7.734285354614258, "learning_rate": 4.624664879356568e-05, "loss": 5.0932, "step": 1380 }, { "epoch": 0.4693065158181437, "grad_norm": 8.277946472167969, "learning_rate": 4.6916890080428954e-05, "loss": 5.1977, "step": 1400 }, { "epoch": 0.4760108946155458, "grad_norm": 7.437044620513916, "learning_rate": 4.7587131367292224e-05, "loss": 5.1254, "step": 1420 }, { "epoch": 0.48271527341294784, "grad_norm": 7.227113723754883, "learning_rate": 4.8257372654155495e-05, "loss": 5.0943, "step": 1440 }, { "epoch": 0.4894196522103499, "grad_norm": 7.8769683837890625, "learning_rate": 4.8927613941018765e-05, "loss": 5.0513, "step": 1460 }, { "epoch": 0.49612403100775193, "grad_norm": 6.573411464691162, "learning_rate": 4.959785522788204e-05, "loss": 5.045, "step": 1480 }, { "epoch": 0.502828409805154, "grad_norm": 6.970616817474365, "learning_rate": 4.997020040229457e-05, "loss": 5.0549, "step": 1500 }, { "epoch": 0.509532788602556, "grad_norm": 7.552504539489746, "learning_rate": 4.989570140803099e-05, "loss": 5.1787, "step": 1520 }, { "epoch": 0.5162371673999581, "grad_norm": 6.920768737792969, "learning_rate": 4.982120241376742e-05, "loss": 5.0916, "step": 1540 }, { "epoch": 0.5229415461973601, "grad_norm": 6.61656379699707, "learning_rate": 4.974670341950384e-05, "loss": 5.0528, "step": 1560 }, { "epoch": 0.5296459249947622, "grad_norm": 7.405433654785156, "learning_rate": 4.967220442524026e-05, "loss": 5.0544, "step": 1580 }, { "epoch": 0.5363503037921643, "grad_norm": 7.384308815002441, "learning_rate": 4.959770543097668e-05, "loss": 5.0058, "step": 1600 }, { "epoch": 0.5430546825895664, "grad_norm": 6.147129058837891, "learning_rate": 4.95232064367131e-05, "loss": 5.1453, "step": 1620 }, { "epoch": 0.5497590613869684, "grad_norm": 6.856501579284668, "learning_rate": 4.944870744244953e-05, "loss": 4.997, "step": 1640 }, { "epoch": 0.5564634401843704, "grad_norm": 7.677363395690918, "learning_rate": 4.937420844818596e-05, "loss": 5.0445, "step": 1660 }, { "epoch": 0.5631678189817725, "grad_norm": 6.1870269775390625, "learning_rate": 4.929970945392238e-05, "loss": 4.8786, "step": 1680 }, { "epoch": 0.5698721977791745, "grad_norm": 6.6285529136657715, "learning_rate": 4.92252104596588e-05, "loss": 4.9182, "step": 1700 }, { "epoch": 0.5765765765765766, "grad_norm": 6.762671947479248, "learning_rate": 4.915071146539522e-05, "loss": 4.836, "step": 1720 }, { "epoch": 0.5832809553739786, "grad_norm": 6.840793132781982, "learning_rate": 4.907621247113165e-05, "loss": 4.9259, "step": 1740 }, { "epoch": 0.5899853341713807, "grad_norm": 7.508916854858398, "learning_rate": 4.900171347686807e-05, "loss": 4.9955, "step": 1760 }, { "epoch": 0.5966897129687827, "grad_norm": 7.2800374031066895, "learning_rate": 4.892721448260449e-05, "loss": 4.9515, "step": 1780 }, { "epoch": 0.6033940917661847, "grad_norm": 6.398910999298096, "learning_rate": 4.885271548834091e-05, "loss": 4.9002, "step": 1800 }, { "epoch": 0.6100984705635868, "grad_norm": 7.054329872131348, "learning_rate": 4.877821649407733e-05, "loss": 4.8998, "step": 1820 }, { "epoch": 0.616802849360989, "grad_norm": 6.903358459472656, "learning_rate": 4.870371749981376e-05, "loss": 4.8227, "step": 1840 }, { "epoch": 0.623507228158391, "grad_norm": 6.382834434509277, "learning_rate": 4.862921850555018e-05, "loss": 4.8448, "step": 1860 }, { "epoch": 0.630211606955793, "grad_norm": 6.244606018066406, "learning_rate": 4.85547195112866e-05, "loss": 4.9048, "step": 1880 }, { "epoch": 0.6369159857531951, "grad_norm": 6.7048115730285645, "learning_rate": 4.848022051702302e-05, "loss": 4.8862, "step": 1900 }, { "epoch": 0.6436203645505971, "grad_norm": 6.761898994445801, "learning_rate": 4.840572152275945e-05, "loss": 4.836, "step": 1920 }, { "epoch": 0.6503247433479992, "grad_norm": 6.694396495819092, "learning_rate": 4.833122252849587e-05, "loss": 4.922, "step": 1940 }, { "epoch": 0.6570291221454012, "grad_norm": 7.083889961242676, "learning_rate": 4.825672353423229e-05, "loss": 4.801, "step": 1960 }, { "epoch": 0.6637335009428033, "grad_norm": 6.358527183532715, "learning_rate": 4.818222453996871e-05, "loss": 4.9146, "step": 1980 }, { "epoch": 0.6704378797402053, "grad_norm": 6.129880428314209, "learning_rate": 4.810772554570513e-05, "loss": 4.7806, "step": 2000 }, { "epoch": 0.6704378797402053, "eval_bleu_greedy": 1.635969783116869, "eval_loss": 0.6162992715835571, "eval_runtime": 98.9607, "eval_samples_per_second": 1.011, "eval_steps_per_second": 1.011, "step": 2000 }, { "epoch": 0.6771422585376073, "grad_norm": 6.157505989074707, "learning_rate": 4.803322655144156e-05, "loss": 4.6963, "step": 2020 }, { "epoch": 0.6838466373350094, "grad_norm": 6.576890468597412, "learning_rate": 4.795872755717798e-05, "loss": 4.7945, "step": 2040 }, { "epoch": 0.6905510161324114, "grad_norm": 6.158898830413818, "learning_rate": 4.78842285629144e-05, "loss": 4.7629, "step": 2060 }, { "epoch": 0.6972553949298136, "grad_norm": 7.410290241241455, "learning_rate": 4.780972956865082e-05, "loss": 4.6563, "step": 2080 }, { "epoch": 0.7039597737272156, "grad_norm": 6.731761455535889, "learning_rate": 4.773523057438724e-05, "loss": 4.6947, "step": 2100 }, { "epoch": 0.7106641525246177, "grad_norm": 6.34529447555542, "learning_rate": 4.766073158012367e-05, "loss": 4.7754, "step": 2120 }, { "epoch": 0.7173685313220197, "grad_norm": 6.288251876831055, "learning_rate": 4.758623258586009e-05, "loss": 4.7656, "step": 2140 }, { "epoch": 0.7240729101194218, "grad_norm": 6.207250595092773, "learning_rate": 4.751173359159651e-05, "loss": 4.6873, "step": 2160 }, { "epoch": 0.7307772889168238, "grad_norm": 6.283239841461182, "learning_rate": 4.7437234597332934e-05, "loss": 4.6951, "step": 2180 }, { "epoch": 0.7374816677142259, "grad_norm": 6.399360179901123, "learning_rate": 4.736273560306936e-05, "loss": 4.8608, "step": 2200 }, { "epoch": 0.7441860465116279, "grad_norm": 7.108467102050781, "learning_rate": 4.728823660880579e-05, "loss": 4.7696, "step": 2220 }, { "epoch": 0.7508904253090299, "grad_norm": 6.39521598815918, "learning_rate": 4.721373761454221e-05, "loss": 4.6725, "step": 2240 }, { "epoch": 0.757594804106432, "grad_norm": 7.189324855804443, "learning_rate": 4.713923862027863e-05, "loss": 4.7705, "step": 2260 }, { "epoch": 0.764299182903834, "grad_norm": 6.980968952178955, "learning_rate": 4.706473962601505e-05, "loss": 4.6747, "step": 2280 }, { "epoch": 0.7710035617012361, "grad_norm": 5.775346755981445, "learning_rate": 4.699024063175147e-05, "loss": 4.5941, "step": 2300 }, { "epoch": 0.7777079404986382, "grad_norm": 6.348450183868408, "learning_rate": 4.69157416374879e-05, "loss": 4.6419, "step": 2320 }, { "epoch": 0.7844123192960403, "grad_norm": 6.2754340171813965, "learning_rate": 4.684124264322432e-05, "loss": 4.6844, "step": 2340 }, { "epoch": 0.7911166980934423, "grad_norm": 6.037561893463135, "learning_rate": 4.676674364896074e-05, "loss": 4.6424, "step": 2360 }, { "epoch": 0.7978210768908444, "grad_norm": 6.3136372566223145, "learning_rate": 4.669224465469716e-05, "loss": 4.6775, "step": 2380 }, { "epoch": 0.8045254556882464, "grad_norm": 6.91141939163208, "learning_rate": 4.661774566043359e-05, "loss": 4.6124, "step": 2400 }, { "epoch": 0.8112298344856484, "grad_norm": 5.787718772888184, "learning_rate": 4.654324666617001e-05, "loss": 4.6595, "step": 2420 }, { "epoch": 0.8179342132830505, "grad_norm": 6.696752548217773, "learning_rate": 4.646874767190643e-05, "loss": 4.563, "step": 2440 }, { "epoch": 0.8246385920804525, "grad_norm": 6.550769805908203, "learning_rate": 4.639424867764285e-05, "loss": 4.7101, "step": 2460 }, { "epoch": 0.8313429708778546, "grad_norm": 6.5647969245910645, "learning_rate": 4.6319749683379274e-05, "loss": 4.5763, "step": 2480 }, { "epoch": 0.8380473496752566, "grad_norm": 6.71086311340332, "learning_rate": 4.62452506891157e-05, "loss": 4.4918, "step": 2500 }, { "epoch": 0.8447517284726587, "grad_norm": 6.1139445304870605, "learning_rate": 4.617075169485212e-05, "loss": 4.5095, "step": 2520 }, { "epoch": 0.8514561072700607, "grad_norm": 5.594122886657715, "learning_rate": 4.609625270058854e-05, "loss": 4.6359, "step": 2540 }, { "epoch": 0.8581604860674628, "grad_norm": 6.769913196563721, "learning_rate": 4.6021753706324964e-05, "loss": 4.4938, "step": 2560 }, { "epoch": 0.8648648648648649, "grad_norm": 6.663547039031982, "learning_rate": 4.5947254712061385e-05, "loss": 4.5619, "step": 2580 }, { "epoch": 0.871569243662267, "grad_norm": 6.266171455383301, "learning_rate": 4.587275571779781e-05, "loss": 4.4406, "step": 2600 }, { "epoch": 0.878273622459669, "grad_norm": 6.513619422912598, "learning_rate": 4.5798256723534234e-05, "loss": 4.498, "step": 2620 }, { "epoch": 0.884978001257071, "grad_norm": 6.088827133178711, "learning_rate": 4.5723757729270654e-05, "loss": 4.5242, "step": 2640 }, { "epoch": 0.8916823800544731, "grad_norm": 6.1904296875, "learning_rate": 4.5649258735007075e-05, "loss": 4.5976, "step": 2660 }, { "epoch": 0.8983867588518751, "grad_norm": 6.429610252380371, "learning_rate": 4.55747597407435e-05, "loss": 4.4584, "step": 2680 }, { "epoch": 0.9050911376492772, "grad_norm": 5.907393932342529, "learning_rate": 4.5500260746479924e-05, "loss": 4.5814, "step": 2700 }, { "epoch": 0.9117955164466792, "grad_norm": 6.102148056030273, "learning_rate": 4.5425761752216345e-05, "loss": 4.5913, "step": 2720 }, { "epoch": 0.9184998952440813, "grad_norm": 6.3552327156066895, "learning_rate": 4.5351262757952766e-05, "loss": 4.5327, "step": 2740 }, { "epoch": 0.9252042740414833, "grad_norm": 5.990479946136475, "learning_rate": 4.527676376368919e-05, "loss": 4.4025, "step": 2760 }, { "epoch": 0.9319086528388854, "grad_norm": 5.946578502655029, "learning_rate": 4.5202264769425614e-05, "loss": 4.5777, "step": 2780 }, { "epoch": 0.9386130316362874, "grad_norm": 6.422057151794434, "learning_rate": 4.512776577516204e-05, "loss": 4.4951, "step": 2800 }, { "epoch": 0.9453174104336896, "grad_norm": 6.144739151000977, "learning_rate": 4.505326678089846e-05, "loss": 4.5586, "step": 2820 }, { "epoch": 0.9520217892310916, "grad_norm": 6.011499404907227, "learning_rate": 4.4978767786634884e-05, "loss": 4.4695, "step": 2840 }, { "epoch": 0.9587261680284936, "grad_norm": 5.852478504180908, "learning_rate": 4.4904268792371304e-05, "loss": 4.4765, "step": 2860 }, { "epoch": 0.9654305468258957, "grad_norm": 5.951258182525635, "learning_rate": 4.482976979810773e-05, "loss": 4.4017, "step": 2880 }, { "epoch": 0.9721349256232977, "grad_norm": 6.046126842498779, "learning_rate": 4.475527080384415e-05, "loss": 4.4065, "step": 2900 }, { "epoch": 0.9788393044206998, "grad_norm": 7.13279390335083, "learning_rate": 4.4680771809580574e-05, "loss": 4.4527, "step": 2920 }, { "epoch": 0.9855436832181018, "grad_norm": 6.364200115203857, "learning_rate": 4.4606272815316995e-05, "loss": 4.4392, "step": 2940 }, { "epoch": 0.9922480620155039, "grad_norm": 6.487414360046387, "learning_rate": 4.4531773821053416e-05, "loss": 4.3799, "step": 2960 }, { "epoch": 0.9989524408129059, "grad_norm": 6.20075798034668, "learning_rate": 4.445727482678984e-05, "loss": 4.3814, "step": 2980 }, { "epoch": 1.0056987219777918, "grad_norm": 6.238715171813965, "learning_rate": 4.4382775832526264e-05, "loss": 4.3533, "step": 3000 }, { "epoch": 1.0056987219777918, "eval_bleu_greedy": 2.12858213528201, "eval_loss": 0.568386971950531, "eval_runtime": 434.1939, "eval_samples_per_second": 0.23, "eval_steps_per_second": 0.23, "step": 3000 }, { "epoch": 1.012403100775194, "grad_norm": 6.458903789520264, "learning_rate": 4.4308276838262685e-05, "loss": 4.218, "step": 3020 }, { "epoch": 1.0191074795725958, "grad_norm": 6.330122947692871, "learning_rate": 4.4233777843999106e-05, "loss": 4.3083, "step": 3040 }, { "epoch": 1.025811858369998, "grad_norm": 6.430805683135986, "learning_rate": 4.4159278849735534e-05, "loss": 4.2772, "step": 3060 }, { "epoch": 1.0325162371674, "grad_norm": 6.592049598693848, "learning_rate": 4.4084779855471954e-05, "loss": 4.3208, "step": 3080 }, { "epoch": 1.039220615964802, "grad_norm": 6.5312299728393555, "learning_rate": 4.4010280861208375e-05, "loss": 4.1853, "step": 3100 }, { "epoch": 1.045924994762204, "grad_norm": 5.972381591796875, "learning_rate": 4.3935781866944796e-05, "loss": 4.2346, "step": 3120 }, { "epoch": 1.0526293735596062, "grad_norm": 5.913834095001221, "learning_rate": 4.386128287268122e-05, "loss": 4.2687, "step": 3140 }, { "epoch": 1.059333752357008, "grad_norm": 6.429443836212158, "learning_rate": 4.3786783878417645e-05, "loss": 4.33, "step": 3160 }, { "epoch": 1.0660381311544103, "grad_norm": 6.044195175170898, "learning_rate": 4.3712284884154066e-05, "loss": 4.2251, "step": 3180 }, { "epoch": 1.0727425099518122, "grad_norm": 6.010583877563477, "learning_rate": 4.3637785889890487e-05, "loss": 4.3161, "step": 3200 }, { "epoch": 1.0794468887492144, "grad_norm": 5.625052452087402, "learning_rate": 4.356328689562691e-05, "loss": 4.2069, "step": 3220 }, { "epoch": 1.0861512675466163, "grad_norm": 6.308145999908447, "learning_rate": 4.348878790136333e-05, "loss": 4.2812, "step": 3240 }, { "epoch": 1.0928556463440184, "grad_norm": 6.858571529388428, "learning_rate": 4.3414288907099756e-05, "loss": 4.2798, "step": 3260 }, { "epoch": 1.0995600251414206, "grad_norm": 6.711289882659912, "learning_rate": 4.333978991283618e-05, "loss": 4.2203, "step": 3280 }, { "epoch": 1.1062644039388225, "grad_norm": 6.270653247833252, "learning_rate": 4.3265290918572605e-05, "loss": 4.259, "step": 3300 }, { "epoch": 1.1129687827362247, "grad_norm": 5.929893493652344, "learning_rate": 4.3190791924309025e-05, "loss": 4.3751, "step": 3320 }, { "epoch": 1.1196731615336266, "grad_norm": 5.837188720703125, "learning_rate": 4.3116292930045446e-05, "loss": 4.2712, "step": 3340 }, { "epoch": 1.1263775403310288, "grad_norm": 6.670574188232422, "learning_rate": 4.3041793935781874e-05, "loss": 4.2151, "step": 3360 }, { "epoch": 1.1330819191284307, "grad_norm": 6.452718734741211, "learning_rate": 4.2967294941518295e-05, "loss": 4.321, "step": 3380 }, { "epoch": 1.1397862979258329, "grad_norm": 6.4428391456604, "learning_rate": 4.2892795947254716e-05, "loss": 4.246, "step": 3400 }, { "epoch": 1.1464906767232348, "grad_norm": 6.246615409851074, "learning_rate": 4.2818296952991137e-05, "loss": 4.2763, "step": 3420 }, { "epoch": 1.153195055520637, "grad_norm": 6.092718124389648, "learning_rate": 4.274379795872756e-05, "loss": 4.1313, "step": 3440 }, { "epoch": 1.1598994343180389, "grad_norm": 5.333466529846191, "learning_rate": 4.2669298964463985e-05, "loss": 4.2007, "step": 3460 }, { "epoch": 1.166603813115441, "grad_norm": 7.08294677734375, "learning_rate": 4.2594799970200406e-05, "loss": 4.1708, "step": 3480 }, { "epoch": 1.173308191912843, "grad_norm": 6.408305644989014, "learning_rate": 4.252030097593683e-05, "loss": 4.2438, "step": 3500 }, { "epoch": 1.1800125707102451, "grad_norm": 5.942695140838623, "learning_rate": 4.244580198167325e-05, "loss": 4.1205, "step": 3520 }, { "epoch": 1.1867169495076473, "grad_norm": 6.69981050491333, "learning_rate": 4.2371302987409675e-05, "loss": 4.1979, "step": 3540 }, { "epoch": 1.1934213283050492, "grad_norm": 7.073486804962158, "learning_rate": 4.2296803993146096e-05, "loss": 4.286, "step": 3560 }, { "epoch": 1.2001257071024514, "grad_norm": 6.129693031311035, "learning_rate": 4.222230499888252e-05, "loss": 4.1252, "step": 3580 }, { "epoch": 1.2068300858998533, "grad_norm": 6.761497497558594, "learning_rate": 4.214780600461894e-05, "loss": 4.16, "step": 3600 }, { "epoch": 1.2135344646972555, "grad_norm": 6.681031703948975, "learning_rate": 4.207330701035536e-05, "loss": 4.239, "step": 3620 }, { "epoch": 1.2202388434946574, "grad_norm": 6.160006523132324, "learning_rate": 4.1998808016091787e-05, "loss": 4.1435, "step": 3640 }, { "epoch": 1.2269432222920595, "grad_norm": 6.183200359344482, "learning_rate": 4.192430902182821e-05, "loss": 4.1166, "step": 3660 }, { "epoch": 1.2336476010894615, "grad_norm": 5.975028991699219, "learning_rate": 4.184981002756463e-05, "loss": 4.0858, "step": 3680 }, { "epoch": 1.2403519798868636, "grad_norm": 6.827803134918213, "learning_rate": 4.177531103330105e-05, "loss": 4.1952, "step": 3700 }, { "epoch": 1.2470563586842656, "grad_norm": 6.478833198547363, "learning_rate": 4.170081203903747e-05, "loss": 4.059, "step": 3720 }, { "epoch": 1.2537607374816677, "grad_norm": 6.197700500488281, "learning_rate": 4.16263130447739e-05, "loss": 4.2003, "step": 3740 }, { "epoch": 1.2604651162790699, "grad_norm": 5.54361629486084, "learning_rate": 4.155181405051032e-05, "loss": 4.1438, "step": 3760 }, { "epoch": 1.2671694950764718, "grad_norm": 5.62382698059082, "learning_rate": 4.147731505624674e-05, "loss": 4.154, "step": 3780 }, { "epoch": 1.2738738738738737, "grad_norm": 5.418813228607178, "learning_rate": 4.140281606198316e-05, "loss": 4.1561, "step": 3800 }, { "epoch": 1.280578252671276, "grad_norm": 5.975061893463135, "learning_rate": 4.132831706771959e-05, "loss": 4.178, "step": 3820 }, { "epoch": 1.287282631468678, "grad_norm": 6.231929302215576, "learning_rate": 4.125381807345601e-05, "loss": 4.1253, "step": 3840 }, { "epoch": 1.29398701026608, "grad_norm": 6.092617034912109, "learning_rate": 4.1179319079192437e-05, "loss": 4.1438, "step": 3860 }, { "epoch": 1.3006913890634821, "grad_norm": 6.094106674194336, "learning_rate": 4.110482008492886e-05, "loss": 4.1144, "step": 3880 }, { "epoch": 1.307395767860884, "grad_norm": 6.208296298980713, "learning_rate": 4.103032109066528e-05, "loss": 4.1352, "step": 3900 }, { "epoch": 1.3141001466582862, "grad_norm": 5.595242500305176, "learning_rate": 4.09558220964017e-05, "loss": 4.1146, "step": 3920 }, { "epoch": 1.3208045254556882, "grad_norm": 6.050002098083496, "learning_rate": 4.088132310213813e-05, "loss": 4.0787, "step": 3940 }, { "epoch": 1.3275089042530903, "grad_norm": 5.937078952789307, "learning_rate": 4.080682410787455e-05, "loss": 4.2249, "step": 3960 }, { "epoch": 1.3342132830504925, "grad_norm": 6.359293460845947, "learning_rate": 4.073232511361097e-05, "loss": 4.0592, "step": 3980 }, { "epoch": 1.3409176618478944, "grad_norm": 5.486398696899414, "learning_rate": 4.065782611934739e-05, "loss": 4.183, "step": 4000 }, { "epoch": 1.3409176618478944, "eval_bleu_greedy": 1.8924225876289293, "eval_loss": 0.5379119515419006, "eval_runtime": 41.2638, "eval_samples_per_second": 2.423, "eval_steps_per_second": 2.423, "step": 4000 }, { "epoch": 1.3476220406452963, "grad_norm": 6.733095169067383, "learning_rate": 4.058332712508382e-05, "loss": 4.1826, "step": 4020 }, { "epoch": 1.3543264194426985, "grad_norm": 6.984947681427002, "learning_rate": 4.050882813082024e-05, "loss": 4.0244, "step": 4040 }, { "epoch": 1.3610307982401006, "grad_norm": 6.321763515472412, "learning_rate": 4.043432913655666e-05, "loss": 4.1419, "step": 4060 }, { "epoch": 1.3677351770375026, "grad_norm": 6.014941692352295, "learning_rate": 4.035983014229308e-05, "loss": 4.0579, "step": 4080 }, { "epoch": 1.3744395558349047, "grad_norm": 6.782519340515137, "learning_rate": 4.02853311480295e-05, "loss": 4.037, "step": 4100 }, { "epoch": 1.3811439346323067, "grad_norm": 5.457937717437744, "learning_rate": 4.021083215376593e-05, "loss": 4.0483, "step": 4120 }, { "epoch": 1.3878483134297088, "grad_norm": 6.121335983276367, "learning_rate": 4.013633315950235e-05, "loss": 4.0982, "step": 4140 }, { "epoch": 1.3945526922271108, "grad_norm": 6.334305763244629, "learning_rate": 4.006183416523877e-05, "loss": 4.106, "step": 4160 }, { "epoch": 1.401257071024513, "grad_norm": 6.201812744140625, "learning_rate": 3.998733517097519e-05, "loss": 4.0633, "step": 4180 }, { "epoch": 1.407961449821915, "grad_norm": 6.2243828773498535, "learning_rate": 3.991283617671162e-05, "loss": 4.0215, "step": 4200 }, { "epoch": 1.414665828619317, "grad_norm": 6.266222953796387, "learning_rate": 3.983833718244804e-05, "loss": 4.1322, "step": 4220 }, { "epoch": 1.421370207416719, "grad_norm": 5.890945911407471, "learning_rate": 3.976383818818446e-05, "loss": 3.8993, "step": 4240 }, { "epoch": 1.428074586214121, "grad_norm": 5.7960991859436035, "learning_rate": 3.968933919392088e-05, "loss": 4.1042, "step": 4260 }, { "epoch": 1.4347789650115232, "grad_norm": 6.632575988769531, "learning_rate": 3.96148401996573e-05, "loss": 4.1236, "step": 4280 }, { "epoch": 1.4414833438089252, "grad_norm": 6.313004493713379, "learning_rate": 3.954034120539373e-05, "loss": 4.0445, "step": 4300 }, { "epoch": 1.4481877226063273, "grad_norm": 6.819790840148926, "learning_rate": 3.946584221113015e-05, "loss": 4.0814, "step": 4320 }, { "epoch": 1.4548921014037293, "grad_norm": 5.719134330749512, "learning_rate": 3.939134321686657e-05, "loss": 3.9592, "step": 4340 }, { "epoch": 1.4615964802011314, "grad_norm": 6.105227470397949, "learning_rate": 3.931684422260299e-05, "loss": 4.0809, "step": 4360 }, { "epoch": 1.4683008589985334, "grad_norm": 6.068193435668945, "learning_rate": 3.924234522833941e-05, "loss": 4.1412, "step": 4380 }, { "epoch": 1.4750052377959355, "grad_norm": 5.725421905517578, "learning_rate": 3.916784623407585e-05, "loss": 4.0711, "step": 4400 }, { "epoch": 1.4817096165933374, "grad_norm": 6.790429592132568, "learning_rate": 3.909334723981227e-05, "loss": 3.9287, "step": 4420 }, { "epoch": 1.4884139953907396, "grad_norm": 6.1765971183776855, "learning_rate": 3.901884824554869e-05, "loss": 4.0219, "step": 4440 }, { "epoch": 1.4951183741881415, "grad_norm": 5.821228981018066, "learning_rate": 3.894434925128511e-05, "loss": 3.9816, "step": 4460 }, { "epoch": 1.5018227529855437, "grad_norm": 6.144356727600098, "learning_rate": 3.886985025702153e-05, "loss": 3.9802, "step": 4480 }, { "epoch": 1.5085271317829458, "grad_norm": 5.687018871307373, "learning_rate": 3.879535126275796e-05, "loss": 3.9389, "step": 4500 }, { "epoch": 1.5152315105803478, "grad_norm": 5.2722673416137695, "learning_rate": 3.872085226849438e-05, "loss": 4.115, "step": 4520 }, { "epoch": 1.5219358893777497, "grad_norm": 5.904730796813965, "learning_rate": 3.86463532742308e-05, "loss": 4.0509, "step": 4540 }, { "epoch": 1.5286402681751519, "grad_norm": 6.75799560546875, "learning_rate": 3.857185427996722e-05, "loss": 4.074, "step": 4560 }, { "epoch": 1.535344646972554, "grad_norm": 6.9323039054870605, "learning_rate": 3.849735528570364e-05, "loss": 3.9566, "step": 4580 }, { "epoch": 1.542049025769956, "grad_norm": 6.0153398513793945, "learning_rate": 3.842285629144007e-05, "loss": 4.0422, "step": 4600 }, { "epoch": 1.548753404567358, "grad_norm": 6.283784866333008, "learning_rate": 3.834835729717649e-05, "loss": 3.9645, "step": 4620 }, { "epoch": 1.5554577833647603, "grad_norm": 6.145251274108887, "learning_rate": 3.827385830291291e-05, "loss": 4.0442, "step": 4640 }, { "epoch": 1.5621621621621622, "grad_norm": 5.55891227722168, "learning_rate": 3.819935930864933e-05, "loss": 4.0023, "step": 4660 }, { "epoch": 1.5688665409595641, "grad_norm": 6.398717880249023, "learning_rate": 3.812486031438576e-05, "loss": 3.9904, "step": 4680 }, { "epoch": 1.5755709197569663, "grad_norm": 7.079937934875488, "learning_rate": 3.805036132012218e-05, "loss": 3.851, "step": 4700 }, { "epoch": 1.5822752985543684, "grad_norm": 5.542669773101807, "learning_rate": 3.79758623258586e-05, "loss": 3.9195, "step": 4720 }, { "epoch": 1.5889796773517704, "grad_norm": 6.840109348297119, "learning_rate": 3.790136333159502e-05, "loss": 4.0468, "step": 4740 }, { "epoch": 1.5956840561491723, "grad_norm": 5.582828044891357, "learning_rate": 3.7826864337331444e-05, "loss": 3.897, "step": 4760 }, { "epoch": 1.6023884349465745, "grad_norm": 5.687999725341797, "learning_rate": 3.775236534306787e-05, "loss": 4.0053, "step": 4780 }, { "epoch": 1.6090928137439766, "grad_norm": 5.822837829589844, "learning_rate": 3.767786634880429e-05, "loss": 3.9547, "step": 4800 }, { "epoch": 1.6157971925413785, "grad_norm": 6.070379257202148, "learning_rate": 3.7603367354540713e-05, "loss": 4.0644, "step": 4820 }, { "epoch": 1.6225015713387805, "grad_norm": 5.518836975097656, "learning_rate": 3.7528868360277134e-05, "loss": 3.8831, "step": 4840 }, { "epoch": 1.6292059501361829, "grad_norm": 5.762600898742676, "learning_rate": 3.745436936601356e-05, "loss": 3.9588, "step": 4860 }, { "epoch": 1.6359103289335848, "grad_norm": 6.048323154449463, "learning_rate": 3.737987037174998e-05, "loss": 3.9825, "step": 4880 }, { "epoch": 1.6426147077309867, "grad_norm": 5.4770097732543945, "learning_rate": 3.7305371377486404e-05, "loss": 3.8577, "step": 4900 }, { "epoch": 1.6493190865283889, "grad_norm": 6.0199294090271, "learning_rate": 3.7230872383222825e-05, "loss": 3.8402, "step": 4920 }, { "epoch": 1.656023465325791, "grad_norm": 5.508368492126465, "learning_rate": 3.715637338895925e-05, "loss": 3.8506, "step": 4940 }, { "epoch": 1.662727844123193, "grad_norm": 5.987868309020996, "learning_rate": 3.708187439469567e-05, "loss": 3.928, "step": 4960 }, { "epoch": 1.669432222920595, "grad_norm": 6.022989749908447, "learning_rate": 3.70073754004321e-05, "loss": 3.9483, "step": 4980 }, { "epoch": 1.676136601717997, "grad_norm": 5.780736446380615, "learning_rate": 3.693287640616852e-05, "loss": 3.9055, "step": 5000 }, { "epoch": 1.676136601717997, "eval_bleu_greedy": 2.074504503599086, "eval_loss": 0.5163004398345947, "eval_runtime": 114.7725, "eval_samples_per_second": 0.871, "eval_steps_per_second": 0.871, "step": 5000 }, { "epoch": 1.6828409805153992, "grad_norm": 7.780221462249756, "learning_rate": 3.685837741190494e-05, "loss": 4.0413, "step": 5020 }, { "epoch": 1.6895453593128011, "grad_norm": 5.654071807861328, "learning_rate": 3.6783878417641363e-05, "loss": 3.9454, "step": 5040 }, { "epoch": 1.696249738110203, "grad_norm": 5.763638019561768, "learning_rate": 3.6709379423377784e-05, "loss": 3.9884, "step": 5060 }, { "epoch": 1.7029541169076052, "grad_norm": 5.78656005859375, "learning_rate": 3.663488042911421e-05, "loss": 3.9436, "step": 5080 }, { "epoch": 1.7096584957050074, "grad_norm": 6.413984775543213, "learning_rate": 3.656038143485063e-05, "loss": 3.9803, "step": 5100 }, { "epoch": 1.7163628745024093, "grad_norm": 5.727552890777588, "learning_rate": 3.6485882440587054e-05, "loss": 4.0424, "step": 5120 }, { "epoch": 1.7230672532998115, "grad_norm": 5.366096496582031, "learning_rate": 3.6411383446323475e-05, "loss": 3.9212, "step": 5140 }, { "epoch": 1.7297716320972136, "grad_norm": 5.877246856689453, "learning_rate": 3.63368844520599e-05, "loss": 3.9313, "step": 5160 }, { "epoch": 1.7364760108946156, "grad_norm": 4.906258583068848, "learning_rate": 3.626238545779632e-05, "loss": 3.9713, "step": 5180 }, { "epoch": 1.7431803896920175, "grad_norm": 5.745492458343506, "learning_rate": 3.6187886463532744e-05, "loss": 3.9141, "step": 5200 }, { "epoch": 1.7498847684894197, "grad_norm": 5.654531002044678, "learning_rate": 3.6113387469269165e-05, "loss": 3.9644, "step": 5220 }, { "epoch": 1.7565891472868218, "grad_norm": 5.877029895782471, "learning_rate": 3.6038888475005586e-05, "loss": 3.9801, "step": 5240 }, { "epoch": 1.7632935260842237, "grad_norm": 6.766676425933838, "learning_rate": 3.5964389480742013e-05, "loss": 3.9611, "step": 5260 }, { "epoch": 1.7699979048816257, "grad_norm": 6.007946968078613, "learning_rate": 3.5889890486478434e-05, "loss": 3.9596, "step": 5280 }, { "epoch": 1.7767022836790278, "grad_norm": 5.436508655548096, "learning_rate": 3.5815391492214855e-05, "loss": 3.9234, "step": 5300 }, { "epoch": 1.78340666247643, "grad_norm": 6.275641441345215, "learning_rate": 3.5740892497951276e-05, "loss": 3.9003, "step": 5320 }, { "epoch": 1.790111041273832, "grad_norm": 5.461209774017334, "learning_rate": 3.5666393503687704e-05, "loss": 3.9583, "step": 5340 }, { "epoch": 1.796815420071234, "grad_norm": 5.981624603271484, "learning_rate": 3.5591894509424125e-05, "loss": 3.8287, "step": 5360 }, { "epoch": 1.8035197988686362, "grad_norm": 5.947275638580322, "learning_rate": 3.5517395515160546e-05, "loss": 3.9774, "step": 5380 }, { "epoch": 1.8102241776660382, "grad_norm": 6.036500930786133, "learning_rate": 3.5442896520896966e-05, "loss": 3.8426, "step": 5400 }, { "epoch": 1.81692855646344, "grad_norm": 5.644313335418701, "learning_rate": 3.536839752663339e-05, "loss": 3.8918, "step": 5420 }, { "epoch": 1.8236329352608422, "grad_norm": 6.368311882019043, "learning_rate": 3.5293898532369815e-05, "loss": 3.9701, "step": 5440 }, { "epoch": 1.8303373140582444, "grad_norm": 5.9047932624816895, "learning_rate": 3.5219399538106236e-05, "loss": 3.878, "step": 5460 }, { "epoch": 1.8370416928556463, "grad_norm": 6.0813679695129395, "learning_rate": 3.514490054384266e-05, "loss": 3.9292, "step": 5480 }, { "epoch": 1.8437460716530483, "grad_norm": 6.120319843292236, "learning_rate": 3.5070401549579084e-05, "loss": 3.8474, "step": 5500 }, { "epoch": 1.8504504504504504, "grad_norm": 6.296043395996094, "learning_rate": 3.4995902555315505e-05, "loss": 3.7354, "step": 5520 }, { "epoch": 1.8571548292478526, "grad_norm": 5.409097671508789, "learning_rate": 3.492140356105193e-05, "loss": 3.8795, "step": 5540 }, { "epoch": 1.8638592080452545, "grad_norm": 6.105241775512695, "learning_rate": 3.4846904566788354e-05, "loss": 3.9765, "step": 5560 }, { "epoch": 1.8705635868426564, "grad_norm": 5.878379821777344, "learning_rate": 3.4772405572524775e-05, "loss": 3.8718, "step": 5580 }, { "epoch": 1.8772679656400588, "grad_norm": 5.730438709259033, "learning_rate": 3.4697906578261196e-05, "loss": 4.0157, "step": 5600 }, { "epoch": 1.8839723444374608, "grad_norm": 5.375248432159424, "learning_rate": 3.4623407583997616e-05, "loss": 3.9336, "step": 5620 }, { "epoch": 1.8906767232348627, "grad_norm": 6.077249526977539, "learning_rate": 3.4548908589734044e-05, "loss": 3.8112, "step": 5640 }, { "epoch": 1.8973811020322648, "grad_norm": 5.51649808883667, "learning_rate": 3.4474409595470465e-05, "loss": 3.9677, "step": 5660 }, { "epoch": 1.904085480829667, "grad_norm": 5.96297025680542, "learning_rate": 3.4399910601206886e-05, "loss": 3.8401, "step": 5680 }, { "epoch": 1.910789859627069, "grad_norm": 5.845096588134766, "learning_rate": 3.432541160694331e-05, "loss": 3.9445, "step": 5700 }, { "epoch": 1.9174942384244709, "grad_norm": 6.597667694091797, "learning_rate": 3.425091261267973e-05, "loss": 3.8767, "step": 5720 }, { "epoch": 1.924198617221873, "grad_norm": 5.085957050323486, "learning_rate": 3.4176413618416155e-05, "loss": 3.816, "step": 5740 }, { "epoch": 1.9309029960192752, "grad_norm": 5.354710578918457, "learning_rate": 3.4101914624152576e-05, "loss": 3.7329, "step": 5760 }, { "epoch": 1.937607374816677, "grad_norm": 6.152263641357422, "learning_rate": 3.4027415629889e-05, "loss": 3.9058, "step": 5780 }, { "epoch": 1.944311753614079, "grad_norm": 5.678866863250732, "learning_rate": 3.395291663562542e-05, "loss": 3.9123, "step": 5800 }, { "epoch": 1.9510161324114812, "grad_norm": 5.211181640625, "learning_rate": 3.3878417641361846e-05, "loss": 3.857, "step": 5820 }, { "epoch": 1.9577205112088834, "grad_norm": 5.752172470092773, "learning_rate": 3.3803918647098266e-05, "loss": 3.9512, "step": 5840 }, { "epoch": 1.9644248900062853, "grad_norm": 5.982390403747559, "learning_rate": 3.372941965283469e-05, "loss": 3.8322, "step": 5860 }, { "epoch": 1.9711292688036874, "grad_norm": 6.09535551071167, "learning_rate": 3.365492065857111e-05, "loss": 3.8806, "step": 5880 }, { "epoch": 1.9778336476010896, "grad_norm": 6.2229905128479, "learning_rate": 3.358042166430753e-05, "loss": 3.885, "step": 5900 }, { "epoch": 1.9845380263984915, "grad_norm": 5.936634540557861, "learning_rate": 3.350592267004396e-05, "loss": 3.8126, "step": 5920 }, { "epoch": 1.9912424051958935, "grad_norm": 5.78571081161499, "learning_rate": 3.343142367578038e-05, "loss": 4.0106, "step": 5940 }, { "epoch": 1.9979467839932956, "grad_norm": 6.465760231018066, "learning_rate": 3.33569246815168e-05, "loss": 3.8896, "step": 5960 }, { "epoch": 2.004357846218311, "grad_norm": 6.171356201171875, "learning_rate": 3.328242568725322e-05, "loss": 3.4701, "step": 5980 }, { "epoch": 2.0110622250157135, "grad_norm": 5.609477519989014, "learning_rate": 3.320792669298965e-05, "loss": 3.6326, "step": 6000 }, { "epoch": 2.0110622250157135, "eval_bleu_greedy": 2.0660877864138794, "eval_loss": 0.5018166899681091, "eval_runtime": 118.8592, "eval_samples_per_second": 0.841, "eval_steps_per_second": 0.841, "step": 6000 }, { "epoch": 2.0177666038131155, "grad_norm": 5.845204830169678, "learning_rate": 3.313342769872607e-05, "loss": 3.7089, "step": 6020 }, { "epoch": 2.0244709826105174, "grad_norm": 6.158261775970459, "learning_rate": 3.3058928704462496e-05, "loss": 3.6354, "step": 6040 }, { "epoch": 2.0311753614079198, "grad_norm": 5.758568286895752, "learning_rate": 3.2984429710198916e-05, "loss": 3.6658, "step": 6060 }, { "epoch": 2.0378797402053217, "grad_norm": 6.4133381843566895, "learning_rate": 3.290993071593534e-05, "loss": 3.716, "step": 6080 }, { "epoch": 2.0445841190027236, "grad_norm": 5.51917028427124, "learning_rate": 3.283543172167176e-05, "loss": 3.6775, "step": 6100 }, { "epoch": 2.0512884978001256, "grad_norm": 5.898686408996582, "learning_rate": 3.2760932727408186e-05, "loss": 3.7088, "step": 6120 }, { "epoch": 2.057992876597528, "grad_norm": 6.7731852531433105, "learning_rate": 3.268643373314461e-05, "loss": 3.5724, "step": 6140 }, { "epoch": 2.06469725539493, "grad_norm": 5.695714950561523, "learning_rate": 3.261193473888103e-05, "loss": 3.6688, "step": 6160 }, { "epoch": 2.071401634192332, "grad_norm": 6.517350196838379, "learning_rate": 3.253743574461745e-05, "loss": 3.7519, "step": 6180 }, { "epoch": 2.0781060129897337, "grad_norm": 5.876154899597168, "learning_rate": 3.2462936750353876e-05, "loss": 3.6403, "step": 6200 }, { "epoch": 2.084810391787136, "grad_norm": 6.117770671844482, "learning_rate": 3.23884377560903e-05, "loss": 3.636, "step": 6220 }, { "epoch": 2.091514770584538, "grad_norm": 5.719681262969971, "learning_rate": 3.231393876182672e-05, "loss": 3.5995, "step": 6240 }, { "epoch": 2.09821914938194, "grad_norm": 6.031946182250977, "learning_rate": 3.223943976756314e-05, "loss": 3.6692, "step": 6260 }, { "epoch": 2.104923528179342, "grad_norm": 6.5091023445129395, "learning_rate": 3.216494077329956e-05, "loss": 3.6672, "step": 6280 }, { "epoch": 2.1116279069767443, "grad_norm": 6.04213285446167, "learning_rate": 3.209044177903599e-05, "loss": 3.5401, "step": 6300 }, { "epoch": 2.1183322857741462, "grad_norm": 6.8394036293029785, "learning_rate": 3.201594278477241e-05, "loss": 3.6274, "step": 6320 }, { "epoch": 2.125036664571548, "grad_norm": 5.81780481338501, "learning_rate": 3.194144379050883e-05, "loss": 3.6487, "step": 6340 }, { "epoch": 2.1317410433689505, "grad_norm": 6.043388366699219, "learning_rate": 3.186694479624525e-05, "loss": 3.6658, "step": 6360 }, { "epoch": 2.1384454221663525, "grad_norm": 6.0370378494262695, "learning_rate": 3.179244580198167e-05, "loss": 3.5821, "step": 6380 }, { "epoch": 2.1451498009637544, "grad_norm": 5.804340362548828, "learning_rate": 3.17179468077181e-05, "loss": 3.7046, "step": 6400 }, { "epoch": 2.1518541797611563, "grad_norm": 5.964964866638184, "learning_rate": 3.164344781345452e-05, "loss": 3.6777, "step": 6420 }, { "epoch": 2.1585585585585587, "grad_norm": 6.0428853034973145, "learning_rate": 3.156894881919094e-05, "loss": 3.7281, "step": 6440 }, { "epoch": 2.1652629373559606, "grad_norm": 5.866547584533691, "learning_rate": 3.149444982492736e-05, "loss": 3.5978, "step": 6460 }, { "epoch": 2.1719673161533626, "grad_norm": 6.283875465393066, "learning_rate": 3.141995083066379e-05, "loss": 3.6787, "step": 6480 }, { "epoch": 2.1786716949507645, "grad_norm": 5.971242427825928, "learning_rate": 3.134545183640021e-05, "loss": 3.5794, "step": 6500 }, { "epoch": 2.185376073748167, "grad_norm": 6.014956474304199, "learning_rate": 3.127095284213663e-05, "loss": 3.6924, "step": 6520 }, { "epoch": 2.192080452545569, "grad_norm": 5.171935558319092, "learning_rate": 3.119645384787305e-05, "loss": 3.6373, "step": 6540 }, { "epoch": 2.1987848313429708, "grad_norm": 6.373608112335205, "learning_rate": 3.112195485360947e-05, "loss": 3.6745, "step": 6560 }, { "epoch": 2.205489210140373, "grad_norm": 5.605614185333252, "learning_rate": 3.10474558593459e-05, "loss": 3.6497, "step": 6580 }, { "epoch": 2.212193588937775, "grad_norm": 5.9374613761901855, "learning_rate": 3.097295686508233e-05, "loss": 3.5979, "step": 6600 }, { "epoch": 2.218897967735177, "grad_norm": 5.818667411804199, "learning_rate": 3.089845787081875e-05, "loss": 3.5593, "step": 6620 }, { "epoch": 2.225602346532579, "grad_norm": 5.608858585357666, "learning_rate": 3.082395887655517e-05, "loss": 3.5811, "step": 6640 }, { "epoch": 2.2323067253299813, "grad_norm": 6.370382785797119, "learning_rate": 3.074945988229159e-05, "loss": 3.6893, "step": 6660 }, { "epoch": 2.2390111041273832, "grad_norm": 5.760286331176758, "learning_rate": 3.067496088802802e-05, "loss": 3.6869, "step": 6680 }, { "epoch": 2.245715482924785, "grad_norm": 5.964378833770752, "learning_rate": 3.060046189376444e-05, "loss": 3.5856, "step": 6700 }, { "epoch": 2.252419861722187, "grad_norm": 5.7680439949035645, "learning_rate": 3.052596289950086e-05, "loss": 3.6309, "step": 6720 }, { "epoch": 2.2591242405195895, "grad_norm": 6.063139915466309, "learning_rate": 3.045146390523728e-05, "loss": 3.542, "step": 6740 }, { "epoch": 2.2658286193169914, "grad_norm": 7.324517250061035, "learning_rate": 3.0376964910973705e-05, "loss": 3.5469, "step": 6760 }, { "epoch": 2.2725329981143934, "grad_norm": 5.549790859222412, "learning_rate": 3.0302465916710126e-05, "loss": 3.6051, "step": 6780 }, { "epoch": 2.2792373769117953, "grad_norm": 5.7682929039001465, "learning_rate": 3.022796692244655e-05, "loss": 3.5874, "step": 6800 }, { "epoch": 2.2859417557091977, "grad_norm": 6.120064735412598, "learning_rate": 3.015346792818297e-05, "loss": 3.6209, "step": 6820 }, { "epoch": 2.2926461345065996, "grad_norm": 5.814151287078857, "learning_rate": 3.0078968933919395e-05, "loss": 3.5889, "step": 6840 }, { "epoch": 2.2993505133040015, "grad_norm": 6.4021077156066895, "learning_rate": 3.0004469939655816e-05, "loss": 3.6508, "step": 6860 }, { "epoch": 2.306054892101404, "grad_norm": 6.688700199127197, "learning_rate": 2.9929970945392237e-05, "loss": 3.6598, "step": 6880 }, { "epoch": 2.312759270898806, "grad_norm": 5.971013069152832, "learning_rate": 2.985547195112866e-05, "loss": 3.6805, "step": 6900 }, { "epoch": 2.3194636496962078, "grad_norm": 5.857511520385742, "learning_rate": 2.9780972956865082e-05, "loss": 3.5774, "step": 6920 }, { "epoch": 2.3261680284936097, "grad_norm": 5.832746505737305, "learning_rate": 2.9706473962601506e-05, "loss": 3.6995, "step": 6940 }, { "epoch": 2.332872407291012, "grad_norm": 6.846590042114258, "learning_rate": 2.9631974968337927e-05, "loss": 3.6018, "step": 6960 }, { "epoch": 2.339576786088414, "grad_norm": 5.618002891540527, "learning_rate": 2.955747597407435e-05, "loss": 3.5473, "step": 6980 }, { "epoch": 2.346281164885816, "grad_norm": 5.3922319412231445, "learning_rate": 2.9482976979810772e-05, "loss": 3.5538, "step": 7000 }, { "epoch": 2.346281164885816, "eval_bleu_greedy": 2.167759735525418, "eval_loss": 0.4932926893234253, "eval_runtime": 337.6724, "eval_samples_per_second": 0.296, "eval_steps_per_second": 0.296, "step": 7000 }, { "epoch": 2.352985543683218, "grad_norm": 5.728118896484375, "learning_rate": 2.9408477985547193e-05, "loss": 3.637, "step": 7020 }, { "epoch": 2.3596899224806203, "grad_norm": 5.794277191162109, "learning_rate": 2.9333978991283618e-05, "loss": 3.5977, "step": 7040 }, { "epoch": 2.366394301278022, "grad_norm": 6.4884419441223145, "learning_rate": 2.925947999702004e-05, "loss": 3.5448, "step": 7060 }, { "epoch": 2.373098680075424, "grad_norm": 7.127490520477295, "learning_rate": 2.9184981002756463e-05, "loss": 3.7117, "step": 7080 }, { "epoch": 2.3798030588728265, "grad_norm": 5.834691047668457, "learning_rate": 2.9110482008492884e-05, "loss": 3.5537, "step": 7100 }, { "epoch": 2.3865074376702284, "grad_norm": 5.9557671546936035, "learning_rate": 2.9035983014229308e-05, "loss": 3.5724, "step": 7120 }, { "epoch": 2.3932118164676304, "grad_norm": 6.775606632232666, "learning_rate": 2.8961484019965735e-05, "loss": 3.5767, "step": 7140 }, { "epoch": 2.3999161952650323, "grad_norm": 5.350723743438721, "learning_rate": 2.8886985025702156e-05, "loss": 3.6228, "step": 7160 }, { "epoch": 2.4066205740624347, "grad_norm": 6.00510835647583, "learning_rate": 2.881248603143858e-05, "loss": 3.6664, "step": 7180 }, { "epoch": 2.4133249528598366, "grad_norm": 6.175734519958496, "learning_rate": 2.8737987037175e-05, "loss": 3.6428, "step": 7200 }, { "epoch": 2.4200293316572385, "grad_norm": 6.390973091125488, "learning_rate": 2.8663488042911422e-05, "loss": 3.5884, "step": 7220 }, { "epoch": 2.4267337104546405, "grad_norm": 6.262541770935059, "learning_rate": 2.8588989048647847e-05, "loss": 3.5878, "step": 7240 }, { "epoch": 2.433438089252043, "grad_norm": 5.385353088378906, "learning_rate": 2.8514490054384268e-05, "loss": 3.5796, "step": 7260 }, { "epoch": 2.440142468049445, "grad_norm": 5.796669006347656, "learning_rate": 2.8439991060120692e-05, "loss": 3.5707, "step": 7280 }, { "epoch": 2.4468468468468467, "grad_norm": 6.3658857345581055, "learning_rate": 2.8365492065857113e-05, "loss": 3.6012, "step": 7300 }, { "epoch": 2.453551225644249, "grad_norm": 5.848957538604736, "learning_rate": 2.8290993071593537e-05, "loss": 3.5894, "step": 7320 }, { "epoch": 2.460255604441651, "grad_norm": 6.327582836151123, "learning_rate": 2.8216494077329958e-05, "loss": 3.5888, "step": 7340 }, { "epoch": 2.466959983239053, "grad_norm": 6.102633476257324, "learning_rate": 2.814199508306638e-05, "loss": 3.6819, "step": 7360 }, { "epoch": 2.473664362036455, "grad_norm": 5.988522529602051, "learning_rate": 2.8067496088802803e-05, "loss": 3.4606, "step": 7380 }, { "epoch": 2.4803687408338573, "grad_norm": 5.952702522277832, "learning_rate": 2.7992997094539224e-05, "loss": 3.5249, "step": 7400 }, { "epoch": 2.487073119631259, "grad_norm": 6.563230514526367, "learning_rate": 2.7918498100275648e-05, "loss": 3.5629, "step": 7420 }, { "epoch": 2.493777498428661, "grad_norm": 5.4130988121032715, "learning_rate": 2.784399910601207e-05, "loss": 3.5616, "step": 7440 }, { "epoch": 2.500481877226063, "grad_norm": 6.370597839355469, "learning_rate": 2.7769500111748493e-05, "loss": 3.6258, "step": 7460 }, { "epoch": 2.5071862560234655, "grad_norm": 5.860075950622559, "learning_rate": 2.7695001117484914e-05, "loss": 3.584, "step": 7480 }, { "epoch": 2.5138906348208674, "grad_norm": 5.848262786865234, "learning_rate": 2.762050212322134e-05, "loss": 3.5515, "step": 7500 }, { "epoch": 2.5205950136182693, "grad_norm": 6.018378257751465, "learning_rate": 2.754600312895776e-05, "loss": 3.6001, "step": 7520 }, { "epoch": 2.5272993924156717, "grad_norm": 5.92679500579834, "learning_rate": 2.747150413469418e-05, "loss": 3.6455, "step": 7540 }, { "epoch": 2.5340037712130736, "grad_norm": 6.104831695556641, "learning_rate": 2.7397005140430604e-05, "loss": 3.5881, "step": 7560 }, { "epoch": 2.5407081500104756, "grad_norm": 5.604018211364746, "learning_rate": 2.7322506146167025e-05, "loss": 3.5521, "step": 7580 }, { "epoch": 2.5474125288078775, "grad_norm": 6.820720195770264, "learning_rate": 2.724800715190345e-05, "loss": 3.5984, "step": 7600 }, { "epoch": 2.5541169076052794, "grad_norm": 5.802369117736816, "learning_rate": 2.717350815763987e-05, "loss": 3.6231, "step": 7620 }, { "epoch": 2.560821286402682, "grad_norm": 6.830519676208496, "learning_rate": 2.7099009163376295e-05, "loss": 3.513, "step": 7640 }, { "epoch": 2.5675256652000837, "grad_norm": 5.891795635223389, "learning_rate": 2.7024510169112716e-05, "loss": 3.4983, "step": 7660 }, { "epoch": 2.5742300439974857, "grad_norm": 5.775413513183594, "learning_rate": 2.6950011174849143e-05, "loss": 3.5026, "step": 7680 }, { "epoch": 2.580934422794888, "grad_norm": 6.1186442375183105, "learning_rate": 2.6875512180585564e-05, "loss": 3.5892, "step": 7700 }, { "epoch": 2.58763880159229, "grad_norm": 6.562758445739746, "learning_rate": 2.680101318632199e-05, "loss": 3.5703, "step": 7720 }, { "epoch": 2.594343180389692, "grad_norm": 6.671054840087891, "learning_rate": 2.672651419205841e-05, "loss": 3.6545, "step": 7740 }, { "epoch": 2.6010475591870943, "grad_norm": 6.263803005218506, "learning_rate": 2.6652015197794834e-05, "loss": 3.6269, "step": 7760 }, { "epoch": 2.6077519379844962, "grad_norm": 6.635150909423828, "learning_rate": 2.6577516203531254e-05, "loss": 3.5478, "step": 7780 }, { "epoch": 2.614456316781898, "grad_norm": 6.699692726135254, "learning_rate": 2.650301720926768e-05, "loss": 3.6548, "step": 7800 }, { "epoch": 2.6211606955793, "grad_norm": 5.610607624053955, "learning_rate": 2.64285182150041e-05, "loss": 3.6276, "step": 7820 }, { "epoch": 2.627865074376702, "grad_norm": 6.077248573303223, "learning_rate": 2.6354019220740524e-05, "loss": 3.5808, "step": 7840 }, { "epoch": 2.6345694531741044, "grad_norm": 6.732864856719971, "learning_rate": 2.6279520226476945e-05, "loss": 3.6394, "step": 7860 }, { "epoch": 2.6412738319715063, "grad_norm": 7.764287948608398, "learning_rate": 2.6205021232213366e-05, "loss": 3.6177, "step": 7880 }, { "epoch": 2.6479782107689083, "grad_norm": 5.52256441116333, "learning_rate": 2.613052223794979e-05, "loss": 3.5052, "step": 7900 }, { "epoch": 2.6546825895663106, "grad_norm": 5.835344314575195, "learning_rate": 2.605602324368621e-05, "loss": 3.5943, "step": 7920 }, { "epoch": 2.6613869683637126, "grad_norm": 6.355226993560791, "learning_rate": 2.5981524249422635e-05, "loss": 3.512, "step": 7940 }, { "epoch": 2.6680913471611145, "grad_norm": 5.310232639312744, "learning_rate": 2.5907025255159056e-05, "loss": 3.4958, "step": 7960 }, { "epoch": 2.674795725958517, "grad_norm": 6.357884883880615, "learning_rate": 2.583252626089548e-05, "loss": 3.4715, "step": 7980 }, { "epoch": 2.681500104755919, "grad_norm": 5.544501304626465, "learning_rate": 2.57580272666319e-05, "loss": 3.6412, "step": 8000 }, { "epoch": 2.681500104755919, "eval_bleu_greedy": 2.0426177231502667, "eval_loss": 0.478807270526886, "eval_runtime": 461.9761, "eval_samples_per_second": 0.216, "eval_steps_per_second": 0.216, "step": 8000 }, { "epoch": 2.6882044835533208, "grad_norm": 6.2651214599609375, "learning_rate": 2.5683528272368322e-05, "loss": 3.6418, "step": 8020 }, { "epoch": 2.6949088623507227, "grad_norm": 5.576724529266357, "learning_rate": 2.5609029278104746e-05, "loss": 3.57, "step": 8040 }, { "epoch": 2.7016132411481246, "grad_norm": 6.44381856918335, "learning_rate": 2.5534530283841167e-05, "loss": 3.6091, "step": 8060 }, { "epoch": 2.708317619945527, "grad_norm": 6.368646621704102, "learning_rate": 2.546003128957759e-05, "loss": 3.6117, "step": 8080 }, { "epoch": 2.715021998742929, "grad_norm": 6.0201096534729, "learning_rate": 2.5385532295314012e-05, "loss": 3.4642, "step": 8100 }, { "epoch": 2.721726377540331, "grad_norm": 6.197525501251221, "learning_rate": 2.5311033301050437e-05, "loss": 3.5663, "step": 8120 }, { "epoch": 2.7284307563377332, "grad_norm": 7.0478434562683105, "learning_rate": 2.5236534306786857e-05, "loss": 3.5866, "step": 8140 }, { "epoch": 2.735135135135135, "grad_norm": 5.948623180389404, "learning_rate": 2.516203531252328e-05, "loss": 3.594, "step": 8160 }, { "epoch": 2.741839513932537, "grad_norm": 6.0779266357421875, "learning_rate": 2.5087536318259703e-05, "loss": 3.5592, "step": 8180 }, { "epoch": 2.7485438927299395, "grad_norm": 6.036412715911865, "learning_rate": 2.5013037323996123e-05, "loss": 3.6267, "step": 8200 }, { "epoch": 2.7552482715273414, "grad_norm": 5.411278247833252, "learning_rate": 2.493853832973255e-05, "loss": 3.4901, "step": 8220 }, { "epoch": 2.7619526503247434, "grad_norm": 5.945597171783447, "learning_rate": 2.4864039335468972e-05, "loss": 3.5108, "step": 8240 }, { "epoch": 2.7686570291221453, "grad_norm": 5.927489280700684, "learning_rate": 2.4789540341205393e-05, "loss": 3.523, "step": 8260 }, { "epoch": 2.775361407919547, "grad_norm": 5.989095211029053, "learning_rate": 2.4715041346941817e-05, "loss": 3.5256, "step": 8280 }, { "epoch": 2.7820657867169496, "grad_norm": 5.67732572555542, "learning_rate": 2.4640542352678238e-05, "loss": 3.5598, "step": 8300 }, { "epoch": 2.7887701655143515, "grad_norm": 5.954450607299805, "learning_rate": 2.4566043358414662e-05, "loss": 3.5808, "step": 8320 }, { "epoch": 2.7954745443117535, "grad_norm": 6.370481967926025, "learning_rate": 2.4491544364151083e-05, "loss": 3.4797, "step": 8340 }, { "epoch": 2.802178923109156, "grad_norm": 6.09319543838501, "learning_rate": 2.4417045369887507e-05, "loss": 3.5653, "step": 8360 }, { "epoch": 2.8088833019065578, "grad_norm": 6.781850337982178, "learning_rate": 2.4342546375623932e-05, "loss": 3.5961, "step": 8380 }, { "epoch": 2.8155876807039597, "grad_norm": 6.419500350952148, "learning_rate": 2.4268047381360353e-05, "loss": 3.5896, "step": 8400 }, { "epoch": 2.822292059501362, "grad_norm": 6.156778335571289, "learning_rate": 2.4193548387096777e-05, "loss": 3.5622, "step": 8420 }, { "epoch": 2.828996438298764, "grad_norm": 6.792672157287598, "learning_rate": 2.4119049392833198e-05, "loss": 3.5262, "step": 8440 }, { "epoch": 2.835700817096166, "grad_norm": 6.010193347930908, "learning_rate": 2.4044550398569622e-05, "loss": 3.5115, "step": 8460 }, { "epoch": 2.842405195893568, "grad_norm": 5.631977081298828, "learning_rate": 2.3970051404306043e-05, "loss": 3.5809, "step": 8480 }, { "epoch": 2.84910957469097, "grad_norm": 5.957998275756836, "learning_rate": 2.3895552410042464e-05, "loss": 3.5782, "step": 8500 }, { "epoch": 2.855813953488372, "grad_norm": 6.206627368927002, "learning_rate": 2.3821053415778888e-05, "loss": 3.5551, "step": 8520 }, { "epoch": 2.862518332285774, "grad_norm": 5.459038257598877, "learning_rate": 2.374655442151531e-05, "loss": 3.6324, "step": 8540 }, { "epoch": 2.869222711083176, "grad_norm": 6.841930866241455, "learning_rate": 2.3672055427251733e-05, "loss": 3.6122, "step": 8560 }, { "epoch": 2.8759270898805784, "grad_norm": 6.351479530334473, "learning_rate": 2.3597556432988154e-05, "loss": 3.5087, "step": 8580 }, { "epoch": 2.8826314686779804, "grad_norm": 5.920718193054199, "learning_rate": 2.352305743872458e-05, "loss": 3.5419, "step": 8600 }, { "epoch": 2.8893358474753823, "grad_norm": 6.254413604736328, "learning_rate": 2.3448558444461e-05, "loss": 3.475, "step": 8620 }, { "epoch": 2.8960402262727847, "grad_norm": 6.233896732330322, "learning_rate": 2.3374059450197424e-05, "loss": 3.6237, "step": 8640 }, { "epoch": 2.9027446050701866, "grad_norm": 6.401550769805908, "learning_rate": 2.3299560455933848e-05, "loss": 3.5505, "step": 8660 }, { "epoch": 2.9094489838675885, "grad_norm": 6.0176591873168945, "learning_rate": 2.322506146167027e-05, "loss": 3.4655, "step": 8680 }, { "epoch": 2.9161533626649905, "grad_norm": 6.907371520996094, "learning_rate": 2.3150562467406693e-05, "loss": 3.4332, "step": 8700 }, { "epoch": 2.9228577414623924, "grad_norm": 6.5138444900512695, "learning_rate": 2.3076063473143114e-05, "loss": 3.5849, "step": 8720 }, { "epoch": 2.929562120259795, "grad_norm": 7.2710347175598145, "learning_rate": 2.3001564478879538e-05, "loss": 3.5402, "step": 8740 }, { "epoch": 2.9362664990571967, "grad_norm": 6.0399909019470215, "learning_rate": 2.292706548461596e-05, "loss": 3.5121, "step": 8760 }, { "epoch": 2.9429708778545987, "grad_norm": 6.308010578155518, "learning_rate": 2.285256649035238e-05, "loss": 3.473, "step": 8780 }, { "epoch": 2.949675256652001, "grad_norm": 5.680022716522217, "learning_rate": 2.2778067496088804e-05, "loss": 3.5039, "step": 8800 }, { "epoch": 2.956379635449403, "grad_norm": 5.785823345184326, "learning_rate": 2.2703568501825225e-05, "loss": 3.5442, "step": 8820 }, { "epoch": 2.963084014246805, "grad_norm": 5.894392967224121, "learning_rate": 2.262906950756165e-05, "loss": 3.4217, "step": 8840 }, { "epoch": 2.969788393044207, "grad_norm": 5.803259372711182, "learning_rate": 2.255457051329807e-05, "loss": 3.5274, "step": 8860 }, { "epoch": 2.976492771841609, "grad_norm": 6.49872350692749, "learning_rate": 2.2480071519034494e-05, "loss": 3.5022, "step": 8880 }, { "epoch": 2.983197150639011, "grad_norm": 6.164760112762451, "learning_rate": 2.2405572524770915e-05, "loss": 3.5333, "step": 8900 }, { "epoch": 2.989901529436413, "grad_norm": 6.127744674682617, "learning_rate": 2.233107353050734e-05, "loss": 3.4646, "step": 8920 }, { "epoch": 2.996605908233815, "grad_norm": 6.783234119415283, "learning_rate": 2.2256574536243764e-05, "loss": 3.5176, "step": 8940 }, { "epoch": 3.0030169704588308, "grad_norm": 5.963592052459717, "learning_rate": 2.2182075541980185e-05, "loss": 3.202, "step": 8960 }, { "epoch": 3.009721349256233, "grad_norm": 6.0857744216918945, "learning_rate": 2.210757654771661e-05, "loss": 3.3323, "step": 8980 }, { "epoch": 3.016425728053635, "grad_norm": 6.808197975158691, "learning_rate": 2.203307755345303e-05, "loss": 3.4583, "step": 9000 }, { "epoch": 3.016425728053635, "eval_bleu_greedy": 2.4234819018656997, "eval_loss": 0.4750092625617981, "eval_runtime": 445.5917, "eval_samples_per_second": 0.224, "eval_steps_per_second": 0.224, "step": 9000 }, { "epoch": 3.023130106851037, "grad_norm": 6.107368469238281, "learning_rate": 2.195857855918945e-05, "loss": 3.2516, "step": 9020 }, { "epoch": 3.029834485648439, "grad_norm": 6.768011093139648, "learning_rate": 2.1884079564925875e-05, "loss": 3.3924, "step": 9040 }, { "epoch": 3.0365388644458413, "grad_norm": 6.5038743019104, "learning_rate": 2.1809580570662296e-05, "loss": 3.2324, "step": 9060 }, { "epoch": 3.0432432432432432, "grad_norm": 5.936684608459473, "learning_rate": 2.173508157639872e-05, "loss": 3.3328, "step": 9080 }, { "epoch": 3.049947622040645, "grad_norm": 6.226217746734619, "learning_rate": 2.166058258213514e-05, "loss": 3.2819, "step": 9100 }, { "epoch": 3.0566520008380476, "grad_norm": 5.835100173950195, "learning_rate": 2.1586083587871565e-05, "loss": 3.3229, "step": 9120 }, { "epoch": 3.0633563796354495, "grad_norm": 6.959074020385742, "learning_rate": 2.1511584593607986e-05, "loss": 3.2994, "step": 9140 }, { "epoch": 3.0700607584328514, "grad_norm": 6.586850643157959, "learning_rate": 2.1437085599344407e-05, "loss": 3.2011, "step": 9160 }, { "epoch": 3.0767651372302534, "grad_norm": 5.986579418182373, "learning_rate": 2.1362586605080835e-05, "loss": 3.3602, "step": 9180 }, { "epoch": 3.0834695160276557, "grad_norm": 5.427093029022217, "learning_rate": 2.1288087610817256e-05, "loss": 3.3568, "step": 9200 }, { "epoch": 3.0901738948250577, "grad_norm": 6.213650226593018, "learning_rate": 2.121358861655368e-05, "loss": 3.2927, "step": 9220 }, { "epoch": 3.0968782736224596, "grad_norm": 6.2159342765808105, "learning_rate": 2.11390896222901e-05, "loss": 3.3695, "step": 9240 }, { "epoch": 3.1035826524198615, "grad_norm": 6.4243597984313965, "learning_rate": 2.106459062802652e-05, "loss": 3.3794, "step": 9260 }, { "epoch": 3.110287031217264, "grad_norm": 6.737236499786377, "learning_rate": 2.0990091633762946e-05, "loss": 3.3333, "step": 9280 }, { "epoch": 3.116991410014666, "grad_norm": 6.42462158203125, "learning_rate": 2.0915592639499367e-05, "loss": 3.3944, "step": 9300 }, { "epoch": 3.1236957888120678, "grad_norm": 6.075654029846191, "learning_rate": 2.084109364523579e-05, "loss": 3.3037, "step": 9320 }, { "epoch": 3.13040016760947, "grad_norm": 5.77776575088501, "learning_rate": 2.0766594650972212e-05, "loss": 3.4129, "step": 9340 }, { "epoch": 3.137104546406872, "grad_norm": 6.13924503326416, "learning_rate": 2.0692095656708636e-05, "loss": 3.3723, "step": 9360 }, { "epoch": 3.143808925204274, "grad_norm": 6.142735481262207, "learning_rate": 2.0617596662445057e-05, "loss": 3.3321, "step": 9380 }, { "epoch": 3.150513304001676, "grad_norm": 6.672779083251953, "learning_rate": 2.0543097668181478e-05, "loss": 3.3234, "step": 9400 }, { "epoch": 3.1572176827990783, "grad_norm": 6.145503520965576, "learning_rate": 2.0468598673917902e-05, "loss": 3.2741, "step": 9420 }, { "epoch": 3.1639220615964803, "grad_norm": 6.716073513031006, "learning_rate": 2.0394099679654323e-05, "loss": 3.2791, "step": 9440 }, { "epoch": 3.170626440393882, "grad_norm": 6.335756778717041, "learning_rate": 2.031960068539075e-05, "loss": 3.2895, "step": 9460 }, { "epoch": 3.177330819191284, "grad_norm": 7.066572189331055, "learning_rate": 2.024510169112717e-05, "loss": 3.3522, "step": 9480 }, { "epoch": 3.1840351979886865, "grad_norm": 6.07637357711792, "learning_rate": 2.0170602696863593e-05, "loss": 3.2799, "step": 9500 }, { "epoch": 3.1907395767860884, "grad_norm": 6.036308288574219, "learning_rate": 2.0096103702600017e-05, "loss": 3.3748, "step": 9520 }, { "epoch": 3.1974439555834904, "grad_norm": 6.354751110076904, "learning_rate": 2.0021604708336438e-05, "loss": 3.3692, "step": 9540 }, { "epoch": 3.2041483343808927, "grad_norm": 5.642934322357178, "learning_rate": 1.9947105714072862e-05, "loss": 3.2563, "step": 9560 }, { "epoch": 3.2108527131782947, "grad_norm": 6.8418803215026855, "learning_rate": 1.9872606719809283e-05, "loss": 3.3056, "step": 9580 }, { "epoch": 3.2175570919756966, "grad_norm": 5.866108417510986, "learning_rate": 1.9798107725545707e-05, "loss": 3.2476, "step": 9600 }, { "epoch": 3.2242614707730985, "grad_norm": 5.497636795043945, "learning_rate": 1.9723608731282128e-05, "loss": 3.2634, "step": 9620 }, { "epoch": 3.230965849570501, "grad_norm": 6.034823894500732, "learning_rate": 1.9649109737018552e-05, "loss": 3.4055, "step": 9640 }, { "epoch": 3.237670228367903, "grad_norm": 7.224872589111328, "learning_rate": 1.9574610742754973e-05, "loss": 3.3118, "step": 9660 }, { "epoch": 3.244374607165305, "grad_norm": 6.321878433227539, "learning_rate": 1.9500111748491394e-05, "loss": 3.3225, "step": 9680 }, { "epoch": 3.2510789859627067, "grad_norm": 6.296338081359863, "learning_rate": 1.9425612754227818e-05, "loss": 3.349, "step": 9700 }, { "epoch": 3.257783364760109, "grad_norm": 6.3536505699157715, "learning_rate": 1.935111375996424e-05, "loss": 3.3247, "step": 9720 }, { "epoch": 3.264487743557511, "grad_norm": 5.711906433105469, "learning_rate": 1.9276614765700667e-05, "loss": 3.3193, "step": 9740 }, { "epoch": 3.271192122354913, "grad_norm": 6.689239978790283, "learning_rate": 1.9202115771437088e-05, "loss": 3.272, "step": 9760 }, { "epoch": 3.2778965011523153, "grad_norm": 6.301712989807129, "learning_rate": 1.912761677717351e-05, "loss": 3.2867, "step": 9780 }, { "epoch": 3.2846008799497173, "grad_norm": 6.167557239532471, "learning_rate": 1.9053117782909933e-05, "loss": 3.3041, "step": 9800 }, { "epoch": 3.291305258747119, "grad_norm": 6.17465353012085, "learning_rate": 1.8978618788646354e-05, "loss": 3.2369, "step": 9820 }, { "epoch": 3.298009637544521, "grad_norm": 6.496537208557129, "learning_rate": 1.8904119794382778e-05, "loss": 3.3375, "step": 9840 }, { "epoch": 3.304714016341923, "grad_norm": 6.527161598205566, "learning_rate": 1.88296208001192e-05, "loss": 3.3039, "step": 9860 }, { "epoch": 3.3114183951393255, "grad_norm": 7.209779262542725, "learning_rate": 1.8755121805855623e-05, "loss": 3.3638, "step": 9880 }, { "epoch": 3.3181227739367274, "grad_norm": 6.651127815246582, "learning_rate": 1.8680622811592044e-05, "loss": 3.3062, "step": 9900 }, { "epoch": 3.3248271527341293, "grad_norm": 5.988480567932129, "learning_rate": 1.8606123817328465e-05, "loss": 3.2656, "step": 9920 }, { "epoch": 3.3315315315315317, "grad_norm": 6.463028907775879, "learning_rate": 1.853162482306489e-05, "loss": 3.2631, "step": 9940 }, { "epoch": 3.3382359103289336, "grad_norm": 5.910898685455322, "learning_rate": 1.845712582880131e-05, "loss": 3.2458, "step": 9960 }, { "epoch": 3.3449402891263356, "grad_norm": 6.237380027770996, "learning_rate": 1.8382626834537734e-05, "loss": 3.32, "step": 9980 }, { "epoch": 3.3516446679237375, "grad_norm": 6.681026458740234, "learning_rate": 1.830812784027416e-05, "loss": 3.2698, "step": 10000 }, { "epoch": 3.3516446679237375, "eval_bleu_greedy": 2.2580959043747226, "eval_loss": 0.46980682015419006, "eval_runtime": 206.5153, "eval_samples_per_second": 0.484, "eval_steps_per_second": 0.484, "step": 10000 }, { "epoch": 3.35901948460088, "grad_norm": 6.202412128448486, "learning_rate": 1.823362884601058e-05, "loss": 3.2913, "step": 10020 }, { "epoch": 3.365723863398282, "grad_norm": 7.057358741760254, "learning_rate": 1.8159129851747004e-05, "loss": 3.2247, "step": 10040 }, { "epoch": 3.372428242195684, "grad_norm": 6.253905773162842, "learning_rate": 1.8084630857483425e-05, "loss": 3.2799, "step": 10060 }, { "epoch": 3.379132620993086, "grad_norm": 5.532748222351074, "learning_rate": 1.801013186321985e-05, "loss": 3.2562, "step": 10080 }, { "epoch": 3.385836999790488, "grad_norm": 5.992211818695068, "learning_rate": 1.793563286895627e-05, "loss": 3.3089, "step": 10100 }, { "epoch": 3.3925413785878904, "grad_norm": 5.832738876342773, "learning_rate": 1.7861133874692694e-05, "loss": 3.2162, "step": 10120 }, { "epoch": 3.3992457573852923, "grad_norm": 7.200679779052734, "learning_rate": 1.7786634880429115e-05, "loss": 3.2683, "step": 10140 }, { "epoch": 3.4059501361826943, "grad_norm": 5.809505939483643, "learning_rate": 1.7712135886165536e-05, "loss": 3.298, "step": 10160 }, { "epoch": 3.412654514980096, "grad_norm": 6.071516513824463, "learning_rate": 1.763763689190196e-05, "loss": 3.289, "step": 10180 }, { "epoch": 3.4193588937774986, "grad_norm": 7.4007744789123535, "learning_rate": 1.756313789763838e-05, "loss": 3.2438, "step": 10200 }, { "epoch": 3.4260632725749005, "grad_norm": 5.917469501495361, "learning_rate": 1.7488638903374805e-05, "loss": 3.2932, "step": 10220 }, { "epoch": 3.4327676513723024, "grad_norm": 6.20914363861084, "learning_rate": 1.7414139909111226e-05, "loss": 3.2827, "step": 10240 }, { "epoch": 3.4394720301697044, "grad_norm": 6.218352794647217, "learning_rate": 1.733964091484765e-05, "loss": 3.2885, "step": 10260 }, { "epoch": 3.4461764089671068, "grad_norm": 6.699190616607666, "learning_rate": 1.7265141920584075e-05, "loss": 3.2237, "step": 10280 }, { "epoch": 3.4528807877645087, "grad_norm": 5.649641990661621, "learning_rate": 1.7190642926320496e-05, "loss": 3.1781, "step": 10300 }, { "epoch": 3.4595851665619106, "grad_norm": 6.218810558319092, "learning_rate": 1.711614393205692e-05, "loss": 3.1712, "step": 10320 }, { "epoch": 3.466289545359313, "grad_norm": 7.203590393066406, "learning_rate": 1.704164493779334e-05, "loss": 3.2399, "step": 10340 }, { "epoch": 3.472993924156715, "grad_norm": 6.820786952972412, "learning_rate": 1.6967145943529765e-05, "loss": 3.2389, "step": 10360 }, { "epoch": 3.479698302954117, "grad_norm": 6.1062798500061035, "learning_rate": 1.6892646949266186e-05, "loss": 3.2507, "step": 10380 }, { "epoch": 3.486402681751519, "grad_norm": 6.02709436416626, "learning_rate": 1.6818147955002607e-05, "loss": 3.2714, "step": 10400 }, { "epoch": 3.493107060548921, "grad_norm": 6.64369010925293, "learning_rate": 1.674364896073903e-05, "loss": 3.2319, "step": 10420 }, { "epoch": 3.499811439346323, "grad_norm": 5.996265888214111, "learning_rate": 1.6669149966475452e-05, "loss": 3.221, "step": 10440 }, { "epoch": 3.506515818143725, "grad_norm": 6.68301248550415, "learning_rate": 1.6594650972211876e-05, "loss": 3.2714, "step": 10460 }, { "epoch": 3.513220196941127, "grad_norm": 5.8294878005981445, "learning_rate": 1.6520151977948297e-05, "loss": 3.2909, "step": 10480 }, { "epoch": 3.5199245757385293, "grad_norm": 6.585033893585205, "learning_rate": 1.644565298368472e-05, "loss": 3.2091, "step": 10500 }, { "epoch": 3.5266289545359313, "grad_norm": 6.6064934730529785, "learning_rate": 1.6371153989421142e-05, "loss": 3.2033, "step": 10520 }, { "epoch": 3.533333333333333, "grad_norm": 6.687121868133545, "learning_rate": 1.6296654995157566e-05, "loss": 3.2532, "step": 10540 }, { "epoch": 3.5400377121307356, "grad_norm": 7.002409934997559, "learning_rate": 1.622215600089399e-05, "loss": 3.2823, "step": 10560 }, { "epoch": 3.5467420909281375, "grad_norm": 6.823233127593994, "learning_rate": 1.614765700663041e-05, "loss": 3.3175, "step": 10580 }, { "epoch": 3.5534464697255395, "grad_norm": 5.88700532913208, "learning_rate": 1.6073158012366836e-05, "loss": 3.2122, "step": 10600 }, { "epoch": 3.5601508485229414, "grad_norm": 6.105057716369629, "learning_rate": 1.5998659018103257e-05, "loss": 3.1478, "step": 10620 }, { "epoch": 3.5668552273203433, "grad_norm": 6.8328022956848145, "learning_rate": 1.592416002383968e-05, "loss": 3.2097, "step": 10640 }, { "epoch": 3.5735596061177457, "grad_norm": 6.577600002288818, "learning_rate": 1.5849661029576102e-05, "loss": 3.1664, "step": 10660 }, { "epoch": 3.5802639849151476, "grad_norm": 7.270109176635742, "learning_rate": 1.5775162035312523e-05, "loss": 3.1901, "step": 10680 }, { "epoch": 3.5869683637125496, "grad_norm": 6.4257683753967285, "learning_rate": 1.5700663041048947e-05, "loss": 3.226, "step": 10700 }, { "epoch": 3.593672742509952, "grad_norm": 5.963393688201904, "learning_rate": 1.5626164046785368e-05, "loss": 3.2249, "step": 10720 }, { "epoch": 3.600377121307354, "grad_norm": 6.7239766120910645, "learning_rate": 1.5551665052521792e-05, "loss": 3.2575, "step": 10740 }, { "epoch": 3.607081500104756, "grad_norm": 6.665550231933594, "learning_rate": 1.5477166058258213e-05, "loss": 3.2503, "step": 10760 }, { "epoch": 3.613785878902158, "grad_norm": 5.777255058288574, "learning_rate": 1.5402667063994637e-05, "loss": 3.2054, "step": 10780 }, { "epoch": 3.62049025769956, "grad_norm": 6.491949558258057, "learning_rate": 1.5328168069731058e-05, "loss": 3.1708, "step": 10800 }, { "epoch": 3.627194636496962, "grad_norm": 6.6252665519714355, "learning_rate": 1.5253669075467482e-05, "loss": 3.2418, "step": 10820 }, { "epoch": 3.633899015294364, "grad_norm": 6.817688941955566, "learning_rate": 1.5179170081203905e-05, "loss": 3.259, "step": 10840 }, { "epoch": 3.640603394091766, "grad_norm": 6.675405025482178, "learning_rate": 1.5104671086940328e-05, "loss": 3.218, "step": 10860 }, { "epoch": 3.6473077728891683, "grad_norm": 6.145236015319824, "learning_rate": 1.503017209267675e-05, "loss": 3.2562, "step": 10880 }, { "epoch": 3.6540121516865702, "grad_norm": 6.741012096405029, "learning_rate": 1.4955673098413173e-05, "loss": 3.2195, "step": 10900 }, { "epoch": 3.660716530483972, "grad_norm": 6.519981861114502, "learning_rate": 1.4881174104149595e-05, "loss": 3.2343, "step": 10920 }, { "epoch": 3.6674209092813745, "grad_norm": 5.3957648277282715, "learning_rate": 1.4806675109886018e-05, "loss": 3.2527, "step": 10940 }, { "epoch": 3.6741252880787765, "grad_norm": 7.068119525909424, "learning_rate": 1.473217611562244e-05, "loss": 3.1907, "step": 10960 }, { "epoch": 3.6808296668761784, "grad_norm": 6.537194728851318, "learning_rate": 1.4657677121358861e-05, "loss": 3.1773, "step": 10980 }, { "epoch": 3.687534045673581, "grad_norm": 6.715285778045654, "learning_rate": 1.4583178127095284e-05, "loss": 3.2272, "step": 11000 }, { "epoch": 3.687534045673581, "eval_bleu_greedy": 2.3251290926774186, "eval_loss": 0.46657735109329224, "eval_runtime": 464.198, "eval_samples_per_second": 0.215, "eval_steps_per_second": 0.215, "step": 11000 }, { "epoch": 3.6942384244709827, "grad_norm": 6.894411087036133, "learning_rate": 1.4508679132831707e-05, "loss": 3.1783, "step": 11020 }, { "epoch": 3.7009428032683847, "grad_norm": 6.030000686645508, "learning_rate": 1.4434180138568129e-05, "loss": 3.2033, "step": 11040 }, { "epoch": 3.7076471820657866, "grad_norm": 6.902273654937744, "learning_rate": 1.4359681144304552e-05, "loss": 3.1508, "step": 11060 }, { "epoch": 3.7143515608631885, "grad_norm": 6.1826934814453125, "learning_rate": 1.4285182150040974e-05, "loss": 3.2302, "step": 11080 }, { "epoch": 3.721055939660591, "grad_norm": 7.401595592498779, "learning_rate": 1.4210683155777399e-05, "loss": 3.2139, "step": 11100 }, { "epoch": 3.727760318457993, "grad_norm": 6.83165168762207, "learning_rate": 1.4136184161513821e-05, "loss": 3.1857, "step": 11120 }, { "epoch": 3.7344646972553948, "grad_norm": 6.816708087921143, "learning_rate": 1.4061685167250244e-05, "loss": 3.2377, "step": 11140 }, { "epoch": 3.741169076052797, "grad_norm": 5.8378987312316895, "learning_rate": 1.3987186172986666e-05, "loss": 3.2827, "step": 11160 }, { "epoch": 3.747873454850199, "grad_norm": 7.595831394195557, "learning_rate": 1.3912687178723089e-05, "loss": 3.237, "step": 11180 }, { "epoch": 3.754577833647601, "grad_norm": 6.578879356384277, "learning_rate": 1.3838188184459511e-05, "loss": 3.2416, "step": 11200 }, { "epoch": 3.7612822124450034, "grad_norm": 5.939640522003174, "learning_rate": 1.3763689190195932e-05, "loss": 3.1734, "step": 11220 }, { "epoch": 3.7679865912424053, "grad_norm": 6.304750442504883, "learning_rate": 1.3689190195932355e-05, "loss": 3.2416, "step": 11240 }, { "epoch": 3.7746909700398072, "grad_norm": 6.177415370941162, "learning_rate": 1.3614691201668777e-05, "loss": 3.1675, "step": 11260 }, { "epoch": 3.781395348837209, "grad_norm": 6.993617057800293, "learning_rate": 1.35401922074052e-05, "loss": 3.2162, "step": 11280 }, { "epoch": 3.788099727634611, "grad_norm": 6.1251726150512695, "learning_rate": 1.3465693213141623e-05, "loss": 3.2232, "step": 11300 }, { "epoch": 3.7948041064320135, "grad_norm": 6.485012054443359, "learning_rate": 1.3391194218878045e-05, "loss": 3.1591, "step": 11320 }, { "epoch": 3.8015084852294154, "grad_norm": 6.347079277038574, "learning_rate": 1.3316695224614468e-05, "loss": 3.1745, "step": 11340 }, { "epoch": 3.8082128640268174, "grad_norm": 7.0095744132995605, "learning_rate": 1.324219623035089e-05, "loss": 3.2463, "step": 11360 }, { "epoch": 3.8149172428242197, "grad_norm": 6.158694267272949, "learning_rate": 1.3167697236087315e-05, "loss": 3.223, "step": 11380 }, { "epoch": 3.8216216216216217, "grad_norm": 7.0430827140808105, "learning_rate": 1.3093198241823737e-05, "loss": 3.1774, "step": 11400 }, { "epoch": 3.8283260004190236, "grad_norm": 6.411921501159668, "learning_rate": 1.301869924756016e-05, "loss": 3.2934, "step": 11420 }, { "epoch": 3.835030379216426, "grad_norm": 6.355661392211914, "learning_rate": 1.2944200253296582e-05, "loss": 3.129, "step": 11440 }, { "epoch": 3.841734758013828, "grad_norm": 5.618327617645264, "learning_rate": 1.2869701259033005e-05, "loss": 3.138, "step": 11460 }, { "epoch": 3.84843913681123, "grad_norm": 6.159928321838379, "learning_rate": 1.2795202264769426e-05, "loss": 3.2041, "step": 11480 }, { "epoch": 3.8551435156086318, "grad_norm": 7.234489917755127, "learning_rate": 1.2720703270505848e-05, "loss": 3.2447, "step": 11500 }, { "epoch": 3.8618478944060337, "grad_norm": 6.748493671417236, "learning_rate": 1.2646204276242271e-05, "loss": 3.2015, "step": 11520 }, { "epoch": 3.868552273203436, "grad_norm": 6.751996040344238, "learning_rate": 1.2571705281978693e-05, "loss": 3.1563, "step": 11540 }, { "epoch": 3.875256652000838, "grad_norm": 6.8070783615112305, "learning_rate": 1.2497206287715116e-05, "loss": 3.1416, "step": 11560 }, { "epoch": 3.88196103079824, "grad_norm": 6.117493152618408, "learning_rate": 1.242270729345154e-05, "loss": 3.1295, "step": 11580 }, { "epoch": 3.8886654095956423, "grad_norm": 6.02462100982666, "learning_rate": 1.2348208299187961e-05, "loss": 3.2225, "step": 11600 }, { "epoch": 3.8953697883930443, "grad_norm": 5.968542575836182, "learning_rate": 1.2273709304924384e-05, "loss": 3.175, "step": 11620 }, { "epoch": 3.902074167190446, "grad_norm": 7.16673469543457, "learning_rate": 1.2199210310660806e-05, "loss": 3.2582, "step": 11640 }, { "epoch": 3.9087785459878486, "grad_norm": 6.56205415725708, "learning_rate": 1.2124711316397229e-05, "loss": 3.2285, "step": 11660 }, { "epoch": 3.9154829247852505, "grad_norm": 6.418534755706787, "learning_rate": 1.2050212322133651e-05, "loss": 3.2023, "step": 11680 }, { "epoch": 3.9221873035826524, "grad_norm": 6.226400852203369, "learning_rate": 1.1975713327870074e-05, "loss": 3.1821, "step": 11700 }, { "epoch": 3.9288916823800544, "grad_norm": 6.837843894958496, "learning_rate": 1.1901214333606497e-05, "loss": 3.1677, "step": 11720 }, { "epoch": 3.9355960611774563, "grad_norm": 5.950616836547852, "learning_rate": 1.182671533934292e-05, "loss": 3.1856, "step": 11740 }, { "epoch": 3.9423004399748587, "grad_norm": 6.373692035675049, "learning_rate": 1.1752216345079342e-05, "loss": 3.2501, "step": 11760 }, { "epoch": 3.9490048187722606, "grad_norm": 6.721376895904541, "learning_rate": 1.1677717350815764e-05, "loss": 3.2044, "step": 11780 }, { "epoch": 3.9557091975696625, "grad_norm": 6.181844711303711, "learning_rate": 1.1603218356552187e-05, "loss": 3.1938, "step": 11800 }, { "epoch": 3.962413576367065, "grad_norm": 6.3947577476501465, "learning_rate": 1.152871936228861e-05, "loss": 3.1984, "step": 11820 }, { "epoch": 3.969117955164467, "grad_norm": 6.491850852966309, "learning_rate": 1.1454220368025032e-05, "loss": 3.1825, "step": 11840 }, { "epoch": 3.975822333961869, "grad_norm": 6.11356782913208, "learning_rate": 1.1379721373761455e-05, "loss": 3.1174, "step": 11860 }, { "epoch": 3.982526712759271, "grad_norm": 7.591030597686768, "learning_rate": 1.1305222379497877e-05, "loss": 3.2322, "step": 11880 }, { "epoch": 3.989231091556673, "grad_norm": 5.719244956970215, "learning_rate": 1.12307233852343e-05, "loss": 3.1449, "step": 11900 }, { "epoch": 3.995935470354075, "grad_norm": 6.756486892700195, "learning_rate": 1.1156224390970722e-05, "loss": 3.1296, "step": 11920 }, { "epoch": 4.002681751518961, "grad_norm": 6.4487528800964355, "learning_rate": 1.1081725396707145e-05, "loss": 3.259, "step": 11940 }, { "epoch": 4.009386130316363, "grad_norm": 6.847874641418457, "learning_rate": 1.1007226402443568e-05, "loss": 3.1175, "step": 11960 }, { "epoch": 4.016090509113765, "grad_norm": 6.221479892730713, "learning_rate": 1.093272740817999e-05, "loss": 3.1395, "step": 11980 }, { "epoch": 4.022794887911167, "grad_norm": 7.619890213012695, "learning_rate": 1.0858228413916413e-05, "loss": 3.0701, "step": 12000 }, { "epoch": 4.022794887911167, "eval_bleu_greedy": 2.2056340612259895, "eval_loss": 0.4672054350376129, "eval_runtime": 519.4009, "eval_samples_per_second": 0.193, "eval_steps_per_second": 0.193, "step": 12000 }, { "epoch": 4.029499266708569, "grad_norm": 6.872377872467041, "learning_rate": 1.0783729419652835e-05, "loss": 3.1056, "step": 12020 }, { "epoch": 4.036203645505971, "grad_norm": 6.781915664672852, "learning_rate": 1.0709230425389258e-05, "loss": 3.0394, "step": 12040 }, { "epoch": 4.042908024303373, "grad_norm": 6.772296905517578, "learning_rate": 1.063473143112568e-05, "loss": 3.1418, "step": 12060 }, { "epoch": 4.049612403100776, "grad_norm": 5.7719011306762695, "learning_rate": 1.0560232436862103e-05, "loss": 2.9814, "step": 12080 }, { "epoch": 4.056316781898177, "grad_norm": 6.217945575714111, "learning_rate": 1.0485733442598526e-05, "loss": 3.1063, "step": 12100 }, { "epoch": 4.0630211606955795, "grad_norm": 7.190448760986328, "learning_rate": 1.0411234448334948e-05, "loss": 3.1758, "step": 12120 }, { "epoch": 4.069725539492981, "grad_norm": 7.15440559387207, "learning_rate": 1.033673545407137e-05, "loss": 3.1914, "step": 12140 }, { "epoch": 4.076429918290383, "grad_norm": 6.738652229309082, "learning_rate": 1.0262236459807793e-05, "loss": 2.9985, "step": 12160 }, { "epoch": 4.083134297087786, "grad_norm": 6.6931962966918945, "learning_rate": 1.0187737465544216e-05, "loss": 3.1319, "step": 12180 }, { "epoch": 4.089838675885187, "grad_norm": 7.118000507354736, "learning_rate": 1.0113238471280638e-05, "loss": 3.1749, "step": 12200 }, { "epoch": 4.09654305468259, "grad_norm": 6.880594730377197, "learning_rate": 1.0038739477017061e-05, "loss": 3.151, "step": 12220 }, { "epoch": 4.103247433479992, "grad_norm": 6.61147403717041, "learning_rate": 9.964240482753482e-06, "loss": 3.0578, "step": 12240 }, { "epoch": 4.1099518122773935, "grad_norm": 8.498011589050293, "learning_rate": 9.889741488489906e-06, "loss": 3.0902, "step": 12260 }, { "epoch": 4.116656191074796, "grad_norm": 7.078530788421631, "learning_rate": 9.815242494226329e-06, "loss": 3.1134, "step": 12280 }, { "epoch": 4.123360569872197, "grad_norm": 7.098989486694336, "learning_rate": 9.740743499962751e-06, "loss": 3.0835, "step": 12300 }, { "epoch": 4.1300649486696, "grad_norm": 6.889624118804932, "learning_rate": 9.666244505699174e-06, "loss": 3.0836, "step": 12320 }, { "epoch": 4.136769327467002, "grad_norm": 6.000241279602051, "learning_rate": 9.591745511435596e-06, "loss": 3.1416, "step": 12340 }, { "epoch": 4.143473706264404, "grad_norm": 7.567933559417725, "learning_rate": 9.517246517172019e-06, "loss": 3.2006, "step": 12360 }, { "epoch": 4.150178085061806, "grad_norm": 6.100574493408203, "learning_rate": 9.44274752290844e-06, "loss": 3.0368, "step": 12380 }, { "epoch": 4.156882463859208, "grad_norm": 6.519239902496338, "learning_rate": 9.368248528644864e-06, "loss": 3.1715, "step": 12400 }, { "epoch": 4.16358684265661, "grad_norm": 7.143859386444092, "learning_rate": 9.293749534381287e-06, "loss": 3.1236, "step": 12420 }, { "epoch": 4.170291221454012, "grad_norm": 7.759309768676758, "learning_rate": 9.21925054011771e-06, "loss": 3.1434, "step": 12440 }, { "epoch": 4.176995600251415, "grad_norm": 6.477195739746094, "learning_rate": 9.144751545854132e-06, "loss": 3.1094, "step": 12460 }, { "epoch": 4.183699979048816, "grad_norm": 6.8250813484191895, "learning_rate": 9.070252551590554e-06, "loss": 3.0832, "step": 12480 }, { "epoch": 4.1904043578462185, "grad_norm": 6.915088176727295, "learning_rate": 8.995753557326975e-06, "loss": 3.1025, "step": 12500 }, { "epoch": 4.19710873664362, "grad_norm": 6.240241527557373, "learning_rate": 8.921254563063398e-06, "loss": 3.1142, "step": 12520 }, { "epoch": 4.203813115441022, "grad_norm": 5.7193522453308105, "learning_rate": 8.846755568799822e-06, "loss": 3.0886, "step": 12540 }, { "epoch": 4.210517494238425, "grad_norm": 6.799840927124023, "learning_rate": 8.772256574536245e-06, "loss": 3.0755, "step": 12560 }, { "epoch": 4.217221873035826, "grad_norm": 6.537818908691406, "learning_rate": 8.697757580272667e-06, "loss": 3.1854, "step": 12580 }, { "epoch": 4.223926251833229, "grad_norm": 6.453887939453125, "learning_rate": 8.62325858600909e-06, "loss": 3.1353, "step": 12600 }, { "epoch": 4.230630630630631, "grad_norm": 6.519958019256592, "learning_rate": 8.54875959174551e-06, "loss": 3.0728, "step": 12620 }, { "epoch": 4.237335009428032, "grad_norm": 6.916313648223877, "learning_rate": 8.474260597481933e-06, "loss": 3.1365, "step": 12640 }, { "epoch": 4.244039388225435, "grad_norm": 6.080234527587891, "learning_rate": 8.399761603218358e-06, "loss": 3.1026, "step": 12660 }, { "epoch": 4.250743767022837, "grad_norm": 6.199918746948242, "learning_rate": 8.32526260895478e-06, "loss": 3.1174, "step": 12680 }, { "epoch": 4.257448145820239, "grad_norm": 6.077798843383789, "learning_rate": 8.250763614691203e-06, "loss": 3.1353, "step": 12700 }, { "epoch": 4.264152524617641, "grad_norm": 7.003963947296143, "learning_rate": 8.176264620427625e-06, "loss": 3.1721, "step": 12720 }, { "epoch": 4.2708569034150425, "grad_norm": 7.276467323303223, "learning_rate": 8.101765626164046e-06, "loss": 3.1403, "step": 12740 }, { "epoch": 4.277561282212445, "grad_norm": 6.939758777618408, "learning_rate": 8.027266631900469e-06, "loss": 3.084, "step": 12760 }, { "epoch": 4.284265661009847, "grad_norm": 6.425601959228516, "learning_rate": 7.952767637636891e-06, "loss": 3.1602, "step": 12780 }, { "epoch": 4.290970039807249, "grad_norm": 6.354540824890137, "learning_rate": 7.878268643373316e-06, "loss": 3.0745, "step": 12800 }, { "epoch": 4.297674418604651, "grad_norm": 6.399191856384277, "learning_rate": 7.803769649109738e-06, "loss": 3.0546, "step": 12820 }, { "epoch": 4.3043787974020535, "grad_norm": 7.009991645812988, "learning_rate": 7.72927065484616e-06, "loss": 3.1064, "step": 12840 }, { "epoch": 4.311083176199455, "grad_norm": 6.549511432647705, "learning_rate": 7.654771660582582e-06, "loss": 3.119, "step": 12860 }, { "epoch": 4.317787554996857, "grad_norm": 6.825671672821045, "learning_rate": 7.580272666319005e-06, "loss": 3.0588, "step": 12880 }, { "epoch": 4.32449193379426, "grad_norm": 6.320077896118164, "learning_rate": 7.505773672055427e-06, "loss": 3.1293, "step": 12900 }, { "epoch": 4.331196312591661, "grad_norm": 6.614448070526123, "learning_rate": 7.4312746777918494e-06, "loss": 3.0686, "step": 12920 }, { "epoch": 4.337900691389064, "grad_norm": 6.515445709228516, "learning_rate": 7.356775683528274e-06, "loss": 3.0706, "step": 12940 }, { "epoch": 4.344605070186465, "grad_norm": 7.301309585571289, "learning_rate": 7.2822766892646954e-06, "loss": 3.0647, "step": 12960 }, { "epoch": 4.3513094489838675, "grad_norm": 7.263702869415283, "learning_rate": 7.207777695001118e-06, "loss": 3.1394, "step": 12980 }, { "epoch": 4.35801382778127, "grad_norm": 6.944880962371826, "learning_rate": 7.133278700737541e-06, "loss": 3.1024, "step": 13000 }, { "epoch": 4.35801382778127, "eval_bleu_greedy": 2.3552626321962884, "eval_loss": 0.4624796211719513, "eval_runtime": 534.5676, "eval_samples_per_second": 0.187, "eval_steps_per_second": 0.187, "step": 13000 }, { "epoch": 4.364718206578671, "grad_norm": 6.3444414138793945, "learning_rate": 7.058779706473963e-06, "loss": 3.0579, "step": 13020 }, { "epoch": 4.371422585376074, "grad_norm": 7.333340167999268, "learning_rate": 6.984280712210385e-06, "loss": 3.0466, "step": 13040 }, { "epoch": 4.378126964173476, "grad_norm": 7.396731376647949, "learning_rate": 6.9097817179468075e-06, "loss": 3.0913, "step": 13060 }, { "epoch": 4.384831342970878, "grad_norm": 7.228068828582764, "learning_rate": 6.835282723683231e-06, "loss": 3.0371, "step": 13080 }, { "epoch": 4.39153572176828, "grad_norm": 7.033063888549805, "learning_rate": 6.7607837294196535e-06, "loss": 3.1139, "step": 13100 }, { "epoch": 4.398240100565682, "grad_norm": 7.278767108917236, "learning_rate": 6.686284735156076e-06, "loss": 3.0944, "step": 13120 }, { "epoch": 4.404944479363084, "grad_norm": 7.024720191955566, "learning_rate": 6.611785740892499e-06, "loss": 3.1348, "step": 13140 }, { "epoch": 4.411648858160486, "grad_norm": 7.331086158752441, "learning_rate": 6.53728674662892e-06, "loss": 3.0433, "step": 13160 }, { "epoch": 4.418353236957888, "grad_norm": 6.916625499725342, "learning_rate": 6.462787752365343e-06, "loss": 3.0803, "step": 13180 }, { "epoch": 4.42505761575529, "grad_norm": 6.226632595062256, "learning_rate": 6.3882887581017655e-06, "loss": 3.1699, "step": 13200 }, { "epoch": 4.4317619945526925, "grad_norm": 6.50321626663208, "learning_rate": 6.313789763838189e-06, "loss": 3.0663, "step": 13220 }, { "epoch": 4.438466373350094, "grad_norm": 7.217479705810547, "learning_rate": 6.239290769574611e-06, "loss": 3.0969, "step": 13240 }, { "epoch": 4.445170752147496, "grad_norm": 7.183232307434082, "learning_rate": 6.164791775311034e-06, "loss": 3.0768, "step": 13260 }, { "epoch": 4.451875130944899, "grad_norm": 6.6673102378845215, "learning_rate": 6.090292781047456e-06, "loss": 3.1062, "step": 13280 }, { "epoch": 4.4585795097423, "grad_norm": 7.2146525382995605, "learning_rate": 6.015793786783878e-06, "loss": 3.1092, "step": 13300 }, { "epoch": 4.465283888539703, "grad_norm": 7.235128402709961, "learning_rate": 5.941294792520302e-06, "loss": 3.0743, "step": 13320 }, { "epoch": 4.471988267337105, "grad_norm": 6.275545597076416, "learning_rate": 5.8667957982567235e-06, "loss": 3.0743, "step": 13340 }, { "epoch": 4.4786926461345065, "grad_norm": 6.9439473152160645, "learning_rate": 5.792296803993146e-06, "loss": 3.1373, "step": 13360 }, { "epoch": 4.485397024931909, "grad_norm": 6.065330982208252, "learning_rate": 5.7177978097295695e-06, "loss": 3.1196, "step": 13380 }, { "epoch": 4.49210140372931, "grad_norm": 6.784725666046143, "learning_rate": 5.643298815465991e-06, "loss": 3.1024, "step": 13400 }, { "epoch": 4.498805782526713, "grad_norm": 6.992110729217529, "learning_rate": 5.568799821202414e-06, "loss": 3.2035, "step": 13420 }, { "epoch": 4.505510161324115, "grad_norm": 6.317196846008301, "learning_rate": 5.494300826938836e-06, "loss": 3.1222, "step": 13440 }, { "epoch": 4.512214540121517, "grad_norm": 6.293644428253174, "learning_rate": 5.419801832675259e-06, "loss": 3.1413, "step": 13460 }, { "epoch": 4.518918918918919, "grad_norm": 7.206140518188477, "learning_rate": 5.3453028384116815e-06, "loss": 3.125, "step": 13480 }, { "epoch": 4.525623297716321, "grad_norm": 6.770303726196289, "learning_rate": 5.270803844148104e-06, "loss": 3.1346, "step": 13500 }, { "epoch": 4.532327676513723, "grad_norm": 7.2229485511779785, "learning_rate": 5.196304849884527e-06, "loss": 3.1146, "step": 13520 }, { "epoch": 4.539032055311125, "grad_norm": 7.110487461090088, "learning_rate": 5.121805855620949e-06, "loss": 3.1114, "step": 13540 }, { "epoch": 4.545736434108527, "grad_norm": 7.351033687591553, "learning_rate": 5.047306861357372e-06, "loss": 3.0575, "step": 13560 }, { "epoch": 4.552440812905929, "grad_norm": 6.950778484344482, "learning_rate": 4.972807867093794e-06, "loss": 3.0641, "step": 13580 }, { "epoch": 4.559145191703331, "grad_norm": 6.550965785980225, "learning_rate": 4.898308872830217e-06, "loss": 3.1761, "step": 13600 }, { "epoch": 4.565849570500733, "grad_norm": 6.709011554718018, "learning_rate": 4.8238098785666396e-06, "loss": 2.9988, "step": 13620 }, { "epoch": 4.572553949298135, "grad_norm": 6.507779598236084, "learning_rate": 4.749310884303062e-06, "loss": 3.016, "step": 13640 }, { "epoch": 4.579258328095538, "grad_norm": 6.363673210144043, "learning_rate": 4.674811890039485e-06, "loss": 3.0496, "step": 13660 }, { "epoch": 4.585962706892939, "grad_norm": 6.965389251708984, "learning_rate": 4.600312895775907e-06, "loss": 3.1518, "step": 13680 }, { "epoch": 4.5926670856903415, "grad_norm": 6.091116905212402, "learning_rate": 4.52581390151233e-06, "loss": 3.0881, "step": 13700 }, { "epoch": 4.599371464487744, "grad_norm": 7.049524784088135, "learning_rate": 4.4513149072487524e-06, "loss": 3.1202, "step": 13720 }, { "epoch": 4.606075843285145, "grad_norm": 6.323545932769775, "learning_rate": 4.376815912985175e-06, "loss": 3.0242, "step": 13740 }, { "epoch": 4.612780222082548, "grad_norm": 7.295837879180908, "learning_rate": 4.3023169187215976e-06, "loss": 3.1243, "step": 13760 }, { "epoch": 4.61948460087995, "grad_norm": 6.582053184509277, "learning_rate": 4.22781792445802e-06, "loss": 3.0687, "step": 13780 }, { "epoch": 4.626188979677352, "grad_norm": 6.175601959228516, "learning_rate": 4.153318930194443e-06, "loss": 3.0427, "step": 13800 }, { "epoch": 4.632893358474754, "grad_norm": 7.662842273712158, "learning_rate": 4.078819935930865e-06, "loss": 3.1979, "step": 13820 }, { "epoch": 4.6395977372721555, "grad_norm": 7.036664009094238, "learning_rate": 4.004320941667288e-06, "loss": 3.1559, "step": 13840 }, { "epoch": 4.646302116069558, "grad_norm": 5.965688228607178, "learning_rate": 3.9298219474037105e-06, "loss": 3.1257, "step": 13860 }, { "epoch": 4.65300649486696, "grad_norm": 6.378177165985107, "learning_rate": 3.855322953140133e-06, "loss": 3.1514, "step": 13880 }, { "epoch": 4.659710873664362, "grad_norm": 6.670738220214844, "learning_rate": 3.780823958876555e-06, "loss": 3.0992, "step": 13900 }, { "epoch": 4.666415252461764, "grad_norm": 6.76698112487793, "learning_rate": 3.7063249646129778e-06, "loss": 3.1566, "step": 13920 }, { "epoch": 4.6731196312591665, "grad_norm": 6.328171253204346, "learning_rate": 3.6318259703494007e-06, "loss": 3.0974, "step": 13940 }, { "epoch": 4.679824010056568, "grad_norm": 7.151896953582764, "learning_rate": 3.557326976085823e-06, "loss": 3.1234, "step": 13960 }, { "epoch": 4.68652838885397, "grad_norm": 6.95003080368042, "learning_rate": 3.4828279818222455e-06, "loss": 3.156, "step": 13980 }, { "epoch": 4.693232767651372, "grad_norm": 7.1711931228637695, "learning_rate": 3.4083289875586685e-06, "loss": 3.0428, "step": 14000 }, { "epoch": 4.693232767651372, "eval_bleu_greedy": 2.416340135494281, "eval_loss": 0.45956096053123474, "eval_runtime": 314.8396, "eval_samples_per_second": 0.318, "eval_steps_per_second": 0.318, "step": 14000 }, { "epoch": 4.699937146448774, "grad_norm": 6.529189109802246, "learning_rate": 3.3338299932950906e-06, "loss": 3.1417, "step": 14020 }, { "epoch": 4.706641525246177, "grad_norm": 7.026646137237549, "learning_rate": 3.259330999031513e-06, "loss": 3.1099, "step": 14040 }, { "epoch": 4.713345904043578, "grad_norm": 6.561285972595215, "learning_rate": 3.1848320047679354e-06, "loss": 3.0675, "step": 14060 }, { "epoch": 4.7200502828409805, "grad_norm": 7.228313446044922, "learning_rate": 3.1103330105043583e-06, "loss": 3.0952, "step": 14080 }, { "epoch": 4.726754661638383, "grad_norm": 6.70543098449707, "learning_rate": 3.035834016240781e-06, "loss": 3.1087, "step": 14100 }, { "epoch": 4.733459040435784, "grad_norm": 6.371490478515625, "learning_rate": 2.9613350219772035e-06, "loss": 3.0608, "step": 14120 }, { "epoch": 4.740163419233187, "grad_norm": 6.534164905548096, "learning_rate": 2.886836027713626e-06, "loss": 3.055, "step": 14140 }, { "epoch": 4.746867798030589, "grad_norm": 6.988217353820801, "learning_rate": 2.8123370334500486e-06, "loss": 3.066, "step": 14160 }, { "epoch": 4.753572176827991, "grad_norm": 7.489045143127441, "learning_rate": 2.737838039186471e-06, "loss": 3.1232, "step": 14180 }, { "epoch": 4.760276555625393, "grad_norm": 6.6933512687683105, "learning_rate": 2.663339044922894e-06, "loss": 3.0314, "step": 14200 }, { "epoch": 4.766980934422795, "grad_norm": 6.849923133850098, "learning_rate": 2.5888400506593164e-06, "loss": 3.0855, "step": 14220 }, { "epoch": 4.773685313220197, "grad_norm": 6.958053112030029, "learning_rate": 2.5143410563957385e-06, "loss": 3.085, "step": 14240 }, { "epoch": 4.780389692017599, "grad_norm": 7.325470924377441, "learning_rate": 2.4398420621321615e-06, "loss": 3.1435, "step": 14260 }, { "epoch": 4.787094070815001, "grad_norm": 6.421871185302734, "learning_rate": 2.3653430678685837e-06, "loss": 3.0735, "step": 14280 }, { "epoch": 4.793798449612403, "grad_norm": 6.636096000671387, "learning_rate": 2.2908440736050062e-06, "loss": 3.083, "step": 14300 }, { "epoch": 4.8005028284098055, "grad_norm": 7.074666976928711, "learning_rate": 2.2163450793414292e-06, "loss": 3.0699, "step": 14320 }, { "epoch": 4.807207207207207, "grad_norm": 6.859719276428223, "learning_rate": 2.1418460850778514e-06, "loss": 3.126, "step": 14340 }, { "epoch": 4.813911586004609, "grad_norm": 7.162552833557129, "learning_rate": 2.0673470908142744e-06, "loss": 3.066, "step": 14360 }, { "epoch": 4.820615964802011, "grad_norm": 6.949527740478516, "learning_rate": 1.9928480965506965e-06, "loss": 3.0962, "step": 14380 }, { "epoch": 4.827320343599413, "grad_norm": 6.28379487991333, "learning_rate": 1.918349102287119e-06, "loss": 3.0169, "step": 14400 }, { "epoch": 4.834024722396816, "grad_norm": 6.530064582824707, "learning_rate": 1.843850108023542e-06, "loss": 3.0847, "step": 14420 }, { "epoch": 4.840729101194217, "grad_norm": 7.0545783042907715, "learning_rate": 1.7693511137599643e-06, "loss": 3.1926, "step": 14440 }, { "epoch": 4.847433479991619, "grad_norm": 6.212683200836182, "learning_rate": 1.694852119496387e-06, "loss": 3.0994, "step": 14460 }, { "epoch": 4.854137858789022, "grad_norm": 6.651175498962402, "learning_rate": 1.6203531252328094e-06, "loss": 3.115, "step": 14480 }, { "epoch": 4.860842237586423, "grad_norm": 6.536131381988525, "learning_rate": 1.545854130969232e-06, "loss": 3.0519, "step": 14500 }, { "epoch": 4.867546616383826, "grad_norm": 6.119905948638916, "learning_rate": 1.4713551367056546e-06, "loss": 3.097, "step": 14520 }, { "epoch": 4.874250995181228, "grad_norm": 7.160987854003906, "learning_rate": 1.3968561424420771e-06, "loss": 3.1391, "step": 14540 }, { "epoch": 4.8809553739786296, "grad_norm": 6.599812984466553, "learning_rate": 1.3223571481784995e-06, "loss": 3.0939, "step": 14560 }, { "epoch": 4.887659752776032, "grad_norm": 6.979626178741455, "learning_rate": 1.247858153914922e-06, "loss": 3.1038, "step": 14580 }, { "epoch": 4.894364131573434, "grad_norm": 7.19669771194458, "learning_rate": 1.1733591596513449e-06, "loss": 3.1171, "step": 14600 }, { "epoch": 4.901068510370836, "grad_norm": 7.708127975463867, "learning_rate": 1.0988601653877674e-06, "loss": 3.1308, "step": 14620 }, { "epoch": 4.907772889168238, "grad_norm": 7.753808498382568, "learning_rate": 1.0243611711241898e-06, "loss": 3.0866, "step": 14640 }, { "epoch": 4.9144772679656406, "grad_norm": 6.714838981628418, "learning_rate": 9.498621768606124e-07, "loss": 2.9962, "step": 14660 }, { "epoch": 4.921181646763042, "grad_norm": 6.7879767417907715, "learning_rate": 8.75363182597035e-07, "loss": 3.0925, "step": 14680 }, { "epoch": 4.927886025560444, "grad_norm": 7.638024806976318, "learning_rate": 8.008641883334574e-07, "loss": 3.0839, "step": 14700 }, { "epoch": 4.934590404357846, "grad_norm": 6.672430038452148, "learning_rate": 7.263651940698801e-07, "loss": 3.0642, "step": 14720 }, { "epoch": 4.941294783155248, "grad_norm": 6.447202682495117, "learning_rate": 6.518661998063027e-07, "loss": 3.0111, "step": 14740 }, { "epoch": 4.947999161952651, "grad_norm": 6.621779441833496, "learning_rate": 5.773672055427253e-07, "loss": 3.1408, "step": 14760 }, { "epoch": 4.954703540750052, "grad_norm": 7.014694690704346, "learning_rate": 5.028682112791477e-07, "loss": 3.1408, "step": 14780 }, { "epoch": 4.9614079195474545, "grad_norm": 7.518828868865967, "learning_rate": 4.2836921701557035e-07, "loss": 3.1069, "step": 14800 }, { "epoch": 4.968112298344856, "grad_norm": 6.64265775680542, "learning_rate": 3.538702227519929e-07, "loss": 3.0875, "step": 14820 }, { "epoch": 4.974816677142258, "grad_norm": 6.18177604675293, "learning_rate": 2.793712284884154e-07, "loss": 3.1182, "step": 14840 }, { "epoch": 4.981521055939661, "grad_norm": 6.46857213973999, "learning_rate": 2.0487223422483797e-07, "loss": 3.1629, "step": 14860 }, { "epoch": 4.988225434737062, "grad_norm": 6.637886047363281, "learning_rate": 1.3037323996126055e-07, "loss": 3.0804, "step": 14880 }, { "epoch": 4.994929813534465, "grad_norm": 6.32726526260376, "learning_rate": 5.587424569768308e-08, "loss": 3.0856, "step": 14900 } ], "logging_steps": 20, "max_steps": 14915, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0778373124393533e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }