| { | |
| "best_metric": 3.0770967925659036, | |
| "best_model_checkpoint": "/workspace/llm-storage/output/qwen-3B/checkpoint-1000", | |
| "epoch": 4.999958097632517, | |
| "eval_steps": 1000, | |
| "global_step": 14915, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006704378797402053, | |
| "grad_norm": 11.061790466308594, | |
| "learning_rate": 6.702412868632708e-07, | |
| "loss": 11.096, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.013408757594804106, | |
| "grad_norm": 10.477948188781738, | |
| "learning_rate": 1.3404825737265416e-06, | |
| "loss": 11.0987, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02011313639220616, | |
| "grad_norm": 12.07636547088623, | |
| "learning_rate": 2.0107238605898126e-06, | |
| "loss": 11.0549, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.02681751518960821, | |
| "grad_norm": 11.286941528320312, | |
| "learning_rate": 2.680965147453083e-06, | |
| "loss": 10.3064, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03352189398701027, | |
| "grad_norm": 6.030429840087891, | |
| "learning_rate": 3.351206434316354e-06, | |
| "loss": 9.5768, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04022627278441232, | |
| "grad_norm": 8.383974075317383, | |
| "learning_rate": 4.021447721179625e-06, | |
| "loss": 8.5186, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04693065158181437, | |
| "grad_norm": 5.529025554656982, | |
| "learning_rate": 4.691689008042896e-06, | |
| "loss": 7.5261, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05363503037921642, | |
| "grad_norm": 3.8176066875457764, | |
| "learning_rate": 5.361930294906166e-06, | |
| "loss": 7.0029, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.06033940917661848, | |
| "grad_norm": 3.5050182342529297, | |
| "learning_rate": 6.032171581769437e-06, | |
| "loss": 6.9204, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.06704378797402054, | |
| "grad_norm": 4.123743534088135, | |
| "learning_rate": 6.702412868632708e-06, | |
| "loss": 6.6679, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07374816677142258, | |
| "grad_norm": 3.595418691635132, | |
| "learning_rate": 7.372654155495978e-06, | |
| "loss": 6.6241, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.08045254556882464, | |
| "grad_norm": 4.242733001708984, | |
| "learning_rate": 8.04289544235925e-06, | |
| "loss": 6.4657, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0871569243662267, | |
| "grad_norm": 4.478307247161865, | |
| "learning_rate": 8.71313672922252e-06, | |
| "loss": 6.4337, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.09386130316362874, | |
| "grad_norm": 4.557178497314453, | |
| "learning_rate": 9.383378016085791e-06, | |
| "loss": 6.3413, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1005656819610308, | |
| "grad_norm": 4.335766792297363, | |
| "learning_rate": 1.0053619302949062e-05, | |
| "loss": 6.4344, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10727006075843284, | |
| "grad_norm": 4.590956687927246, | |
| "learning_rate": 1.0723860589812333e-05, | |
| "loss": 6.3337, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1139744395558349, | |
| "grad_norm": 4.8770060539245605, | |
| "learning_rate": 1.1394101876675605e-05, | |
| "loss": 6.2143, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.12067881835323696, | |
| "grad_norm": 4.7008891105651855, | |
| "learning_rate": 1.2064343163538874e-05, | |
| "loss": 6.2898, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.127383197150639, | |
| "grad_norm": 4.890043258666992, | |
| "learning_rate": 1.2734584450402146e-05, | |
| "loss": 6.1775, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.13408757594804108, | |
| "grad_norm": 5.183588981628418, | |
| "learning_rate": 1.3404825737265417e-05, | |
| "loss": 6.1675, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.14079195474544312, | |
| "grad_norm": 5.2846527099609375, | |
| "learning_rate": 1.4075067024128689e-05, | |
| "loss": 6.2091, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.14749633354284516, | |
| "grad_norm": 5.238739490509033, | |
| "learning_rate": 1.4745308310991956e-05, | |
| "loss": 6.1628, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.15420071234024724, | |
| "grad_norm": 5.562626838684082, | |
| "learning_rate": 1.5415549597855227e-05, | |
| "loss": 6.0092, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.16090509113764928, | |
| "grad_norm": 6.380126476287842, | |
| "learning_rate": 1.60857908847185e-05, | |
| "loss": 6.0497, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.16760946993505133, | |
| "grad_norm": 5.533380031585693, | |
| "learning_rate": 1.675603217158177e-05, | |
| "loss": 5.9952, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1743138487324534, | |
| "grad_norm": 6.026157379150391, | |
| "learning_rate": 1.742627345844504e-05, | |
| "loss": 6.0327, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.18101822752985544, | |
| "grad_norm": 5.79816198348999, | |
| "learning_rate": 1.8096514745308312e-05, | |
| "loss": 5.8172, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.18772260632725749, | |
| "grad_norm": 6.133901119232178, | |
| "learning_rate": 1.8766756032171583e-05, | |
| "loss": 5.7605, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.19442698512465956, | |
| "grad_norm": 7.115331649780273, | |
| "learning_rate": 1.9436997319034853e-05, | |
| "loss": 5.9287, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.2011313639220616, | |
| "grad_norm": 6.978466033935547, | |
| "learning_rate": 2.0107238605898124e-05, | |
| "loss": 5.7846, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.20783574271946365, | |
| "grad_norm": 7.083895206451416, | |
| "learning_rate": 2.0777479892761395e-05, | |
| "loss": 5.6818, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2145401215168657, | |
| "grad_norm": 7.947028636932373, | |
| "learning_rate": 2.1447721179624665e-05, | |
| "loss": 5.7934, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.22124450031426776, | |
| "grad_norm": 6.990066051483154, | |
| "learning_rate": 2.211796246648794e-05, | |
| "loss": 5.6991, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2279488791116698, | |
| "grad_norm": 6.842931747436523, | |
| "learning_rate": 2.278820375335121e-05, | |
| "loss": 5.75, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.23465325790907185, | |
| "grad_norm": 6.710008144378662, | |
| "learning_rate": 2.3458445040214477e-05, | |
| "loss": 5.5526, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.24135763670647392, | |
| "grad_norm": 6.721392631530762, | |
| "learning_rate": 2.4128686327077747e-05, | |
| "loss": 5.6124, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.24806201550387597, | |
| "grad_norm": 7.801576614379883, | |
| "learning_rate": 2.479892761394102e-05, | |
| "loss": 5.6865, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.254766394301278, | |
| "grad_norm": 7.230539798736572, | |
| "learning_rate": 2.5469168900804292e-05, | |
| "loss": 5.5821, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.26147077309868005, | |
| "grad_norm": 7.283995151519775, | |
| "learning_rate": 2.6139410187667563e-05, | |
| "loss": 5.6686, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.26817515189608215, | |
| "grad_norm": 7.756102085113525, | |
| "learning_rate": 2.6809651474530833e-05, | |
| "loss": 5.4761, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2748795306934842, | |
| "grad_norm": 7.6954569816589355, | |
| "learning_rate": 2.7479892761394104e-05, | |
| "loss": 5.6806, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.28158390949088624, | |
| "grad_norm": 8.64757251739502, | |
| "learning_rate": 2.8150134048257378e-05, | |
| "loss": 5.525, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2882882882882883, | |
| "grad_norm": 7.394837379455566, | |
| "learning_rate": 2.8820375335120648e-05, | |
| "loss": 5.4771, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.29499266708569033, | |
| "grad_norm": 8.593236923217773, | |
| "learning_rate": 2.9490616621983912e-05, | |
| "loss": 5.5333, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3016970458830924, | |
| "grad_norm": 7.424787998199463, | |
| "learning_rate": 3.0160857908847186e-05, | |
| "loss": 5.2982, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3084014246804945, | |
| "grad_norm": 7.988162994384766, | |
| "learning_rate": 3.083109919571045e-05, | |
| "loss": 5.4675, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.3151058034778965, | |
| "grad_norm": 7.624905586242676, | |
| "learning_rate": 3.1501340482573724e-05, | |
| "loss": 5.4906, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.32181018227529856, | |
| "grad_norm": 8.334522247314453, | |
| "learning_rate": 3.2171581769437e-05, | |
| "loss": 5.4069, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3285145610727006, | |
| "grad_norm": 8.150382041931152, | |
| "learning_rate": 3.284182305630027e-05, | |
| "loss": 5.4312, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.33521893987010265, | |
| "grad_norm": 8.183965682983398, | |
| "learning_rate": 3.351206434316354e-05, | |
| "loss": 5.4012, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.33521893987010265, | |
| "eval_bleu_greedy": 3.0770967925659036, | |
| "eval_loss": 0.693783700466156, | |
| "eval_runtime": 309.6004, | |
| "eval_samples_per_second": 0.323, | |
| "eval_steps_per_second": 0.323, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3419233186675047, | |
| "grad_norm": 8.621966361999512, | |
| "learning_rate": 3.418230563002681e-05, | |
| "loss": 5.3954, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3486276974649068, | |
| "grad_norm": 8.160908699035645, | |
| "learning_rate": 3.485254691689008e-05, | |
| "loss": 5.4285, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.35533207626230884, | |
| "grad_norm": 7.480456352233887, | |
| "learning_rate": 3.5522788203753354e-05, | |
| "loss": 5.3167, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3620364550597109, | |
| "grad_norm": 7.508689880371094, | |
| "learning_rate": 3.6193029490616625e-05, | |
| "loss": 5.3439, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3687408338571129, | |
| "grad_norm": 9.118748664855957, | |
| "learning_rate": 3.6863270777479895e-05, | |
| "loss": 5.3604, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.37544521265451497, | |
| "grad_norm": 7.6000471115112305, | |
| "learning_rate": 3.7533512064343166e-05, | |
| "loss": 5.2976, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.382149591451917, | |
| "grad_norm": 7.776809215545654, | |
| "learning_rate": 3.8203753351206436e-05, | |
| "loss": 5.3177, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.3888539702493191, | |
| "grad_norm": 8.50612735748291, | |
| "learning_rate": 3.887399463806971e-05, | |
| "loss": 5.2408, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.39555834904672116, | |
| "grad_norm": 7.958391189575195, | |
| "learning_rate": 3.954423592493298e-05, | |
| "loss": 5.2248, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.4022627278441232, | |
| "grad_norm": 7.7386579513549805, | |
| "learning_rate": 4.021447721179625e-05, | |
| "loss": 5.2663, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.40896710664152525, | |
| "grad_norm": 8.172608375549316, | |
| "learning_rate": 4.088471849865952e-05, | |
| "loss": 5.2612, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.4156714854389273, | |
| "grad_norm": 7.354376792907715, | |
| "learning_rate": 4.155495978552279e-05, | |
| "loss": 5.2672, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.42237586423632933, | |
| "grad_norm": 7.837838649749756, | |
| "learning_rate": 4.222520107238606e-05, | |
| "loss": 5.1707, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4290802430337314, | |
| "grad_norm": 8.173263549804688, | |
| "learning_rate": 4.289544235924933e-05, | |
| "loss": 5.2387, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4357846218311335, | |
| "grad_norm": 7.122191905975342, | |
| "learning_rate": 4.35656836461126e-05, | |
| "loss": 5.2604, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4424890006285355, | |
| "grad_norm": 7.6141839027404785, | |
| "learning_rate": 4.423592493297588e-05, | |
| "loss": 5.1689, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.44919337942593757, | |
| "grad_norm": 7.241093635559082, | |
| "learning_rate": 4.490616621983915e-05, | |
| "loss": 5.1451, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.4558977582233396, | |
| "grad_norm": 6.968513011932373, | |
| "learning_rate": 4.557640750670242e-05, | |
| "loss": 5.0216, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.46260213702074165, | |
| "grad_norm": 7.734285354614258, | |
| "learning_rate": 4.624664879356568e-05, | |
| "loss": 5.0932, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4693065158181437, | |
| "grad_norm": 8.277946472167969, | |
| "learning_rate": 4.6916890080428954e-05, | |
| "loss": 5.1977, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4760108946155458, | |
| "grad_norm": 7.437044620513916, | |
| "learning_rate": 4.7587131367292224e-05, | |
| "loss": 5.1254, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.48271527341294784, | |
| "grad_norm": 7.227113723754883, | |
| "learning_rate": 4.8257372654155495e-05, | |
| "loss": 5.0943, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.4894196522103499, | |
| "grad_norm": 7.8769683837890625, | |
| "learning_rate": 4.8927613941018765e-05, | |
| "loss": 5.0513, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.49612403100775193, | |
| "grad_norm": 6.573411464691162, | |
| "learning_rate": 4.959785522788204e-05, | |
| "loss": 5.045, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.502828409805154, | |
| "grad_norm": 6.970616817474365, | |
| "learning_rate": 4.997020040229457e-05, | |
| "loss": 5.0549, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.509532788602556, | |
| "grad_norm": 7.552504539489746, | |
| "learning_rate": 4.989570140803099e-05, | |
| "loss": 5.1787, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.5162371673999581, | |
| "grad_norm": 6.920768737792969, | |
| "learning_rate": 4.982120241376742e-05, | |
| "loss": 5.0916, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.5229415461973601, | |
| "grad_norm": 6.61656379699707, | |
| "learning_rate": 4.974670341950384e-05, | |
| "loss": 5.0528, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5296459249947622, | |
| "grad_norm": 7.405433654785156, | |
| "learning_rate": 4.967220442524026e-05, | |
| "loss": 5.0544, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5363503037921643, | |
| "grad_norm": 7.384308815002441, | |
| "learning_rate": 4.959770543097668e-05, | |
| "loss": 5.0058, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5430546825895664, | |
| "grad_norm": 6.147129058837891, | |
| "learning_rate": 4.95232064367131e-05, | |
| "loss": 5.1453, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5497590613869684, | |
| "grad_norm": 6.856501579284668, | |
| "learning_rate": 4.944870744244953e-05, | |
| "loss": 4.997, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.5564634401843704, | |
| "grad_norm": 7.677363395690918, | |
| "learning_rate": 4.937420844818596e-05, | |
| "loss": 5.0445, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5631678189817725, | |
| "grad_norm": 6.1870269775390625, | |
| "learning_rate": 4.929970945392238e-05, | |
| "loss": 4.8786, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5698721977791745, | |
| "grad_norm": 6.6285529136657715, | |
| "learning_rate": 4.92252104596588e-05, | |
| "loss": 4.9182, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5765765765765766, | |
| "grad_norm": 6.762671947479248, | |
| "learning_rate": 4.915071146539522e-05, | |
| "loss": 4.836, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5832809553739786, | |
| "grad_norm": 6.840793132781982, | |
| "learning_rate": 4.907621247113165e-05, | |
| "loss": 4.9259, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.5899853341713807, | |
| "grad_norm": 7.508916854858398, | |
| "learning_rate": 4.900171347686807e-05, | |
| "loss": 4.9955, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5966897129687827, | |
| "grad_norm": 7.2800374031066895, | |
| "learning_rate": 4.892721448260449e-05, | |
| "loss": 4.9515, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.6033940917661847, | |
| "grad_norm": 6.398910999298096, | |
| "learning_rate": 4.885271548834091e-05, | |
| "loss": 4.9002, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6100984705635868, | |
| "grad_norm": 7.054329872131348, | |
| "learning_rate": 4.877821649407733e-05, | |
| "loss": 4.8998, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.616802849360989, | |
| "grad_norm": 6.903358459472656, | |
| "learning_rate": 4.870371749981376e-05, | |
| "loss": 4.8227, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.623507228158391, | |
| "grad_norm": 6.382834434509277, | |
| "learning_rate": 4.862921850555018e-05, | |
| "loss": 4.8448, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.630211606955793, | |
| "grad_norm": 6.244606018066406, | |
| "learning_rate": 4.85547195112866e-05, | |
| "loss": 4.9048, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6369159857531951, | |
| "grad_norm": 6.7048115730285645, | |
| "learning_rate": 4.848022051702302e-05, | |
| "loss": 4.8862, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6436203645505971, | |
| "grad_norm": 6.761898994445801, | |
| "learning_rate": 4.840572152275945e-05, | |
| "loss": 4.836, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6503247433479992, | |
| "grad_norm": 6.694396495819092, | |
| "learning_rate": 4.833122252849587e-05, | |
| "loss": 4.922, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.6570291221454012, | |
| "grad_norm": 7.083889961242676, | |
| "learning_rate": 4.825672353423229e-05, | |
| "loss": 4.801, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6637335009428033, | |
| "grad_norm": 6.358527183532715, | |
| "learning_rate": 4.818222453996871e-05, | |
| "loss": 4.9146, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6704378797402053, | |
| "grad_norm": 6.129880428314209, | |
| "learning_rate": 4.810772554570513e-05, | |
| "loss": 4.7806, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6704378797402053, | |
| "eval_bleu_greedy": 1.635969783116869, | |
| "eval_loss": 0.6162992715835571, | |
| "eval_runtime": 98.9607, | |
| "eval_samples_per_second": 1.011, | |
| "eval_steps_per_second": 1.011, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6771422585376073, | |
| "grad_norm": 6.157505989074707, | |
| "learning_rate": 4.803322655144156e-05, | |
| "loss": 4.6963, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6838466373350094, | |
| "grad_norm": 6.576890468597412, | |
| "learning_rate": 4.795872755717798e-05, | |
| "loss": 4.7945, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.6905510161324114, | |
| "grad_norm": 6.158898830413818, | |
| "learning_rate": 4.78842285629144e-05, | |
| "loss": 4.7629, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6972553949298136, | |
| "grad_norm": 7.410290241241455, | |
| "learning_rate": 4.780972956865082e-05, | |
| "loss": 4.6563, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.7039597737272156, | |
| "grad_norm": 6.731761455535889, | |
| "learning_rate": 4.773523057438724e-05, | |
| "loss": 4.6947, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7106641525246177, | |
| "grad_norm": 6.34529447555542, | |
| "learning_rate": 4.766073158012367e-05, | |
| "loss": 4.7754, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.7173685313220197, | |
| "grad_norm": 6.288251876831055, | |
| "learning_rate": 4.758623258586009e-05, | |
| "loss": 4.7656, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.7240729101194218, | |
| "grad_norm": 6.207250595092773, | |
| "learning_rate": 4.751173359159651e-05, | |
| "loss": 4.6873, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.7307772889168238, | |
| "grad_norm": 6.283239841461182, | |
| "learning_rate": 4.7437234597332934e-05, | |
| "loss": 4.6951, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7374816677142259, | |
| "grad_norm": 6.399360179901123, | |
| "learning_rate": 4.736273560306936e-05, | |
| "loss": 4.8608, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7441860465116279, | |
| "grad_norm": 7.108467102050781, | |
| "learning_rate": 4.728823660880579e-05, | |
| "loss": 4.7696, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7508904253090299, | |
| "grad_norm": 6.39521598815918, | |
| "learning_rate": 4.721373761454221e-05, | |
| "loss": 4.6725, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.757594804106432, | |
| "grad_norm": 7.189324855804443, | |
| "learning_rate": 4.713923862027863e-05, | |
| "loss": 4.7705, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.764299182903834, | |
| "grad_norm": 6.980968952178955, | |
| "learning_rate": 4.706473962601505e-05, | |
| "loss": 4.6747, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7710035617012361, | |
| "grad_norm": 5.775346755981445, | |
| "learning_rate": 4.699024063175147e-05, | |
| "loss": 4.5941, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7777079404986382, | |
| "grad_norm": 6.348450183868408, | |
| "learning_rate": 4.69157416374879e-05, | |
| "loss": 4.6419, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7844123192960403, | |
| "grad_norm": 6.2754340171813965, | |
| "learning_rate": 4.684124264322432e-05, | |
| "loss": 4.6844, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.7911166980934423, | |
| "grad_norm": 6.037561893463135, | |
| "learning_rate": 4.676674364896074e-05, | |
| "loss": 4.6424, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7978210768908444, | |
| "grad_norm": 6.3136372566223145, | |
| "learning_rate": 4.669224465469716e-05, | |
| "loss": 4.6775, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.8045254556882464, | |
| "grad_norm": 6.91141939163208, | |
| "learning_rate": 4.661774566043359e-05, | |
| "loss": 4.6124, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8112298344856484, | |
| "grad_norm": 5.787718772888184, | |
| "learning_rate": 4.654324666617001e-05, | |
| "loss": 4.6595, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.8179342132830505, | |
| "grad_norm": 6.696752548217773, | |
| "learning_rate": 4.646874767190643e-05, | |
| "loss": 4.563, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.8246385920804525, | |
| "grad_norm": 6.550769805908203, | |
| "learning_rate": 4.639424867764285e-05, | |
| "loss": 4.7101, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.8313429708778546, | |
| "grad_norm": 6.5647969245910645, | |
| "learning_rate": 4.6319749683379274e-05, | |
| "loss": 4.5763, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.8380473496752566, | |
| "grad_norm": 6.71086311340332, | |
| "learning_rate": 4.62452506891157e-05, | |
| "loss": 4.4918, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8447517284726587, | |
| "grad_norm": 6.1139445304870605, | |
| "learning_rate": 4.617075169485212e-05, | |
| "loss": 4.5095, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8514561072700607, | |
| "grad_norm": 5.594122886657715, | |
| "learning_rate": 4.609625270058854e-05, | |
| "loss": 4.6359, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.8581604860674628, | |
| "grad_norm": 6.769913196563721, | |
| "learning_rate": 4.6021753706324964e-05, | |
| "loss": 4.4938, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8648648648648649, | |
| "grad_norm": 6.663547039031982, | |
| "learning_rate": 4.5947254712061385e-05, | |
| "loss": 4.5619, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.871569243662267, | |
| "grad_norm": 6.266171455383301, | |
| "learning_rate": 4.587275571779781e-05, | |
| "loss": 4.4406, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.878273622459669, | |
| "grad_norm": 6.513619422912598, | |
| "learning_rate": 4.5798256723534234e-05, | |
| "loss": 4.498, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.884978001257071, | |
| "grad_norm": 6.088827133178711, | |
| "learning_rate": 4.5723757729270654e-05, | |
| "loss": 4.5242, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.8916823800544731, | |
| "grad_norm": 6.1904296875, | |
| "learning_rate": 4.5649258735007075e-05, | |
| "loss": 4.5976, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8983867588518751, | |
| "grad_norm": 6.429610252380371, | |
| "learning_rate": 4.55747597407435e-05, | |
| "loss": 4.4584, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.9050911376492772, | |
| "grad_norm": 5.907393932342529, | |
| "learning_rate": 4.5500260746479924e-05, | |
| "loss": 4.5814, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9117955164466792, | |
| "grad_norm": 6.102148056030273, | |
| "learning_rate": 4.5425761752216345e-05, | |
| "loss": 4.5913, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.9184998952440813, | |
| "grad_norm": 6.3552327156066895, | |
| "learning_rate": 4.5351262757952766e-05, | |
| "loss": 4.5327, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.9252042740414833, | |
| "grad_norm": 5.990479946136475, | |
| "learning_rate": 4.527676376368919e-05, | |
| "loss": 4.4025, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.9319086528388854, | |
| "grad_norm": 5.946578502655029, | |
| "learning_rate": 4.5202264769425614e-05, | |
| "loss": 4.5777, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.9386130316362874, | |
| "grad_norm": 6.422057151794434, | |
| "learning_rate": 4.512776577516204e-05, | |
| "loss": 4.4951, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9453174104336896, | |
| "grad_norm": 6.144739151000977, | |
| "learning_rate": 4.505326678089846e-05, | |
| "loss": 4.5586, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9520217892310916, | |
| "grad_norm": 6.011499404907227, | |
| "learning_rate": 4.4978767786634884e-05, | |
| "loss": 4.4695, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.9587261680284936, | |
| "grad_norm": 5.852478504180908, | |
| "learning_rate": 4.4904268792371304e-05, | |
| "loss": 4.4765, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9654305468258957, | |
| "grad_norm": 5.951258182525635, | |
| "learning_rate": 4.482976979810773e-05, | |
| "loss": 4.4017, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9721349256232977, | |
| "grad_norm": 6.046126842498779, | |
| "learning_rate": 4.475527080384415e-05, | |
| "loss": 4.4065, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9788393044206998, | |
| "grad_norm": 7.13279390335083, | |
| "learning_rate": 4.4680771809580574e-05, | |
| "loss": 4.4527, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9855436832181018, | |
| "grad_norm": 6.364200115203857, | |
| "learning_rate": 4.4606272815316995e-05, | |
| "loss": 4.4392, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.9922480620155039, | |
| "grad_norm": 6.487414360046387, | |
| "learning_rate": 4.4531773821053416e-05, | |
| "loss": 4.3799, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9989524408129059, | |
| "grad_norm": 6.20075798034668, | |
| "learning_rate": 4.445727482678984e-05, | |
| "loss": 4.3814, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.0056987219777918, | |
| "grad_norm": 6.238715171813965, | |
| "learning_rate": 4.4382775832526264e-05, | |
| "loss": 4.3533, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0056987219777918, | |
| "eval_bleu_greedy": 2.12858213528201, | |
| "eval_loss": 0.568386971950531, | |
| "eval_runtime": 434.1939, | |
| "eval_samples_per_second": 0.23, | |
| "eval_steps_per_second": 0.23, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.012403100775194, | |
| "grad_norm": 6.458903789520264, | |
| "learning_rate": 4.4308276838262685e-05, | |
| "loss": 4.218, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.0191074795725958, | |
| "grad_norm": 6.330122947692871, | |
| "learning_rate": 4.4233777843999106e-05, | |
| "loss": 4.3083, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.025811858369998, | |
| "grad_norm": 6.430805683135986, | |
| "learning_rate": 4.4159278849735534e-05, | |
| "loss": 4.2772, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.0325162371674, | |
| "grad_norm": 6.592049598693848, | |
| "learning_rate": 4.4084779855471954e-05, | |
| "loss": 4.3208, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.039220615964802, | |
| "grad_norm": 6.5312299728393555, | |
| "learning_rate": 4.4010280861208375e-05, | |
| "loss": 4.1853, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.045924994762204, | |
| "grad_norm": 5.972381591796875, | |
| "learning_rate": 4.3935781866944796e-05, | |
| "loss": 4.2346, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.0526293735596062, | |
| "grad_norm": 5.913834095001221, | |
| "learning_rate": 4.386128287268122e-05, | |
| "loss": 4.2687, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.059333752357008, | |
| "grad_norm": 6.429443836212158, | |
| "learning_rate": 4.3786783878417645e-05, | |
| "loss": 4.33, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.0660381311544103, | |
| "grad_norm": 6.044195175170898, | |
| "learning_rate": 4.3712284884154066e-05, | |
| "loss": 4.2251, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.0727425099518122, | |
| "grad_norm": 6.010583877563477, | |
| "learning_rate": 4.3637785889890487e-05, | |
| "loss": 4.3161, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.0794468887492144, | |
| "grad_norm": 5.625052452087402, | |
| "learning_rate": 4.356328689562691e-05, | |
| "loss": 4.2069, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.0861512675466163, | |
| "grad_norm": 6.308145999908447, | |
| "learning_rate": 4.348878790136333e-05, | |
| "loss": 4.2812, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.0928556463440184, | |
| "grad_norm": 6.858571529388428, | |
| "learning_rate": 4.3414288907099756e-05, | |
| "loss": 4.2798, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.0995600251414206, | |
| "grad_norm": 6.711289882659912, | |
| "learning_rate": 4.333978991283618e-05, | |
| "loss": 4.2203, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.1062644039388225, | |
| "grad_norm": 6.270653247833252, | |
| "learning_rate": 4.3265290918572605e-05, | |
| "loss": 4.259, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.1129687827362247, | |
| "grad_norm": 5.929893493652344, | |
| "learning_rate": 4.3190791924309025e-05, | |
| "loss": 4.3751, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.1196731615336266, | |
| "grad_norm": 5.837188720703125, | |
| "learning_rate": 4.3116292930045446e-05, | |
| "loss": 4.2712, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.1263775403310288, | |
| "grad_norm": 6.670574188232422, | |
| "learning_rate": 4.3041793935781874e-05, | |
| "loss": 4.2151, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.1330819191284307, | |
| "grad_norm": 6.452718734741211, | |
| "learning_rate": 4.2967294941518295e-05, | |
| "loss": 4.321, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.1397862979258329, | |
| "grad_norm": 6.4428391456604, | |
| "learning_rate": 4.2892795947254716e-05, | |
| "loss": 4.246, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.1464906767232348, | |
| "grad_norm": 6.246615409851074, | |
| "learning_rate": 4.2818296952991137e-05, | |
| "loss": 4.2763, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.153195055520637, | |
| "grad_norm": 6.092718124389648, | |
| "learning_rate": 4.274379795872756e-05, | |
| "loss": 4.1313, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.1598994343180389, | |
| "grad_norm": 5.333466529846191, | |
| "learning_rate": 4.2669298964463985e-05, | |
| "loss": 4.2007, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.166603813115441, | |
| "grad_norm": 7.08294677734375, | |
| "learning_rate": 4.2594799970200406e-05, | |
| "loss": 4.1708, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.173308191912843, | |
| "grad_norm": 6.408305644989014, | |
| "learning_rate": 4.252030097593683e-05, | |
| "loss": 4.2438, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.1800125707102451, | |
| "grad_norm": 5.942695140838623, | |
| "learning_rate": 4.244580198167325e-05, | |
| "loss": 4.1205, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.1867169495076473, | |
| "grad_norm": 6.69981050491333, | |
| "learning_rate": 4.2371302987409675e-05, | |
| "loss": 4.1979, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.1934213283050492, | |
| "grad_norm": 7.073486804962158, | |
| "learning_rate": 4.2296803993146096e-05, | |
| "loss": 4.286, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.2001257071024514, | |
| "grad_norm": 6.129693031311035, | |
| "learning_rate": 4.222230499888252e-05, | |
| "loss": 4.1252, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.2068300858998533, | |
| "grad_norm": 6.761497497558594, | |
| "learning_rate": 4.214780600461894e-05, | |
| "loss": 4.16, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.2135344646972555, | |
| "grad_norm": 6.681031703948975, | |
| "learning_rate": 4.207330701035536e-05, | |
| "loss": 4.239, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.2202388434946574, | |
| "grad_norm": 6.160006523132324, | |
| "learning_rate": 4.1998808016091787e-05, | |
| "loss": 4.1435, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.2269432222920595, | |
| "grad_norm": 6.183200359344482, | |
| "learning_rate": 4.192430902182821e-05, | |
| "loss": 4.1166, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.2336476010894615, | |
| "grad_norm": 5.975028991699219, | |
| "learning_rate": 4.184981002756463e-05, | |
| "loss": 4.0858, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.2403519798868636, | |
| "grad_norm": 6.827803134918213, | |
| "learning_rate": 4.177531103330105e-05, | |
| "loss": 4.1952, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2470563586842656, | |
| "grad_norm": 6.478833198547363, | |
| "learning_rate": 4.170081203903747e-05, | |
| "loss": 4.059, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.2537607374816677, | |
| "grad_norm": 6.197700500488281, | |
| "learning_rate": 4.16263130447739e-05, | |
| "loss": 4.2003, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.2604651162790699, | |
| "grad_norm": 5.54361629486084, | |
| "learning_rate": 4.155181405051032e-05, | |
| "loss": 4.1438, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.2671694950764718, | |
| "grad_norm": 5.62382698059082, | |
| "learning_rate": 4.147731505624674e-05, | |
| "loss": 4.154, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.2738738738738737, | |
| "grad_norm": 5.418813228607178, | |
| "learning_rate": 4.140281606198316e-05, | |
| "loss": 4.1561, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.280578252671276, | |
| "grad_norm": 5.975061893463135, | |
| "learning_rate": 4.132831706771959e-05, | |
| "loss": 4.178, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.287282631468678, | |
| "grad_norm": 6.231929302215576, | |
| "learning_rate": 4.125381807345601e-05, | |
| "loss": 4.1253, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.29398701026608, | |
| "grad_norm": 6.092617034912109, | |
| "learning_rate": 4.1179319079192437e-05, | |
| "loss": 4.1438, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.3006913890634821, | |
| "grad_norm": 6.094106674194336, | |
| "learning_rate": 4.110482008492886e-05, | |
| "loss": 4.1144, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.307395767860884, | |
| "grad_norm": 6.208296298980713, | |
| "learning_rate": 4.103032109066528e-05, | |
| "loss": 4.1352, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.3141001466582862, | |
| "grad_norm": 5.595242500305176, | |
| "learning_rate": 4.09558220964017e-05, | |
| "loss": 4.1146, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.3208045254556882, | |
| "grad_norm": 6.050002098083496, | |
| "learning_rate": 4.088132310213813e-05, | |
| "loss": 4.0787, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.3275089042530903, | |
| "grad_norm": 5.937078952789307, | |
| "learning_rate": 4.080682410787455e-05, | |
| "loss": 4.2249, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.3342132830504925, | |
| "grad_norm": 6.359293460845947, | |
| "learning_rate": 4.073232511361097e-05, | |
| "loss": 4.0592, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.3409176618478944, | |
| "grad_norm": 5.486398696899414, | |
| "learning_rate": 4.065782611934739e-05, | |
| "loss": 4.183, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3409176618478944, | |
| "eval_bleu_greedy": 1.8924225876289293, | |
| "eval_loss": 0.5379119515419006, | |
| "eval_runtime": 41.2638, | |
| "eval_samples_per_second": 2.423, | |
| "eval_steps_per_second": 2.423, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3476220406452963, | |
| "grad_norm": 6.733095169067383, | |
| "learning_rate": 4.058332712508382e-05, | |
| "loss": 4.1826, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.3543264194426985, | |
| "grad_norm": 6.984947681427002, | |
| "learning_rate": 4.050882813082024e-05, | |
| "loss": 4.0244, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.3610307982401006, | |
| "grad_norm": 6.321763515472412, | |
| "learning_rate": 4.043432913655666e-05, | |
| "loss": 4.1419, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.3677351770375026, | |
| "grad_norm": 6.014941692352295, | |
| "learning_rate": 4.035983014229308e-05, | |
| "loss": 4.0579, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.3744395558349047, | |
| "grad_norm": 6.782519340515137, | |
| "learning_rate": 4.02853311480295e-05, | |
| "loss": 4.037, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.3811439346323067, | |
| "grad_norm": 5.457937717437744, | |
| "learning_rate": 4.021083215376593e-05, | |
| "loss": 4.0483, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.3878483134297088, | |
| "grad_norm": 6.121335983276367, | |
| "learning_rate": 4.013633315950235e-05, | |
| "loss": 4.0982, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.3945526922271108, | |
| "grad_norm": 6.334305763244629, | |
| "learning_rate": 4.006183416523877e-05, | |
| "loss": 4.106, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.401257071024513, | |
| "grad_norm": 6.201812744140625, | |
| "learning_rate": 3.998733517097519e-05, | |
| "loss": 4.0633, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.407961449821915, | |
| "grad_norm": 6.2243828773498535, | |
| "learning_rate": 3.991283617671162e-05, | |
| "loss": 4.0215, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.414665828619317, | |
| "grad_norm": 6.266222953796387, | |
| "learning_rate": 3.983833718244804e-05, | |
| "loss": 4.1322, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.421370207416719, | |
| "grad_norm": 5.890945911407471, | |
| "learning_rate": 3.976383818818446e-05, | |
| "loss": 3.8993, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.428074586214121, | |
| "grad_norm": 5.7960991859436035, | |
| "learning_rate": 3.968933919392088e-05, | |
| "loss": 4.1042, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.4347789650115232, | |
| "grad_norm": 6.632575988769531, | |
| "learning_rate": 3.96148401996573e-05, | |
| "loss": 4.1236, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.4414833438089252, | |
| "grad_norm": 6.313004493713379, | |
| "learning_rate": 3.954034120539373e-05, | |
| "loss": 4.0445, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.4481877226063273, | |
| "grad_norm": 6.819790840148926, | |
| "learning_rate": 3.946584221113015e-05, | |
| "loss": 4.0814, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.4548921014037293, | |
| "grad_norm": 5.719134330749512, | |
| "learning_rate": 3.939134321686657e-05, | |
| "loss": 3.9592, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.4615964802011314, | |
| "grad_norm": 6.105227470397949, | |
| "learning_rate": 3.931684422260299e-05, | |
| "loss": 4.0809, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.4683008589985334, | |
| "grad_norm": 6.068193435668945, | |
| "learning_rate": 3.924234522833941e-05, | |
| "loss": 4.1412, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.4750052377959355, | |
| "grad_norm": 5.725421905517578, | |
| "learning_rate": 3.916784623407585e-05, | |
| "loss": 4.0711, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.4817096165933374, | |
| "grad_norm": 6.790429592132568, | |
| "learning_rate": 3.909334723981227e-05, | |
| "loss": 3.9287, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.4884139953907396, | |
| "grad_norm": 6.1765971183776855, | |
| "learning_rate": 3.901884824554869e-05, | |
| "loss": 4.0219, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.4951183741881415, | |
| "grad_norm": 5.821228981018066, | |
| "learning_rate": 3.894434925128511e-05, | |
| "loss": 3.9816, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.5018227529855437, | |
| "grad_norm": 6.144356727600098, | |
| "learning_rate": 3.886985025702153e-05, | |
| "loss": 3.9802, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.5085271317829458, | |
| "grad_norm": 5.687018871307373, | |
| "learning_rate": 3.879535126275796e-05, | |
| "loss": 3.9389, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.5152315105803478, | |
| "grad_norm": 5.2722673416137695, | |
| "learning_rate": 3.872085226849438e-05, | |
| "loss": 4.115, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.5219358893777497, | |
| "grad_norm": 5.904730796813965, | |
| "learning_rate": 3.86463532742308e-05, | |
| "loss": 4.0509, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.5286402681751519, | |
| "grad_norm": 6.75799560546875, | |
| "learning_rate": 3.857185427996722e-05, | |
| "loss": 4.074, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.535344646972554, | |
| "grad_norm": 6.9323039054870605, | |
| "learning_rate": 3.849735528570364e-05, | |
| "loss": 3.9566, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.542049025769956, | |
| "grad_norm": 6.0153398513793945, | |
| "learning_rate": 3.842285629144007e-05, | |
| "loss": 4.0422, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.548753404567358, | |
| "grad_norm": 6.283784866333008, | |
| "learning_rate": 3.834835729717649e-05, | |
| "loss": 3.9645, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.5554577833647603, | |
| "grad_norm": 6.145251274108887, | |
| "learning_rate": 3.827385830291291e-05, | |
| "loss": 4.0442, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.5621621621621622, | |
| "grad_norm": 5.55891227722168, | |
| "learning_rate": 3.819935930864933e-05, | |
| "loss": 4.0023, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.5688665409595641, | |
| "grad_norm": 6.398717880249023, | |
| "learning_rate": 3.812486031438576e-05, | |
| "loss": 3.9904, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.5755709197569663, | |
| "grad_norm": 7.079937934875488, | |
| "learning_rate": 3.805036132012218e-05, | |
| "loss": 3.851, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.5822752985543684, | |
| "grad_norm": 5.542669773101807, | |
| "learning_rate": 3.79758623258586e-05, | |
| "loss": 3.9195, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.5889796773517704, | |
| "grad_norm": 6.840109348297119, | |
| "learning_rate": 3.790136333159502e-05, | |
| "loss": 4.0468, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.5956840561491723, | |
| "grad_norm": 5.582828044891357, | |
| "learning_rate": 3.7826864337331444e-05, | |
| "loss": 3.897, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.6023884349465745, | |
| "grad_norm": 5.687999725341797, | |
| "learning_rate": 3.775236534306787e-05, | |
| "loss": 4.0053, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.6090928137439766, | |
| "grad_norm": 5.822837829589844, | |
| "learning_rate": 3.767786634880429e-05, | |
| "loss": 3.9547, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6157971925413785, | |
| "grad_norm": 6.070379257202148, | |
| "learning_rate": 3.7603367354540713e-05, | |
| "loss": 4.0644, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.6225015713387805, | |
| "grad_norm": 5.518836975097656, | |
| "learning_rate": 3.7528868360277134e-05, | |
| "loss": 3.8831, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.6292059501361829, | |
| "grad_norm": 5.762600898742676, | |
| "learning_rate": 3.745436936601356e-05, | |
| "loss": 3.9588, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.6359103289335848, | |
| "grad_norm": 6.048323154449463, | |
| "learning_rate": 3.737987037174998e-05, | |
| "loss": 3.9825, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.6426147077309867, | |
| "grad_norm": 5.4770097732543945, | |
| "learning_rate": 3.7305371377486404e-05, | |
| "loss": 3.8577, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.6493190865283889, | |
| "grad_norm": 6.0199294090271, | |
| "learning_rate": 3.7230872383222825e-05, | |
| "loss": 3.8402, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.656023465325791, | |
| "grad_norm": 5.508368492126465, | |
| "learning_rate": 3.715637338895925e-05, | |
| "loss": 3.8506, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.662727844123193, | |
| "grad_norm": 5.987868309020996, | |
| "learning_rate": 3.708187439469567e-05, | |
| "loss": 3.928, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.669432222920595, | |
| "grad_norm": 6.022989749908447, | |
| "learning_rate": 3.70073754004321e-05, | |
| "loss": 3.9483, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.676136601717997, | |
| "grad_norm": 5.780736446380615, | |
| "learning_rate": 3.693287640616852e-05, | |
| "loss": 3.9055, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.676136601717997, | |
| "eval_bleu_greedy": 2.074504503599086, | |
| "eval_loss": 0.5163004398345947, | |
| "eval_runtime": 114.7725, | |
| "eval_samples_per_second": 0.871, | |
| "eval_steps_per_second": 0.871, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.6828409805153992, | |
| "grad_norm": 7.780221462249756, | |
| "learning_rate": 3.685837741190494e-05, | |
| "loss": 4.0413, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.6895453593128011, | |
| "grad_norm": 5.654071807861328, | |
| "learning_rate": 3.6783878417641363e-05, | |
| "loss": 3.9454, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.696249738110203, | |
| "grad_norm": 5.763638019561768, | |
| "learning_rate": 3.6709379423377784e-05, | |
| "loss": 3.9884, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.7029541169076052, | |
| "grad_norm": 5.78656005859375, | |
| "learning_rate": 3.663488042911421e-05, | |
| "loss": 3.9436, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.7096584957050074, | |
| "grad_norm": 6.413984775543213, | |
| "learning_rate": 3.656038143485063e-05, | |
| "loss": 3.9803, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.7163628745024093, | |
| "grad_norm": 5.727552890777588, | |
| "learning_rate": 3.6485882440587054e-05, | |
| "loss": 4.0424, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.7230672532998115, | |
| "grad_norm": 5.366096496582031, | |
| "learning_rate": 3.6411383446323475e-05, | |
| "loss": 3.9212, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.7297716320972136, | |
| "grad_norm": 5.877246856689453, | |
| "learning_rate": 3.63368844520599e-05, | |
| "loss": 3.9313, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.7364760108946156, | |
| "grad_norm": 4.906258583068848, | |
| "learning_rate": 3.626238545779632e-05, | |
| "loss": 3.9713, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.7431803896920175, | |
| "grad_norm": 5.745492458343506, | |
| "learning_rate": 3.6187886463532744e-05, | |
| "loss": 3.9141, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.7498847684894197, | |
| "grad_norm": 5.654531002044678, | |
| "learning_rate": 3.6113387469269165e-05, | |
| "loss": 3.9644, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.7565891472868218, | |
| "grad_norm": 5.877029895782471, | |
| "learning_rate": 3.6038888475005586e-05, | |
| "loss": 3.9801, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.7632935260842237, | |
| "grad_norm": 6.766676425933838, | |
| "learning_rate": 3.5964389480742013e-05, | |
| "loss": 3.9611, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.7699979048816257, | |
| "grad_norm": 6.007946968078613, | |
| "learning_rate": 3.5889890486478434e-05, | |
| "loss": 3.9596, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.7767022836790278, | |
| "grad_norm": 5.436508655548096, | |
| "learning_rate": 3.5815391492214855e-05, | |
| "loss": 3.9234, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.78340666247643, | |
| "grad_norm": 6.275641441345215, | |
| "learning_rate": 3.5740892497951276e-05, | |
| "loss": 3.9003, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.790111041273832, | |
| "grad_norm": 5.461209774017334, | |
| "learning_rate": 3.5666393503687704e-05, | |
| "loss": 3.9583, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.796815420071234, | |
| "grad_norm": 5.981624603271484, | |
| "learning_rate": 3.5591894509424125e-05, | |
| "loss": 3.8287, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.8035197988686362, | |
| "grad_norm": 5.947275638580322, | |
| "learning_rate": 3.5517395515160546e-05, | |
| "loss": 3.9774, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.8102241776660382, | |
| "grad_norm": 6.036500930786133, | |
| "learning_rate": 3.5442896520896966e-05, | |
| "loss": 3.8426, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.81692855646344, | |
| "grad_norm": 5.644313335418701, | |
| "learning_rate": 3.536839752663339e-05, | |
| "loss": 3.8918, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.8236329352608422, | |
| "grad_norm": 6.368311882019043, | |
| "learning_rate": 3.5293898532369815e-05, | |
| "loss": 3.9701, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.8303373140582444, | |
| "grad_norm": 5.9047932624816895, | |
| "learning_rate": 3.5219399538106236e-05, | |
| "loss": 3.878, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.8370416928556463, | |
| "grad_norm": 6.0813679695129395, | |
| "learning_rate": 3.514490054384266e-05, | |
| "loss": 3.9292, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.8437460716530483, | |
| "grad_norm": 6.120319843292236, | |
| "learning_rate": 3.5070401549579084e-05, | |
| "loss": 3.8474, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.8504504504504504, | |
| "grad_norm": 6.296043395996094, | |
| "learning_rate": 3.4995902555315505e-05, | |
| "loss": 3.7354, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.8571548292478526, | |
| "grad_norm": 5.409097671508789, | |
| "learning_rate": 3.492140356105193e-05, | |
| "loss": 3.8795, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.8638592080452545, | |
| "grad_norm": 6.105241775512695, | |
| "learning_rate": 3.4846904566788354e-05, | |
| "loss": 3.9765, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.8705635868426564, | |
| "grad_norm": 5.878379821777344, | |
| "learning_rate": 3.4772405572524775e-05, | |
| "loss": 3.8718, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.8772679656400588, | |
| "grad_norm": 5.730438709259033, | |
| "learning_rate": 3.4697906578261196e-05, | |
| "loss": 4.0157, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.8839723444374608, | |
| "grad_norm": 5.375248432159424, | |
| "learning_rate": 3.4623407583997616e-05, | |
| "loss": 3.9336, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.8906767232348627, | |
| "grad_norm": 6.077249526977539, | |
| "learning_rate": 3.4548908589734044e-05, | |
| "loss": 3.8112, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.8973811020322648, | |
| "grad_norm": 5.51649808883667, | |
| "learning_rate": 3.4474409595470465e-05, | |
| "loss": 3.9677, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.904085480829667, | |
| "grad_norm": 5.96297025680542, | |
| "learning_rate": 3.4399910601206886e-05, | |
| "loss": 3.8401, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.910789859627069, | |
| "grad_norm": 5.845096588134766, | |
| "learning_rate": 3.432541160694331e-05, | |
| "loss": 3.9445, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.9174942384244709, | |
| "grad_norm": 6.597667694091797, | |
| "learning_rate": 3.425091261267973e-05, | |
| "loss": 3.8767, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.924198617221873, | |
| "grad_norm": 5.085957050323486, | |
| "learning_rate": 3.4176413618416155e-05, | |
| "loss": 3.816, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.9309029960192752, | |
| "grad_norm": 5.354710578918457, | |
| "learning_rate": 3.4101914624152576e-05, | |
| "loss": 3.7329, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.937607374816677, | |
| "grad_norm": 6.152263641357422, | |
| "learning_rate": 3.4027415629889e-05, | |
| "loss": 3.9058, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.944311753614079, | |
| "grad_norm": 5.678866863250732, | |
| "learning_rate": 3.395291663562542e-05, | |
| "loss": 3.9123, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.9510161324114812, | |
| "grad_norm": 5.211181640625, | |
| "learning_rate": 3.3878417641361846e-05, | |
| "loss": 3.857, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.9577205112088834, | |
| "grad_norm": 5.752172470092773, | |
| "learning_rate": 3.3803918647098266e-05, | |
| "loss": 3.9512, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.9644248900062853, | |
| "grad_norm": 5.982390403747559, | |
| "learning_rate": 3.372941965283469e-05, | |
| "loss": 3.8322, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.9711292688036874, | |
| "grad_norm": 6.09535551071167, | |
| "learning_rate": 3.365492065857111e-05, | |
| "loss": 3.8806, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.9778336476010896, | |
| "grad_norm": 6.2229905128479, | |
| "learning_rate": 3.358042166430753e-05, | |
| "loss": 3.885, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.9845380263984915, | |
| "grad_norm": 5.936634540557861, | |
| "learning_rate": 3.350592267004396e-05, | |
| "loss": 3.8126, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.9912424051958935, | |
| "grad_norm": 5.78571081161499, | |
| "learning_rate": 3.343142367578038e-05, | |
| "loss": 4.0106, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.9979467839932956, | |
| "grad_norm": 6.465760231018066, | |
| "learning_rate": 3.33569246815168e-05, | |
| "loss": 3.8896, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 2.004357846218311, | |
| "grad_norm": 6.171356201171875, | |
| "learning_rate": 3.328242568725322e-05, | |
| "loss": 3.4701, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 2.0110622250157135, | |
| "grad_norm": 5.609477519989014, | |
| "learning_rate": 3.320792669298965e-05, | |
| "loss": 3.6326, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.0110622250157135, | |
| "eval_bleu_greedy": 2.0660877864138794, | |
| "eval_loss": 0.5018166899681091, | |
| "eval_runtime": 118.8592, | |
| "eval_samples_per_second": 0.841, | |
| "eval_steps_per_second": 0.841, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.0177666038131155, | |
| "grad_norm": 5.845204830169678, | |
| "learning_rate": 3.313342769872607e-05, | |
| "loss": 3.7089, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 2.0244709826105174, | |
| "grad_norm": 6.158261775970459, | |
| "learning_rate": 3.3058928704462496e-05, | |
| "loss": 3.6354, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 2.0311753614079198, | |
| "grad_norm": 5.758568286895752, | |
| "learning_rate": 3.2984429710198916e-05, | |
| "loss": 3.6658, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 2.0378797402053217, | |
| "grad_norm": 6.4133381843566895, | |
| "learning_rate": 3.290993071593534e-05, | |
| "loss": 3.716, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 2.0445841190027236, | |
| "grad_norm": 5.51917028427124, | |
| "learning_rate": 3.283543172167176e-05, | |
| "loss": 3.6775, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.0512884978001256, | |
| "grad_norm": 5.898686408996582, | |
| "learning_rate": 3.2760932727408186e-05, | |
| "loss": 3.7088, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 2.057992876597528, | |
| "grad_norm": 6.7731852531433105, | |
| "learning_rate": 3.268643373314461e-05, | |
| "loss": 3.5724, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 2.06469725539493, | |
| "grad_norm": 5.695714950561523, | |
| "learning_rate": 3.261193473888103e-05, | |
| "loss": 3.6688, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 2.071401634192332, | |
| "grad_norm": 6.517350196838379, | |
| "learning_rate": 3.253743574461745e-05, | |
| "loss": 3.7519, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 2.0781060129897337, | |
| "grad_norm": 5.876154899597168, | |
| "learning_rate": 3.2462936750353876e-05, | |
| "loss": 3.6403, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.084810391787136, | |
| "grad_norm": 6.117770671844482, | |
| "learning_rate": 3.23884377560903e-05, | |
| "loss": 3.636, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 2.091514770584538, | |
| "grad_norm": 5.719681262969971, | |
| "learning_rate": 3.231393876182672e-05, | |
| "loss": 3.5995, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 2.09821914938194, | |
| "grad_norm": 6.031946182250977, | |
| "learning_rate": 3.223943976756314e-05, | |
| "loss": 3.6692, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 2.104923528179342, | |
| "grad_norm": 6.5091023445129395, | |
| "learning_rate": 3.216494077329956e-05, | |
| "loss": 3.6672, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 2.1116279069767443, | |
| "grad_norm": 6.04213285446167, | |
| "learning_rate": 3.209044177903599e-05, | |
| "loss": 3.5401, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.1183322857741462, | |
| "grad_norm": 6.8394036293029785, | |
| "learning_rate": 3.201594278477241e-05, | |
| "loss": 3.6274, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 2.125036664571548, | |
| "grad_norm": 5.81780481338501, | |
| "learning_rate": 3.194144379050883e-05, | |
| "loss": 3.6487, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 2.1317410433689505, | |
| "grad_norm": 6.043388366699219, | |
| "learning_rate": 3.186694479624525e-05, | |
| "loss": 3.6658, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 2.1384454221663525, | |
| "grad_norm": 6.0370378494262695, | |
| "learning_rate": 3.179244580198167e-05, | |
| "loss": 3.5821, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 2.1451498009637544, | |
| "grad_norm": 5.804340362548828, | |
| "learning_rate": 3.17179468077181e-05, | |
| "loss": 3.7046, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.1518541797611563, | |
| "grad_norm": 5.964964866638184, | |
| "learning_rate": 3.164344781345452e-05, | |
| "loss": 3.6777, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 2.1585585585585587, | |
| "grad_norm": 6.0428853034973145, | |
| "learning_rate": 3.156894881919094e-05, | |
| "loss": 3.7281, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 2.1652629373559606, | |
| "grad_norm": 5.866547584533691, | |
| "learning_rate": 3.149444982492736e-05, | |
| "loss": 3.5978, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 2.1719673161533626, | |
| "grad_norm": 6.283875465393066, | |
| "learning_rate": 3.141995083066379e-05, | |
| "loss": 3.6787, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 2.1786716949507645, | |
| "grad_norm": 5.971242427825928, | |
| "learning_rate": 3.134545183640021e-05, | |
| "loss": 3.5794, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.185376073748167, | |
| "grad_norm": 6.014956474304199, | |
| "learning_rate": 3.127095284213663e-05, | |
| "loss": 3.6924, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 2.192080452545569, | |
| "grad_norm": 5.171935558319092, | |
| "learning_rate": 3.119645384787305e-05, | |
| "loss": 3.6373, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 2.1987848313429708, | |
| "grad_norm": 6.373608112335205, | |
| "learning_rate": 3.112195485360947e-05, | |
| "loss": 3.6745, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 2.205489210140373, | |
| "grad_norm": 5.605614185333252, | |
| "learning_rate": 3.10474558593459e-05, | |
| "loss": 3.6497, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 2.212193588937775, | |
| "grad_norm": 5.9374613761901855, | |
| "learning_rate": 3.097295686508233e-05, | |
| "loss": 3.5979, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.218897967735177, | |
| "grad_norm": 5.818667411804199, | |
| "learning_rate": 3.089845787081875e-05, | |
| "loss": 3.5593, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 2.225602346532579, | |
| "grad_norm": 5.608858585357666, | |
| "learning_rate": 3.082395887655517e-05, | |
| "loss": 3.5811, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 2.2323067253299813, | |
| "grad_norm": 6.370382785797119, | |
| "learning_rate": 3.074945988229159e-05, | |
| "loss": 3.6893, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 2.2390111041273832, | |
| "grad_norm": 5.760286331176758, | |
| "learning_rate": 3.067496088802802e-05, | |
| "loss": 3.6869, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 2.245715482924785, | |
| "grad_norm": 5.964378833770752, | |
| "learning_rate": 3.060046189376444e-05, | |
| "loss": 3.5856, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.252419861722187, | |
| "grad_norm": 5.7680439949035645, | |
| "learning_rate": 3.052596289950086e-05, | |
| "loss": 3.6309, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 2.2591242405195895, | |
| "grad_norm": 6.063139915466309, | |
| "learning_rate": 3.045146390523728e-05, | |
| "loss": 3.542, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 2.2658286193169914, | |
| "grad_norm": 7.324517250061035, | |
| "learning_rate": 3.0376964910973705e-05, | |
| "loss": 3.5469, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 2.2725329981143934, | |
| "grad_norm": 5.549790859222412, | |
| "learning_rate": 3.0302465916710126e-05, | |
| "loss": 3.6051, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 2.2792373769117953, | |
| "grad_norm": 5.7682929039001465, | |
| "learning_rate": 3.022796692244655e-05, | |
| "loss": 3.5874, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.2859417557091977, | |
| "grad_norm": 6.120064735412598, | |
| "learning_rate": 3.015346792818297e-05, | |
| "loss": 3.6209, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 2.2926461345065996, | |
| "grad_norm": 5.814151287078857, | |
| "learning_rate": 3.0078968933919395e-05, | |
| "loss": 3.5889, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 2.2993505133040015, | |
| "grad_norm": 6.4021077156066895, | |
| "learning_rate": 3.0004469939655816e-05, | |
| "loss": 3.6508, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 2.306054892101404, | |
| "grad_norm": 6.688700199127197, | |
| "learning_rate": 2.9929970945392237e-05, | |
| "loss": 3.6598, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 2.312759270898806, | |
| "grad_norm": 5.971013069152832, | |
| "learning_rate": 2.985547195112866e-05, | |
| "loss": 3.6805, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.3194636496962078, | |
| "grad_norm": 5.857511520385742, | |
| "learning_rate": 2.9780972956865082e-05, | |
| "loss": 3.5774, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 2.3261680284936097, | |
| "grad_norm": 5.832746505737305, | |
| "learning_rate": 2.9706473962601506e-05, | |
| "loss": 3.6995, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 2.332872407291012, | |
| "grad_norm": 6.846590042114258, | |
| "learning_rate": 2.9631974968337927e-05, | |
| "loss": 3.6018, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 2.339576786088414, | |
| "grad_norm": 5.618002891540527, | |
| "learning_rate": 2.955747597407435e-05, | |
| "loss": 3.5473, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 2.346281164885816, | |
| "grad_norm": 5.3922319412231445, | |
| "learning_rate": 2.9482976979810772e-05, | |
| "loss": 3.5538, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.346281164885816, | |
| "eval_bleu_greedy": 2.167759735525418, | |
| "eval_loss": 0.4932926893234253, | |
| "eval_runtime": 337.6724, | |
| "eval_samples_per_second": 0.296, | |
| "eval_steps_per_second": 0.296, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.352985543683218, | |
| "grad_norm": 5.728118896484375, | |
| "learning_rate": 2.9408477985547193e-05, | |
| "loss": 3.637, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 2.3596899224806203, | |
| "grad_norm": 5.794277191162109, | |
| "learning_rate": 2.9333978991283618e-05, | |
| "loss": 3.5977, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 2.366394301278022, | |
| "grad_norm": 6.4884419441223145, | |
| "learning_rate": 2.925947999702004e-05, | |
| "loss": 3.5448, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 2.373098680075424, | |
| "grad_norm": 7.127490520477295, | |
| "learning_rate": 2.9184981002756463e-05, | |
| "loss": 3.7117, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 2.3798030588728265, | |
| "grad_norm": 5.834691047668457, | |
| "learning_rate": 2.9110482008492884e-05, | |
| "loss": 3.5537, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.3865074376702284, | |
| "grad_norm": 5.9557671546936035, | |
| "learning_rate": 2.9035983014229308e-05, | |
| "loss": 3.5724, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 2.3932118164676304, | |
| "grad_norm": 6.775606632232666, | |
| "learning_rate": 2.8961484019965735e-05, | |
| "loss": 3.5767, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 2.3999161952650323, | |
| "grad_norm": 5.350723743438721, | |
| "learning_rate": 2.8886985025702156e-05, | |
| "loss": 3.6228, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 2.4066205740624347, | |
| "grad_norm": 6.00510835647583, | |
| "learning_rate": 2.881248603143858e-05, | |
| "loss": 3.6664, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 2.4133249528598366, | |
| "grad_norm": 6.175734519958496, | |
| "learning_rate": 2.8737987037175e-05, | |
| "loss": 3.6428, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.4200293316572385, | |
| "grad_norm": 6.390973091125488, | |
| "learning_rate": 2.8663488042911422e-05, | |
| "loss": 3.5884, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 2.4267337104546405, | |
| "grad_norm": 6.262541770935059, | |
| "learning_rate": 2.8588989048647847e-05, | |
| "loss": 3.5878, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 2.433438089252043, | |
| "grad_norm": 5.385353088378906, | |
| "learning_rate": 2.8514490054384268e-05, | |
| "loss": 3.5796, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 2.440142468049445, | |
| "grad_norm": 5.796669006347656, | |
| "learning_rate": 2.8439991060120692e-05, | |
| "loss": 3.5707, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 2.4468468468468467, | |
| "grad_norm": 6.3658857345581055, | |
| "learning_rate": 2.8365492065857113e-05, | |
| "loss": 3.6012, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.453551225644249, | |
| "grad_norm": 5.848957538604736, | |
| "learning_rate": 2.8290993071593537e-05, | |
| "loss": 3.5894, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 2.460255604441651, | |
| "grad_norm": 6.327582836151123, | |
| "learning_rate": 2.8216494077329958e-05, | |
| "loss": 3.5888, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 2.466959983239053, | |
| "grad_norm": 6.102633476257324, | |
| "learning_rate": 2.814199508306638e-05, | |
| "loss": 3.6819, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 2.473664362036455, | |
| "grad_norm": 5.988522529602051, | |
| "learning_rate": 2.8067496088802803e-05, | |
| "loss": 3.4606, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 2.4803687408338573, | |
| "grad_norm": 5.952702522277832, | |
| "learning_rate": 2.7992997094539224e-05, | |
| "loss": 3.5249, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.487073119631259, | |
| "grad_norm": 6.563230514526367, | |
| "learning_rate": 2.7918498100275648e-05, | |
| "loss": 3.5629, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 2.493777498428661, | |
| "grad_norm": 5.4130988121032715, | |
| "learning_rate": 2.784399910601207e-05, | |
| "loss": 3.5616, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 2.500481877226063, | |
| "grad_norm": 6.370597839355469, | |
| "learning_rate": 2.7769500111748493e-05, | |
| "loss": 3.6258, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 2.5071862560234655, | |
| "grad_norm": 5.860075950622559, | |
| "learning_rate": 2.7695001117484914e-05, | |
| "loss": 3.584, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 2.5138906348208674, | |
| "grad_norm": 5.848262786865234, | |
| "learning_rate": 2.762050212322134e-05, | |
| "loss": 3.5515, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.5205950136182693, | |
| "grad_norm": 6.018378257751465, | |
| "learning_rate": 2.754600312895776e-05, | |
| "loss": 3.6001, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 2.5272993924156717, | |
| "grad_norm": 5.92679500579834, | |
| "learning_rate": 2.747150413469418e-05, | |
| "loss": 3.6455, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 2.5340037712130736, | |
| "grad_norm": 6.104831695556641, | |
| "learning_rate": 2.7397005140430604e-05, | |
| "loss": 3.5881, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 2.5407081500104756, | |
| "grad_norm": 5.604018211364746, | |
| "learning_rate": 2.7322506146167025e-05, | |
| "loss": 3.5521, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 2.5474125288078775, | |
| "grad_norm": 6.820720195770264, | |
| "learning_rate": 2.724800715190345e-05, | |
| "loss": 3.5984, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.5541169076052794, | |
| "grad_norm": 5.802369117736816, | |
| "learning_rate": 2.717350815763987e-05, | |
| "loss": 3.6231, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 2.560821286402682, | |
| "grad_norm": 6.830519676208496, | |
| "learning_rate": 2.7099009163376295e-05, | |
| "loss": 3.513, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 2.5675256652000837, | |
| "grad_norm": 5.891795635223389, | |
| "learning_rate": 2.7024510169112716e-05, | |
| "loss": 3.4983, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 2.5742300439974857, | |
| "grad_norm": 5.775413513183594, | |
| "learning_rate": 2.6950011174849143e-05, | |
| "loss": 3.5026, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 2.580934422794888, | |
| "grad_norm": 6.1186442375183105, | |
| "learning_rate": 2.6875512180585564e-05, | |
| "loss": 3.5892, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.58763880159229, | |
| "grad_norm": 6.562758445739746, | |
| "learning_rate": 2.680101318632199e-05, | |
| "loss": 3.5703, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 2.594343180389692, | |
| "grad_norm": 6.671054840087891, | |
| "learning_rate": 2.672651419205841e-05, | |
| "loss": 3.6545, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 2.6010475591870943, | |
| "grad_norm": 6.263803005218506, | |
| "learning_rate": 2.6652015197794834e-05, | |
| "loss": 3.6269, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 2.6077519379844962, | |
| "grad_norm": 6.635150909423828, | |
| "learning_rate": 2.6577516203531254e-05, | |
| "loss": 3.5478, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 2.614456316781898, | |
| "grad_norm": 6.699692726135254, | |
| "learning_rate": 2.650301720926768e-05, | |
| "loss": 3.6548, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.6211606955793, | |
| "grad_norm": 5.610607624053955, | |
| "learning_rate": 2.64285182150041e-05, | |
| "loss": 3.6276, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 2.627865074376702, | |
| "grad_norm": 6.077248573303223, | |
| "learning_rate": 2.6354019220740524e-05, | |
| "loss": 3.5808, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 2.6345694531741044, | |
| "grad_norm": 6.732864856719971, | |
| "learning_rate": 2.6279520226476945e-05, | |
| "loss": 3.6394, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 2.6412738319715063, | |
| "grad_norm": 7.764287948608398, | |
| "learning_rate": 2.6205021232213366e-05, | |
| "loss": 3.6177, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 2.6479782107689083, | |
| "grad_norm": 5.52256441116333, | |
| "learning_rate": 2.613052223794979e-05, | |
| "loss": 3.5052, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.6546825895663106, | |
| "grad_norm": 5.835344314575195, | |
| "learning_rate": 2.605602324368621e-05, | |
| "loss": 3.5943, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.6613869683637126, | |
| "grad_norm": 6.355226993560791, | |
| "learning_rate": 2.5981524249422635e-05, | |
| "loss": 3.512, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 2.6680913471611145, | |
| "grad_norm": 5.310232639312744, | |
| "learning_rate": 2.5907025255159056e-05, | |
| "loss": 3.4958, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 2.674795725958517, | |
| "grad_norm": 6.357884883880615, | |
| "learning_rate": 2.583252626089548e-05, | |
| "loss": 3.4715, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 2.681500104755919, | |
| "grad_norm": 5.544501304626465, | |
| "learning_rate": 2.57580272666319e-05, | |
| "loss": 3.6412, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.681500104755919, | |
| "eval_bleu_greedy": 2.0426177231502667, | |
| "eval_loss": 0.478807270526886, | |
| "eval_runtime": 461.9761, | |
| "eval_samples_per_second": 0.216, | |
| "eval_steps_per_second": 0.216, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.6882044835533208, | |
| "grad_norm": 6.2651214599609375, | |
| "learning_rate": 2.5683528272368322e-05, | |
| "loss": 3.6418, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 2.6949088623507227, | |
| "grad_norm": 5.576724529266357, | |
| "learning_rate": 2.5609029278104746e-05, | |
| "loss": 3.57, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 2.7016132411481246, | |
| "grad_norm": 6.44381856918335, | |
| "learning_rate": 2.5534530283841167e-05, | |
| "loss": 3.6091, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 2.708317619945527, | |
| "grad_norm": 6.368646621704102, | |
| "learning_rate": 2.546003128957759e-05, | |
| "loss": 3.6117, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 2.715021998742929, | |
| "grad_norm": 6.0201096534729, | |
| "learning_rate": 2.5385532295314012e-05, | |
| "loss": 3.4642, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.721726377540331, | |
| "grad_norm": 6.197525501251221, | |
| "learning_rate": 2.5311033301050437e-05, | |
| "loss": 3.5663, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 2.7284307563377332, | |
| "grad_norm": 7.0478434562683105, | |
| "learning_rate": 2.5236534306786857e-05, | |
| "loss": 3.5866, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 2.735135135135135, | |
| "grad_norm": 5.948623180389404, | |
| "learning_rate": 2.516203531252328e-05, | |
| "loss": 3.594, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 2.741839513932537, | |
| "grad_norm": 6.0779266357421875, | |
| "learning_rate": 2.5087536318259703e-05, | |
| "loss": 3.5592, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 2.7485438927299395, | |
| "grad_norm": 6.036412715911865, | |
| "learning_rate": 2.5013037323996123e-05, | |
| "loss": 3.6267, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.7552482715273414, | |
| "grad_norm": 5.411278247833252, | |
| "learning_rate": 2.493853832973255e-05, | |
| "loss": 3.4901, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 2.7619526503247434, | |
| "grad_norm": 5.945597171783447, | |
| "learning_rate": 2.4864039335468972e-05, | |
| "loss": 3.5108, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 2.7686570291221453, | |
| "grad_norm": 5.927489280700684, | |
| "learning_rate": 2.4789540341205393e-05, | |
| "loss": 3.523, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 2.775361407919547, | |
| "grad_norm": 5.989095211029053, | |
| "learning_rate": 2.4715041346941817e-05, | |
| "loss": 3.5256, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 2.7820657867169496, | |
| "grad_norm": 5.67732572555542, | |
| "learning_rate": 2.4640542352678238e-05, | |
| "loss": 3.5598, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.7887701655143515, | |
| "grad_norm": 5.954450607299805, | |
| "learning_rate": 2.4566043358414662e-05, | |
| "loss": 3.5808, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 2.7954745443117535, | |
| "grad_norm": 6.370481967926025, | |
| "learning_rate": 2.4491544364151083e-05, | |
| "loss": 3.4797, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 2.802178923109156, | |
| "grad_norm": 6.09319543838501, | |
| "learning_rate": 2.4417045369887507e-05, | |
| "loss": 3.5653, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 2.8088833019065578, | |
| "grad_norm": 6.781850337982178, | |
| "learning_rate": 2.4342546375623932e-05, | |
| "loss": 3.5961, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 2.8155876807039597, | |
| "grad_norm": 6.419500350952148, | |
| "learning_rate": 2.4268047381360353e-05, | |
| "loss": 3.5896, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.822292059501362, | |
| "grad_norm": 6.156778335571289, | |
| "learning_rate": 2.4193548387096777e-05, | |
| "loss": 3.5622, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 2.828996438298764, | |
| "grad_norm": 6.792672157287598, | |
| "learning_rate": 2.4119049392833198e-05, | |
| "loss": 3.5262, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 2.835700817096166, | |
| "grad_norm": 6.010193347930908, | |
| "learning_rate": 2.4044550398569622e-05, | |
| "loss": 3.5115, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 2.842405195893568, | |
| "grad_norm": 5.631977081298828, | |
| "learning_rate": 2.3970051404306043e-05, | |
| "loss": 3.5809, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 2.84910957469097, | |
| "grad_norm": 5.957998275756836, | |
| "learning_rate": 2.3895552410042464e-05, | |
| "loss": 3.5782, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.855813953488372, | |
| "grad_norm": 6.206627368927002, | |
| "learning_rate": 2.3821053415778888e-05, | |
| "loss": 3.5551, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 2.862518332285774, | |
| "grad_norm": 5.459038257598877, | |
| "learning_rate": 2.374655442151531e-05, | |
| "loss": 3.6324, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 2.869222711083176, | |
| "grad_norm": 6.841930866241455, | |
| "learning_rate": 2.3672055427251733e-05, | |
| "loss": 3.6122, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 2.8759270898805784, | |
| "grad_norm": 6.351479530334473, | |
| "learning_rate": 2.3597556432988154e-05, | |
| "loss": 3.5087, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 2.8826314686779804, | |
| "grad_norm": 5.920718193054199, | |
| "learning_rate": 2.352305743872458e-05, | |
| "loss": 3.5419, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.8893358474753823, | |
| "grad_norm": 6.254413604736328, | |
| "learning_rate": 2.3448558444461e-05, | |
| "loss": 3.475, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 2.8960402262727847, | |
| "grad_norm": 6.233896732330322, | |
| "learning_rate": 2.3374059450197424e-05, | |
| "loss": 3.6237, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.9027446050701866, | |
| "grad_norm": 6.401550769805908, | |
| "learning_rate": 2.3299560455933848e-05, | |
| "loss": 3.5505, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 2.9094489838675885, | |
| "grad_norm": 6.0176591873168945, | |
| "learning_rate": 2.322506146167027e-05, | |
| "loss": 3.4655, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 2.9161533626649905, | |
| "grad_norm": 6.907371520996094, | |
| "learning_rate": 2.3150562467406693e-05, | |
| "loss": 3.4332, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.9228577414623924, | |
| "grad_norm": 6.5138444900512695, | |
| "learning_rate": 2.3076063473143114e-05, | |
| "loss": 3.5849, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 2.929562120259795, | |
| "grad_norm": 7.2710347175598145, | |
| "learning_rate": 2.3001564478879538e-05, | |
| "loss": 3.5402, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 2.9362664990571967, | |
| "grad_norm": 6.0399909019470215, | |
| "learning_rate": 2.292706548461596e-05, | |
| "loss": 3.5121, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 2.9429708778545987, | |
| "grad_norm": 6.308010578155518, | |
| "learning_rate": 2.285256649035238e-05, | |
| "loss": 3.473, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 2.949675256652001, | |
| "grad_norm": 5.680022716522217, | |
| "learning_rate": 2.2778067496088804e-05, | |
| "loss": 3.5039, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.956379635449403, | |
| "grad_norm": 5.785823345184326, | |
| "learning_rate": 2.2703568501825225e-05, | |
| "loss": 3.5442, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 2.963084014246805, | |
| "grad_norm": 5.894392967224121, | |
| "learning_rate": 2.262906950756165e-05, | |
| "loss": 3.4217, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 2.969788393044207, | |
| "grad_norm": 5.803259372711182, | |
| "learning_rate": 2.255457051329807e-05, | |
| "loss": 3.5274, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 2.976492771841609, | |
| "grad_norm": 6.49872350692749, | |
| "learning_rate": 2.2480071519034494e-05, | |
| "loss": 3.5022, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 2.983197150639011, | |
| "grad_norm": 6.164760112762451, | |
| "learning_rate": 2.2405572524770915e-05, | |
| "loss": 3.5333, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.989901529436413, | |
| "grad_norm": 6.127744674682617, | |
| "learning_rate": 2.233107353050734e-05, | |
| "loss": 3.4646, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 2.996605908233815, | |
| "grad_norm": 6.783234119415283, | |
| "learning_rate": 2.2256574536243764e-05, | |
| "loss": 3.5176, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 3.0030169704588308, | |
| "grad_norm": 5.963592052459717, | |
| "learning_rate": 2.2182075541980185e-05, | |
| "loss": 3.202, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 3.009721349256233, | |
| "grad_norm": 6.0857744216918945, | |
| "learning_rate": 2.210757654771661e-05, | |
| "loss": 3.3323, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 3.016425728053635, | |
| "grad_norm": 6.808197975158691, | |
| "learning_rate": 2.203307755345303e-05, | |
| "loss": 3.4583, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.016425728053635, | |
| "eval_bleu_greedy": 2.4234819018656997, | |
| "eval_loss": 0.4750092625617981, | |
| "eval_runtime": 445.5917, | |
| "eval_samples_per_second": 0.224, | |
| "eval_steps_per_second": 0.224, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.023130106851037, | |
| "grad_norm": 6.107368469238281, | |
| "learning_rate": 2.195857855918945e-05, | |
| "loss": 3.2516, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 3.029834485648439, | |
| "grad_norm": 6.768011093139648, | |
| "learning_rate": 2.1884079564925875e-05, | |
| "loss": 3.3924, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 3.0365388644458413, | |
| "grad_norm": 6.5038743019104, | |
| "learning_rate": 2.1809580570662296e-05, | |
| "loss": 3.2324, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 3.0432432432432432, | |
| "grad_norm": 5.936684608459473, | |
| "learning_rate": 2.173508157639872e-05, | |
| "loss": 3.3328, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 3.049947622040645, | |
| "grad_norm": 6.226217746734619, | |
| "learning_rate": 2.166058258213514e-05, | |
| "loss": 3.2819, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.0566520008380476, | |
| "grad_norm": 5.835100173950195, | |
| "learning_rate": 2.1586083587871565e-05, | |
| "loss": 3.3229, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 3.0633563796354495, | |
| "grad_norm": 6.959074020385742, | |
| "learning_rate": 2.1511584593607986e-05, | |
| "loss": 3.2994, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 3.0700607584328514, | |
| "grad_norm": 6.586850643157959, | |
| "learning_rate": 2.1437085599344407e-05, | |
| "loss": 3.2011, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 3.0767651372302534, | |
| "grad_norm": 5.986579418182373, | |
| "learning_rate": 2.1362586605080835e-05, | |
| "loss": 3.3602, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 3.0834695160276557, | |
| "grad_norm": 5.427093029022217, | |
| "learning_rate": 2.1288087610817256e-05, | |
| "loss": 3.3568, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.0901738948250577, | |
| "grad_norm": 6.213650226593018, | |
| "learning_rate": 2.121358861655368e-05, | |
| "loss": 3.2927, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 3.0968782736224596, | |
| "grad_norm": 6.2159342765808105, | |
| "learning_rate": 2.11390896222901e-05, | |
| "loss": 3.3695, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 3.1035826524198615, | |
| "grad_norm": 6.4243597984313965, | |
| "learning_rate": 2.106459062802652e-05, | |
| "loss": 3.3794, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 3.110287031217264, | |
| "grad_norm": 6.737236499786377, | |
| "learning_rate": 2.0990091633762946e-05, | |
| "loss": 3.3333, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 3.116991410014666, | |
| "grad_norm": 6.42462158203125, | |
| "learning_rate": 2.0915592639499367e-05, | |
| "loss": 3.3944, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.1236957888120678, | |
| "grad_norm": 6.075654029846191, | |
| "learning_rate": 2.084109364523579e-05, | |
| "loss": 3.3037, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 3.13040016760947, | |
| "grad_norm": 5.77776575088501, | |
| "learning_rate": 2.0766594650972212e-05, | |
| "loss": 3.4129, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 3.137104546406872, | |
| "grad_norm": 6.13924503326416, | |
| "learning_rate": 2.0692095656708636e-05, | |
| "loss": 3.3723, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 3.143808925204274, | |
| "grad_norm": 6.142735481262207, | |
| "learning_rate": 2.0617596662445057e-05, | |
| "loss": 3.3321, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 3.150513304001676, | |
| "grad_norm": 6.672779083251953, | |
| "learning_rate": 2.0543097668181478e-05, | |
| "loss": 3.3234, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.1572176827990783, | |
| "grad_norm": 6.145503520965576, | |
| "learning_rate": 2.0468598673917902e-05, | |
| "loss": 3.2741, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 3.1639220615964803, | |
| "grad_norm": 6.716073513031006, | |
| "learning_rate": 2.0394099679654323e-05, | |
| "loss": 3.2791, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 3.170626440393882, | |
| "grad_norm": 6.335756778717041, | |
| "learning_rate": 2.031960068539075e-05, | |
| "loss": 3.2895, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 3.177330819191284, | |
| "grad_norm": 7.066572189331055, | |
| "learning_rate": 2.024510169112717e-05, | |
| "loss": 3.3522, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 3.1840351979886865, | |
| "grad_norm": 6.07637357711792, | |
| "learning_rate": 2.0170602696863593e-05, | |
| "loss": 3.2799, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.1907395767860884, | |
| "grad_norm": 6.036308288574219, | |
| "learning_rate": 2.0096103702600017e-05, | |
| "loss": 3.3748, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 3.1974439555834904, | |
| "grad_norm": 6.354751110076904, | |
| "learning_rate": 2.0021604708336438e-05, | |
| "loss": 3.3692, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 3.2041483343808927, | |
| "grad_norm": 5.642934322357178, | |
| "learning_rate": 1.9947105714072862e-05, | |
| "loss": 3.2563, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 3.2108527131782947, | |
| "grad_norm": 6.8418803215026855, | |
| "learning_rate": 1.9872606719809283e-05, | |
| "loss": 3.3056, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 3.2175570919756966, | |
| "grad_norm": 5.866108417510986, | |
| "learning_rate": 1.9798107725545707e-05, | |
| "loss": 3.2476, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.2242614707730985, | |
| "grad_norm": 5.497636795043945, | |
| "learning_rate": 1.9723608731282128e-05, | |
| "loss": 3.2634, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 3.230965849570501, | |
| "grad_norm": 6.034823894500732, | |
| "learning_rate": 1.9649109737018552e-05, | |
| "loss": 3.4055, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 3.237670228367903, | |
| "grad_norm": 7.224872589111328, | |
| "learning_rate": 1.9574610742754973e-05, | |
| "loss": 3.3118, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 3.244374607165305, | |
| "grad_norm": 6.321878433227539, | |
| "learning_rate": 1.9500111748491394e-05, | |
| "loss": 3.3225, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 3.2510789859627067, | |
| "grad_norm": 6.296338081359863, | |
| "learning_rate": 1.9425612754227818e-05, | |
| "loss": 3.349, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.257783364760109, | |
| "grad_norm": 6.3536505699157715, | |
| "learning_rate": 1.935111375996424e-05, | |
| "loss": 3.3247, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 3.264487743557511, | |
| "grad_norm": 5.711906433105469, | |
| "learning_rate": 1.9276614765700667e-05, | |
| "loss": 3.3193, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 3.271192122354913, | |
| "grad_norm": 6.689239978790283, | |
| "learning_rate": 1.9202115771437088e-05, | |
| "loss": 3.272, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 3.2778965011523153, | |
| "grad_norm": 6.301712989807129, | |
| "learning_rate": 1.912761677717351e-05, | |
| "loss": 3.2867, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 3.2846008799497173, | |
| "grad_norm": 6.167557239532471, | |
| "learning_rate": 1.9053117782909933e-05, | |
| "loss": 3.3041, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.291305258747119, | |
| "grad_norm": 6.17465353012085, | |
| "learning_rate": 1.8978618788646354e-05, | |
| "loss": 3.2369, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 3.298009637544521, | |
| "grad_norm": 6.496537208557129, | |
| "learning_rate": 1.8904119794382778e-05, | |
| "loss": 3.3375, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 3.304714016341923, | |
| "grad_norm": 6.527161598205566, | |
| "learning_rate": 1.88296208001192e-05, | |
| "loss": 3.3039, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 3.3114183951393255, | |
| "grad_norm": 7.209779262542725, | |
| "learning_rate": 1.8755121805855623e-05, | |
| "loss": 3.3638, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 3.3181227739367274, | |
| "grad_norm": 6.651127815246582, | |
| "learning_rate": 1.8680622811592044e-05, | |
| "loss": 3.3062, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.3248271527341293, | |
| "grad_norm": 5.988480567932129, | |
| "learning_rate": 1.8606123817328465e-05, | |
| "loss": 3.2656, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 3.3315315315315317, | |
| "grad_norm": 6.463028907775879, | |
| "learning_rate": 1.853162482306489e-05, | |
| "loss": 3.2631, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 3.3382359103289336, | |
| "grad_norm": 5.910898685455322, | |
| "learning_rate": 1.845712582880131e-05, | |
| "loss": 3.2458, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 3.3449402891263356, | |
| "grad_norm": 6.237380027770996, | |
| "learning_rate": 1.8382626834537734e-05, | |
| "loss": 3.32, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 3.3516446679237375, | |
| "grad_norm": 6.681026458740234, | |
| "learning_rate": 1.830812784027416e-05, | |
| "loss": 3.2698, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.3516446679237375, | |
| "eval_bleu_greedy": 2.2580959043747226, | |
| "eval_loss": 0.46980682015419006, | |
| "eval_runtime": 206.5153, | |
| "eval_samples_per_second": 0.484, | |
| "eval_steps_per_second": 0.484, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.35901948460088, | |
| "grad_norm": 6.202412128448486, | |
| "learning_rate": 1.823362884601058e-05, | |
| "loss": 3.2913, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 3.365723863398282, | |
| "grad_norm": 7.057358741760254, | |
| "learning_rate": 1.8159129851747004e-05, | |
| "loss": 3.2247, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 3.372428242195684, | |
| "grad_norm": 6.253905773162842, | |
| "learning_rate": 1.8084630857483425e-05, | |
| "loss": 3.2799, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 3.379132620993086, | |
| "grad_norm": 5.532748222351074, | |
| "learning_rate": 1.801013186321985e-05, | |
| "loss": 3.2562, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 3.385836999790488, | |
| "grad_norm": 5.992211818695068, | |
| "learning_rate": 1.793563286895627e-05, | |
| "loss": 3.3089, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.3925413785878904, | |
| "grad_norm": 5.832738876342773, | |
| "learning_rate": 1.7861133874692694e-05, | |
| "loss": 3.2162, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 3.3992457573852923, | |
| "grad_norm": 7.200679779052734, | |
| "learning_rate": 1.7786634880429115e-05, | |
| "loss": 3.2683, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 3.4059501361826943, | |
| "grad_norm": 5.809505939483643, | |
| "learning_rate": 1.7712135886165536e-05, | |
| "loss": 3.298, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 3.412654514980096, | |
| "grad_norm": 6.071516513824463, | |
| "learning_rate": 1.763763689190196e-05, | |
| "loss": 3.289, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 3.4193588937774986, | |
| "grad_norm": 7.4007744789123535, | |
| "learning_rate": 1.756313789763838e-05, | |
| "loss": 3.2438, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.4260632725749005, | |
| "grad_norm": 5.917469501495361, | |
| "learning_rate": 1.7488638903374805e-05, | |
| "loss": 3.2932, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 3.4327676513723024, | |
| "grad_norm": 6.20914363861084, | |
| "learning_rate": 1.7414139909111226e-05, | |
| "loss": 3.2827, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 3.4394720301697044, | |
| "grad_norm": 6.218352794647217, | |
| "learning_rate": 1.733964091484765e-05, | |
| "loss": 3.2885, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 3.4461764089671068, | |
| "grad_norm": 6.699190616607666, | |
| "learning_rate": 1.7265141920584075e-05, | |
| "loss": 3.2237, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 3.4528807877645087, | |
| "grad_norm": 5.649641990661621, | |
| "learning_rate": 1.7190642926320496e-05, | |
| "loss": 3.1781, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.4595851665619106, | |
| "grad_norm": 6.218810558319092, | |
| "learning_rate": 1.711614393205692e-05, | |
| "loss": 3.1712, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 3.466289545359313, | |
| "grad_norm": 7.203590393066406, | |
| "learning_rate": 1.704164493779334e-05, | |
| "loss": 3.2399, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 3.472993924156715, | |
| "grad_norm": 6.820786952972412, | |
| "learning_rate": 1.6967145943529765e-05, | |
| "loss": 3.2389, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 3.479698302954117, | |
| "grad_norm": 6.1062798500061035, | |
| "learning_rate": 1.6892646949266186e-05, | |
| "loss": 3.2507, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 3.486402681751519, | |
| "grad_norm": 6.02709436416626, | |
| "learning_rate": 1.6818147955002607e-05, | |
| "loss": 3.2714, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.493107060548921, | |
| "grad_norm": 6.64369010925293, | |
| "learning_rate": 1.674364896073903e-05, | |
| "loss": 3.2319, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 3.499811439346323, | |
| "grad_norm": 5.996265888214111, | |
| "learning_rate": 1.6669149966475452e-05, | |
| "loss": 3.221, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 3.506515818143725, | |
| "grad_norm": 6.68301248550415, | |
| "learning_rate": 1.6594650972211876e-05, | |
| "loss": 3.2714, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 3.513220196941127, | |
| "grad_norm": 5.8294878005981445, | |
| "learning_rate": 1.6520151977948297e-05, | |
| "loss": 3.2909, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 3.5199245757385293, | |
| "grad_norm": 6.585033893585205, | |
| "learning_rate": 1.644565298368472e-05, | |
| "loss": 3.2091, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.5266289545359313, | |
| "grad_norm": 6.6064934730529785, | |
| "learning_rate": 1.6371153989421142e-05, | |
| "loss": 3.2033, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 3.533333333333333, | |
| "grad_norm": 6.687121868133545, | |
| "learning_rate": 1.6296654995157566e-05, | |
| "loss": 3.2532, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 3.5400377121307356, | |
| "grad_norm": 7.002409934997559, | |
| "learning_rate": 1.622215600089399e-05, | |
| "loss": 3.2823, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 3.5467420909281375, | |
| "grad_norm": 6.823233127593994, | |
| "learning_rate": 1.614765700663041e-05, | |
| "loss": 3.3175, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 3.5534464697255395, | |
| "grad_norm": 5.88700532913208, | |
| "learning_rate": 1.6073158012366836e-05, | |
| "loss": 3.2122, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.5601508485229414, | |
| "grad_norm": 6.105057716369629, | |
| "learning_rate": 1.5998659018103257e-05, | |
| "loss": 3.1478, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 3.5668552273203433, | |
| "grad_norm": 6.8328022956848145, | |
| "learning_rate": 1.592416002383968e-05, | |
| "loss": 3.2097, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 3.5735596061177457, | |
| "grad_norm": 6.577600002288818, | |
| "learning_rate": 1.5849661029576102e-05, | |
| "loss": 3.1664, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 3.5802639849151476, | |
| "grad_norm": 7.270109176635742, | |
| "learning_rate": 1.5775162035312523e-05, | |
| "loss": 3.1901, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 3.5869683637125496, | |
| "grad_norm": 6.4257683753967285, | |
| "learning_rate": 1.5700663041048947e-05, | |
| "loss": 3.226, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.593672742509952, | |
| "grad_norm": 5.963393688201904, | |
| "learning_rate": 1.5626164046785368e-05, | |
| "loss": 3.2249, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 3.600377121307354, | |
| "grad_norm": 6.7239766120910645, | |
| "learning_rate": 1.5551665052521792e-05, | |
| "loss": 3.2575, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 3.607081500104756, | |
| "grad_norm": 6.665550231933594, | |
| "learning_rate": 1.5477166058258213e-05, | |
| "loss": 3.2503, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 3.613785878902158, | |
| "grad_norm": 5.777255058288574, | |
| "learning_rate": 1.5402667063994637e-05, | |
| "loss": 3.2054, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 3.62049025769956, | |
| "grad_norm": 6.491949558258057, | |
| "learning_rate": 1.5328168069731058e-05, | |
| "loss": 3.1708, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.627194636496962, | |
| "grad_norm": 6.6252665519714355, | |
| "learning_rate": 1.5253669075467482e-05, | |
| "loss": 3.2418, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 3.633899015294364, | |
| "grad_norm": 6.817688941955566, | |
| "learning_rate": 1.5179170081203905e-05, | |
| "loss": 3.259, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 3.640603394091766, | |
| "grad_norm": 6.675405025482178, | |
| "learning_rate": 1.5104671086940328e-05, | |
| "loss": 3.218, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 3.6473077728891683, | |
| "grad_norm": 6.145236015319824, | |
| "learning_rate": 1.503017209267675e-05, | |
| "loss": 3.2562, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 3.6540121516865702, | |
| "grad_norm": 6.741012096405029, | |
| "learning_rate": 1.4955673098413173e-05, | |
| "loss": 3.2195, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.660716530483972, | |
| "grad_norm": 6.519981861114502, | |
| "learning_rate": 1.4881174104149595e-05, | |
| "loss": 3.2343, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 3.6674209092813745, | |
| "grad_norm": 5.3957648277282715, | |
| "learning_rate": 1.4806675109886018e-05, | |
| "loss": 3.2527, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 3.6741252880787765, | |
| "grad_norm": 7.068119525909424, | |
| "learning_rate": 1.473217611562244e-05, | |
| "loss": 3.1907, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 3.6808296668761784, | |
| "grad_norm": 6.537194728851318, | |
| "learning_rate": 1.4657677121358861e-05, | |
| "loss": 3.1773, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 3.687534045673581, | |
| "grad_norm": 6.715285778045654, | |
| "learning_rate": 1.4583178127095284e-05, | |
| "loss": 3.2272, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.687534045673581, | |
| "eval_bleu_greedy": 2.3251290926774186, | |
| "eval_loss": 0.46657735109329224, | |
| "eval_runtime": 464.198, | |
| "eval_samples_per_second": 0.215, | |
| "eval_steps_per_second": 0.215, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.6942384244709827, | |
| "grad_norm": 6.894411087036133, | |
| "learning_rate": 1.4508679132831707e-05, | |
| "loss": 3.1783, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 3.7009428032683847, | |
| "grad_norm": 6.030000686645508, | |
| "learning_rate": 1.4434180138568129e-05, | |
| "loss": 3.2033, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 3.7076471820657866, | |
| "grad_norm": 6.902273654937744, | |
| "learning_rate": 1.4359681144304552e-05, | |
| "loss": 3.1508, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 3.7143515608631885, | |
| "grad_norm": 6.1826934814453125, | |
| "learning_rate": 1.4285182150040974e-05, | |
| "loss": 3.2302, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 3.721055939660591, | |
| "grad_norm": 7.401595592498779, | |
| "learning_rate": 1.4210683155777399e-05, | |
| "loss": 3.2139, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.727760318457993, | |
| "grad_norm": 6.83165168762207, | |
| "learning_rate": 1.4136184161513821e-05, | |
| "loss": 3.1857, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 3.7344646972553948, | |
| "grad_norm": 6.816708087921143, | |
| "learning_rate": 1.4061685167250244e-05, | |
| "loss": 3.2377, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 3.741169076052797, | |
| "grad_norm": 5.8378987312316895, | |
| "learning_rate": 1.3987186172986666e-05, | |
| "loss": 3.2827, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 3.747873454850199, | |
| "grad_norm": 7.595831394195557, | |
| "learning_rate": 1.3912687178723089e-05, | |
| "loss": 3.237, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 3.754577833647601, | |
| "grad_norm": 6.578879356384277, | |
| "learning_rate": 1.3838188184459511e-05, | |
| "loss": 3.2416, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.7612822124450034, | |
| "grad_norm": 5.939640522003174, | |
| "learning_rate": 1.3763689190195932e-05, | |
| "loss": 3.1734, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 3.7679865912424053, | |
| "grad_norm": 6.304750442504883, | |
| "learning_rate": 1.3689190195932355e-05, | |
| "loss": 3.2416, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 3.7746909700398072, | |
| "grad_norm": 6.177415370941162, | |
| "learning_rate": 1.3614691201668777e-05, | |
| "loss": 3.1675, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 3.781395348837209, | |
| "grad_norm": 6.993617057800293, | |
| "learning_rate": 1.35401922074052e-05, | |
| "loss": 3.2162, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 3.788099727634611, | |
| "grad_norm": 6.1251726150512695, | |
| "learning_rate": 1.3465693213141623e-05, | |
| "loss": 3.2232, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.7948041064320135, | |
| "grad_norm": 6.485012054443359, | |
| "learning_rate": 1.3391194218878045e-05, | |
| "loss": 3.1591, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 3.8015084852294154, | |
| "grad_norm": 6.347079277038574, | |
| "learning_rate": 1.3316695224614468e-05, | |
| "loss": 3.1745, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 3.8082128640268174, | |
| "grad_norm": 7.0095744132995605, | |
| "learning_rate": 1.324219623035089e-05, | |
| "loss": 3.2463, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 3.8149172428242197, | |
| "grad_norm": 6.158694267272949, | |
| "learning_rate": 1.3167697236087315e-05, | |
| "loss": 3.223, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 3.8216216216216217, | |
| "grad_norm": 7.0430827140808105, | |
| "learning_rate": 1.3093198241823737e-05, | |
| "loss": 3.1774, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.8283260004190236, | |
| "grad_norm": 6.411921501159668, | |
| "learning_rate": 1.301869924756016e-05, | |
| "loss": 3.2934, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 3.835030379216426, | |
| "grad_norm": 6.355661392211914, | |
| "learning_rate": 1.2944200253296582e-05, | |
| "loss": 3.129, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 3.841734758013828, | |
| "grad_norm": 5.618327617645264, | |
| "learning_rate": 1.2869701259033005e-05, | |
| "loss": 3.138, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 3.84843913681123, | |
| "grad_norm": 6.159928321838379, | |
| "learning_rate": 1.2795202264769426e-05, | |
| "loss": 3.2041, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 3.8551435156086318, | |
| "grad_norm": 7.234489917755127, | |
| "learning_rate": 1.2720703270505848e-05, | |
| "loss": 3.2447, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.8618478944060337, | |
| "grad_norm": 6.748493671417236, | |
| "learning_rate": 1.2646204276242271e-05, | |
| "loss": 3.2015, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 3.868552273203436, | |
| "grad_norm": 6.751996040344238, | |
| "learning_rate": 1.2571705281978693e-05, | |
| "loss": 3.1563, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 3.875256652000838, | |
| "grad_norm": 6.8070783615112305, | |
| "learning_rate": 1.2497206287715116e-05, | |
| "loss": 3.1416, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 3.88196103079824, | |
| "grad_norm": 6.117493152618408, | |
| "learning_rate": 1.242270729345154e-05, | |
| "loss": 3.1295, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 3.8886654095956423, | |
| "grad_norm": 6.02462100982666, | |
| "learning_rate": 1.2348208299187961e-05, | |
| "loss": 3.2225, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.8953697883930443, | |
| "grad_norm": 5.968542575836182, | |
| "learning_rate": 1.2273709304924384e-05, | |
| "loss": 3.175, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 3.902074167190446, | |
| "grad_norm": 7.16673469543457, | |
| "learning_rate": 1.2199210310660806e-05, | |
| "loss": 3.2582, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 3.9087785459878486, | |
| "grad_norm": 6.56205415725708, | |
| "learning_rate": 1.2124711316397229e-05, | |
| "loss": 3.2285, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 3.9154829247852505, | |
| "grad_norm": 6.418534755706787, | |
| "learning_rate": 1.2050212322133651e-05, | |
| "loss": 3.2023, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 3.9221873035826524, | |
| "grad_norm": 6.226400852203369, | |
| "learning_rate": 1.1975713327870074e-05, | |
| "loss": 3.1821, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.9288916823800544, | |
| "grad_norm": 6.837843894958496, | |
| "learning_rate": 1.1901214333606497e-05, | |
| "loss": 3.1677, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 3.9355960611774563, | |
| "grad_norm": 5.950616836547852, | |
| "learning_rate": 1.182671533934292e-05, | |
| "loss": 3.1856, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 3.9423004399748587, | |
| "grad_norm": 6.373692035675049, | |
| "learning_rate": 1.1752216345079342e-05, | |
| "loss": 3.2501, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 3.9490048187722606, | |
| "grad_norm": 6.721376895904541, | |
| "learning_rate": 1.1677717350815764e-05, | |
| "loss": 3.2044, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 3.9557091975696625, | |
| "grad_norm": 6.181844711303711, | |
| "learning_rate": 1.1603218356552187e-05, | |
| "loss": 3.1938, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.962413576367065, | |
| "grad_norm": 6.3947577476501465, | |
| "learning_rate": 1.152871936228861e-05, | |
| "loss": 3.1984, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 3.969117955164467, | |
| "grad_norm": 6.491850852966309, | |
| "learning_rate": 1.1454220368025032e-05, | |
| "loss": 3.1825, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 3.975822333961869, | |
| "grad_norm": 6.11356782913208, | |
| "learning_rate": 1.1379721373761455e-05, | |
| "loss": 3.1174, | |
| "step": 11860 | |
| }, | |
| { | |
| "epoch": 3.982526712759271, | |
| "grad_norm": 7.591030597686768, | |
| "learning_rate": 1.1305222379497877e-05, | |
| "loss": 3.2322, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 3.989231091556673, | |
| "grad_norm": 5.719244956970215, | |
| "learning_rate": 1.12307233852343e-05, | |
| "loss": 3.1449, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.995935470354075, | |
| "grad_norm": 6.756486892700195, | |
| "learning_rate": 1.1156224390970722e-05, | |
| "loss": 3.1296, | |
| "step": 11920 | |
| }, | |
| { | |
| "epoch": 4.002681751518961, | |
| "grad_norm": 6.4487528800964355, | |
| "learning_rate": 1.1081725396707145e-05, | |
| "loss": 3.259, | |
| "step": 11940 | |
| }, | |
| { | |
| "epoch": 4.009386130316363, | |
| "grad_norm": 6.847874641418457, | |
| "learning_rate": 1.1007226402443568e-05, | |
| "loss": 3.1175, | |
| "step": 11960 | |
| }, | |
| { | |
| "epoch": 4.016090509113765, | |
| "grad_norm": 6.221479892730713, | |
| "learning_rate": 1.093272740817999e-05, | |
| "loss": 3.1395, | |
| "step": 11980 | |
| }, | |
| { | |
| "epoch": 4.022794887911167, | |
| "grad_norm": 7.619890213012695, | |
| "learning_rate": 1.0858228413916413e-05, | |
| "loss": 3.0701, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.022794887911167, | |
| "eval_bleu_greedy": 2.2056340612259895, | |
| "eval_loss": 0.4672054350376129, | |
| "eval_runtime": 519.4009, | |
| "eval_samples_per_second": 0.193, | |
| "eval_steps_per_second": 0.193, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.029499266708569, | |
| "grad_norm": 6.872377872467041, | |
| "learning_rate": 1.0783729419652835e-05, | |
| "loss": 3.1056, | |
| "step": 12020 | |
| }, | |
| { | |
| "epoch": 4.036203645505971, | |
| "grad_norm": 6.781915664672852, | |
| "learning_rate": 1.0709230425389258e-05, | |
| "loss": 3.0394, | |
| "step": 12040 | |
| }, | |
| { | |
| "epoch": 4.042908024303373, | |
| "grad_norm": 6.772296905517578, | |
| "learning_rate": 1.063473143112568e-05, | |
| "loss": 3.1418, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 4.049612403100776, | |
| "grad_norm": 5.7719011306762695, | |
| "learning_rate": 1.0560232436862103e-05, | |
| "loss": 2.9814, | |
| "step": 12080 | |
| }, | |
| { | |
| "epoch": 4.056316781898177, | |
| "grad_norm": 6.217945575714111, | |
| "learning_rate": 1.0485733442598526e-05, | |
| "loss": 3.1063, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 4.0630211606955795, | |
| "grad_norm": 7.190448760986328, | |
| "learning_rate": 1.0411234448334948e-05, | |
| "loss": 3.1758, | |
| "step": 12120 | |
| }, | |
| { | |
| "epoch": 4.069725539492981, | |
| "grad_norm": 7.15440559387207, | |
| "learning_rate": 1.033673545407137e-05, | |
| "loss": 3.1914, | |
| "step": 12140 | |
| }, | |
| { | |
| "epoch": 4.076429918290383, | |
| "grad_norm": 6.738652229309082, | |
| "learning_rate": 1.0262236459807793e-05, | |
| "loss": 2.9985, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 4.083134297087786, | |
| "grad_norm": 6.6931962966918945, | |
| "learning_rate": 1.0187737465544216e-05, | |
| "loss": 3.1319, | |
| "step": 12180 | |
| }, | |
| { | |
| "epoch": 4.089838675885187, | |
| "grad_norm": 7.118000507354736, | |
| "learning_rate": 1.0113238471280638e-05, | |
| "loss": 3.1749, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 4.09654305468259, | |
| "grad_norm": 6.880594730377197, | |
| "learning_rate": 1.0038739477017061e-05, | |
| "loss": 3.151, | |
| "step": 12220 | |
| }, | |
| { | |
| "epoch": 4.103247433479992, | |
| "grad_norm": 6.61147403717041, | |
| "learning_rate": 9.964240482753482e-06, | |
| "loss": 3.0578, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 4.1099518122773935, | |
| "grad_norm": 8.498011589050293, | |
| "learning_rate": 9.889741488489906e-06, | |
| "loss": 3.0902, | |
| "step": 12260 | |
| }, | |
| { | |
| "epoch": 4.116656191074796, | |
| "grad_norm": 7.078530788421631, | |
| "learning_rate": 9.815242494226329e-06, | |
| "loss": 3.1134, | |
| "step": 12280 | |
| }, | |
| { | |
| "epoch": 4.123360569872197, | |
| "grad_norm": 7.098989486694336, | |
| "learning_rate": 9.740743499962751e-06, | |
| "loss": 3.0835, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 4.1300649486696, | |
| "grad_norm": 6.889624118804932, | |
| "learning_rate": 9.666244505699174e-06, | |
| "loss": 3.0836, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 4.136769327467002, | |
| "grad_norm": 6.000241279602051, | |
| "learning_rate": 9.591745511435596e-06, | |
| "loss": 3.1416, | |
| "step": 12340 | |
| }, | |
| { | |
| "epoch": 4.143473706264404, | |
| "grad_norm": 7.567933559417725, | |
| "learning_rate": 9.517246517172019e-06, | |
| "loss": 3.2006, | |
| "step": 12360 | |
| }, | |
| { | |
| "epoch": 4.150178085061806, | |
| "grad_norm": 6.100574493408203, | |
| "learning_rate": 9.44274752290844e-06, | |
| "loss": 3.0368, | |
| "step": 12380 | |
| }, | |
| { | |
| "epoch": 4.156882463859208, | |
| "grad_norm": 6.519239902496338, | |
| "learning_rate": 9.368248528644864e-06, | |
| "loss": 3.1715, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 4.16358684265661, | |
| "grad_norm": 7.143859386444092, | |
| "learning_rate": 9.293749534381287e-06, | |
| "loss": 3.1236, | |
| "step": 12420 | |
| }, | |
| { | |
| "epoch": 4.170291221454012, | |
| "grad_norm": 7.759309768676758, | |
| "learning_rate": 9.21925054011771e-06, | |
| "loss": 3.1434, | |
| "step": 12440 | |
| }, | |
| { | |
| "epoch": 4.176995600251415, | |
| "grad_norm": 6.477195739746094, | |
| "learning_rate": 9.144751545854132e-06, | |
| "loss": 3.1094, | |
| "step": 12460 | |
| }, | |
| { | |
| "epoch": 4.183699979048816, | |
| "grad_norm": 6.8250813484191895, | |
| "learning_rate": 9.070252551590554e-06, | |
| "loss": 3.0832, | |
| "step": 12480 | |
| }, | |
| { | |
| "epoch": 4.1904043578462185, | |
| "grad_norm": 6.915088176727295, | |
| "learning_rate": 8.995753557326975e-06, | |
| "loss": 3.1025, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.19710873664362, | |
| "grad_norm": 6.240241527557373, | |
| "learning_rate": 8.921254563063398e-06, | |
| "loss": 3.1142, | |
| "step": 12520 | |
| }, | |
| { | |
| "epoch": 4.203813115441022, | |
| "grad_norm": 5.7193522453308105, | |
| "learning_rate": 8.846755568799822e-06, | |
| "loss": 3.0886, | |
| "step": 12540 | |
| }, | |
| { | |
| "epoch": 4.210517494238425, | |
| "grad_norm": 6.799840927124023, | |
| "learning_rate": 8.772256574536245e-06, | |
| "loss": 3.0755, | |
| "step": 12560 | |
| }, | |
| { | |
| "epoch": 4.217221873035826, | |
| "grad_norm": 6.537818908691406, | |
| "learning_rate": 8.697757580272667e-06, | |
| "loss": 3.1854, | |
| "step": 12580 | |
| }, | |
| { | |
| "epoch": 4.223926251833229, | |
| "grad_norm": 6.453887939453125, | |
| "learning_rate": 8.62325858600909e-06, | |
| "loss": 3.1353, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 4.230630630630631, | |
| "grad_norm": 6.519958019256592, | |
| "learning_rate": 8.54875959174551e-06, | |
| "loss": 3.0728, | |
| "step": 12620 | |
| }, | |
| { | |
| "epoch": 4.237335009428032, | |
| "grad_norm": 6.916313648223877, | |
| "learning_rate": 8.474260597481933e-06, | |
| "loss": 3.1365, | |
| "step": 12640 | |
| }, | |
| { | |
| "epoch": 4.244039388225435, | |
| "grad_norm": 6.080234527587891, | |
| "learning_rate": 8.399761603218358e-06, | |
| "loss": 3.1026, | |
| "step": 12660 | |
| }, | |
| { | |
| "epoch": 4.250743767022837, | |
| "grad_norm": 6.199918746948242, | |
| "learning_rate": 8.32526260895478e-06, | |
| "loss": 3.1174, | |
| "step": 12680 | |
| }, | |
| { | |
| "epoch": 4.257448145820239, | |
| "grad_norm": 6.077798843383789, | |
| "learning_rate": 8.250763614691203e-06, | |
| "loss": 3.1353, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 4.264152524617641, | |
| "grad_norm": 7.003963947296143, | |
| "learning_rate": 8.176264620427625e-06, | |
| "loss": 3.1721, | |
| "step": 12720 | |
| }, | |
| { | |
| "epoch": 4.2708569034150425, | |
| "grad_norm": 7.276467323303223, | |
| "learning_rate": 8.101765626164046e-06, | |
| "loss": 3.1403, | |
| "step": 12740 | |
| }, | |
| { | |
| "epoch": 4.277561282212445, | |
| "grad_norm": 6.939758777618408, | |
| "learning_rate": 8.027266631900469e-06, | |
| "loss": 3.084, | |
| "step": 12760 | |
| }, | |
| { | |
| "epoch": 4.284265661009847, | |
| "grad_norm": 6.425601959228516, | |
| "learning_rate": 7.952767637636891e-06, | |
| "loss": 3.1602, | |
| "step": 12780 | |
| }, | |
| { | |
| "epoch": 4.290970039807249, | |
| "grad_norm": 6.354540824890137, | |
| "learning_rate": 7.878268643373316e-06, | |
| "loss": 3.0745, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 4.297674418604651, | |
| "grad_norm": 6.399191856384277, | |
| "learning_rate": 7.803769649109738e-06, | |
| "loss": 3.0546, | |
| "step": 12820 | |
| }, | |
| { | |
| "epoch": 4.3043787974020535, | |
| "grad_norm": 7.009991645812988, | |
| "learning_rate": 7.72927065484616e-06, | |
| "loss": 3.1064, | |
| "step": 12840 | |
| }, | |
| { | |
| "epoch": 4.311083176199455, | |
| "grad_norm": 6.549511432647705, | |
| "learning_rate": 7.654771660582582e-06, | |
| "loss": 3.119, | |
| "step": 12860 | |
| }, | |
| { | |
| "epoch": 4.317787554996857, | |
| "grad_norm": 6.825671672821045, | |
| "learning_rate": 7.580272666319005e-06, | |
| "loss": 3.0588, | |
| "step": 12880 | |
| }, | |
| { | |
| "epoch": 4.32449193379426, | |
| "grad_norm": 6.320077896118164, | |
| "learning_rate": 7.505773672055427e-06, | |
| "loss": 3.1293, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 4.331196312591661, | |
| "grad_norm": 6.614448070526123, | |
| "learning_rate": 7.4312746777918494e-06, | |
| "loss": 3.0686, | |
| "step": 12920 | |
| }, | |
| { | |
| "epoch": 4.337900691389064, | |
| "grad_norm": 6.515445709228516, | |
| "learning_rate": 7.356775683528274e-06, | |
| "loss": 3.0706, | |
| "step": 12940 | |
| }, | |
| { | |
| "epoch": 4.344605070186465, | |
| "grad_norm": 7.301309585571289, | |
| "learning_rate": 7.2822766892646954e-06, | |
| "loss": 3.0647, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 4.3513094489838675, | |
| "grad_norm": 7.263702869415283, | |
| "learning_rate": 7.207777695001118e-06, | |
| "loss": 3.1394, | |
| "step": 12980 | |
| }, | |
| { | |
| "epoch": 4.35801382778127, | |
| "grad_norm": 6.944880962371826, | |
| "learning_rate": 7.133278700737541e-06, | |
| "loss": 3.1024, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.35801382778127, | |
| "eval_bleu_greedy": 2.3552626321962884, | |
| "eval_loss": 0.4624796211719513, | |
| "eval_runtime": 534.5676, | |
| "eval_samples_per_second": 0.187, | |
| "eval_steps_per_second": 0.187, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.364718206578671, | |
| "grad_norm": 6.3444414138793945, | |
| "learning_rate": 7.058779706473963e-06, | |
| "loss": 3.0579, | |
| "step": 13020 | |
| }, | |
| { | |
| "epoch": 4.371422585376074, | |
| "grad_norm": 7.333340167999268, | |
| "learning_rate": 6.984280712210385e-06, | |
| "loss": 3.0466, | |
| "step": 13040 | |
| }, | |
| { | |
| "epoch": 4.378126964173476, | |
| "grad_norm": 7.396731376647949, | |
| "learning_rate": 6.9097817179468075e-06, | |
| "loss": 3.0913, | |
| "step": 13060 | |
| }, | |
| { | |
| "epoch": 4.384831342970878, | |
| "grad_norm": 7.228068828582764, | |
| "learning_rate": 6.835282723683231e-06, | |
| "loss": 3.0371, | |
| "step": 13080 | |
| }, | |
| { | |
| "epoch": 4.39153572176828, | |
| "grad_norm": 7.033063888549805, | |
| "learning_rate": 6.7607837294196535e-06, | |
| "loss": 3.1139, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 4.398240100565682, | |
| "grad_norm": 7.278767108917236, | |
| "learning_rate": 6.686284735156076e-06, | |
| "loss": 3.0944, | |
| "step": 13120 | |
| }, | |
| { | |
| "epoch": 4.404944479363084, | |
| "grad_norm": 7.024720191955566, | |
| "learning_rate": 6.611785740892499e-06, | |
| "loss": 3.1348, | |
| "step": 13140 | |
| }, | |
| { | |
| "epoch": 4.411648858160486, | |
| "grad_norm": 7.331086158752441, | |
| "learning_rate": 6.53728674662892e-06, | |
| "loss": 3.0433, | |
| "step": 13160 | |
| }, | |
| { | |
| "epoch": 4.418353236957888, | |
| "grad_norm": 6.916625499725342, | |
| "learning_rate": 6.462787752365343e-06, | |
| "loss": 3.0803, | |
| "step": 13180 | |
| }, | |
| { | |
| "epoch": 4.42505761575529, | |
| "grad_norm": 6.226632595062256, | |
| "learning_rate": 6.3882887581017655e-06, | |
| "loss": 3.1699, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 4.4317619945526925, | |
| "grad_norm": 6.50321626663208, | |
| "learning_rate": 6.313789763838189e-06, | |
| "loss": 3.0663, | |
| "step": 13220 | |
| }, | |
| { | |
| "epoch": 4.438466373350094, | |
| "grad_norm": 7.217479705810547, | |
| "learning_rate": 6.239290769574611e-06, | |
| "loss": 3.0969, | |
| "step": 13240 | |
| }, | |
| { | |
| "epoch": 4.445170752147496, | |
| "grad_norm": 7.183232307434082, | |
| "learning_rate": 6.164791775311034e-06, | |
| "loss": 3.0768, | |
| "step": 13260 | |
| }, | |
| { | |
| "epoch": 4.451875130944899, | |
| "grad_norm": 6.6673102378845215, | |
| "learning_rate": 6.090292781047456e-06, | |
| "loss": 3.1062, | |
| "step": 13280 | |
| }, | |
| { | |
| "epoch": 4.4585795097423, | |
| "grad_norm": 7.2146525382995605, | |
| "learning_rate": 6.015793786783878e-06, | |
| "loss": 3.1092, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 4.465283888539703, | |
| "grad_norm": 7.235128402709961, | |
| "learning_rate": 5.941294792520302e-06, | |
| "loss": 3.0743, | |
| "step": 13320 | |
| }, | |
| { | |
| "epoch": 4.471988267337105, | |
| "grad_norm": 6.275545597076416, | |
| "learning_rate": 5.8667957982567235e-06, | |
| "loss": 3.0743, | |
| "step": 13340 | |
| }, | |
| { | |
| "epoch": 4.4786926461345065, | |
| "grad_norm": 6.9439473152160645, | |
| "learning_rate": 5.792296803993146e-06, | |
| "loss": 3.1373, | |
| "step": 13360 | |
| }, | |
| { | |
| "epoch": 4.485397024931909, | |
| "grad_norm": 6.065330982208252, | |
| "learning_rate": 5.7177978097295695e-06, | |
| "loss": 3.1196, | |
| "step": 13380 | |
| }, | |
| { | |
| "epoch": 4.49210140372931, | |
| "grad_norm": 6.784725666046143, | |
| "learning_rate": 5.643298815465991e-06, | |
| "loss": 3.1024, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 4.498805782526713, | |
| "grad_norm": 6.992110729217529, | |
| "learning_rate": 5.568799821202414e-06, | |
| "loss": 3.2035, | |
| "step": 13420 | |
| }, | |
| { | |
| "epoch": 4.505510161324115, | |
| "grad_norm": 6.317196846008301, | |
| "learning_rate": 5.494300826938836e-06, | |
| "loss": 3.1222, | |
| "step": 13440 | |
| }, | |
| { | |
| "epoch": 4.512214540121517, | |
| "grad_norm": 6.293644428253174, | |
| "learning_rate": 5.419801832675259e-06, | |
| "loss": 3.1413, | |
| "step": 13460 | |
| }, | |
| { | |
| "epoch": 4.518918918918919, | |
| "grad_norm": 7.206140518188477, | |
| "learning_rate": 5.3453028384116815e-06, | |
| "loss": 3.125, | |
| "step": 13480 | |
| }, | |
| { | |
| "epoch": 4.525623297716321, | |
| "grad_norm": 6.770303726196289, | |
| "learning_rate": 5.270803844148104e-06, | |
| "loss": 3.1346, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.532327676513723, | |
| "grad_norm": 7.2229485511779785, | |
| "learning_rate": 5.196304849884527e-06, | |
| "loss": 3.1146, | |
| "step": 13520 | |
| }, | |
| { | |
| "epoch": 4.539032055311125, | |
| "grad_norm": 7.110487461090088, | |
| "learning_rate": 5.121805855620949e-06, | |
| "loss": 3.1114, | |
| "step": 13540 | |
| }, | |
| { | |
| "epoch": 4.545736434108527, | |
| "grad_norm": 7.351033687591553, | |
| "learning_rate": 5.047306861357372e-06, | |
| "loss": 3.0575, | |
| "step": 13560 | |
| }, | |
| { | |
| "epoch": 4.552440812905929, | |
| "grad_norm": 6.950778484344482, | |
| "learning_rate": 4.972807867093794e-06, | |
| "loss": 3.0641, | |
| "step": 13580 | |
| }, | |
| { | |
| "epoch": 4.559145191703331, | |
| "grad_norm": 6.550965785980225, | |
| "learning_rate": 4.898308872830217e-06, | |
| "loss": 3.1761, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 4.565849570500733, | |
| "grad_norm": 6.709011554718018, | |
| "learning_rate": 4.8238098785666396e-06, | |
| "loss": 2.9988, | |
| "step": 13620 | |
| }, | |
| { | |
| "epoch": 4.572553949298135, | |
| "grad_norm": 6.507779598236084, | |
| "learning_rate": 4.749310884303062e-06, | |
| "loss": 3.016, | |
| "step": 13640 | |
| }, | |
| { | |
| "epoch": 4.579258328095538, | |
| "grad_norm": 6.363673210144043, | |
| "learning_rate": 4.674811890039485e-06, | |
| "loss": 3.0496, | |
| "step": 13660 | |
| }, | |
| { | |
| "epoch": 4.585962706892939, | |
| "grad_norm": 6.965389251708984, | |
| "learning_rate": 4.600312895775907e-06, | |
| "loss": 3.1518, | |
| "step": 13680 | |
| }, | |
| { | |
| "epoch": 4.5926670856903415, | |
| "grad_norm": 6.091116905212402, | |
| "learning_rate": 4.52581390151233e-06, | |
| "loss": 3.0881, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.599371464487744, | |
| "grad_norm": 7.049524784088135, | |
| "learning_rate": 4.4513149072487524e-06, | |
| "loss": 3.1202, | |
| "step": 13720 | |
| }, | |
| { | |
| "epoch": 4.606075843285145, | |
| "grad_norm": 6.323545932769775, | |
| "learning_rate": 4.376815912985175e-06, | |
| "loss": 3.0242, | |
| "step": 13740 | |
| }, | |
| { | |
| "epoch": 4.612780222082548, | |
| "grad_norm": 7.295837879180908, | |
| "learning_rate": 4.3023169187215976e-06, | |
| "loss": 3.1243, | |
| "step": 13760 | |
| }, | |
| { | |
| "epoch": 4.61948460087995, | |
| "grad_norm": 6.582053184509277, | |
| "learning_rate": 4.22781792445802e-06, | |
| "loss": 3.0687, | |
| "step": 13780 | |
| }, | |
| { | |
| "epoch": 4.626188979677352, | |
| "grad_norm": 6.175601959228516, | |
| "learning_rate": 4.153318930194443e-06, | |
| "loss": 3.0427, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.632893358474754, | |
| "grad_norm": 7.662842273712158, | |
| "learning_rate": 4.078819935930865e-06, | |
| "loss": 3.1979, | |
| "step": 13820 | |
| }, | |
| { | |
| "epoch": 4.6395977372721555, | |
| "grad_norm": 7.036664009094238, | |
| "learning_rate": 4.004320941667288e-06, | |
| "loss": 3.1559, | |
| "step": 13840 | |
| }, | |
| { | |
| "epoch": 4.646302116069558, | |
| "grad_norm": 5.965688228607178, | |
| "learning_rate": 3.9298219474037105e-06, | |
| "loss": 3.1257, | |
| "step": 13860 | |
| }, | |
| { | |
| "epoch": 4.65300649486696, | |
| "grad_norm": 6.378177165985107, | |
| "learning_rate": 3.855322953140133e-06, | |
| "loss": 3.1514, | |
| "step": 13880 | |
| }, | |
| { | |
| "epoch": 4.659710873664362, | |
| "grad_norm": 6.670738220214844, | |
| "learning_rate": 3.780823958876555e-06, | |
| "loss": 3.0992, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.666415252461764, | |
| "grad_norm": 6.76698112487793, | |
| "learning_rate": 3.7063249646129778e-06, | |
| "loss": 3.1566, | |
| "step": 13920 | |
| }, | |
| { | |
| "epoch": 4.6731196312591665, | |
| "grad_norm": 6.328171253204346, | |
| "learning_rate": 3.6318259703494007e-06, | |
| "loss": 3.0974, | |
| "step": 13940 | |
| }, | |
| { | |
| "epoch": 4.679824010056568, | |
| "grad_norm": 7.151896953582764, | |
| "learning_rate": 3.557326976085823e-06, | |
| "loss": 3.1234, | |
| "step": 13960 | |
| }, | |
| { | |
| "epoch": 4.68652838885397, | |
| "grad_norm": 6.95003080368042, | |
| "learning_rate": 3.4828279818222455e-06, | |
| "loss": 3.156, | |
| "step": 13980 | |
| }, | |
| { | |
| "epoch": 4.693232767651372, | |
| "grad_norm": 7.1711931228637695, | |
| "learning_rate": 3.4083289875586685e-06, | |
| "loss": 3.0428, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.693232767651372, | |
| "eval_bleu_greedy": 2.416340135494281, | |
| "eval_loss": 0.45956096053123474, | |
| "eval_runtime": 314.8396, | |
| "eval_samples_per_second": 0.318, | |
| "eval_steps_per_second": 0.318, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.699937146448774, | |
| "grad_norm": 6.529189109802246, | |
| "learning_rate": 3.3338299932950906e-06, | |
| "loss": 3.1417, | |
| "step": 14020 | |
| }, | |
| { | |
| "epoch": 4.706641525246177, | |
| "grad_norm": 7.026646137237549, | |
| "learning_rate": 3.259330999031513e-06, | |
| "loss": 3.1099, | |
| "step": 14040 | |
| }, | |
| { | |
| "epoch": 4.713345904043578, | |
| "grad_norm": 6.561285972595215, | |
| "learning_rate": 3.1848320047679354e-06, | |
| "loss": 3.0675, | |
| "step": 14060 | |
| }, | |
| { | |
| "epoch": 4.7200502828409805, | |
| "grad_norm": 7.228313446044922, | |
| "learning_rate": 3.1103330105043583e-06, | |
| "loss": 3.0952, | |
| "step": 14080 | |
| }, | |
| { | |
| "epoch": 4.726754661638383, | |
| "grad_norm": 6.70543098449707, | |
| "learning_rate": 3.035834016240781e-06, | |
| "loss": 3.1087, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.733459040435784, | |
| "grad_norm": 6.371490478515625, | |
| "learning_rate": 2.9613350219772035e-06, | |
| "loss": 3.0608, | |
| "step": 14120 | |
| }, | |
| { | |
| "epoch": 4.740163419233187, | |
| "grad_norm": 6.534164905548096, | |
| "learning_rate": 2.886836027713626e-06, | |
| "loss": 3.055, | |
| "step": 14140 | |
| }, | |
| { | |
| "epoch": 4.746867798030589, | |
| "grad_norm": 6.988217353820801, | |
| "learning_rate": 2.8123370334500486e-06, | |
| "loss": 3.066, | |
| "step": 14160 | |
| }, | |
| { | |
| "epoch": 4.753572176827991, | |
| "grad_norm": 7.489045143127441, | |
| "learning_rate": 2.737838039186471e-06, | |
| "loss": 3.1232, | |
| "step": 14180 | |
| }, | |
| { | |
| "epoch": 4.760276555625393, | |
| "grad_norm": 6.6933512687683105, | |
| "learning_rate": 2.663339044922894e-06, | |
| "loss": 3.0314, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.766980934422795, | |
| "grad_norm": 6.849923133850098, | |
| "learning_rate": 2.5888400506593164e-06, | |
| "loss": 3.0855, | |
| "step": 14220 | |
| }, | |
| { | |
| "epoch": 4.773685313220197, | |
| "grad_norm": 6.958053112030029, | |
| "learning_rate": 2.5143410563957385e-06, | |
| "loss": 3.085, | |
| "step": 14240 | |
| }, | |
| { | |
| "epoch": 4.780389692017599, | |
| "grad_norm": 7.325470924377441, | |
| "learning_rate": 2.4398420621321615e-06, | |
| "loss": 3.1435, | |
| "step": 14260 | |
| }, | |
| { | |
| "epoch": 4.787094070815001, | |
| "grad_norm": 6.421871185302734, | |
| "learning_rate": 2.3653430678685837e-06, | |
| "loss": 3.0735, | |
| "step": 14280 | |
| }, | |
| { | |
| "epoch": 4.793798449612403, | |
| "grad_norm": 6.636096000671387, | |
| "learning_rate": 2.2908440736050062e-06, | |
| "loss": 3.083, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.8005028284098055, | |
| "grad_norm": 7.074666976928711, | |
| "learning_rate": 2.2163450793414292e-06, | |
| "loss": 3.0699, | |
| "step": 14320 | |
| }, | |
| { | |
| "epoch": 4.807207207207207, | |
| "grad_norm": 6.859719276428223, | |
| "learning_rate": 2.1418460850778514e-06, | |
| "loss": 3.126, | |
| "step": 14340 | |
| }, | |
| { | |
| "epoch": 4.813911586004609, | |
| "grad_norm": 7.162552833557129, | |
| "learning_rate": 2.0673470908142744e-06, | |
| "loss": 3.066, | |
| "step": 14360 | |
| }, | |
| { | |
| "epoch": 4.820615964802011, | |
| "grad_norm": 6.949527740478516, | |
| "learning_rate": 1.9928480965506965e-06, | |
| "loss": 3.0962, | |
| "step": 14380 | |
| }, | |
| { | |
| "epoch": 4.827320343599413, | |
| "grad_norm": 6.28379487991333, | |
| "learning_rate": 1.918349102287119e-06, | |
| "loss": 3.0169, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.834024722396816, | |
| "grad_norm": 6.530064582824707, | |
| "learning_rate": 1.843850108023542e-06, | |
| "loss": 3.0847, | |
| "step": 14420 | |
| }, | |
| { | |
| "epoch": 4.840729101194217, | |
| "grad_norm": 7.0545783042907715, | |
| "learning_rate": 1.7693511137599643e-06, | |
| "loss": 3.1926, | |
| "step": 14440 | |
| }, | |
| { | |
| "epoch": 4.847433479991619, | |
| "grad_norm": 6.212683200836182, | |
| "learning_rate": 1.694852119496387e-06, | |
| "loss": 3.0994, | |
| "step": 14460 | |
| }, | |
| { | |
| "epoch": 4.854137858789022, | |
| "grad_norm": 6.651175498962402, | |
| "learning_rate": 1.6203531252328094e-06, | |
| "loss": 3.115, | |
| "step": 14480 | |
| }, | |
| { | |
| "epoch": 4.860842237586423, | |
| "grad_norm": 6.536131381988525, | |
| "learning_rate": 1.545854130969232e-06, | |
| "loss": 3.0519, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.867546616383826, | |
| "grad_norm": 6.119905948638916, | |
| "learning_rate": 1.4713551367056546e-06, | |
| "loss": 3.097, | |
| "step": 14520 | |
| }, | |
| { | |
| "epoch": 4.874250995181228, | |
| "grad_norm": 7.160987854003906, | |
| "learning_rate": 1.3968561424420771e-06, | |
| "loss": 3.1391, | |
| "step": 14540 | |
| }, | |
| { | |
| "epoch": 4.8809553739786296, | |
| "grad_norm": 6.599812984466553, | |
| "learning_rate": 1.3223571481784995e-06, | |
| "loss": 3.0939, | |
| "step": 14560 | |
| }, | |
| { | |
| "epoch": 4.887659752776032, | |
| "grad_norm": 6.979626178741455, | |
| "learning_rate": 1.247858153914922e-06, | |
| "loss": 3.1038, | |
| "step": 14580 | |
| }, | |
| { | |
| "epoch": 4.894364131573434, | |
| "grad_norm": 7.19669771194458, | |
| "learning_rate": 1.1733591596513449e-06, | |
| "loss": 3.1171, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.901068510370836, | |
| "grad_norm": 7.708127975463867, | |
| "learning_rate": 1.0988601653877674e-06, | |
| "loss": 3.1308, | |
| "step": 14620 | |
| }, | |
| { | |
| "epoch": 4.907772889168238, | |
| "grad_norm": 7.753808498382568, | |
| "learning_rate": 1.0243611711241898e-06, | |
| "loss": 3.0866, | |
| "step": 14640 | |
| }, | |
| { | |
| "epoch": 4.9144772679656406, | |
| "grad_norm": 6.714838981628418, | |
| "learning_rate": 9.498621768606124e-07, | |
| "loss": 2.9962, | |
| "step": 14660 | |
| }, | |
| { | |
| "epoch": 4.921181646763042, | |
| "grad_norm": 6.7879767417907715, | |
| "learning_rate": 8.75363182597035e-07, | |
| "loss": 3.0925, | |
| "step": 14680 | |
| }, | |
| { | |
| "epoch": 4.927886025560444, | |
| "grad_norm": 7.638024806976318, | |
| "learning_rate": 8.008641883334574e-07, | |
| "loss": 3.0839, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.934590404357846, | |
| "grad_norm": 6.672430038452148, | |
| "learning_rate": 7.263651940698801e-07, | |
| "loss": 3.0642, | |
| "step": 14720 | |
| }, | |
| { | |
| "epoch": 4.941294783155248, | |
| "grad_norm": 6.447202682495117, | |
| "learning_rate": 6.518661998063027e-07, | |
| "loss": 3.0111, | |
| "step": 14740 | |
| }, | |
| { | |
| "epoch": 4.947999161952651, | |
| "grad_norm": 6.621779441833496, | |
| "learning_rate": 5.773672055427253e-07, | |
| "loss": 3.1408, | |
| "step": 14760 | |
| }, | |
| { | |
| "epoch": 4.954703540750052, | |
| "grad_norm": 7.014694690704346, | |
| "learning_rate": 5.028682112791477e-07, | |
| "loss": 3.1408, | |
| "step": 14780 | |
| }, | |
| { | |
| "epoch": 4.9614079195474545, | |
| "grad_norm": 7.518828868865967, | |
| "learning_rate": 4.2836921701557035e-07, | |
| "loss": 3.1069, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.968112298344856, | |
| "grad_norm": 6.64265775680542, | |
| "learning_rate": 3.538702227519929e-07, | |
| "loss": 3.0875, | |
| "step": 14820 | |
| }, | |
| { | |
| "epoch": 4.974816677142258, | |
| "grad_norm": 6.18177604675293, | |
| "learning_rate": 2.793712284884154e-07, | |
| "loss": 3.1182, | |
| "step": 14840 | |
| }, | |
| { | |
| "epoch": 4.981521055939661, | |
| "grad_norm": 6.46857213973999, | |
| "learning_rate": 2.0487223422483797e-07, | |
| "loss": 3.1629, | |
| "step": 14860 | |
| }, | |
| { | |
| "epoch": 4.988225434737062, | |
| "grad_norm": 6.637886047363281, | |
| "learning_rate": 1.3037323996126055e-07, | |
| "loss": 3.0804, | |
| "step": 14880 | |
| }, | |
| { | |
| "epoch": 4.994929813534465, | |
| "grad_norm": 6.32726526260376, | |
| "learning_rate": 5.587424569768308e-08, | |
| "loss": 3.0856, | |
| "step": 14900 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 14915, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.0778373124393533e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |