| { | |
| "best_metric": 44.91524109269023, | |
| "best_model_checkpoint": "/workspace/llm-storage/output/llama-3B-Base/checkpoint-14000", | |
| "epoch": 4.999958097632517, | |
| "eval_steps": 1000, | |
| "global_step": 14915, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006704378797402053, | |
| "grad_norm": 6.649622440338135, | |
| "learning_rate": 6.702412868632708e-07, | |
| "loss": 7.6699, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.013408757594804106, | |
| "grad_norm": 5.821969985961914, | |
| "learning_rate": 1.3404825737265416e-06, | |
| "loss": 7.6967, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02011313639220616, | |
| "grad_norm": 7.613153457641602, | |
| "learning_rate": 2.0107238605898126e-06, | |
| "loss": 7.68, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.02681751518960821, | |
| "grad_norm": 7.550271511077881, | |
| "learning_rate": 2.680965147453083e-06, | |
| "loss": 7.2033, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03352189398701027, | |
| "grad_norm": 5.453164100646973, | |
| "learning_rate": 3.351206434316354e-06, | |
| "loss": 6.5851, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04022627278441232, | |
| "grad_norm": 6.310723304748535, | |
| "learning_rate": 4.021447721179625e-06, | |
| "loss": 5.9701, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04693065158181437, | |
| "grad_norm": 5.457154750823975, | |
| "learning_rate": 4.691689008042896e-06, | |
| "loss": 5.5137, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05363503037921642, | |
| "grad_norm": 5.355574131011963, | |
| "learning_rate": 5.361930294906166e-06, | |
| "loss": 5.125, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.06033940917661848, | |
| "grad_norm": 5.482090473175049, | |
| "learning_rate": 6.032171581769437e-06, | |
| "loss": 5.171, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.06704378797402054, | |
| "grad_norm": 6.155514717102051, | |
| "learning_rate": 6.702412868632708e-06, | |
| "loss": 4.9362, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07374816677142258, | |
| "grad_norm": 5.355989933013916, | |
| "learning_rate": 7.372654155495978e-06, | |
| "loss": 4.9172, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.08045254556882464, | |
| "grad_norm": 5.774538993835449, | |
| "learning_rate": 8.04289544235925e-06, | |
| "loss": 4.8096, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0871569243662267, | |
| "grad_norm": 5.671195030212402, | |
| "learning_rate": 8.71313672922252e-06, | |
| "loss": 4.8034, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.09386130316362874, | |
| "grad_norm": 6.051366806030273, | |
| "learning_rate": 9.383378016085791e-06, | |
| "loss": 4.7359, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1005656819610308, | |
| "grad_norm": 6.272449493408203, | |
| "learning_rate": 1.0053619302949062e-05, | |
| "loss": 4.6895, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10727006075843284, | |
| "grad_norm": 6.4355669021606445, | |
| "learning_rate": 1.0723860589812333e-05, | |
| "loss": 4.5595, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1139744395558349, | |
| "grad_norm": 8.054056167602539, | |
| "learning_rate": 1.1394101876675605e-05, | |
| "loss": 4.4998, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.12067881835323696, | |
| "grad_norm": 6.529457092285156, | |
| "learning_rate": 1.2064343163538874e-05, | |
| "loss": 4.5933, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.127383197150639, | |
| "grad_norm": 6.745616912841797, | |
| "learning_rate": 1.2734584450402146e-05, | |
| "loss": 4.5153, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.13408757594804108, | |
| "grad_norm": 6.701639175415039, | |
| "learning_rate": 1.3404825737265417e-05, | |
| "loss": 4.5117, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.14079195474544312, | |
| "grad_norm": 7.223796844482422, | |
| "learning_rate": 1.4075067024128689e-05, | |
| "loss": 4.5095, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.14749633354284516, | |
| "grad_norm": 6.9919891357421875, | |
| "learning_rate": 1.4745308310991956e-05, | |
| "loss": 4.504, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.15420071234024724, | |
| "grad_norm": 6.506685256958008, | |
| "learning_rate": 1.5415549597855227e-05, | |
| "loss": 4.3921, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.16090509113764928, | |
| "grad_norm": 6.946916103363037, | |
| "learning_rate": 1.60857908847185e-05, | |
| "loss": 4.4041, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.16760946993505133, | |
| "grad_norm": 6.313327789306641, | |
| "learning_rate": 1.675603217158177e-05, | |
| "loss": 4.3512, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1743138487324534, | |
| "grad_norm": 7.055306911468506, | |
| "learning_rate": 1.742627345844504e-05, | |
| "loss": 4.3819, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.18101822752985544, | |
| "grad_norm": 6.881680965423584, | |
| "learning_rate": 1.8096514745308312e-05, | |
| "loss": 4.2658, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.18772260632725749, | |
| "grad_norm": 6.35658073425293, | |
| "learning_rate": 1.8766756032171583e-05, | |
| "loss": 4.1747, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.19442698512465956, | |
| "grad_norm": 7.282151222229004, | |
| "learning_rate": 1.9436997319034853e-05, | |
| "loss": 4.392, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.2011313639220616, | |
| "grad_norm": 7.252215385437012, | |
| "learning_rate": 2.0107238605898124e-05, | |
| "loss": 4.2709, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.20783574271946365, | |
| "grad_norm": 7.683035850524902, | |
| "learning_rate": 2.0777479892761395e-05, | |
| "loss": 4.1557, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2145401215168657, | |
| "grad_norm": 7.786757469177246, | |
| "learning_rate": 2.1447721179624665e-05, | |
| "loss": 4.2957, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.22124450031426776, | |
| "grad_norm": 7.157670497894287, | |
| "learning_rate": 2.211796246648794e-05, | |
| "loss": 4.1558, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2279488791116698, | |
| "grad_norm": 6.556661128997803, | |
| "learning_rate": 2.278820375335121e-05, | |
| "loss": 4.219, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.23465325790907185, | |
| "grad_norm": 7.472171306610107, | |
| "learning_rate": 2.3458445040214477e-05, | |
| "loss": 4.0597, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.24135763670647392, | |
| "grad_norm": 6.373785018920898, | |
| "learning_rate": 2.4128686327077747e-05, | |
| "loss": 4.0991, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.24806201550387597, | |
| "grad_norm": 7.932548999786377, | |
| "learning_rate": 2.479892761394102e-05, | |
| "loss": 4.2809, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.254766394301278, | |
| "grad_norm": 6.911128520965576, | |
| "learning_rate": 2.5469168900804292e-05, | |
| "loss": 4.107, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.26147077309868005, | |
| "grad_norm": 6.957625865936279, | |
| "learning_rate": 2.6139410187667563e-05, | |
| "loss": 4.2279, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.26817515189608215, | |
| "grad_norm": 6.987204551696777, | |
| "learning_rate": 2.6809651474530833e-05, | |
| "loss": 4.0514, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2748795306934842, | |
| "grad_norm": 6.837357997894287, | |
| "learning_rate": 2.7479892761394104e-05, | |
| "loss": 4.2328, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.28158390949088624, | |
| "grad_norm": 7.49316930770874, | |
| "learning_rate": 2.8150134048257378e-05, | |
| "loss": 4.1407, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2882882882882883, | |
| "grad_norm": 6.795347690582275, | |
| "learning_rate": 2.8820375335120648e-05, | |
| "loss": 4.0602, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.29499266708569033, | |
| "grad_norm": 6.921504020690918, | |
| "learning_rate": 2.9490616621983912e-05, | |
| "loss": 4.0738, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3016970458830924, | |
| "grad_norm": 6.316340923309326, | |
| "learning_rate": 3.0160857908847186e-05, | |
| "loss": 3.9296, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3084014246804945, | |
| "grad_norm": 6.55310583114624, | |
| "learning_rate": 3.083109919571045e-05, | |
| "loss": 4.0316, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.3151058034778965, | |
| "grad_norm": 5.821455001831055, | |
| "learning_rate": 3.1501340482573724e-05, | |
| "loss": 4.0779, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.32181018227529856, | |
| "grad_norm": 7.125136852264404, | |
| "learning_rate": 3.2171581769437e-05, | |
| "loss": 4.0213, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3285145610727006, | |
| "grad_norm": 6.5631794929504395, | |
| "learning_rate": 3.284182305630027e-05, | |
| "loss": 4.0513, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.33521893987010265, | |
| "grad_norm": 6.145415782928467, | |
| "learning_rate": 3.351206434316354e-05, | |
| "loss": 3.9952, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.33521893987010265, | |
| "eval_bleu_greedy": 42.15344877780225, | |
| "eval_loss": 0.5149964690208435, | |
| "eval_runtime": 264.7887, | |
| "eval_samples_per_second": 0.378, | |
| "eval_steps_per_second": 0.378, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3419233186675047, | |
| "grad_norm": 6.734564304351807, | |
| "learning_rate": 3.418230563002681e-05, | |
| "loss": 4.048, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3486276974649068, | |
| "grad_norm": 6.978208541870117, | |
| "learning_rate": 3.485254691689008e-05, | |
| "loss": 4.0318, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.35533207626230884, | |
| "grad_norm": 5.949651718139648, | |
| "learning_rate": 3.5522788203753354e-05, | |
| "loss": 3.9334, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3620364550597109, | |
| "grad_norm": 5.890812397003174, | |
| "learning_rate": 3.6193029490616625e-05, | |
| "loss": 3.9546, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3687408338571129, | |
| "grad_norm": 6.559944152832031, | |
| "learning_rate": 3.6863270777479895e-05, | |
| "loss": 4.0145, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.37544521265451497, | |
| "grad_norm": 6.098083972930908, | |
| "learning_rate": 3.7533512064343166e-05, | |
| "loss": 3.9196, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.382149591451917, | |
| "grad_norm": 6.418199062347412, | |
| "learning_rate": 3.8203753351206436e-05, | |
| "loss": 3.9634, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.3888539702493191, | |
| "grad_norm": 6.399519920349121, | |
| "learning_rate": 3.887399463806971e-05, | |
| "loss": 3.8874, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.39555834904672116, | |
| "grad_norm": 6.154569149017334, | |
| "learning_rate": 3.954423592493298e-05, | |
| "loss": 3.8637, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.4022627278441232, | |
| "grad_norm": 5.589611530303955, | |
| "learning_rate": 4.021447721179625e-05, | |
| "loss": 3.9136, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.40896710664152525, | |
| "grad_norm": 6.433633327484131, | |
| "learning_rate": 4.088471849865952e-05, | |
| "loss": 3.9185, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.4156714854389273, | |
| "grad_norm": 5.366619110107422, | |
| "learning_rate": 4.155495978552279e-05, | |
| "loss": 3.9046, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.42237586423632933, | |
| "grad_norm": 5.7498884201049805, | |
| "learning_rate": 4.222520107238606e-05, | |
| "loss": 3.8305, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4290802430337314, | |
| "grad_norm": 6.001119613647461, | |
| "learning_rate": 4.289544235924933e-05, | |
| "loss": 3.8793, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4357846218311335, | |
| "grad_norm": 5.743397235870361, | |
| "learning_rate": 4.35656836461126e-05, | |
| "loss": 3.8746, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4424890006285355, | |
| "grad_norm": 6.727258205413818, | |
| "learning_rate": 4.423592493297588e-05, | |
| "loss": 3.7943, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.44919337942593757, | |
| "grad_norm": 5.452583312988281, | |
| "learning_rate": 4.490616621983915e-05, | |
| "loss": 3.8095, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.4558977582233396, | |
| "grad_norm": 4.872672080993652, | |
| "learning_rate": 4.557640750670242e-05, | |
| "loss": 3.7152, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.46260213702074165, | |
| "grad_norm": 5.5072174072265625, | |
| "learning_rate": 4.624664879356568e-05, | |
| "loss": 3.7729, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4693065158181437, | |
| "grad_norm": 6.116482257843018, | |
| "learning_rate": 4.6916890080428954e-05, | |
| "loss": 3.8463, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4760108946155458, | |
| "grad_norm": 5.880556583404541, | |
| "learning_rate": 4.7587131367292224e-05, | |
| "loss": 3.8045, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.48271527341294784, | |
| "grad_norm": 5.663512229919434, | |
| "learning_rate": 4.8257372654155495e-05, | |
| "loss": 3.7668, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.4894196522103499, | |
| "grad_norm": 5.742367744445801, | |
| "learning_rate": 4.8927613941018765e-05, | |
| "loss": 3.7065, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.49612403100775193, | |
| "grad_norm": 5.253595352172852, | |
| "learning_rate": 4.959785522788204e-05, | |
| "loss": 3.7373, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.502828409805154, | |
| "grad_norm": 5.620617866516113, | |
| "learning_rate": 4.997020040229457e-05, | |
| "loss": 3.766, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.509532788602556, | |
| "grad_norm": 5.477814197540283, | |
| "learning_rate": 4.989570140803099e-05, | |
| "loss": 3.8301, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.5162371673999581, | |
| "grad_norm": 5.931734561920166, | |
| "learning_rate": 4.982120241376742e-05, | |
| "loss": 3.7464, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.5229415461973601, | |
| "grad_norm": 4.500419616699219, | |
| "learning_rate": 4.974670341950384e-05, | |
| "loss": 3.7418, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5296459249947622, | |
| "grad_norm": 5.702874660491943, | |
| "learning_rate": 4.967220442524026e-05, | |
| "loss": 3.7451, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5363503037921643, | |
| "grad_norm": 6.070137023925781, | |
| "learning_rate": 4.959770543097668e-05, | |
| "loss": 3.6802, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5430546825895664, | |
| "grad_norm": 5.067490100860596, | |
| "learning_rate": 4.95232064367131e-05, | |
| "loss": 3.7898, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5497590613869684, | |
| "grad_norm": 6.116696834564209, | |
| "learning_rate": 4.944870744244953e-05, | |
| "loss": 3.6728, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.5564634401843704, | |
| "grad_norm": 5.9352827072143555, | |
| "learning_rate": 4.937420844818596e-05, | |
| "loss": 3.7049, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5631678189817725, | |
| "grad_norm": 4.784963130950928, | |
| "learning_rate": 4.929970945392238e-05, | |
| "loss": 3.5477, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5698721977791745, | |
| "grad_norm": 5.402501106262207, | |
| "learning_rate": 4.92252104596588e-05, | |
| "loss": 3.6234, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5765765765765766, | |
| "grad_norm": 5.293265342712402, | |
| "learning_rate": 4.915071146539522e-05, | |
| "loss": 3.5493, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5832809553739786, | |
| "grad_norm": 5.232183933258057, | |
| "learning_rate": 4.907621247113165e-05, | |
| "loss": 3.5725, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.5899853341713807, | |
| "grad_norm": 5.71828031539917, | |
| "learning_rate": 4.900171347686807e-05, | |
| "loss": 3.6631, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5966897129687827, | |
| "grad_norm": 5.855662822723389, | |
| "learning_rate": 4.892721448260449e-05, | |
| "loss": 3.6428, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.6033940917661847, | |
| "grad_norm": 5.2928290367126465, | |
| "learning_rate": 4.885271548834091e-05, | |
| "loss": 3.5862, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6100984705635868, | |
| "grad_norm": 5.530043125152588, | |
| "learning_rate": 4.877821649407733e-05, | |
| "loss": 3.5894, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.616802849360989, | |
| "grad_norm": 5.8700079917907715, | |
| "learning_rate": 4.870371749981376e-05, | |
| "loss": 3.4918, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.623507228158391, | |
| "grad_norm": 4.9393510818481445, | |
| "learning_rate": 4.862921850555018e-05, | |
| "loss": 3.5439, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.630211606955793, | |
| "grad_norm": 5.189883232116699, | |
| "learning_rate": 4.85547195112866e-05, | |
| "loss": 3.6312, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6369159857531951, | |
| "grad_norm": 4.674147605895996, | |
| "learning_rate": 4.848022051702302e-05, | |
| "loss": 3.5674, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6436203645505971, | |
| "grad_norm": 5.3367919921875, | |
| "learning_rate": 4.840572152275945e-05, | |
| "loss": 3.5407, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6503247433479992, | |
| "grad_norm": 5.333773612976074, | |
| "learning_rate": 4.833122252849587e-05, | |
| "loss": 3.5916, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.6570291221454012, | |
| "grad_norm": 5.1199235916137695, | |
| "learning_rate": 4.825672353423229e-05, | |
| "loss": 3.5203, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6637335009428033, | |
| "grad_norm": 5.012689590454102, | |
| "learning_rate": 4.818222453996871e-05, | |
| "loss": 3.6189, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6704378797402053, | |
| "grad_norm": 4.8355231285095215, | |
| "learning_rate": 4.810772554570513e-05, | |
| "loss": 3.4858, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6704378797402053, | |
| "eval_bleu_greedy": 42.772572026733215, | |
| "eval_loss": 0.44412651658058167, | |
| "eval_runtime": 257.8686, | |
| "eval_samples_per_second": 0.388, | |
| "eval_steps_per_second": 0.388, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6771422585376073, | |
| "grad_norm": 5.045526504516602, | |
| "learning_rate": 4.803322655144156e-05, | |
| "loss": 3.4094, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6838466373350094, | |
| "grad_norm": 4.954864501953125, | |
| "learning_rate": 4.795872755717798e-05, | |
| "loss": 3.5093, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.6905510161324114, | |
| "grad_norm": 4.870156288146973, | |
| "learning_rate": 4.78842285629144e-05, | |
| "loss": 3.4555, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6972553949298136, | |
| "grad_norm": 5.512898921966553, | |
| "learning_rate": 4.780972956865082e-05, | |
| "loss": 3.3533, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.7039597737272156, | |
| "grad_norm": 5.5289788246154785, | |
| "learning_rate": 4.773523057438724e-05, | |
| "loss": 3.4053, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7106641525246177, | |
| "grad_norm": 4.736262798309326, | |
| "learning_rate": 4.766073158012367e-05, | |
| "loss": 3.4689, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.7173685313220197, | |
| "grad_norm": 5.111291885375977, | |
| "learning_rate": 4.758623258586009e-05, | |
| "loss": 3.4979, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.7240729101194218, | |
| "grad_norm": 5.018069267272949, | |
| "learning_rate": 4.751173359159651e-05, | |
| "loss": 3.3954, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.7307772889168238, | |
| "grad_norm": 4.47459077835083, | |
| "learning_rate": 4.7437234597332934e-05, | |
| "loss": 3.4213, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7374816677142259, | |
| "grad_norm": 4.8868279457092285, | |
| "learning_rate": 4.736273560306936e-05, | |
| "loss": 3.5459, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7441860465116279, | |
| "grad_norm": 5.392606735229492, | |
| "learning_rate": 4.728823660880579e-05, | |
| "loss": 3.4798, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7508904253090299, | |
| "grad_norm": 4.868163108825684, | |
| "learning_rate": 4.721373761454221e-05, | |
| "loss": 3.3849, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.757594804106432, | |
| "grad_norm": 5.640343189239502, | |
| "learning_rate": 4.713923862027863e-05, | |
| "loss": 3.4752, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.764299182903834, | |
| "grad_norm": 5.552671432495117, | |
| "learning_rate": 4.706473962601505e-05, | |
| "loss": 3.3831, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7710035617012361, | |
| "grad_norm": 4.874811172485352, | |
| "learning_rate": 4.699024063175147e-05, | |
| "loss": 3.3214, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7777079404986382, | |
| "grad_norm": 5.65765380859375, | |
| "learning_rate": 4.69157416374879e-05, | |
| "loss": 3.3833, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7844123192960403, | |
| "grad_norm": 5.0529351234436035, | |
| "learning_rate": 4.684124264322432e-05, | |
| "loss": 3.3898, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.7911166980934423, | |
| "grad_norm": 4.97359561920166, | |
| "learning_rate": 4.676674364896074e-05, | |
| "loss": 3.3449, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7978210768908444, | |
| "grad_norm": 5.133802890777588, | |
| "learning_rate": 4.669224465469716e-05, | |
| "loss": 3.3804, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.8045254556882464, | |
| "grad_norm": 5.423835754394531, | |
| "learning_rate": 4.661774566043359e-05, | |
| "loss": 3.3506, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8112298344856484, | |
| "grad_norm": 4.704217910766602, | |
| "learning_rate": 4.654324666617001e-05, | |
| "loss": 3.3602, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.8179342132830505, | |
| "grad_norm": 5.2804365158081055, | |
| "learning_rate": 4.646874767190643e-05, | |
| "loss": 3.2892, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.8246385920804525, | |
| "grad_norm": 5.329574108123779, | |
| "learning_rate": 4.639424867764285e-05, | |
| "loss": 3.4255, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.8313429708778546, | |
| "grad_norm": 5.078803062438965, | |
| "learning_rate": 4.6319749683379274e-05, | |
| "loss": 3.2791, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.8380473496752566, | |
| "grad_norm": 5.376737117767334, | |
| "learning_rate": 4.62452506891157e-05, | |
| "loss": 3.2312, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8447517284726587, | |
| "grad_norm": 4.886073589324951, | |
| "learning_rate": 4.617075169485212e-05, | |
| "loss": 3.2618, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8514561072700607, | |
| "grad_norm": 4.772372722625732, | |
| "learning_rate": 4.609625270058854e-05, | |
| "loss": 3.3566, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.8581604860674628, | |
| "grad_norm": 5.242844581604004, | |
| "learning_rate": 4.6021753706324964e-05, | |
| "loss": 3.1969, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8648648648648649, | |
| "grad_norm": 5.419410228729248, | |
| "learning_rate": 4.5947254712061385e-05, | |
| "loss": 3.2977, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.871569243662267, | |
| "grad_norm": 5.235103130340576, | |
| "learning_rate": 4.587275571779781e-05, | |
| "loss": 3.2264, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.878273622459669, | |
| "grad_norm": 6.972829341888428, | |
| "learning_rate": 4.5798256723534234e-05, | |
| "loss": 3.2103, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.884978001257071, | |
| "grad_norm": 5.612443923950195, | |
| "learning_rate": 4.5723757729270654e-05, | |
| "loss": 3.2591, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.8916823800544731, | |
| "grad_norm": 5.727964401245117, | |
| "learning_rate": 4.5649258735007075e-05, | |
| "loss": 3.3569, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8983867588518751, | |
| "grad_norm": 5.101806640625, | |
| "learning_rate": 4.55747597407435e-05, | |
| "loss": 3.2256, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.9050911376492772, | |
| "grad_norm": 4.5348358154296875, | |
| "learning_rate": 4.5500260746479924e-05, | |
| "loss": 3.3395, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9117955164466792, | |
| "grad_norm": 4.911803722381592, | |
| "learning_rate": 4.5425761752216345e-05, | |
| "loss": 3.3203, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.9184998952440813, | |
| "grad_norm": 5.24350643157959, | |
| "learning_rate": 4.5351262757952766e-05, | |
| "loss": 3.2867, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.9252042740414833, | |
| "grad_norm": 5.0574493408203125, | |
| "learning_rate": 4.527676376368919e-05, | |
| "loss": 3.1712, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.9319086528388854, | |
| "grad_norm": 4.724992752075195, | |
| "learning_rate": 4.5202264769425614e-05, | |
| "loss": 3.3225, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.9386130316362874, | |
| "grad_norm": 5.762344837188721, | |
| "learning_rate": 4.512776577516204e-05, | |
| "loss": 3.277, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9453174104336896, | |
| "grad_norm": 5.392731666564941, | |
| "learning_rate": 4.505326678089846e-05, | |
| "loss": 3.3034, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9520217892310916, | |
| "grad_norm": 4.567736625671387, | |
| "learning_rate": 4.4978767786634884e-05, | |
| "loss": 3.2544, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.9587261680284936, | |
| "grad_norm": 4.565167427062988, | |
| "learning_rate": 4.4904268792371304e-05, | |
| "loss": 3.2303, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9654305468258957, | |
| "grad_norm": 4.804454326629639, | |
| "learning_rate": 4.482976979810773e-05, | |
| "loss": 3.1671, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9721349256232977, | |
| "grad_norm": 4.879951477050781, | |
| "learning_rate": 4.475527080384415e-05, | |
| "loss": 3.1655, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9788393044206998, | |
| "grad_norm": 5.604247570037842, | |
| "learning_rate": 4.4680771809580574e-05, | |
| "loss": 3.2109, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9855436832181018, | |
| "grad_norm": 5.372274875640869, | |
| "learning_rate": 4.4606272815316995e-05, | |
| "loss": 3.1978, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.9922480620155039, | |
| "grad_norm": 5.262199878692627, | |
| "learning_rate": 4.4531773821053416e-05, | |
| "loss": 3.1311, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9989524408129059, | |
| "grad_norm": 4.829686641693115, | |
| "learning_rate": 4.445727482678984e-05, | |
| "loss": 3.1546, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.0053635030379215, | |
| "grad_norm": 4.690941333770752, | |
| "learning_rate": 4.4382775832526264e-05, | |
| "loss": 2.9199, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0053635030379215, | |
| "eval_bleu_greedy": 42.24584689498649, | |
| "eval_loss": 0.4063108563423157, | |
| "eval_runtime": 176.1437, | |
| "eval_samples_per_second": 0.568, | |
| "eval_steps_per_second": 0.568, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0120678818353237, | |
| "grad_norm": 5.380918025970459, | |
| "learning_rate": 4.4308276838262685e-05, | |
| "loss": 2.9888, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.0187722606327259, | |
| "grad_norm": 4.866960048675537, | |
| "learning_rate": 4.4233777843999106e-05, | |
| "loss": 3.0593, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.0254766394301278, | |
| "grad_norm": 5.36842155456543, | |
| "learning_rate": 4.4159278849735534e-05, | |
| "loss": 3.0062, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.03218101822753, | |
| "grad_norm": 5.316088676452637, | |
| "learning_rate": 4.4084779855471954e-05, | |
| "loss": 3.0334, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.0388853970249319, | |
| "grad_norm": 5.1895012855529785, | |
| "learning_rate": 4.4010280861208375e-05, | |
| "loss": 2.9251, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.045589775822334, | |
| "grad_norm": 4.736208438873291, | |
| "learning_rate": 4.3935781866944796e-05, | |
| "loss": 2.9938, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.052294154619736, | |
| "grad_norm": 5.429833889007568, | |
| "learning_rate": 4.386128287268122e-05, | |
| "loss": 3.0089, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.0589985334171381, | |
| "grad_norm": 4.854675769805908, | |
| "learning_rate": 4.3786783878417645e-05, | |
| "loss": 3.0657, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.06570291221454, | |
| "grad_norm": 4.766360759735107, | |
| "learning_rate": 4.3712284884154066e-05, | |
| "loss": 2.9854, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.0724072910119422, | |
| "grad_norm": 5.70284366607666, | |
| "learning_rate": 4.3637785889890487e-05, | |
| "loss": 3.0657, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.0791116698093441, | |
| "grad_norm": 5.729391574859619, | |
| "learning_rate": 4.356328689562691e-05, | |
| "loss": 2.9516, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.0858160486067463, | |
| "grad_norm": 5.109742164611816, | |
| "learning_rate": 4.348878790136333e-05, | |
| "loss": 3.0221, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.0925204274041482, | |
| "grad_norm": 5.132044315338135, | |
| "learning_rate": 4.3414288907099756e-05, | |
| "loss": 3.0308, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.0992248062015504, | |
| "grad_norm": 4.71274995803833, | |
| "learning_rate": 4.333978991283618e-05, | |
| "loss": 2.9777, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.1059291849989525, | |
| "grad_norm": 5.162365436553955, | |
| "learning_rate": 4.3265290918572605e-05, | |
| "loss": 3.0288, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.1126335637963545, | |
| "grad_norm": 4.953509330749512, | |
| "learning_rate": 4.3190791924309025e-05, | |
| "loss": 3.1481, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.1193379425937566, | |
| "grad_norm": 4.702221870422363, | |
| "learning_rate": 4.3116292930045446e-05, | |
| "loss": 3.0185, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.1260423213911586, | |
| "grad_norm": 4.8475446701049805, | |
| "learning_rate": 4.3041793935781874e-05, | |
| "loss": 2.9887, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.1327467001885607, | |
| "grad_norm": 5.558666706085205, | |
| "learning_rate": 4.2967294941518295e-05, | |
| "loss": 3.0207, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.1394510789859627, | |
| "grad_norm": 5.5212602615356445, | |
| "learning_rate": 4.2892795947254716e-05, | |
| "loss": 3.0179, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.1461554577833648, | |
| "grad_norm": 5.226992607116699, | |
| "learning_rate": 4.2818296952991137e-05, | |
| "loss": 3.0295, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.1528598365807667, | |
| "grad_norm": 5.354259014129639, | |
| "learning_rate": 4.274379795872756e-05, | |
| "loss": 2.9132, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.159564215378169, | |
| "grad_norm": 4.869040489196777, | |
| "learning_rate": 4.2669298964463985e-05, | |
| "loss": 2.9472, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.1662685941755708, | |
| "grad_norm": 7.068531036376953, | |
| "learning_rate": 4.2594799970200406e-05, | |
| "loss": 2.915, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.172972972972973, | |
| "grad_norm": 5.257974147796631, | |
| "learning_rate": 4.252030097593683e-05, | |
| "loss": 2.9875, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.179677351770375, | |
| "grad_norm": 5.5318474769592285, | |
| "learning_rate": 4.244580198167325e-05, | |
| "loss": 2.9196, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.186381730567777, | |
| "grad_norm": 5.792961597442627, | |
| "learning_rate": 4.2371302987409675e-05, | |
| "loss": 2.9411, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.1930861093651792, | |
| "grad_norm": 5.819127559661865, | |
| "learning_rate": 4.2296803993146096e-05, | |
| "loss": 3.0377, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.1997904881625812, | |
| "grad_norm": 5.471036911010742, | |
| "learning_rate": 4.222230499888252e-05, | |
| "loss": 2.9347, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.2064948669599833, | |
| "grad_norm": 5.375743865966797, | |
| "learning_rate": 4.214780600461894e-05, | |
| "loss": 2.9063, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.2131992457573852, | |
| "grad_norm": 5.005084991455078, | |
| "learning_rate": 4.207330701035536e-05, | |
| "loss": 2.9835, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.2199036245547874, | |
| "grad_norm": 4.6130475997924805, | |
| "learning_rate": 4.1998808016091787e-05, | |
| "loss": 2.9734, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.2266080033521893, | |
| "grad_norm": 4.87117338180542, | |
| "learning_rate": 4.192430902182821e-05, | |
| "loss": 2.873, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.2333123821495915, | |
| "grad_norm": 5.72168493270874, | |
| "learning_rate": 4.184981002756463e-05, | |
| "loss": 2.8803, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.2400167609469934, | |
| "grad_norm": 5.60111665725708, | |
| "learning_rate": 4.177531103330105e-05, | |
| "loss": 2.9545, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2467211397443956, | |
| "grad_norm": 5.848781108856201, | |
| "learning_rate": 4.170081203903747e-05, | |
| "loss": 2.8407, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.2534255185417975, | |
| "grad_norm": 5.473219394683838, | |
| "learning_rate": 4.16263130447739e-05, | |
| "loss": 2.957, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.2601298973391997, | |
| "grad_norm": 4.950867652893066, | |
| "learning_rate": 4.155181405051032e-05, | |
| "loss": 2.9443, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.2668342761366018, | |
| "grad_norm": 5.851542949676514, | |
| "learning_rate": 4.147731505624674e-05, | |
| "loss": 2.9224, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.2735386549340038, | |
| "grad_norm": 5.347169876098633, | |
| "learning_rate": 4.140281606198316e-05, | |
| "loss": 2.9604, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.2802430337314057, | |
| "grad_norm": 4.820839881896973, | |
| "learning_rate": 4.132831706771959e-05, | |
| "loss": 2.9367, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.2869474125288078, | |
| "grad_norm": 4.836258411407471, | |
| "learning_rate": 4.125381807345601e-05, | |
| "loss": 2.9184, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.29365179132621, | |
| "grad_norm": 4.943812847137451, | |
| "learning_rate": 4.1179319079192437e-05, | |
| "loss": 2.935, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.300356170123612, | |
| "grad_norm": 6.155868053436279, | |
| "learning_rate": 4.110482008492886e-05, | |
| "loss": 2.9132, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.307060548921014, | |
| "grad_norm": 5.259906768798828, | |
| "learning_rate": 4.103032109066528e-05, | |
| "loss": 2.8939, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.313764927718416, | |
| "grad_norm": 5.179457664489746, | |
| "learning_rate": 4.09558220964017e-05, | |
| "loss": 2.9348, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.3204693065158182, | |
| "grad_norm": 4.546389579772949, | |
| "learning_rate": 4.088132310213813e-05, | |
| "loss": 2.8441, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.32717368531322, | |
| "grad_norm": 5.706070423126221, | |
| "learning_rate": 4.080682410787455e-05, | |
| "loss": 3.0033, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.3338780641106223, | |
| "grad_norm": 6.009608268737793, | |
| "learning_rate": 4.073232511361097e-05, | |
| "loss": 2.8735, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.3405824429080244, | |
| "grad_norm": 4.916238784790039, | |
| "learning_rate": 4.065782611934739e-05, | |
| "loss": 2.9641, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3405824429080244, | |
| "eval_bleu_greedy": 42.46623138458086, | |
| "eval_loss": 0.3836318850517273, | |
| "eval_runtime": 102.4476, | |
| "eval_samples_per_second": 0.976, | |
| "eval_steps_per_second": 0.976, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3472868217054264, | |
| "grad_norm": 4.503624439239502, | |
| "learning_rate": 4.058332712508382e-05, | |
| "loss": 2.9549, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.3539912005028283, | |
| "grad_norm": 4.97296142578125, | |
| "learning_rate": 4.050882813082024e-05, | |
| "loss": 2.8055, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.3606955793002304, | |
| "grad_norm": 5.414029598236084, | |
| "learning_rate": 4.043432913655666e-05, | |
| "loss": 2.9634, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.3673999580976326, | |
| "grad_norm": 5.9259033203125, | |
| "learning_rate": 4.035983014229308e-05, | |
| "loss": 2.8464, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.3741043368950345, | |
| "grad_norm": 5.309682846069336, | |
| "learning_rate": 4.02853311480295e-05, | |
| "loss": 2.82, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.3808087156924367, | |
| "grad_norm": 5.620110034942627, | |
| "learning_rate": 4.021083215376593e-05, | |
| "loss": 2.8368, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.3875130944898386, | |
| "grad_norm": 5.198604583740234, | |
| "learning_rate": 4.013633315950235e-05, | |
| "loss": 2.8964, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.3942174732872408, | |
| "grad_norm": 5.072934150695801, | |
| "learning_rate": 4.006183416523877e-05, | |
| "loss": 2.8942, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.4009218520846427, | |
| "grad_norm": 4.95306921005249, | |
| "learning_rate": 3.998733517097519e-05, | |
| "loss": 2.838, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.4076262308820449, | |
| "grad_norm": 4.760760307312012, | |
| "learning_rate": 3.991283617671162e-05, | |
| "loss": 2.8592, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.414330609679447, | |
| "grad_norm": 4.957187652587891, | |
| "learning_rate": 3.983833718244804e-05, | |
| "loss": 2.904, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.421034988476849, | |
| "grad_norm": 4.402778625488281, | |
| "learning_rate": 3.976383818818446e-05, | |
| "loss": 2.7385, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.4277393672742509, | |
| "grad_norm": 5.658669471740723, | |
| "learning_rate": 3.968933919392088e-05, | |
| "loss": 2.8683, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.434443746071653, | |
| "grad_norm": 4.498071670532227, | |
| "learning_rate": 3.96148401996573e-05, | |
| "loss": 2.9447, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.4411481248690552, | |
| "grad_norm": 5.859282970428467, | |
| "learning_rate": 3.954034120539373e-05, | |
| "loss": 2.7831, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.4478525036664571, | |
| "grad_norm": 4.9684062004089355, | |
| "learning_rate": 3.946584221113015e-05, | |
| "loss": 2.8563, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.4545568824638593, | |
| "grad_norm": 4.819912910461426, | |
| "learning_rate": 3.939134321686657e-05, | |
| "loss": 2.8287, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.4612612612612612, | |
| "grad_norm": 5.544245719909668, | |
| "learning_rate": 3.931684422260299e-05, | |
| "loss": 2.8593, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.4679656400586634, | |
| "grad_norm": 5.144392013549805, | |
| "learning_rate": 3.924234522833941e-05, | |
| "loss": 2.925, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.4746700188560653, | |
| "grad_norm": 5.183608531951904, | |
| "learning_rate": 3.916784623407585e-05, | |
| "loss": 2.842, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.4813743976534675, | |
| "grad_norm": 5.042162895202637, | |
| "learning_rate": 3.909334723981227e-05, | |
| "loss": 2.7355, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.4880787764508696, | |
| "grad_norm": 4.931463241577148, | |
| "learning_rate": 3.901884824554869e-05, | |
| "loss": 2.8361, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.4947831552482715, | |
| "grad_norm": 5.274477481842041, | |
| "learning_rate": 3.894434925128511e-05, | |
| "loss": 2.8082, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.5014875340456735, | |
| "grad_norm": 5.249208450317383, | |
| "learning_rate": 3.886985025702153e-05, | |
| "loss": 2.8168, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.5081919128430756, | |
| "grad_norm": 4.666344165802002, | |
| "learning_rate": 3.879535126275796e-05, | |
| "loss": 2.7648, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.5148962916404778, | |
| "grad_norm": 5.37315559387207, | |
| "learning_rate": 3.872085226849438e-05, | |
| "loss": 2.9321, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.5216006704378797, | |
| "grad_norm": 6.209960460662842, | |
| "learning_rate": 3.86463532742308e-05, | |
| "loss": 2.8484, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.5283050492352817, | |
| "grad_norm": 4.266842365264893, | |
| "learning_rate": 3.857185427996722e-05, | |
| "loss": 2.8629, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.5350094280326838, | |
| "grad_norm": 4.92935848236084, | |
| "learning_rate": 3.849735528570364e-05, | |
| "loss": 2.8121, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.541713806830086, | |
| "grad_norm": 5.50532865524292, | |
| "learning_rate": 3.842285629144007e-05, | |
| "loss": 2.8396, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.548418185627488, | |
| "grad_norm": 4.804236888885498, | |
| "learning_rate": 3.834835729717649e-05, | |
| "loss": 2.7826, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.55512256442489, | |
| "grad_norm": 4.995475769042969, | |
| "learning_rate": 3.827385830291291e-05, | |
| "loss": 2.8538, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.5618269432222922, | |
| "grad_norm": 4.848133087158203, | |
| "learning_rate": 3.819935930864933e-05, | |
| "loss": 2.8618, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.5685313220196941, | |
| "grad_norm": 4.338679790496826, | |
| "learning_rate": 3.812486031438576e-05, | |
| "loss": 2.7829, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.575235700817096, | |
| "grad_norm": 4.795759201049805, | |
| "learning_rate": 3.805036132012218e-05, | |
| "loss": 2.7286, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.5819400796144982, | |
| "grad_norm": 5.834278106689453, | |
| "learning_rate": 3.79758623258586e-05, | |
| "loss": 2.7273, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.5886444584119004, | |
| "grad_norm": 4.841015338897705, | |
| "learning_rate": 3.790136333159502e-05, | |
| "loss": 2.8229, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.5953488372093023, | |
| "grad_norm": 5.714345932006836, | |
| "learning_rate": 3.7826864337331444e-05, | |
| "loss": 2.7513, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.6020532160067043, | |
| "grad_norm": 4.734280586242676, | |
| "learning_rate": 3.775236534306787e-05, | |
| "loss": 2.8036, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.6087575948041064, | |
| "grad_norm": 5.643955707550049, | |
| "learning_rate": 3.767786634880429e-05, | |
| "loss": 2.7828, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6154619736015086, | |
| "grad_norm": 5.758233070373535, | |
| "learning_rate": 3.7603367354540713e-05, | |
| "loss": 2.868, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.6221663523989105, | |
| "grad_norm": 4.5780110359191895, | |
| "learning_rate": 3.7528868360277134e-05, | |
| "loss": 2.7409, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.6288707311963124, | |
| "grad_norm": 4.986692428588867, | |
| "learning_rate": 3.745436936601356e-05, | |
| "loss": 2.765, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.6355751099937148, | |
| "grad_norm": 4.5255913734436035, | |
| "learning_rate": 3.737987037174998e-05, | |
| "loss": 2.7851, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.6422794887911167, | |
| "grad_norm": 4.560822486877441, | |
| "learning_rate": 3.7305371377486404e-05, | |
| "loss": 2.7158, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.6489838675885187, | |
| "grad_norm": 5.068883419036865, | |
| "learning_rate": 3.7230872383222825e-05, | |
| "loss": 2.6394, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.6556882463859208, | |
| "grad_norm": 5.706743240356445, | |
| "learning_rate": 3.715637338895925e-05, | |
| "loss": 2.6872, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.662392625183323, | |
| "grad_norm": 5.257516860961914, | |
| "learning_rate": 3.708187439469567e-05, | |
| "loss": 2.7565, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.669097003980725, | |
| "grad_norm": 5.708479881286621, | |
| "learning_rate": 3.70073754004321e-05, | |
| "loss": 2.7354, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.6758013827781268, | |
| "grad_norm": 5.4913482666015625, | |
| "learning_rate": 3.693287640616852e-05, | |
| "loss": 2.7493, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.6758013827781268, | |
| "eval_bleu_greedy": 43.18356706938715, | |
| "eval_loss": 0.3662695586681366, | |
| "eval_runtime": 57.826, | |
| "eval_samples_per_second": 1.729, | |
| "eval_steps_per_second": 1.729, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.682505761575529, | |
| "grad_norm": 5.242050647735596, | |
| "learning_rate": 3.685837741190494e-05, | |
| "loss": 2.8011, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.6892101403729312, | |
| "grad_norm": 5.231212615966797, | |
| "learning_rate": 3.6783878417641363e-05, | |
| "loss": 2.8461, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.695914519170333, | |
| "grad_norm": 4.671517848968506, | |
| "learning_rate": 3.6709379423377784e-05, | |
| "loss": 2.7855, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.702618897967735, | |
| "grad_norm": 6.603331565856934, | |
| "learning_rate": 3.663488042911421e-05, | |
| "loss": 2.7619, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.7093232767651372, | |
| "grad_norm": 5.420257568359375, | |
| "learning_rate": 3.656038143485063e-05, | |
| "loss": 2.8161, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.7160276555625393, | |
| "grad_norm": 6.228781700134277, | |
| "learning_rate": 3.6485882440587054e-05, | |
| "loss": 2.8839, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.7227320343599413, | |
| "grad_norm": 5.307265758514404, | |
| "learning_rate": 3.6411383446323475e-05, | |
| "loss": 2.7141, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.7294364131573434, | |
| "grad_norm": 5.219127178192139, | |
| "learning_rate": 3.63368844520599e-05, | |
| "loss": 2.6867, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.7361407919547456, | |
| "grad_norm": 7.216719150543213, | |
| "learning_rate": 3.626238545779632e-05, | |
| "loss": 2.8654, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.7428451707521475, | |
| "grad_norm": 5.994204998016357, | |
| "learning_rate": 3.6187886463532744e-05, | |
| "loss": 2.7267, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.7495495495495494, | |
| "grad_norm": 5.58418607711792, | |
| "learning_rate": 3.6113387469269165e-05, | |
| "loss": 2.809, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.7562539283469516, | |
| "grad_norm": 5.072906970977783, | |
| "learning_rate": 3.6038888475005586e-05, | |
| "loss": 2.7781, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.7629583071443538, | |
| "grad_norm": 5.0261335372924805, | |
| "learning_rate": 3.5964389480742013e-05, | |
| "loss": 2.7554, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.7696626859417557, | |
| "grad_norm": 5.358790874481201, | |
| "learning_rate": 3.5889890486478434e-05, | |
| "loss": 2.8101, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.7763670647391576, | |
| "grad_norm": 5.563721656799316, | |
| "learning_rate": 3.5815391492214855e-05, | |
| "loss": 2.8028, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.7830714435365598, | |
| "grad_norm": 4.921150207519531, | |
| "learning_rate": 3.5740892497951276e-05, | |
| "loss": 2.7203, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.789775822333962, | |
| "grad_norm": 5.121066093444824, | |
| "learning_rate": 3.5666393503687704e-05, | |
| "loss": 2.8034, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.7964802011313639, | |
| "grad_norm": 5.708595275878906, | |
| "learning_rate": 3.5591894509424125e-05, | |
| "loss": 2.6728, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.803184579928766, | |
| "grad_norm": 6.127959728240967, | |
| "learning_rate": 3.5517395515160546e-05, | |
| "loss": 2.8047, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.8098889587261682, | |
| "grad_norm": 4.8620405197143555, | |
| "learning_rate": 3.5442896520896966e-05, | |
| "loss": 2.6712, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.81659333752357, | |
| "grad_norm": 5.093926906585693, | |
| "learning_rate": 3.536839752663339e-05, | |
| "loss": 2.7286, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.823297716320972, | |
| "grad_norm": 5.522273063659668, | |
| "learning_rate": 3.5293898532369815e-05, | |
| "loss": 2.7907, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.8300020951183742, | |
| "grad_norm": 4.671688556671143, | |
| "learning_rate": 3.5219399538106236e-05, | |
| "loss": 2.7235, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.8367064739157763, | |
| "grad_norm": 4.80985689163208, | |
| "learning_rate": 3.514490054384266e-05, | |
| "loss": 2.7607, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.8434108527131783, | |
| "grad_norm": 5.098107814788818, | |
| "learning_rate": 3.5070401549579084e-05, | |
| "loss": 2.6983, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.8501152315105802, | |
| "grad_norm": 5.1361236572265625, | |
| "learning_rate": 3.4995902555315505e-05, | |
| "loss": 2.6083, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.8568196103079824, | |
| "grad_norm": 5.15921688079834, | |
| "learning_rate": 3.492140356105193e-05, | |
| "loss": 2.7306, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.8635239891053845, | |
| "grad_norm": 5.7626519203186035, | |
| "learning_rate": 3.4846904566788354e-05, | |
| "loss": 2.774, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.8702283679027865, | |
| "grad_norm": 5.519023895263672, | |
| "learning_rate": 3.4772405572524775e-05, | |
| "loss": 2.7246, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.8769327467001886, | |
| "grad_norm": 5.080531120300293, | |
| "learning_rate": 3.4697906578261196e-05, | |
| "loss": 2.8704, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.8836371254975908, | |
| "grad_norm": 4.591396808624268, | |
| "learning_rate": 3.4623407583997616e-05, | |
| "loss": 2.7437, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.8903415042949927, | |
| "grad_norm": 5.403082847595215, | |
| "learning_rate": 3.4548908589734044e-05, | |
| "loss": 2.6397, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.8970458830923946, | |
| "grad_norm": 6.168285369873047, | |
| "learning_rate": 3.4474409595470465e-05, | |
| "loss": 2.8026, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.9037502618897968, | |
| "grad_norm": 6.647582530975342, | |
| "learning_rate": 3.4399910601206886e-05, | |
| "loss": 2.6709, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.910454640687199, | |
| "grad_norm": 5.180938720703125, | |
| "learning_rate": 3.432541160694331e-05, | |
| "loss": 2.794, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.9171590194846009, | |
| "grad_norm": 5.552131175994873, | |
| "learning_rate": 3.425091261267973e-05, | |
| "loss": 2.7171, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.9238633982820028, | |
| "grad_norm": 5.482553958892822, | |
| "learning_rate": 3.4176413618416155e-05, | |
| "loss": 2.7021, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.930567777079405, | |
| "grad_norm": 4.879262447357178, | |
| "learning_rate": 3.4101914624152576e-05, | |
| "loss": 2.5908, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.9372721558768071, | |
| "grad_norm": 5.676559925079346, | |
| "learning_rate": 3.4027415629889e-05, | |
| "loss": 2.7314, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.943976534674209, | |
| "grad_norm": 5.349039077758789, | |
| "learning_rate": 3.395291663562542e-05, | |
| "loss": 2.7559, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.950680913471611, | |
| "grad_norm": 4.996124744415283, | |
| "learning_rate": 3.3878417641361846e-05, | |
| "loss": 2.696, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.9573852922690131, | |
| "grad_norm": 5.283078193664551, | |
| "learning_rate": 3.3803918647098266e-05, | |
| "loss": 2.7735, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.9640896710664153, | |
| "grad_norm": 5.360789775848389, | |
| "learning_rate": 3.372941965283469e-05, | |
| "loss": 2.6893, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.9707940498638172, | |
| "grad_norm": 4.45306921005249, | |
| "learning_rate": 3.365492065857111e-05, | |
| "loss": 2.7067, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.9774984286612194, | |
| "grad_norm": 5.298338890075684, | |
| "learning_rate": 3.358042166430753e-05, | |
| "loss": 2.7175, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.9842028074586215, | |
| "grad_norm": 5.592753887176514, | |
| "learning_rate": 3.350592267004396e-05, | |
| "loss": 2.6956, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.9909071862560235, | |
| "grad_norm": 4.717761993408203, | |
| "learning_rate": 3.343142367578038e-05, | |
| "loss": 2.8405, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.9976115650534254, | |
| "grad_norm": 5.031712532043457, | |
| "learning_rate": 3.33569246815168e-05, | |
| "loss": 2.7726, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 2.004022627278441, | |
| "grad_norm": 5.590005397796631, | |
| "learning_rate": 3.328242568725322e-05, | |
| "loss": 2.3678, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 2.010727006075843, | |
| "grad_norm": 5.272172451019287, | |
| "learning_rate": 3.320792669298965e-05, | |
| "loss": 2.4877, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.010727006075843, | |
| "eval_bleu_greedy": 43.32802392418832, | |
| "eval_loss": 0.3537669777870178, | |
| "eval_runtime": 89.2292, | |
| "eval_samples_per_second": 1.121, | |
| "eval_steps_per_second": 1.121, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.0174313848732455, | |
| "grad_norm": 5.440276622772217, | |
| "learning_rate": 3.313342769872607e-05, | |
| "loss": 2.5165, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 2.0241357636706474, | |
| "grad_norm": 4.8560709953308105, | |
| "learning_rate": 3.3058928704462496e-05, | |
| "loss": 2.4584, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 2.0308401424680493, | |
| "grad_norm": 4.232001781463623, | |
| "learning_rate": 3.2984429710198916e-05, | |
| "loss": 2.5005, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 2.0375445212654517, | |
| "grad_norm": 4.847684383392334, | |
| "learning_rate": 3.290993071593534e-05, | |
| "loss": 2.5064, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 2.0442489000628536, | |
| "grad_norm": 4.831601619720459, | |
| "learning_rate": 3.283543172167176e-05, | |
| "loss": 2.5224, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.0509532788602556, | |
| "grad_norm": 4.691544055938721, | |
| "learning_rate": 3.2760932727408186e-05, | |
| "loss": 2.5174, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 2.0576576576576575, | |
| "grad_norm": 4.6900248527526855, | |
| "learning_rate": 3.268643373314461e-05, | |
| "loss": 2.4071, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 2.06436203645506, | |
| "grad_norm": 4.56058931350708, | |
| "learning_rate": 3.261193473888103e-05, | |
| "loss": 2.4984, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 2.071066415252462, | |
| "grad_norm": 4.817535877227783, | |
| "learning_rate": 3.253743574461745e-05, | |
| "loss": 2.5249, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 2.0777707940498638, | |
| "grad_norm": 4.966011047363281, | |
| "learning_rate": 3.2462936750353876e-05, | |
| "loss": 2.4622, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.0844751728472657, | |
| "grad_norm": 5.804232120513916, | |
| "learning_rate": 3.23884377560903e-05, | |
| "loss": 2.4868, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 2.091179551644668, | |
| "grad_norm": 4.816470623016357, | |
| "learning_rate": 3.231393876182672e-05, | |
| "loss": 2.3994, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 2.09788393044207, | |
| "grad_norm": 5.60403299331665, | |
| "learning_rate": 3.223943976756314e-05, | |
| "loss": 2.511, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 2.104588309239472, | |
| "grad_norm": 4.696137428283691, | |
| "learning_rate": 3.216494077329956e-05, | |
| "loss": 2.4767, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 2.1112926880368743, | |
| "grad_norm": 4.613682270050049, | |
| "learning_rate": 3.209044177903599e-05, | |
| "loss": 2.3844, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.1179970668342762, | |
| "grad_norm": 4.697776794433594, | |
| "learning_rate": 3.201594278477241e-05, | |
| "loss": 2.4177, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 2.124701445631678, | |
| "grad_norm": 5.78491735458374, | |
| "learning_rate": 3.194144379050883e-05, | |
| "loss": 2.4688, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 2.13140582442908, | |
| "grad_norm": 5.266840934753418, | |
| "learning_rate": 3.186694479624525e-05, | |
| "loss": 2.5086, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 2.1381102032264825, | |
| "grad_norm": 6.113589286804199, | |
| "learning_rate": 3.179244580198167e-05, | |
| "loss": 2.4037, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 2.1448145820238844, | |
| "grad_norm": 5.600616455078125, | |
| "learning_rate": 3.17179468077181e-05, | |
| "loss": 2.5343, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.1515189608212864, | |
| "grad_norm": 5.7312774658203125, | |
| "learning_rate": 3.164344781345452e-05, | |
| "loss": 2.4902, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 2.1582233396186883, | |
| "grad_norm": 5.237963676452637, | |
| "learning_rate": 3.156894881919094e-05, | |
| "loss": 2.5499, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 2.1649277184160907, | |
| "grad_norm": 5.6732916831970215, | |
| "learning_rate": 3.149444982492736e-05, | |
| "loss": 2.4206, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 2.1716320972134926, | |
| "grad_norm": 5.351031303405762, | |
| "learning_rate": 3.141995083066379e-05, | |
| "loss": 2.4755, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 2.1783364760108945, | |
| "grad_norm": 5.774845123291016, | |
| "learning_rate": 3.134545183640021e-05, | |
| "loss": 2.405, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.1850408548082965, | |
| "grad_norm": 6.55280876159668, | |
| "learning_rate": 3.127095284213663e-05, | |
| "loss": 2.5264, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 2.191745233605699, | |
| "grad_norm": 5.785919666290283, | |
| "learning_rate": 3.119645384787305e-05, | |
| "loss": 2.4565, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 2.1984496124031008, | |
| "grad_norm": 5.330859184265137, | |
| "learning_rate": 3.112195485360947e-05, | |
| "loss": 2.4884, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 2.2051539912005027, | |
| "grad_norm": 5.32330322265625, | |
| "learning_rate": 3.10474558593459e-05, | |
| "loss": 2.4773, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 2.211858369997905, | |
| "grad_norm": 6.741830825805664, | |
| "learning_rate": 3.097295686508233e-05, | |
| "loss": 2.424, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.218562748795307, | |
| "grad_norm": 4.638687610626221, | |
| "learning_rate": 3.089845787081875e-05, | |
| "loss": 2.41, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 2.225267127592709, | |
| "grad_norm": 5.136049270629883, | |
| "learning_rate": 3.082395887655517e-05, | |
| "loss": 2.4351, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 2.231971506390111, | |
| "grad_norm": 5.4034528732299805, | |
| "learning_rate": 3.074945988229159e-05, | |
| "loss": 2.4679, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 2.2386758851875133, | |
| "grad_norm": 5.091638088226318, | |
| "learning_rate": 3.067496088802802e-05, | |
| "loss": 2.5217, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 2.245380263984915, | |
| "grad_norm": 4.8217668533325195, | |
| "learning_rate": 3.060046189376444e-05, | |
| "loss": 2.4203, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.252084642782317, | |
| "grad_norm": 5.1358113288879395, | |
| "learning_rate": 3.052596289950086e-05, | |
| "loss": 2.4624, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 2.2587890215797195, | |
| "grad_norm": 5.394964218139648, | |
| "learning_rate": 3.045146390523728e-05, | |
| "loss": 2.3956, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 2.2654934003771214, | |
| "grad_norm": 5.930358409881592, | |
| "learning_rate": 3.0376964910973705e-05, | |
| "loss": 2.3428, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 2.2721977791745234, | |
| "grad_norm": 5.13174295425415, | |
| "learning_rate": 3.0302465916710126e-05, | |
| "loss": 2.4459, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 2.2789021579719253, | |
| "grad_norm": 5.4948248863220215, | |
| "learning_rate": 3.022796692244655e-05, | |
| "loss": 2.4351, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.2856065367693272, | |
| "grad_norm": 5.601160049438477, | |
| "learning_rate": 3.015346792818297e-05, | |
| "loss": 2.4617, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 2.2923109155667296, | |
| "grad_norm": 5.33076810836792, | |
| "learning_rate": 3.0078968933919395e-05, | |
| "loss": 2.435, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 2.2990152943641315, | |
| "grad_norm": 5.344810485839844, | |
| "learning_rate": 3.0004469939655816e-05, | |
| "loss": 2.4458, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 2.3057196731615335, | |
| "grad_norm": 5.304954528808594, | |
| "learning_rate": 2.9929970945392237e-05, | |
| "loss": 2.4774, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 2.312424051958936, | |
| "grad_norm": 5.514897346496582, | |
| "learning_rate": 2.985547195112866e-05, | |
| "loss": 2.5046, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.319128430756338, | |
| "grad_norm": 5.057281494140625, | |
| "learning_rate": 2.9780972956865082e-05, | |
| "loss": 2.4099, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 2.3258328095537397, | |
| "grad_norm": 5.301017761230469, | |
| "learning_rate": 2.9706473962601506e-05, | |
| "loss": 2.4959, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 2.3325371883511417, | |
| "grad_norm": 5.437694549560547, | |
| "learning_rate": 2.9631974968337927e-05, | |
| "loss": 2.4614, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 2.339241567148544, | |
| "grad_norm": 4.668396949768066, | |
| "learning_rate": 2.955747597407435e-05, | |
| "loss": 2.4309, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 2.345945945945946, | |
| "grad_norm": 5.610180854797363, | |
| "learning_rate": 2.9482976979810772e-05, | |
| "loss": 2.4053, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.345945945945946, | |
| "eval_bleu_greedy": 43.68322019190237, | |
| "eval_loss": 0.34515950083732605, | |
| "eval_runtime": 129.474, | |
| "eval_samples_per_second": 0.772, | |
| "eval_steps_per_second": 0.772, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.352650324743348, | |
| "grad_norm": 5.010399341583252, | |
| "learning_rate": 2.9408477985547193e-05, | |
| "loss": 2.4909, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 2.35935470354075, | |
| "grad_norm": 5.367332935333252, | |
| "learning_rate": 2.9333978991283618e-05, | |
| "loss": 2.4067, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 2.366059082338152, | |
| "grad_norm": 6.149716854095459, | |
| "learning_rate": 2.925947999702004e-05, | |
| "loss": 2.3804, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 2.372763461135554, | |
| "grad_norm": 5.402404308319092, | |
| "learning_rate": 2.9184981002756463e-05, | |
| "loss": 2.5418, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 2.379467839932956, | |
| "grad_norm": 5.056270599365234, | |
| "learning_rate": 2.9110482008492884e-05, | |
| "loss": 2.426, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.3861722187303585, | |
| "grad_norm": 5.246522426605225, | |
| "learning_rate": 2.9035983014229308e-05, | |
| "loss": 2.402, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 2.3928765975277604, | |
| "grad_norm": 5.366527557373047, | |
| "learning_rate": 2.8961484019965735e-05, | |
| "loss": 2.4275, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 2.3995809763251623, | |
| "grad_norm": 5.639255523681641, | |
| "learning_rate": 2.8886985025702156e-05, | |
| "loss": 2.4606, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 2.4062853551225643, | |
| "grad_norm": 4.962663650512695, | |
| "learning_rate": 2.881248603143858e-05, | |
| "loss": 2.4537, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 2.4129897339199666, | |
| "grad_norm": 5.757133483886719, | |
| "learning_rate": 2.8737987037175e-05, | |
| "loss": 2.4827, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.4196941127173686, | |
| "grad_norm": 5.6149516105651855, | |
| "learning_rate": 2.8663488042911422e-05, | |
| "loss": 2.457, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 2.4263984915147705, | |
| "grad_norm": 5.717596054077148, | |
| "learning_rate": 2.8588989048647847e-05, | |
| "loss": 2.4372, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 2.4331028703121724, | |
| "grad_norm": 6.159657001495361, | |
| "learning_rate": 2.8514490054384268e-05, | |
| "loss": 2.4332, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 2.439807249109575, | |
| "grad_norm": 5.708389759063721, | |
| "learning_rate": 2.8439991060120692e-05, | |
| "loss": 2.3862, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 2.4465116279069767, | |
| "grad_norm": 5.569727420806885, | |
| "learning_rate": 2.8365492065857113e-05, | |
| "loss": 2.4465, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.4532160067043787, | |
| "grad_norm": 5.431128025054932, | |
| "learning_rate": 2.8290993071593537e-05, | |
| "loss": 2.4079, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 2.459920385501781, | |
| "grad_norm": 5.69559907913208, | |
| "learning_rate": 2.8216494077329958e-05, | |
| "loss": 2.4039, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 2.466624764299183, | |
| "grad_norm": 5.6564226150512695, | |
| "learning_rate": 2.814199508306638e-05, | |
| "loss": 2.5032, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 2.473329143096585, | |
| "grad_norm": 5.606060028076172, | |
| "learning_rate": 2.8067496088802803e-05, | |
| "loss": 2.2959, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 2.480033521893987, | |
| "grad_norm": 5.51323127746582, | |
| "learning_rate": 2.7992997094539224e-05, | |
| "loss": 2.3528, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.4867379006913892, | |
| "grad_norm": 4.964369773864746, | |
| "learning_rate": 2.7918498100275648e-05, | |
| "loss": 2.4204, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 2.493442279488791, | |
| "grad_norm": 4.708135604858398, | |
| "learning_rate": 2.784399910601207e-05, | |
| "loss": 2.4221, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 2.500146658286193, | |
| "grad_norm": 4.968412399291992, | |
| "learning_rate": 2.7769500111748493e-05, | |
| "loss": 2.4488, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 2.506851037083595, | |
| "grad_norm": 4.854182243347168, | |
| "learning_rate": 2.7695001117484914e-05, | |
| "loss": 2.412, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 2.5135554158809974, | |
| "grad_norm": 5.069591999053955, | |
| "learning_rate": 2.762050212322134e-05, | |
| "loss": 2.4355, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.5202597946783993, | |
| "grad_norm": 5.306612014770508, | |
| "learning_rate": 2.754600312895776e-05, | |
| "loss": 2.4293, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 2.5269641734758013, | |
| "grad_norm": 5.377747058868408, | |
| "learning_rate": 2.747150413469418e-05, | |
| "loss": 2.5042, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 2.5336685522732036, | |
| "grad_norm": 5.272029399871826, | |
| "learning_rate": 2.7397005140430604e-05, | |
| "loss": 2.4412, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 2.5403729310706056, | |
| "grad_norm": 6.27547550201416, | |
| "learning_rate": 2.7322506146167025e-05, | |
| "loss": 2.4129, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 2.5470773098680075, | |
| "grad_norm": 4.881494045257568, | |
| "learning_rate": 2.724800715190345e-05, | |
| "loss": 2.4177, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.5537816886654094, | |
| "grad_norm": 4.519489288330078, | |
| "learning_rate": 2.717350815763987e-05, | |
| "loss": 2.486, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 2.5604860674628114, | |
| "grad_norm": 5.494065284729004, | |
| "learning_rate": 2.7099009163376295e-05, | |
| "loss": 2.3461, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 2.5671904462602138, | |
| "grad_norm": 5.3841776847839355, | |
| "learning_rate": 2.7024510169112716e-05, | |
| "loss": 2.4056, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 2.5738948250576157, | |
| "grad_norm": 4.913102626800537, | |
| "learning_rate": 2.6950011174849143e-05, | |
| "loss": 2.3517, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 2.5805992038550176, | |
| "grad_norm": 5.246600151062012, | |
| "learning_rate": 2.6875512180585564e-05, | |
| "loss": 2.4435, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.58730358265242, | |
| "grad_norm": 5.883936882019043, | |
| "learning_rate": 2.680101318632199e-05, | |
| "loss": 2.4236, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 2.594007961449822, | |
| "grad_norm": 4.634848117828369, | |
| "learning_rate": 2.672651419205841e-05, | |
| "loss": 2.4566, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 2.600712340247224, | |
| "grad_norm": 5.388699531555176, | |
| "learning_rate": 2.6652015197794834e-05, | |
| "loss": 2.4541, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 2.6074167190446262, | |
| "grad_norm": 5.410739421844482, | |
| "learning_rate": 2.6577516203531254e-05, | |
| "loss": 2.4242, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 2.614121097842028, | |
| "grad_norm": 6.787940979003906, | |
| "learning_rate": 2.650301720926768e-05, | |
| "loss": 2.4763, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.62082547663943, | |
| "grad_norm": 5.818297863006592, | |
| "learning_rate": 2.64285182150041e-05, | |
| "loss": 2.4732, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 2.627529855436832, | |
| "grad_norm": 5.29996395111084, | |
| "learning_rate": 2.6354019220740524e-05, | |
| "loss": 2.4218, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 2.634234234234234, | |
| "grad_norm": 5.816316604614258, | |
| "learning_rate": 2.6279520226476945e-05, | |
| "loss": 2.4662, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 2.6409386130316364, | |
| "grad_norm": 6.0321149826049805, | |
| "learning_rate": 2.6205021232213366e-05, | |
| "loss": 2.437, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 2.6476429918290383, | |
| "grad_norm": 5.97547721862793, | |
| "learning_rate": 2.613052223794979e-05, | |
| "loss": 2.3648, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.65434737062644, | |
| "grad_norm": 5.269493579864502, | |
| "learning_rate": 2.605602324368621e-05, | |
| "loss": 2.4529, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.6610517494238426, | |
| "grad_norm": 5.6759114265441895, | |
| "learning_rate": 2.5981524249422635e-05, | |
| "loss": 2.347, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 2.6677561282212445, | |
| "grad_norm": 5.74458122253418, | |
| "learning_rate": 2.5907025255159056e-05, | |
| "loss": 2.3639, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 2.6744605070186465, | |
| "grad_norm": 6.335291385650635, | |
| "learning_rate": 2.583252626089548e-05, | |
| "loss": 2.2973, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 2.681164885816049, | |
| "grad_norm": 4.999330520629883, | |
| "learning_rate": 2.57580272666319e-05, | |
| "loss": 2.4947, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.681164885816049, | |
| "eval_bleu_greedy": 43.85960445724608, | |
| "eval_loss": 0.3381543755531311, | |
| "eval_runtime": 116.2511, | |
| "eval_samples_per_second": 0.86, | |
| "eval_steps_per_second": 0.86, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.6878692646134508, | |
| "grad_norm": 5.183805465698242, | |
| "learning_rate": 2.5683528272368322e-05, | |
| "loss": 2.4735, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 2.6945736434108527, | |
| "grad_norm": 5.173423767089844, | |
| "learning_rate": 2.5609029278104746e-05, | |
| "loss": 2.4424, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 2.7012780222082546, | |
| "grad_norm": 5.538326740264893, | |
| "learning_rate": 2.5534530283841167e-05, | |
| "loss": 2.4517, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 2.7079824010056566, | |
| "grad_norm": 5.611026287078857, | |
| "learning_rate": 2.546003128957759e-05, | |
| "loss": 2.4325, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 2.714686779803059, | |
| "grad_norm": 5.152528762817383, | |
| "learning_rate": 2.5385532295314012e-05, | |
| "loss": 2.3573, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.721391158600461, | |
| "grad_norm": 5.090909481048584, | |
| "learning_rate": 2.5311033301050437e-05, | |
| "loss": 2.3834, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 2.728095537397863, | |
| "grad_norm": 5.87515926361084, | |
| "learning_rate": 2.5236534306786857e-05, | |
| "loss": 2.4277, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 2.734799916195265, | |
| "grad_norm": 5.354443073272705, | |
| "learning_rate": 2.516203531252328e-05, | |
| "loss": 2.4928, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 2.741504294992667, | |
| "grad_norm": 5.228540897369385, | |
| "learning_rate": 2.5087536318259703e-05, | |
| "loss": 2.4103, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 2.748208673790069, | |
| "grad_norm": 5.605003356933594, | |
| "learning_rate": 2.5013037323996123e-05, | |
| "loss": 2.4431, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.7549130525874714, | |
| "grad_norm": 5.046622276306152, | |
| "learning_rate": 2.493853832973255e-05, | |
| "loss": 2.3604, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 2.7616174313848734, | |
| "grad_norm": 5.582128047943115, | |
| "learning_rate": 2.4864039335468972e-05, | |
| "loss": 2.3736, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 2.7683218101822753, | |
| "grad_norm": 5.479374408721924, | |
| "learning_rate": 2.4789540341205393e-05, | |
| "loss": 2.3829, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 2.7750261889796772, | |
| "grad_norm": 5.341858386993408, | |
| "learning_rate": 2.4715041346941817e-05, | |
| "loss": 2.391, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 2.781730567777079, | |
| "grad_norm": 5.183146953582764, | |
| "learning_rate": 2.4640542352678238e-05, | |
| "loss": 2.4453, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.7884349465744815, | |
| "grad_norm": 5.269819736480713, | |
| "learning_rate": 2.4566043358414662e-05, | |
| "loss": 2.393, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 2.7951393253718835, | |
| "grad_norm": 5.160492420196533, | |
| "learning_rate": 2.4491544364151083e-05, | |
| "loss": 2.3292, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 2.8018437041692854, | |
| "grad_norm": 5.365363121032715, | |
| "learning_rate": 2.4417045369887507e-05, | |
| "loss": 2.423, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 2.808548082966688, | |
| "grad_norm": 5.038601398468018, | |
| "learning_rate": 2.4342546375623932e-05, | |
| "loss": 2.4559, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 2.8152524617640897, | |
| "grad_norm": 6.426731109619141, | |
| "learning_rate": 2.4268047381360353e-05, | |
| "loss": 2.4339, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.8219568405614917, | |
| "grad_norm": 5.634438991546631, | |
| "learning_rate": 2.4193548387096777e-05, | |
| "loss": 2.4238, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 2.828661219358894, | |
| "grad_norm": 5.305586338043213, | |
| "learning_rate": 2.4119049392833198e-05, | |
| "loss": 2.3275, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 2.835365598156296, | |
| "grad_norm": 5.70265531539917, | |
| "learning_rate": 2.4044550398569622e-05, | |
| "loss": 2.3728, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 2.842069976953698, | |
| "grad_norm": 5.732183933258057, | |
| "learning_rate": 2.3970051404306043e-05, | |
| "loss": 2.4455, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 2.8487743557511, | |
| "grad_norm": 5.569748401641846, | |
| "learning_rate": 2.3895552410042464e-05, | |
| "loss": 2.4491, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.8554787345485018, | |
| "grad_norm": 6.472934246063232, | |
| "learning_rate": 2.3821053415778888e-05, | |
| "loss": 2.4048, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 2.862183113345904, | |
| "grad_norm": 6.311745643615723, | |
| "learning_rate": 2.374655442151531e-05, | |
| "loss": 2.4798, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 2.868887492143306, | |
| "grad_norm": 5.956141948699951, | |
| "learning_rate": 2.3672055427251733e-05, | |
| "loss": 2.4407, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 2.875591870940708, | |
| "grad_norm": 5.066039085388184, | |
| "learning_rate": 2.3597556432988154e-05, | |
| "loss": 2.4024, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 2.8822962497381104, | |
| "grad_norm": 5.680649757385254, | |
| "learning_rate": 2.352305743872458e-05, | |
| "loss": 2.3765, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.8890006285355123, | |
| "grad_norm": 6.591916561126709, | |
| "learning_rate": 2.3448558444461e-05, | |
| "loss": 2.3552, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 2.8957050073329142, | |
| "grad_norm": 5.818574905395508, | |
| "learning_rate": 2.3374059450197424e-05, | |
| "loss": 2.4488, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.9024093861303166, | |
| "grad_norm": 5.694536209106445, | |
| "learning_rate": 2.3299560455933848e-05, | |
| "loss": 2.3762, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 2.9091137649277186, | |
| "grad_norm": 4.8993659019470215, | |
| "learning_rate": 2.322506146167027e-05, | |
| "loss": 2.3233, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 2.9158181437251205, | |
| "grad_norm": 5.326479911804199, | |
| "learning_rate": 2.3150562467406693e-05, | |
| "loss": 2.2993, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.9225225225225224, | |
| "grad_norm": 5.418346405029297, | |
| "learning_rate": 2.3076063473143114e-05, | |
| "loss": 2.3986, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 2.9292269013199244, | |
| "grad_norm": 5.561238765716553, | |
| "learning_rate": 2.3001564478879538e-05, | |
| "loss": 2.3875, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 2.9359312801173267, | |
| "grad_norm": 5.399714946746826, | |
| "learning_rate": 2.292706548461596e-05, | |
| "loss": 2.4019, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 2.9426356589147287, | |
| "grad_norm": 5.176342964172363, | |
| "learning_rate": 2.285256649035238e-05, | |
| "loss": 2.3222, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 2.9493400377121306, | |
| "grad_norm": 4.745908260345459, | |
| "learning_rate": 2.2778067496088804e-05, | |
| "loss": 2.4057, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.956044416509533, | |
| "grad_norm": 6.640355587005615, | |
| "learning_rate": 2.2703568501825225e-05, | |
| "loss": 2.3954, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 2.962748795306935, | |
| "grad_norm": 6.0387749671936035, | |
| "learning_rate": 2.262906950756165e-05, | |
| "loss": 2.2882, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 2.969453174104337, | |
| "grad_norm": 6.5941596031188965, | |
| "learning_rate": 2.255457051329807e-05, | |
| "loss": 2.3585, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 2.9761575529017392, | |
| "grad_norm": 6.242068767547607, | |
| "learning_rate": 2.2480071519034494e-05, | |
| "loss": 2.3699, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 2.982861931699141, | |
| "grad_norm": 6.592676162719727, | |
| "learning_rate": 2.2405572524770915e-05, | |
| "loss": 2.376, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.989566310496543, | |
| "grad_norm": 5.625716209411621, | |
| "learning_rate": 2.233107353050734e-05, | |
| "loss": 2.321, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 2.996270689293945, | |
| "grad_norm": 5.320464134216309, | |
| "learning_rate": 2.2256574536243764e-05, | |
| "loss": 2.3896, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 3.0026817515189608, | |
| "grad_norm": 4.8960418701171875, | |
| "learning_rate": 2.2182075541980185e-05, | |
| "loss": 2.1192, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 3.0093861303163627, | |
| "grad_norm": 5.314412593841553, | |
| "learning_rate": 2.210757654771661e-05, | |
| "loss": 2.1751, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 3.016090509113765, | |
| "grad_norm": 5.510196208953857, | |
| "learning_rate": 2.203307755345303e-05, | |
| "loss": 2.2691, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.016090509113765, | |
| "eval_bleu_greedy": 43.740830458991255, | |
| "eval_loss": 0.33234962821006775, | |
| "eval_runtime": 141.0542, | |
| "eval_samples_per_second": 0.709, | |
| "eval_steps_per_second": 0.709, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.022794887911167, | |
| "grad_norm": 5.984165191650391, | |
| "learning_rate": 2.195857855918945e-05, | |
| "loss": 2.1367, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 3.029499266708569, | |
| "grad_norm": 5.3130669593811035, | |
| "learning_rate": 2.1884079564925875e-05, | |
| "loss": 2.2206, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 3.036203645505971, | |
| "grad_norm": 5.565290927886963, | |
| "learning_rate": 2.1809580570662296e-05, | |
| "loss": 2.1031, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 3.0429080243033733, | |
| "grad_norm": 5.757679462432861, | |
| "learning_rate": 2.173508157639872e-05, | |
| "loss": 2.2013, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 3.049612403100775, | |
| "grad_norm": 5.874185085296631, | |
| "learning_rate": 2.166058258213514e-05, | |
| "loss": 2.118, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.056316781898177, | |
| "grad_norm": 5.614380836486816, | |
| "learning_rate": 2.1586083587871565e-05, | |
| "loss": 2.1824, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 3.0630211606955795, | |
| "grad_norm": 5.548777103424072, | |
| "learning_rate": 2.1511584593607986e-05, | |
| "loss": 2.1007, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 3.0697255394929814, | |
| "grad_norm": 5.060155868530273, | |
| "learning_rate": 2.1437085599344407e-05, | |
| "loss": 2.0839, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 3.0764299182903834, | |
| "grad_norm": 5.213876724243164, | |
| "learning_rate": 2.1362586605080835e-05, | |
| "loss": 2.1763, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 3.0831342970877853, | |
| "grad_norm": 5.595554351806641, | |
| "learning_rate": 2.1288087610817256e-05, | |
| "loss": 2.2138, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.0898386758851877, | |
| "grad_norm": 6.205844402313232, | |
| "learning_rate": 2.121358861655368e-05, | |
| "loss": 2.1493, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 3.0965430546825896, | |
| "grad_norm": 5.1306352615356445, | |
| "learning_rate": 2.11390896222901e-05, | |
| "loss": 2.2229, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 3.1032474334799915, | |
| "grad_norm": 6.070255756378174, | |
| "learning_rate": 2.106459062802652e-05, | |
| "loss": 2.1823, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 3.1099518122773935, | |
| "grad_norm": 5.469337463378906, | |
| "learning_rate": 2.0990091633762946e-05, | |
| "loss": 2.1498, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 3.116656191074796, | |
| "grad_norm": 5.587215900421143, | |
| "learning_rate": 2.0915592639499367e-05, | |
| "loss": 2.2351, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.123360569872198, | |
| "grad_norm": 5.845519065856934, | |
| "learning_rate": 2.084109364523579e-05, | |
| "loss": 2.1333, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 3.1300649486695997, | |
| "grad_norm": 5.8296732902526855, | |
| "learning_rate": 2.0766594650972212e-05, | |
| "loss": 2.2287, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 3.136769327467002, | |
| "grad_norm": 5.801497459411621, | |
| "learning_rate": 2.0692095656708636e-05, | |
| "loss": 2.2238, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 3.143473706264404, | |
| "grad_norm": 6.177136421203613, | |
| "learning_rate": 2.0617596662445057e-05, | |
| "loss": 2.1735, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 3.150178085061806, | |
| "grad_norm": 6.680226802825928, | |
| "learning_rate": 2.0543097668181478e-05, | |
| "loss": 2.1529, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.156882463859208, | |
| "grad_norm": 6.010555267333984, | |
| "learning_rate": 2.0468598673917902e-05, | |
| "loss": 2.1644, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 3.1635868426566103, | |
| "grad_norm": 5.7896528244018555, | |
| "learning_rate": 2.0394099679654323e-05, | |
| "loss": 2.1184, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 3.170291221454012, | |
| "grad_norm": 5.442397594451904, | |
| "learning_rate": 2.031960068539075e-05, | |
| "loss": 2.1438, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 3.176995600251414, | |
| "grad_norm": 6.730420112609863, | |
| "learning_rate": 2.024510169112717e-05, | |
| "loss": 2.2004, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 3.183699979048816, | |
| "grad_norm": 5.808672904968262, | |
| "learning_rate": 2.0170602696863593e-05, | |
| "loss": 2.1597, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.1904043578462185, | |
| "grad_norm": 5.694986820220947, | |
| "learning_rate": 2.0096103702600017e-05, | |
| "loss": 2.2067, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 3.1971087366436204, | |
| "grad_norm": 5.049539089202881, | |
| "learning_rate": 2.0021604708336438e-05, | |
| "loss": 2.2071, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 3.2038131154410223, | |
| "grad_norm": 5.884223937988281, | |
| "learning_rate": 1.9947105714072862e-05, | |
| "loss": 2.1081, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 3.2105174942384247, | |
| "grad_norm": 6.397696495056152, | |
| "learning_rate": 1.9872606719809283e-05, | |
| "loss": 2.1475, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 3.2172218730358266, | |
| "grad_norm": 6.474588394165039, | |
| "learning_rate": 1.9798107725545707e-05, | |
| "loss": 2.0934, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.2239262518332286, | |
| "grad_norm": 5.678287506103516, | |
| "learning_rate": 1.9723608731282128e-05, | |
| "loss": 2.1236, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 3.2306306306306305, | |
| "grad_norm": 5.606823921203613, | |
| "learning_rate": 1.9649109737018552e-05, | |
| "loss": 2.2182, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 3.237335009428033, | |
| "grad_norm": 5.8254499435424805, | |
| "learning_rate": 1.9574610742754973e-05, | |
| "loss": 2.1691, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 3.244039388225435, | |
| "grad_norm": 5.576842784881592, | |
| "learning_rate": 1.9500111748491394e-05, | |
| "loss": 2.1609, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 3.2507437670228367, | |
| "grad_norm": 5.685579776763916, | |
| "learning_rate": 1.9425612754227818e-05, | |
| "loss": 2.1802, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.2574481458202387, | |
| "grad_norm": 7.00796365737915, | |
| "learning_rate": 1.935111375996424e-05, | |
| "loss": 2.1535, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 3.264152524617641, | |
| "grad_norm": 5.360428333282471, | |
| "learning_rate": 1.9276614765700667e-05, | |
| "loss": 2.1896, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 3.270856903415043, | |
| "grad_norm": 5.5369696617126465, | |
| "learning_rate": 1.9202115771437088e-05, | |
| "loss": 2.114, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 3.277561282212445, | |
| "grad_norm": 6.027918815612793, | |
| "learning_rate": 1.912761677717351e-05, | |
| "loss": 2.1165, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 3.2842656610098473, | |
| "grad_norm": 6.253023624420166, | |
| "learning_rate": 1.9053117782909933e-05, | |
| "loss": 2.132, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.2909700398072492, | |
| "grad_norm": 5.557729721069336, | |
| "learning_rate": 1.8978618788646354e-05, | |
| "loss": 2.098, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 3.297674418604651, | |
| "grad_norm": 5.2757344245910645, | |
| "learning_rate": 1.8904119794382778e-05, | |
| "loss": 2.1631, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 3.304378797402053, | |
| "grad_norm": 6.308342456817627, | |
| "learning_rate": 1.88296208001192e-05, | |
| "loss": 2.1348, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 3.311083176199455, | |
| "grad_norm": 5.6540913581848145, | |
| "learning_rate": 1.8755121805855623e-05, | |
| "loss": 2.1916, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 3.3177875549968574, | |
| "grad_norm": 5.817383289337158, | |
| "learning_rate": 1.8680622811592044e-05, | |
| "loss": 2.1844, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.3244919337942593, | |
| "grad_norm": 6.088453769683838, | |
| "learning_rate": 1.8606123817328465e-05, | |
| "loss": 2.1049, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 3.3311963125916613, | |
| "grad_norm": 5.837181091308594, | |
| "learning_rate": 1.853162482306489e-05, | |
| "loss": 2.1038, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 3.3379006913890636, | |
| "grad_norm": 5.060699939727783, | |
| "learning_rate": 1.845712582880131e-05, | |
| "loss": 2.0959, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 3.3446050701864656, | |
| "grad_norm": 5.951335430145264, | |
| "learning_rate": 1.8382626834537734e-05, | |
| "loss": 2.1495, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 3.3513094489838675, | |
| "grad_norm": 5.952776908874512, | |
| "learning_rate": 1.830812784027416e-05, | |
| "loss": 2.124, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.3513094489838675, | |
| "eval_bleu_greedy": 44.34903037092752, | |
| "eval_loss": 0.3302690088748932, | |
| "eval_runtime": 118.2606, | |
| "eval_samples_per_second": 0.846, | |
| "eval_steps_per_second": 0.846, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.35801382778127, | |
| "grad_norm": 5.622983455657959, | |
| "learning_rate": 1.823362884601058e-05, | |
| "loss": 2.1254, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 3.364718206578672, | |
| "grad_norm": 5.697426795959473, | |
| "learning_rate": 1.8159129851747004e-05, | |
| "loss": 2.147, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 3.3714225853760738, | |
| "grad_norm": 5.682360649108887, | |
| "learning_rate": 1.8084630857483425e-05, | |
| "loss": 2.1331, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 3.3781269641734757, | |
| "grad_norm": 6.510776519775391, | |
| "learning_rate": 1.801013186321985e-05, | |
| "loss": 2.2537, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 3.3848313429708776, | |
| "grad_norm": 6.060893535614014, | |
| "learning_rate": 1.793563286895627e-05, | |
| "loss": 2.1469, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.39153572176828, | |
| "grad_norm": 6.059525012969971, | |
| "learning_rate": 1.7861133874692694e-05, | |
| "loss": 2.1388, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 3.398240100565682, | |
| "grad_norm": 6.2327117919921875, | |
| "learning_rate": 1.7786634880429115e-05, | |
| "loss": 2.118, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 3.404944479363084, | |
| "grad_norm": 5.623783588409424, | |
| "learning_rate": 1.7712135886165536e-05, | |
| "loss": 2.1141, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 3.4116488581604862, | |
| "grad_norm": 6.822098255157471, | |
| "learning_rate": 1.763763689190196e-05, | |
| "loss": 2.1523, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 3.418353236957888, | |
| "grad_norm": 5.375659942626953, | |
| "learning_rate": 1.756313789763838e-05, | |
| "loss": 2.1232, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.42505761575529, | |
| "grad_norm": 6.811357498168945, | |
| "learning_rate": 1.7488638903374805e-05, | |
| "loss": 2.1598, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 3.431761994552692, | |
| "grad_norm": 5.709009647369385, | |
| "learning_rate": 1.7414139909111226e-05, | |
| "loss": 2.1571, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 3.4384663733500944, | |
| "grad_norm": 6.078985214233398, | |
| "learning_rate": 1.733964091484765e-05, | |
| "loss": 2.1624, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 3.4451707521474964, | |
| "grad_norm": 5.974235534667969, | |
| "learning_rate": 1.7265141920584075e-05, | |
| "loss": 2.179, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 3.4518751309448983, | |
| "grad_norm": 6.026799201965332, | |
| "learning_rate": 1.7190642926320496e-05, | |
| "loss": 2.1788, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.4585795097423, | |
| "grad_norm": 6.061217308044434, | |
| "learning_rate": 1.711614393205692e-05, | |
| "loss": 2.1601, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 3.4652838885397026, | |
| "grad_norm": 6.014716625213623, | |
| "learning_rate": 1.704164493779334e-05, | |
| "loss": 2.1726, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 3.4719882673371045, | |
| "grad_norm": 5.476884365081787, | |
| "learning_rate": 1.6967145943529765e-05, | |
| "loss": 2.1277, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 3.4786926461345065, | |
| "grad_norm": 6.806579113006592, | |
| "learning_rate": 1.6892646949266186e-05, | |
| "loss": 2.1742, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 3.485397024931909, | |
| "grad_norm": 6.386503219604492, | |
| "learning_rate": 1.6818147955002607e-05, | |
| "loss": 2.093, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.4921014037293108, | |
| "grad_norm": 5.830183982849121, | |
| "learning_rate": 1.674364896073903e-05, | |
| "loss": 2.2129, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 3.4988057825267127, | |
| "grad_norm": 7.050870895385742, | |
| "learning_rate": 1.6669149966475452e-05, | |
| "loss": 2.0302, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 3.505510161324115, | |
| "grad_norm": 6.339367866516113, | |
| "learning_rate": 1.6594650972211876e-05, | |
| "loss": 2.2095, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 3.512214540121517, | |
| "grad_norm": 6.269477844238281, | |
| "learning_rate": 1.6520151977948297e-05, | |
| "loss": 2.1547, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 3.518918918918919, | |
| "grad_norm": 5.818455219268799, | |
| "learning_rate": 1.644565298368472e-05, | |
| "loss": 2.1763, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.525623297716321, | |
| "grad_norm": 5.608326435089111, | |
| "learning_rate": 1.6371153989421142e-05, | |
| "loss": 2.1408, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 3.532327676513723, | |
| "grad_norm": 5.302171230316162, | |
| "learning_rate": 1.6296654995157566e-05, | |
| "loss": 2.1537, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 3.539032055311125, | |
| "grad_norm": 5.502221584320068, | |
| "learning_rate": 1.622215600089399e-05, | |
| "loss": 2.0883, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 3.545736434108527, | |
| "grad_norm": 6.126551628112793, | |
| "learning_rate": 1.614765700663041e-05, | |
| "loss": 2.1461, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 3.552440812905929, | |
| "grad_norm": 6.817451000213623, | |
| "learning_rate": 1.6073158012366836e-05, | |
| "loss": 2.2614, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.5591451917033314, | |
| "grad_norm": 6.151493072509766, | |
| "learning_rate": 1.5998659018103257e-05, | |
| "loss": 2.2219, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 3.5658495705007334, | |
| "grad_norm": 5.696464538574219, | |
| "learning_rate": 1.592416002383968e-05, | |
| "loss": 2.0755, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 3.5725539492981353, | |
| "grad_norm": 5.986370086669922, | |
| "learning_rate": 1.5849661029576102e-05, | |
| "loss": 2.1614, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 3.5792583280955372, | |
| "grad_norm": 5.665823936462402, | |
| "learning_rate": 1.5775162035312523e-05, | |
| "loss": 2.1477, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 3.5859627068929396, | |
| "grad_norm": 6.486339092254639, | |
| "learning_rate": 1.5700663041048947e-05, | |
| "loss": 2.1662, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.5926670856903415, | |
| "grad_norm": 6.461730003356934, | |
| "learning_rate": 1.5626164046785368e-05, | |
| "loss": 2.2309, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 3.5993714644877435, | |
| "grad_norm": 5.702849864959717, | |
| "learning_rate": 1.5551665052521792e-05, | |
| "loss": 2.0836, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 3.6060758432851454, | |
| "grad_norm": 5.847084999084473, | |
| "learning_rate": 1.5477166058258213e-05, | |
| "loss": 2.1142, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 3.612780222082548, | |
| "grad_norm": 6.496880531311035, | |
| "learning_rate": 1.5402667063994637e-05, | |
| "loss": 2.22, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 3.6194846008799497, | |
| "grad_norm": 5.130465507507324, | |
| "learning_rate": 1.5328168069731058e-05, | |
| "loss": 2.1727, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.6261889796773517, | |
| "grad_norm": 5.424370288848877, | |
| "learning_rate": 1.5253669075467482e-05, | |
| "loss": 2.1294, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 3.632893358474754, | |
| "grad_norm": 5.883772373199463, | |
| "learning_rate": 1.5179170081203905e-05, | |
| "loss": 2.2141, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 3.639597737272156, | |
| "grad_norm": 6.7028374671936035, | |
| "learning_rate": 1.5104671086940328e-05, | |
| "loss": 2.1481, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 3.646302116069558, | |
| "grad_norm": 5.938512325286865, | |
| "learning_rate": 1.503017209267675e-05, | |
| "loss": 2.0623, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 3.65300649486696, | |
| "grad_norm": 6.206230640411377, | |
| "learning_rate": 1.4955673098413173e-05, | |
| "loss": 2.1146, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.6597108736643618, | |
| "grad_norm": 5.823507308959961, | |
| "learning_rate": 1.4881174104149595e-05, | |
| "loss": 2.2025, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 3.666415252461764, | |
| "grad_norm": 6.715165138244629, | |
| "learning_rate": 1.4806675109886018e-05, | |
| "loss": 2.1292, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 3.673119631259166, | |
| "grad_norm": 5.429907321929932, | |
| "learning_rate": 1.473217611562244e-05, | |
| "loss": 2.0976, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 3.679824010056568, | |
| "grad_norm": 6.245195388793945, | |
| "learning_rate": 1.4657677121358861e-05, | |
| "loss": 2.086, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 3.6865283888539704, | |
| "grad_norm": 5.78788948059082, | |
| "learning_rate": 1.4583178127095284e-05, | |
| "loss": 2.1993, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.6865283888539704, | |
| "eval_bleu_greedy": 44.647707129353364, | |
| "eval_loss": 0.3233819603919983, | |
| "eval_runtime": 93.9426, | |
| "eval_samples_per_second": 1.064, | |
| "eval_steps_per_second": 1.064, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.6942384244709827, | |
| "grad_norm": 6.311983108520508, | |
| "learning_rate": 1.4508679132831707e-05, | |
| "loss": 1.9803, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 3.7009428032683847, | |
| "grad_norm": 5.580496788024902, | |
| "learning_rate": 1.4434180138568129e-05, | |
| "loss": 1.9703, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 3.7076471820657866, | |
| "grad_norm": 5.837003707885742, | |
| "learning_rate": 1.4359681144304552e-05, | |
| "loss": 1.9426, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 3.7143515608631885, | |
| "grad_norm": 5.717148780822754, | |
| "learning_rate": 1.4285182150040974e-05, | |
| "loss": 1.9986, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 3.721055939660591, | |
| "grad_norm": 6.6829023361206055, | |
| "learning_rate": 1.4210683155777399e-05, | |
| "loss": 1.9822, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.727760318457993, | |
| "grad_norm": 6.537654876708984, | |
| "learning_rate": 1.4136184161513821e-05, | |
| "loss": 1.9483, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 3.7344646972553948, | |
| "grad_norm": 6.249990940093994, | |
| "learning_rate": 1.4061685167250244e-05, | |
| "loss": 2.0025, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 3.741169076052797, | |
| "grad_norm": 5.1023736000061035, | |
| "learning_rate": 1.3987186172986666e-05, | |
| "loss": 2.0612, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 3.747873454850199, | |
| "grad_norm": 7.344115257263184, | |
| "learning_rate": 1.3912687178723089e-05, | |
| "loss": 1.9921, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 3.754577833647601, | |
| "grad_norm": 6.085274696350098, | |
| "learning_rate": 1.3838188184459511e-05, | |
| "loss": 2.0035, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.7612822124450034, | |
| "grad_norm": 6.330926418304443, | |
| "learning_rate": 1.3763689190195932e-05, | |
| "loss": 1.9726, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 3.7679865912424053, | |
| "grad_norm": 6.293400287628174, | |
| "learning_rate": 1.3689190195932355e-05, | |
| "loss": 2.0186, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 3.7746909700398072, | |
| "grad_norm": 5.887099266052246, | |
| "learning_rate": 1.3614691201668777e-05, | |
| "loss": 1.9584, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 3.781395348837209, | |
| "grad_norm": 7.026653289794922, | |
| "learning_rate": 1.35401922074052e-05, | |
| "loss": 1.9965, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 3.788099727634611, | |
| "grad_norm": 5.680566787719727, | |
| "learning_rate": 1.3465693213141623e-05, | |
| "loss": 1.9619, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.7948041064320135, | |
| "grad_norm": 6.181577682495117, | |
| "learning_rate": 1.3391194218878045e-05, | |
| "loss": 1.932, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 3.8015084852294154, | |
| "grad_norm": 5.5085015296936035, | |
| "learning_rate": 1.3316695224614468e-05, | |
| "loss": 1.9788, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 3.8082128640268174, | |
| "grad_norm": 5.90690279006958, | |
| "learning_rate": 1.324219623035089e-05, | |
| "loss": 2.0387, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 3.8149172428242197, | |
| "grad_norm": 5.351406097412109, | |
| "learning_rate": 1.3167697236087315e-05, | |
| "loss": 2.0106, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 3.8216216216216217, | |
| "grad_norm": 6.080597400665283, | |
| "learning_rate": 1.3093198241823737e-05, | |
| "loss": 1.9646, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.8283260004190236, | |
| "grad_norm": 6.259003639221191, | |
| "learning_rate": 1.301869924756016e-05, | |
| "loss": 2.0344, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 3.835030379216426, | |
| "grad_norm": 5.44300651550293, | |
| "learning_rate": 1.2944200253296582e-05, | |
| "loss": 1.8795, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 3.841734758013828, | |
| "grad_norm": 4.950876235961914, | |
| "learning_rate": 1.2869701259033005e-05, | |
| "loss": 1.9399, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 3.84843913681123, | |
| "grad_norm": 5.528983116149902, | |
| "learning_rate": 1.2795202264769426e-05, | |
| "loss": 1.9733, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 3.8551435156086318, | |
| "grad_norm": 6.489850044250488, | |
| "learning_rate": 1.2720703270505848e-05, | |
| "loss": 1.9987, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.8618478944060337, | |
| "grad_norm": 6.303378105163574, | |
| "learning_rate": 1.2646204276242271e-05, | |
| "loss": 1.9656, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 3.868552273203436, | |
| "grad_norm": 6.525257587432861, | |
| "learning_rate": 1.2571705281978693e-05, | |
| "loss": 1.9482, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 3.875256652000838, | |
| "grad_norm": 6.339199066162109, | |
| "learning_rate": 1.2497206287715116e-05, | |
| "loss": 1.9149, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 3.88196103079824, | |
| "grad_norm": 5.968883037567139, | |
| "learning_rate": 1.242270729345154e-05, | |
| "loss": 1.89, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 3.8886654095956423, | |
| "grad_norm": 5.6695990562438965, | |
| "learning_rate": 1.2348208299187961e-05, | |
| "loss": 2.0011, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.8953697883930443, | |
| "grad_norm": 5.211636066436768, | |
| "learning_rate": 1.2273709304924384e-05, | |
| "loss": 1.9736, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 3.902074167190446, | |
| "grad_norm": 6.887118816375732, | |
| "learning_rate": 1.2199210310660806e-05, | |
| "loss": 2.0432, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 3.9087785459878486, | |
| "grad_norm": 5.855321407318115, | |
| "learning_rate": 1.2124711316397229e-05, | |
| "loss": 2.0008, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 3.9154829247852505, | |
| "grad_norm": 5.927514553070068, | |
| "learning_rate": 1.2050212322133651e-05, | |
| "loss": 1.9975, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 3.9221873035826524, | |
| "grad_norm": 5.483107089996338, | |
| "learning_rate": 1.1975713327870074e-05, | |
| "loss": 1.9729, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.9288916823800544, | |
| "grad_norm": 6.38045072555542, | |
| "learning_rate": 1.1901214333606497e-05, | |
| "loss": 1.957, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 3.9355960611774563, | |
| "grad_norm": 5.1667561531066895, | |
| "learning_rate": 1.182671533934292e-05, | |
| "loss": 1.9953, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 3.9423004399748587, | |
| "grad_norm": 6.497701168060303, | |
| "learning_rate": 1.1752216345079342e-05, | |
| "loss": 2.0336, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 3.9490048187722606, | |
| "grad_norm": 6.4357194900512695, | |
| "learning_rate": 1.1677717350815764e-05, | |
| "loss": 1.9825, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 3.9557091975696625, | |
| "grad_norm": 5.45937442779541, | |
| "learning_rate": 1.1603218356552187e-05, | |
| "loss": 2.0056, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.962413576367065, | |
| "grad_norm": 6.294889450073242, | |
| "learning_rate": 1.152871936228861e-05, | |
| "loss": 1.9627, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 3.969117955164467, | |
| "grad_norm": 6.30310583114624, | |
| "learning_rate": 1.1454220368025032e-05, | |
| "loss": 1.9793, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 3.975822333961869, | |
| "grad_norm": 5.265941143035889, | |
| "learning_rate": 1.1379721373761455e-05, | |
| "loss": 1.9109, | |
| "step": 11860 | |
| }, | |
| { | |
| "epoch": 3.982526712759271, | |
| "grad_norm": 6.546343803405762, | |
| "learning_rate": 1.1305222379497877e-05, | |
| "loss": 1.9913, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 3.989231091556673, | |
| "grad_norm": 5.486214637756348, | |
| "learning_rate": 1.12307233852343e-05, | |
| "loss": 1.917, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.995935470354075, | |
| "grad_norm": 6.186803817749023, | |
| "learning_rate": 1.1156224390970722e-05, | |
| "loss": 1.9161, | |
| "step": 11920 | |
| }, | |
| { | |
| "epoch": 4.002681751518961, | |
| "grad_norm": 6.181544780731201, | |
| "learning_rate": 1.1081725396707145e-05, | |
| "loss": 2.0623, | |
| "step": 11940 | |
| }, | |
| { | |
| "epoch": 4.009386130316363, | |
| "grad_norm": 6.881202220916748, | |
| "learning_rate": 1.1007226402443568e-05, | |
| "loss": 1.9415, | |
| "step": 11960 | |
| }, | |
| { | |
| "epoch": 4.016090509113765, | |
| "grad_norm": 5.8216633796691895, | |
| "learning_rate": 1.093272740817999e-05, | |
| "loss": 1.9856, | |
| "step": 11980 | |
| }, | |
| { | |
| "epoch": 4.022794887911167, | |
| "grad_norm": 7.12168025970459, | |
| "learning_rate": 1.0858228413916413e-05, | |
| "loss": 1.923, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.022794887911167, | |
| "eval_bleu_greedy": 44.50439408731819, | |
| "eval_loss": 0.32926422357559204, | |
| "eval_runtime": 73.5162, | |
| "eval_samples_per_second": 1.36, | |
| "eval_steps_per_second": 1.36, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.029499266708569, | |
| "grad_norm": 6.934213161468506, | |
| "learning_rate": 1.0783729419652835e-05, | |
| "loss": 1.9524, | |
| "step": 12020 | |
| }, | |
| { | |
| "epoch": 4.036203645505971, | |
| "grad_norm": 5.604892730712891, | |
| "learning_rate": 1.0709230425389258e-05, | |
| "loss": 1.899, | |
| "step": 12040 | |
| }, | |
| { | |
| "epoch": 4.042908024303373, | |
| "grad_norm": 6.461178302764893, | |
| "learning_rate": 1.063473143112568e-05, | |
| "loss": 1.9747, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 4.049612403100776, | |
| "grad_norm": 5.3131866455078125, | |
| "learning_rate": 1.0560232436862103e-05, | |
| "loss": 1.8441, | |
| "step": 12080 | |
| }, | |
| { | |
| "epoch": 4.056316781898177, | |
| "grad_norm": 6.133450508117676, | |
| "learning_rate": 1.0485733442598526e-05, | |
| "loss": 1.9745, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 4.0630211606955795, | |
| "grad_norm": 6.431405067443848, | |
| "learning_rate": 1.0411234448334948e-05, | |
| "loss": 2.0148, | |
| "step": 12120 | |
| }, | |
| { | |
| "epoch": 4.069725539492981, | |
| "grad_norm": 6.80940055847168, | |
| "learning_rate": 1.033673545407137e-05, | |
| "loss": 2.0231, | |
| "step": 12140 | |
| }, | |
| { | |
| "epoch": 4.076429918290383, | |
| "grad_norm": 6.070991516113281, | |
| "learning_rate": 1.0262236459807793e-05, | |
| "loss": 1.8877, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 4.083134297087786, | |
| "grad_norm": 6.459563255310059, | |
| "learning_rate": 1.0187737465544216e-05, | |
| "loss": 1.9614, | |
| "step": 12180 | |
| }, | |
| { | |
| "epoch": 4.089838675885187, | |
| "grad_norm": 6.669441223144531, | |
| "learning_rate": 1.0113238471280638e-05, | |
| "loss": 1.9879, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 4.09654305468259, | |
| "grad_norm": 6.027960300445557, | |
| "learning_rate": 1.0038739477017061e-05, | |
| "loss": 2.0168, | |
| "step": 12220 | |
| }, | |
| { | |
| "epoch": 4.103247433479992, | |
| "grad_norm": 6.358060359954834, | |
| "learning_rate": 9.964240482753482e-06, | |
| "loss": 1.9143, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 4.1099518122773935, | |
| "grad_norm": 7.689465045928955, | |
| "learning_rate": 9.889741488489906e-06, | |
| "loss": 1.9383, | |
| "step": 12260 | |
| }, | |
| { | |
| "epoch": 4.116656191074796, | |
| "grad_norm": 6.243340969085693, | |
| "learning_rate": 9.815242494226329e-06, | |
| "loss": 1.9484, | |
| "step": 12280 | |
| }, | |
| { | |
| "epoch": 4.123360569872197, | |
| "grad_norm": 6.634798049926758, | |
| "learning_rate": 9.740743499962751e-06, | |
| "loss": 1.9055, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 4.1300649486696, | |
| "grad_norm": 6.069347858428955, | |
| "learning_rate": 9.666244505699174e-06, | |
| "loss": 1.9314, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 4.136769327467002, | |
| "grad_norm": 5.5364251136779785, | |
| "learning_rate": 9.591745511435596e-06, | |
| "loss": 1.9819, | |
| "step": 12340 | |
| }, | |
| { | |
| "epoch": 4.143473706264404, | |
| "grad_norm": 7.139957904815674, | |
| "learning_rate": 9.517246517172019e-06, | |
| "loss": 2.0314, | |
| "step": 12360 | |
| }, | |
| { | |
| "epoch": 4.150178085061806, | |
| "grad_norm": 6.039831161499023, | |
| "learning_rate": 9.44274752290844e-06, | |
| "loss": 1.8655, | |
| "step": 12380 | |
| }, | |
| { | |
| "epoch": 4.156882463859208, | |
| "grad_norm": 6.120654582977295, | |
| "learning_rate": 9.368248528644864e-06, | |
| "loss": 1.9809, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 4.16358684265661, | |
| "grad_norm": 6.08513879776001, | |
| "learning_rate": 9.293749534381287e-06, | |
| "loss": 1.963, | |
| "step": 12420 | |
| }, | |
| { | |
| "epoch": 4.170291221454012, | |
| "grad_norm": 6.9406208992004395, | |
| "learning_rate": 9.21925054011771e-06, | |
| "loss": 1.9999, | |
| "step": 12440 | |
| }, | |
| { | |
| "epoch": 4.176995600251415, | |
| "grad_norm": 5.910630226135254, | |
| "learning_rate": 9.144751545854132e-06, | |
| "loss": 1.9606, | |
| "step": 12460 | |
| }, | |
| { | |
| "epoch": 4.183699979048816, | |
| "grad_norm": 6.108793258666992, | |
| "learning_rate": 9.070252551590554e-06, | |
| "loss": 1.9267, | |
| "step": 12480 | |
| }, | |
| { | |
| "epoch": 4.1904043578462185, | |
| "grad_norm": 6.7554497718811035, | |
| "learning_rate": 8.995753557326975e-06, | |
| "loss": 1.9704, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.19710873664362, | |
| "grad_norm": 5.927764415740967, | |
| "learning_rate": 8.921254563063398e-06, | |
| "loss": 1.9711, | |
| "step": 12520 | |
| }, | |
| { | |
| "epoch": 4.203813115441022, | |
| "grad_norm": 5.403862476348877, | |
| "learning_rate": 8.846755568799822e-06, | |
| "loss": 1.9401, | |
| "step": 12540 | |
| }, | |
| { | |
| "epoch": 4.210517494238425, | |
| "grad_norm": 6.7553181648254395, | |
| "learning_rate": 8.772256574536245e-06, | |
| "loss": 1.9349, | |
| "step": 12560 | |
| }, | |
| { | |
| "epoch": 4.217221873035826, | |
| "grad_norm": 5.846068382263184, | |
| "learning_rate": 8.697757580272667e-06, | |
| "loss": 1.9993, | |
| "step": 12580 | |
| }, | |
| { | |
| "epoch": 4.223926251833229, | |
| "grad_norm": 5.979813575744629, | |
| "learning_rate": 8.62325858600909e-06, | |
| "loss": 2.0041, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 4.230630630630631, | |
| "grad_norm": 6.5849080085754395, | |
| "learning_rate": 8.54875959174551e-06, | |
| "loss": 1.9836, | |
| "step": 12620 | |
| }, | |
| { | |
| "epoch": 4.237335009428032, | |
| "grad_norm": 6.310152530670166, | |
| "learning_rate": 8.474260597481933e-06, | |
| "loss": 1.9863, | |
| "step": 12640 | |
| }, | |
| { | |
| "epoch": 4.244039388225435, | |
| "grad_norm": 5.853137493133545, | |
| "learning_rate": 8.399761603218358e-06, | |
| "loss": 1.9395, | |
| "step": 12660 | |
| }, | |
| { | |
| "epoch": 4.250743767022837, | |
| "grad_norm": 6.041353225708008, | |
| "learning_rate": 8.32526260895478e-06, | |
| "loss": 1.9732, | |
| "step": 12680 | |
| }, | |
| { | |
| "epoch": 4.257448145820239, | |
| "grad_norm": 5.885549545288086, | |
| "learning_rate": 8.250763614691203e-06, | |
| "loss": 1.9788, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 4.264152524617641, | |
| "grad_norm": 6.728966236114502, | |
| "learning_rate": 8.176264620427625e-06, | |
| "loss": 2.0255, | |
| "step": 12720 | |
| }, | |
| { | |
| "epoch": 4.2708569034150425, | |
| "grad_norm": 6.438037395477295, | |
| "learning_rate": 8.101765626164046e-06, | |
| "loss": 1.964, | |
| "step": 12740 | |
| }, | |
| { | |
| "epoch": 4.277561282212445, | |
| "grad_norm": 6.513815879821777, | |
| "learning_rate": 8.027266631900469e-06, | |
| "loss": 1.9188, | |
| "step": 12760 | |
| }, | |
| { | |
| "epoch": 4.284265661009847, | |
| "grad_norm": 5.92940616607666, | |
| "learning_rate": 7.952767637636891e-06, | |
| "loss": 1.9952, | |
| "step": 12780 | |
| }, | |
| { | |
| "epoch": 4.290970039807249, | |
| "grad_norm": 5.621717929840088, | |
| "learning_rate": 7.878268643373316e-06, | |
| "loss": 1.944, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 4.297674418604651, | |
| "grad_norm": 5.906243324279785, | |
| "learning_rate": 7.803769649109738e-06, | |
| "loss": 1.8989, | |
| "step": 12820 | |
| }, | |
| { | |
| "epoch": 4.3043787974020535, | |
| "grad_norm": 6.153092861175537, | |
| "learning_rate": 7.72927065484616e-06, | |
| "loss": 1.9672, | |
| "step": 12840 | |
| }, | |
| { | |
| "epoch": 4.311083176199455, | |
| "grad_norm": 6.292876720428467, | |
| "learning_rate": 7.654771660582582e-06, | |
| "loss": 1.9418, | |
| "step": 12860 | |
| }, | |
| { | |
| "epoch": 4.317787554996857, | |
| "grad_norm": 6.2637858390808105, | |
| "learning_rate": 7.580272666319005e-06, | |
| "loss": 1.9032, | |
| "step": 12880 | |
| }, | |
| { | |
| "epoch": 4.32449193379426, | |
| "grad_norm": 6.278430461883545, | |
| "learning_rate": 7.505773672055427e-06, | |
| "loss": 2.0032, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 4.331196312591661, | |
| "grad_norm": 6.218298435211182, | |
| "learning_rate": 7.4312746777918494e-06, | |
| "loss": 1.9331, | |
| "step": 12920 | |
| }, | |
| { | |
| "epoch": 4.337900691389064, | |
| "grad_norm": 5.950978755950928, | |
| "learning_rate": 7.356775683528274e-06, | |
| "loss": 1.9175, | |
| "step": 12940 | |
| }, | |
| { | |
| "epoch": 4.344605070186465, | |
| "grad_norm": 6.4616780281066895, | |
| "learning_rate": 7.2822766892646954e-06, | |
| "loss": 1.9171, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 4.3513094489838675, | |
| "grad_norm": 6.791792869567871, | |
| "learning_rate": 7.207777695001118e-06, | |
| "loss": 1.9694, | |
| "step": 12980 | |
| }, | |
| { | |
| "epoch": 4.35801382778127, | |
| "grad_norm": 6.647165775299072, | |
| "learning_rate": 7.133278700737541e-06, | |
| "loss": 1.9414, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.35801382778127, | |
| "eval_bleu_greedy": 44.55894868872612, | |
| "eval_loss": 0.3250446319580078, | |
| "eval_runtime": 38.3365, | |
| "eval_samples_per_second": 2.608, | |
| "eval_steps_per_second": 2.608, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.364718206578671, | |
| "grad_norm": 6.127755165100098, | |
| "learning_rate": 7.058779706473963e-06, | |
| "loss": 1.907, | |
| "step": 13020 | |
| }, | |
| { | |
| "epoch": 4.371422585376074, | |
| "grad_norm": 7.295255184173584, | |
| "learning_rate": 6.984280712210385e-06, | |
| "loss": 1.9099, | |
| "step": 13040 | |
| }, | |
| { | |
| "epoch": 4.378126964173476, | |
| "grad_norm": 5.896840572357178, | |
| "learning_rate": 6.9097817179468075e-06, | |
| "loss": 1.9675, | |
| "step": 13060 | |
| }, | |
| { | |
| "epoch": 4.384831342970878, | |
| "grad_norm": 6.170024394989014, | |
| "learning_rate": 6.835282723683231e-06, | |
| "loss": 1.9138, | |
| "step": 13080 | |
| }, | |
| { | |
| "epoch": 4.39153572176828, | |
| "grad_norm": 7.63161039352417, | |
| "learning_rate": 6.7607837294196535e-06, | |
| "loss": 1.9458, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 4.398240100565682, | |
| "grad_norm": 6.906017780303955, | |
| "learning_rate": 6.686284735156076e-06, | |
| "loss": 1.9417, | |
| "step": 13120 | |
| }, | |
| { | |
| "epoch": 4.404944479363084, | |
| "grad_norm": 6.07291316986084, | |
| "learning_rate": 6.611785740892499e-06, | |
| "loss": 1.9937, | |
| "step": 13140 | |
| }, | |
| { | |
| "epoch": 4.411648858160486, | |
| "grad_norm": 7.198663234710693, | |
| "learning_rate": 6.53728674662892e-06, | |
| "loss": 1.9158, | |
| "step": 13160 | |
| }, | |
| { | |
| "epoch": 4.418353236957888, | |
| "grad_norm": 6.324075222015381, | |
| "learning_rate": 6.462787752365343e-06, | |
| "loss": 1.9194, | |
| "step": 13180 | |
| }, | |
| { | |
| "epoch": 4.42505761575529, | |
| "grad_norm": 5.920898914337158, | |
| "learning_rate": 6.3882887581017655e-06, | |
| "loss": 2.0104, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 4.4317619945526925, | |
| "grad_norm": 5.628123760223389, | |
| "learning_rate": 6.313789763838189e-06, | |
| "loss": 1.9472, | |
| "step": 13220 | |
| }, | |
| { | |
| "epoch": 4.438466373350094, | |
| "grad_norm": 7.292685031890869, | |
| "learning_rate": 6.239290769574611e-06, | |
| "loss": 1.9488, | |
| "step": 13240 | |
| }, | |
| { | |
| "epoch": 4.445170752147496, | |
| "grad_norm": 6.090283393859863, | |
| "learning_rate": 6.164791775311034e-06, | |
| "loss": 1.9137, | |
| "step": 13260 | |
| }, | |
| { | |
| "epoch": 4.451875130944899, | |
| "grad_norm": 6.441646099090576, | |
| "learning_rate": 6.090292781047456e-06, | |
| "loss": 1.9346, | |
| "step": 13280 | |
| }, | |
| { | |
| "epoch": 4.4585795097423, | |
| "grad_norm": 5.901956081390381, | |
| "learning_rate": 6.015793786783878e-06, | |
| "loss": 1.9465, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 4.465283888539703, | |
| "grad_norm": 6.282461643218994, | |
| "learning_rate": 5.941294792520302e-06, | |
| "loss": 1.9368, | |
| "step": 13320 | |
| }, | |
| { | |
| "epoch": 4.471988267337105, | |
| "grad_norm": 5.921506404876709, | |
| "learning_rate": 5.8667957982567235e-06, | |
| "loss": 1.9144, | |
| "step": 13340 | |
| }, | |
| { | |
| "epoch": 4.4786926461345065, | |
| "grad_norm": 6.149923801422119, | |
| "learning_rate": 5.792296803993146e-06, | |
| "loss": 1.9946, | |
| "step": 13360 | |
| }, | |
| { | |
| "epoch": 4.485397024931909, | |
| "grad_norm": 5.411886692047119, | |
| "learning_rate": 5.7177978097295695e-06, | |
| "loss": 1.9656, | |
| "step": 13380 | |
| }, | |
| { | |
| "epoch": 4.49210140372931, | |
| "grad_norm": 6.29611349105835, | |
| "learning_rate": 5.643298815465991e-06, | |
| "loss": 1.9522, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 4.498805782526713, | |
| "grad_norm": 6.302646160125732, | |
| "learning_rate": 5.568799821202414e-06, | |
| "loss": 2.028, | |
| "step": 13420 | |
| }, | |
| { | |
| "epoch": 4.505510161324115, | |
| "grad_norm": 6.083780765533447, | |
| "learning_rate": 5.494300826938836e-06, | |
| "loss": 1.9707, | |
| "step": 13440 | |
| }, | |
| { | |
| "epoch": 4.512214540121517, | |
| "grad_norm": 5.8760857582092285, | |
| "learning_rate": 5.419801832675259e-06, | |
| "loss": 1.9808, | |
| "step": 13460 | |
| }, | |
| { | |
| "epoch": 4.518918918918919, | |
| "grad_norm": 6.291037559509277, | |
| "learning_rate": 5.3453028384116815e-06, | |
| "loss": 1.9806, | |
| "step": 13480 | |
| }, | |
| { | |
| "epoch": 4.525623297716321, | |
| "grad_norm": 6.295835494995117, | |
| "learning_rate": 5.270803844148104e-06, | |
| "loss": 1.9802, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.532327676513723, | |
| "grad_norm": 6.249302387237549, | |
| "learning_rate": 5.196304849884527e-06, | |
| "loss": 1.9503, | |
| "step": 13520 | |
| }, | |
| { | |
| "epoch": 4.539032055311125, | |
| "grad_norm": 6.210274696350098, | |
| "learning_rate": 5.121805855620949e-06, | |
| "loss": 1.945, | |
| "step": 13540 | |
| }, | |
| { | |
| "epoch": 4.545736434108527, | |
| "grad_norm": 6.666885852813721, | |
| "learning_rate": 5.047306861357372e-06, | |
| "loss": 1.906, | |
| "step": 13560 | |
| }, | |
| { | |
| "epoch": 4.552440812905929, | |
| "grad_norm": 6.02937650680542, | |
| "learning_rate": 4.972807867093794e-06, | |
| "loss": 1.8988, | |
| "step": 13580 | |
| }, | |
| { | |
| "epoch": 4.559145191703331, | |
| "grad_norm": 5.890073776245117, | |
| "learning_rate": 4.898308872830217e-06, | |
| "loss": 2.0141, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 4.565849570500733, | |
| "grad_norm": 6.179807662963867, | |
| "learning_rate": 4.8238098785666396e-06, | |
| "loss": 1.8731, | |
| "step": 13620 | |
| }, | |
| { | |
| "epoch": 4.572553949298135, | |
| "grad_norm": 6.25577974319458, | |
| "learning_rate": 4.749310884303062e-06, | |
| "loss": 1.881, | |
| "step": 13640 | |
| }, | |
| { | |
| "epoch": 4.579258328095538, | |
| "grad_norm": 5.9553303718566895, | |
| "learning_rate": 4.674811890039485e-06, | |
| "loss": 1.9032, | |
| "step": 13660 | |
| }, | |
| { | |
| "epoch": 4.585962706892939, | |
| "grad_norm": 6.600682735443115, | |
| "learning_rate": 4.600312895775907e-06, | |
| "loss": 2.0216, | |
| "step": 13680 | |
| }, | |
| { | |
| "epoch": 4.5926670856903415, | |
| "grad_norm": 5.493414878845215, | |
| "learning_rate": 4.52581390151233e-06, | |
| "loss": 1.9494, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.599371464487744, | |
| "grad_norm": 6.232935905456543, | |
| "learning_rate": 4.4513149072487524e-06, | |
| "loss": 1.9676, | |
| "step": 13720 | |
| }, | |
| { | |
| "epoch": 4.606075843285145, | |
| "grad_norm": 5.859748840332031, | |
| "learning_rate": 4.376815912985175e-06, | |
| "loss": 1.904, | |
| "step": 13740 | |
| }, | |
| { | |
| "epoch": 4.612780222082548, | |
| "grad_norm": 6.198875427246094, | |
| "learning_rate": 4.3023169187215976e-06, | |
| "loss": 1.9837, | |
| "step": 13760 | |
| }, | |
| { | |
| "epoch": 4.61948460087995, | |
| "grad_norm": 5.675384521484375, | |
| "learning_rate": 4.22781792445802e-06, | |
| "loss": 1.9275, | |
| "step": 13780 | |
| }, | |
| { | |
| "epoch": 4.626188979677352, | |
| "grad_norm": 6.3175129890441895, | |
| "learning_rate": 4.153318930194443e-06, | |
| "loss": 1.9237, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.632893358474754, | |
| "grad_norm": 7.661358833312988, | |
| "learning_rate": 4.078819935930865e-06, | |
| "loss": 2.0447, | |
| "step": 13820 | |
| }, | |
| { | |
| "epoch": 4.6395977372721555, | |
| "grad_norm": 6.920125484466553, | |
| "learning_rate": 4.004320941667288e-06, | |
| "loss": 1.9915, | |
| "step": 13840 | |
| }, | |
| { | |
| "epoch": 4.646302116069558, | |
| "grad_norm": 5.844318866729736, | |
| "learning_rate": 3.9298219474037105e-06, | |
| "loss": 1.9599, | |
| "step": 13860 | |
| }, | |
| { | |
| "epoch": 4.65300649486696, | |
| "grad_norm": 6.021313667297363, | |
| "learning_rate": 3.855322953140133e-06, | |
| "loss": 1.985, | |
| "step": 13880 | |
| }, | |
| { | |
| "epoch": 4.659710873664362, | |
| "grad_norm": 6.7081298828125, | |
| "learning_rate": 3.780823958876555e-06, | |
| "loss": 1.9395, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.666415252461764, | |
| "grad_norm": 6.496410846710205, | |
| "learning_rate": 3.7063249646129778e-06, | |
| "loss": 1.9911, | |
| "step": 13920 | |
| }, | |
| { | |
| "epoch": 4.6731196312591665, | |
| "grad_norm": 6.401267051696777, | |
| "learning_rate": 3.6318259703494007e-06, | |
| "loss": 1.9315, | |
| "step": 13940 | |
| }, | |
| { | |
| "epoch": 4.679824010056568, | |
| "grad_norm": 6.458618640899658, | |
| "learning_rate": 3.557326976085823e-06, | |
| "loss": 1.9749, | |
| "step": 13960 | |
| }, | |
| { | |
| "epoch": 4.68652838885397, | |
| "grad_norm": 6.759514331817627, | |
| "learning_rate": 3.4828279818222455e-06, | |
| "loss": 2.0216, | |
| "step": 13980 | |
| }, | |
| { | |
| "epoch": 4.693232767651372, | |
| "grad_norm": 6.3571953773498535, | |
| "learning_rate": 3.4083289875586685e-06, | |
| "loss": 1.912, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.693232767651372, | |
| "eval_bleu_greedy": 44.91524109269023, | |
| "eval_loss": 0.32178983092308044, | |
| "eval_runtime": 58.0406, | |
| "eval_samples_per_second": 1.723, | |
| "eval_steps_per_second": 1.723, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.699937146448774, | |
| "grad_norm": 6.1466593742370605, | |
| "learning_rate": 3.3338299932950906e-06, | |
| "loss": 1.9918, | |
| "step": 14020 | |
| }, | |
| { | |
| "epoch": 4.706641525246177, | |
| "grad_norm": 6.885775089263916, | |
| "learning_rate": 3.259330999031513e-06, | |
| "loss": 1.9707, | |
| "step": 14040 | |
| }, | |
| { | |
| "epoch": 4.713345904043578, | |
| "grad_norm": 6.186959743499756, | |
| "learning_rate": 3.1848320047679354e-06, | |
| "loss": 1.9067, | |
| "step": 14060 | |
| }, | |
| { | |
| "epoch": 4.7200502828409805, | |
| "grad_norm": 6.915486812591553, | |
| "learning_rate": 3.1103330105043583e-06, | |
| "loss": 1.9584, | |
| "step": 14080 | |
| }, | |
| { | |
| "epoch": 4.726754661638383, | |
| "grad_norm": 6.508053302764893, | |
| "learning_rate": 3.035834016240781e-06, | |
| "loss": 1.9481, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.733459040435784, | |
| "grad_norm": 5.979186058044434, | |
| "learning_rate": 2.9613350219772035e-06, | |
| "loss": 1.9316, | |
| "step": 14120 | |
| }, | |
| { | |
| "epoch": 4.740163419233187, | |
| "grad_norm": 6.08104944229126, | |
| "learning_rate": 2.886836027713626e-06, | |
| "loss": 1.9287, | |
| "step": 14140 | |
| }, | |
| { | |
| "epoch": 4.746867798030589, | |
| "grad_norm": 6.154195308685303, | |
| "learning_rate": 2.8123370334500486e-06, | |
| "loss": 1.935, | |
| "step": 14160 | |
| }, | |
| { | |
| "epoch": 4.753572176827991, | |
| "grad_norm": 6.8738508224487305, | |
| "learning_rate": 2.737838039186471e-06, | |
| "loss": 1.9573, | |
| "step": 14180 | |
| }, | |
| { | |
| "epoch": 4.760276555625393, | |
| "grad_norm": 6.409038066864014, | |
| "learning_rate": 2.663339044922894e-06, | |
| "loss": 1.868, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.766980934422795, | |
| "grad_norm": 6.662640571594238, | |
| "learning_rate": 2.5888400506593164e-06, | |
| "loss": 1.9635, | |
| "step": 14220 | |
| }, | |
| { | |
| "epoch": 4.773685313220197, | |
| "grad_norm": 6.879613876342773, | |
| "learning_rate": 2.5143410563957385e-06, | |
| "loss": 1.9151, | |
| "step": 14240 | |
| }, | |
| { | |
| "epoch": 4.780389692017599, | |
| "grad_norm": 6.439156532287598, | |
| "learning_rate": 2.4398420621321615e-06, | |
| "loss": 1.991, | |
| "step": 14260 | |
| }, | |
| { | |
| "epoch": 4.787094070815001, | |
| "grad_norm": 5.503256797790527, | |
| "learning_rate": 2.3653430678685837e-06, | |
| "loss": 1.8901, | |
| "step": 14280 | |
| }, | |
| { | |
| "epoch": 4.793798449612403, | |
| "grad_norm": 6.81868839263916, | |
| "learning_rate": 2.2908440736050062e-06, | |
| "loss": 1.9397, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.8005028284098055, | |
| "grad_norm": 6.4171857833862305, | |
| "learning_rate": 2.2163450793414292e-06, | |
| "loss": 1.9112, | |
| "step": 14320 | |
| }, | |
| { | |
| "epoch": 4.807207207207207, | |
| "grad_norm": 7.29513692855835, | |
| "learning_rate": 2.1418460850778514e-06, | |
| "loss": 1.9406, | |
| "step": 14340 | |
| }, | |
| { | |
| "epoch": 4.813911586004609, | |
| "grad_norm": 6.337085723876953, | |
| "learning_rate": 2.0673470908142744e-06, | |
| "loss": 1.9374, | |
| "step": 14360 | |
| }, | |
| { | |
| "epoch": 4.820615964802011, | |
| "grad_norm": 6.306519508361816, | |
| "learning_rate": 1.9928480965506965e-06, | |
| "loss": 1.9493, | |
| "step": 14380 | |
| }, | |
| { | |
| "epoch": 4.827320343599413, | |
| "grad_norm": 5.782825469970703, | |
| "learning_rate": 1.918349102287119e-06, | |
| "loss": 1.886, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.834024722396816, | |
| "grad_norm": 6.214239597320557, | |
| "learning_rate": 1.843850108023542e-06, | |
| "loss": 1.9407, | |
| "step": 14420 | |
| }, | |
| { | |
| "epoch": 4.840729101194217, | |
| "grad_norm": 6.3050336837768555, | |
| "learning_rate": 1.7693511137599643e-06, | |
| "loss": 2.0325, | |
| "step": 14440 | |
| }, | |
| { | |
| "epoch": 4.847433479991619, | |
| "grad_norm": 5.516683101654053, | |
| "learning_rate": 1.694852119496387e-06, | |
| "loss": 1.9344, | |
| "step": 14460 | |
| }, | |
| { | |
| "epoch": 4.854137858789022, | |
| "grad_norm": 6.436117172241211, | |
| "learning_rate": 1.6203531252328094e-06, | |
| "loss": 1.9626, | |
| "step": 14480 | |
| }, | |
| { | |
| "epoch": 4.860842237586423, | |
| "grad_norm": 6.287826061248779, | |
| "learning_rate": 1.545854130969232e-06, | |
| "loss": 1.9281, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.867546616383826, | |
| "grad_norm": 5.765741348266602, | |
| "learning_rate": 1.4713551367056546e-06, | |
| "loss": 1.9629, | |
| "step": 14520 | |
| }, | |
| { | |
| "epoch": 4.874250995181228, | |
| "grad_norm": 7.784135818481445, | |
| "learning_rate": 1.3968561424420771e-06, | |
| "loss": 1.9842, | |
| "step": 14540 | |
| }, | |
| { | |
| "epoch": 4.8809553739786296, | |
| "grad_norm": 6.396987438201904, | |
| "learning_rate": 1.3223571481784995e-06, | |
| "loss": 1.9481, | |
| "step": 14560 | |
| }, | |
| { | |
| "epoch": 4.887659752776032, | |
| "grad_norm": 6.332607746124268, | |
| "learning_rate": 1.247858153914922e-06, | |
| "loss": 1.9444, | |
| "step": 14580 | |
| }, | |
| { | |
| "epoch": 4.894364131573434, | |
| "grad_norm": 7.538281440734863, | |
| "learning_rate": 1.1733591596513449e-06, | |
| "loss": 1.9695, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.901068510370836, | |
| "grad_norm": 6.990569591522217, | |
| "learning_rate": 1.0988601653877674e-06, | |
| "loss": 1.9596, | |
| "step": 14620 | |
| }, | |
| { | |
| "epoch": 4.907772889168238, | |
| "grad_norm": 7.1072998046875, | |
| "learning_rate": 1.0243611711241898e-06, | |
| "loss": 1.9351, | |
| "step": 14640 | |
| }, | |
| { | |
| "epoch": 4.9144772679656406, | |
| "grad_norm": 6.343371391296387, | |
| "learning_rate": 9.498621768606124e-07, | |
| "loss": 1.862, | |
| "step": 14660 | |
| }, | |
| { | |
| "epoch": 4.921181646763042, | |
| "grad_norm": 6.640133857727051, | |
| "learning_rate": 8.75363182597035e-07, | |
| "loss": 1.9533, | |
| "step": 14680 | |
| }, | |
| { | |
| "epoch": 4.927886025560444, | |
| "grad_norm": 6.360694408416748, | |
| "learning_rate": 8.008641883334574e-07, | |
| "loss": 1.9342, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.934590404357846, | |
| "grad_norm": 5.88399076461792, | |
| "learning_rate": 7.263651940698801e-07, | |
| "loss": 1.9337, | |
| "step": 14720 | |
| }, | |
| { | |
| "epoch": 4.941294783155248, | |
| "grad_norm": 6.079113006591797, | |
| "learning_rate": 6.518661998063027e-07, | |
| "loss": 1.8845, | |
| "step": 14740 | |
| }, | |
| { | |
| "epoch": 4.947999161952651, | |
| "grad_norm": 6.270729064941406, | |
| "learning_rate": 5.773672055427253e-07, | |
| "loss": 1.9492, | |
| "step": 14760 | |
| }, | |
| { | |
| "epoch": 4.954703540750052, | |
| "grad_norm": 6.091942310333252, | |
| "learning_rate": 5.028682112791477e-07, | |
| "loss": 1.9806, | |
| "step": 14780 | |
| }, | |
| { | |
| "epoch": 4.9614079195474545, | |
| "grad_norm": 7.632548809051514, | |
| "learning_rate": 4.2836921701557035e-07, | |
| "loss": 1.969, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.968112298344856, | |
| "grad_norm": 6.508310794830322, | |
| "learning_rate": 3.538702227519929e-07, | |
| "loss": 1.946, | |
| "step": 14820 | |
| }, | |
| { | |
| "epoch": 4.974816677142258, | |
| "grad_norm": 5.643795967102051, | |
| "learning_rate": 2.793712284884154e-07, | |
| "loss": 1.966, | |
| "step": 14840 | |
| }, | |
| { | |
| "epoch": 4.981521055939661, | |
| "grad_norm": 5.909745693206787, | |
| "learning_rate": 2.0487223422483797e-07, | |
| "loss": 2.0078, | |
| "step": 14860 | |
| }, | |
| { | |
| "epoch": 4.988225434737062, | |
| "grad_norm": 6.114766597747803, | |
| "learning_rate": 1.3037323996126055e-07, | |
| "loss": 1.9257, | |
| "step": 14880 | |
| }, | |
| { | |
| "epoch": 4.994929813534465, | |
| "grad_norm": 6.140356063842773, | |
| "learning_rate": 5.587424569768308e-08, | |
| "loss": 1.9642, | |
| "step": 14900 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 14915, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.1016910755245588e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |