diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -24,8 +24,8 @@ "completions/min_terminated_length": 212.0, "epoch": 0.0008954555630176853, "frac_reward_zero_std": 0.03125, - "grad_norm": 1.324537021697667, - "kl": 0.0518798828125, + "grad_norm": 1.3225010711337324, + "kl": 0.05487060546875, "learning_rate": 0.0, "loss": 0.0165, "num_tokens": 614764.0, @@ -52,8 +52,8 @@ "completions/min_terminated_length": 144.0, "epoch": 0.0017909111260353706, "frac_reward_zero_std": 0.03125, - "grad_norm": 1.1898987030436239, - "kl": 0.0516357421875, + "grad_norm": 1.1878297521431689, + "kl": 0.05389404296875, "learning_rate": 3.3333333333333335e-07, "loss": 0.0218, "num_tokens": 1251315.0, @@ -72,25 +72,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1477.0, - "completions/mean_length": 619.98828125, - "completions/mean_terminated_length": 617.1937255859375, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_length": 1771.0, + "completions/max_terminated_length": 1771.0, + "completions/mean_length": 641.310546875, + "completions/mean_terminated_length": 640.5048828125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.002686366689053056, "frac_reward_zero_std": 0.0, - "grad_norm": 1.161155142512, - "kl": 0.05609130859375, + "grad_norm": 6.374360193421834, + "kl": 0.060302734375, "learning_rate": 6.666666666666667e-07, - "loss": 0.0261, - "num_tokens": 1895949.0, - "reward": 0.06425781548023224, - "reward_std": 0.04574279859662056, + "loss": 0.0187, + "num_tokens": 1906866.0, + "reward": 0.05839844048023224, + "reward_std": 0.04519660770893097, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.642578125, - "rewards/format_reward/std": 0.4797092080116272, + "rewards/format_reward/mean": 0.583984375, + "rewards/format_reward/std": 0.493378221988678, "step": 3 }, { @@ -101,20 +101,20 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 641.892578125, - "completions/mean_terminated_length": 632.3051147460938, - "completions/min_length": 244.0, - "completions/min_terminated_length": 244.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 641.28125, + "completions/mean_terminated_length": 630.2047119140625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.003581822252070741, - "frac_reward_zero_std": 0.0, - "grad_norm": 6.062290227835526, - "kl": 0.15771484375, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1.0358462731569396, + "kl": 0.0552978515625, "learning_rate": 1.0000000000000002e-06, - "loss": 0.0292, - "num_tokens": 2549926.0, - "reward": 0.06679688394069672, - "reward_std": 0.04454650729894638, + "loss": 0.032, + "num_tokens": 2560530.0, + "reward": 0.06679687649011612, + "reward_std": 0.045152708888053894, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.66796875, @@ -127,26 +127,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1832.0, - "completions/mean_length": 602.048828125, - "completions/mean_terminated_length": 598.4530029296875, - "completions/min_length": 271.0, - "completions/min_terminated_length": 271.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 589.8203125, + "completions/mean_terminated_length": 586.9667358398438, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.0044772778150884264, "frac_reward_zero_std": 0.0625, - "grad_norm": 1.0733703140487003, - "kl": 0.07025146484375, + "grad_norm": 0.8048559127300979, + "kl": 0.05413818359375, "learning_rate": 1.3333333333333334e-06, - "loss": 0.0339, - "num_tokens": 3140687.0, - "reward": 0.07949218899011612, - "reward_std": 0.03763638436794281, + "loss": 0.0198, + "num_tokens": 3145030.0, + "reward": 0.07988281548023224, + "reward_std": 0.03725838661193848, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.794921875, - "rewards/format_reward/std": 0.4041535556316376, + "rewards/format_reward/mean": 0.798828125, + "rewards/format_reward/std": 0.4012683033943176, "step": 5 }, { @@ -156,25 +156,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1348.0, - "completions/mean_length": 593.43359375, - "completions/mean_terminated_length": 590.5870971679688, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_length": 1625.0, + "completions/max_terminated_length": 1625.0, + "completions/mean_length": 602.583984375, + "completions/mean_terminated_length": 601.2994384765625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.005372733378106112, - "frac_reward_zero_std": 0.15625, - "grad_norm": 0.46792041518870797, - "kl": 0.05133056640625, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1.1964223192844583, + "kl": 0.0577392578125, "learning_rate": 1.6666666666666667e-06, - "loss": 0.0476, - "num_tokens": 3759885.0, - "reward": 0.08847656846046448, - "reward_std": 0.028867140412330627, + "loss": 0.0274, + "num_tokens": 3768913.0, + "reward": 0.087890625, + "reward_std": 0.03155777230858803, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.884765625, - "rewards/format_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.87890625, + "rewards/format_reward/std": 0.3265552520751953, "step": 6 }, { @@ -183,26 +183,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1760.0, - "completions/mean_length": 622.640625, - "completions/mean_terminated_length": 619.8512573242188, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1803.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 625.125, + "completions/mean_terminated_length": 625.125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.006268188941123797, - "frac_reward_zero_std": 0.3125, - "grad_norm": 0.3715829196297699, - "kl": 0.0545654296875, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.3470545070077696, + "kl": 0.05816650390625, "learning_rate": 2.0000000000000003e-06, - "loss": 0.0305, - "num_tokens": 4475749.0, - "reward": 0.09160156548023224, - "reward_std": 0.022055473178625107, + "loss": 0.0427, + "num_tokens": 4486049.0, + "reward": 0.09316406399011612, + "reward_std": 0.018348829820752144, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.916015625, - "rewards/format_reward/std": 0.2776356339454651, + "rewards/format_reward/mean": 0.931640625, + "rewards/format_reward/std": 0.25260838866233826, "step": 7 }, { @@ -211,26 +211,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1935.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 581.716796875, - "completions/mean_terminated_length": 581.716796875, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 564.080078125, + "completions/mean_terminated_length": 561.1761474609375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.007163644504141482, - "frac_reward_zero_std": 0.375, - "grad_norm": 0.31874462958935845, - "kl": 0.05224609375, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.31201491179112845, + "kl": 0.056396484375, "learning_rate": 2.3333333333333336e-06, - "loss": 0.0226, - "num_tokens": 5085092.0, - "reward": 0.09453125298023224, - "reward_std": 0.017671994864940643, + "loss": 0.0299, + "num_tokens": 5086362.0, + "reward": 0.09589843451976776, + "reward_std": 0.013836899772286415, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9453125, - "rewards/format_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.958984375, + "rewards/format_reward/std": 0.19852031767368317, "step": 8 }, { @@ -241,24 +241,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1669.0, - "completions/mean_length": 538.501953125, - "completions/mean_terminated_length": 535.5479125976562, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/max_terminated_length": 1808.0, + "completions/mean_length": 541.20703125, + "completions/mean_terminated_length": 538.25830078125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.008059100067159167, - "frac_reward_zero_std": 0.71875, - "grad_norm": 0.22707584816837714, - "kl": 0.05548095703125, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.1881523338020528, + "kl": 0.0604248046875, "learning_rate": 2.666666666666667e-06, - "loss": 0.0184, - "num_tokens": 5653829.0, - "reward": 0.09804687649011612, - "reward_std": 0.007317390292882919, + "loss": 0.0108, + "num_tokens": 5656484.0, + "reward": 0.0986328125, + "reward_std": 0.00546875037252903, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.98046875, - "rewards/format_reward/std": 0.1385180652141571, + "rewards/format_reward/mean": 0.986328125, + "rewards/format_reward/std": 0.1162383034825325, "step": 9 }, { @@ -267,26 +267,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1623.0, - "completions/max_terminated_length": 1623.0, - "completions/mean_length": 508.15625, - "completions/mean_terminated_length": 508.15625, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 522.45703125, + "completions/mean_terminated_length": 519.4716186523438, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, "epoch": 0.008954555630176853, "frac_reward_zero_std": 0.9375, - "grad_norm": 0.10154622002931386, - "kl": 0.0634765625, + "grad_norm": 0.10940100591695406, + "kl": 0.069091796875, "learning_rate": 3e-06, - "loss": 0.0023, - "num_tokens": 6199173.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0085, + "num_tokens": 6209150.0, + "reward": 0.09941406548023224, + "reward_std": 0.0018486406188458204, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 10 }, { @@ -295,26 +295,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1765.0, - "completions/mean_length": 553.5234375, - "completions/mean_terminated_length": 549.9627685546875, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 568.38671875, + "completions/mean_terminated_length": 565.4912109375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.009850011193194537, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.14394814844886103, - "kl": 0.06744384765625, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.14514624791380354, + "kl": 0.0689697265625, "learning_rate": 3.3333333333333333e-06, - "loss": 0.0189, - "num_tokens": 6803009.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0119, + "num_tokens": 6820596.0, + "reward": 0.09921875596046448, + "reward_std": 0.0031250000465661287, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08812850713729858, "step": 11 }, { @@ -324,25 +324,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1550.0, - "completions/max_terminated_length": 1550.0, - "completions/mean_length": 454.251953125, - "completions/mean_terminated_length": 454.251953125, - "completions/min_length": 78.0, - "completions/min_terminated_length": 78.0, + "completions/max_length": 1487.0, + "completions/max_terminated_length": 1487.0, + "completions/mean_length": 470.0703125, + "completions/mean_terminated_length": 470.0703125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, "epoch": 0.010745466756212223, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.16220747501541533, - "kl": 0.0635986328125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.09849439146568009, + "kl": 0.07958984375, "learning_rate": 3.6666666666666666e-06, - "loss": 0.0039, - "num_tokens": 7340802.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0002, + "num_tokens": 7366488.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 12 }, { @@ -352,25 +352,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1469.0, - "completions/max_terminated_length": 1469.0, - "completions/mean_length": 469.560546875, - "completions/mean_terminated_length": 469.560546875, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 516.744140625, + "completions/mean_terminated_length": 516.744140625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.011640922319229908, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.1336388368219293, - "kl": 0.0775146484375, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.1514482044995211, + "kl": 0.079345703125, "learning_rate": 4.000000000000001e-06, - "loss": -0.0062, - "num_tokens": 7896049.0, - "reward": 0.09941406548023224, - "reward_std": 0.0018486406188458204, + "loss": 0.0022, + "num_tokens": 7945893.0, + "reward": 0.09921874850988388, + "reward_std": 0.0026298905722796917, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08812850713729858, "step": 13 }, { @@ -381,24 +381,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1832.0, - "completions/mean_length": 472.15234375, - "completions/mean_terminated_length": 469.0684814453125, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 531.853515625, + "completions/mean_terminated_length": 528.886474609375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, "epoch": 0.012536377882247594, - "frac_reward_zero_std": 0.84375, - "grad_norm": 0.2405259644807092, - "kl": 0.097412109375, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.28626495216884457, + "kl": 0.1065673828125, "learning_rate": 4.333333333333334e-06, - "loss": -0.0011, - "num_tokens": 8492799.0, - "reward": 0.09882812947034836, - "reward_std": 0.004192390479147434, + "loss": -0.0048, + "num_tokens": 8573210.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.98828125, - "rewards/format_reward/std": 0.10772226005792618, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 14 }, { @@ -407,26 +407,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.875, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1911.0, - "completions/mean_length": 452.572265625, - "completions/mean_terminated_length": 433.5972595214844, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 1480.0, + "completions/mean_length": 483.9921875, + "completions/mean_terminated_length": 480.0843505859375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.013431833445265278, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.1753416604203705, - "kl": 0.171630859375, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.31691147197037534, + "kl": 0.1241455078125, "learning_rate": 4.666666666666667e-06, - "loss": 0.0017, - "num_tokens": 9043940.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.019, + "num_tokens": 9140438.0, + "reward": 0.09843750298023224, + "reward_std": 0.004670868627727032, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12414088100194931, "step": 15 }, { @@ -435,26 +435,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 476.982421875, - "completions/mean_terminated_length": 464.00592041015625, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1855.0, + "completions/max_terminated_length": 1855.0, + "completions/mean_length": 519.79296875, + "completions/mean_terminated_length": 512.6004028320312, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.014327289008282965, - "frac_reward_zero_std": 0.96875, - "grad_norm": 1.0552577658632907, - "kl": 0.169189453125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1648930734651514, + "kl": 0.158447265625, "learning_rate": 5e-06, - "loss": 0.0024, - "num_tokens": 9613259.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0016, + "num_tokens": 9731676.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 16 }, { @@ -465,24 +465,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1642.0, - "completions/mean_length": 417.689453125, - "completions/mean_terminated_length": 406.4466552734375, - "completions/min_length": 57.0, - "completions/min_terminated_length": 57.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 504.20703125, + "completions/mean_terminated_length": 493.0751037597656, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, "epoch": 0.015222744571300649, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.26917824685344643, - "kl": 0.1390380859375, + "grad_norm": 1.4330453571845125, + "kl": 0.161865234375, "learning_rate": 4.999952797253148e-06, - "loss": 0.0027, - "num_tokens": 10112252.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": -0.0019, + "num_tokens": 10274966.0, + "reward": 0.099609375, + "reward_std": 0.0010673906654119492, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 17 }, { @@ -491,26 +491,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, + "completions/clipped_ratio": -6.796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1843.0, - "completions/mean_length": 462.291015625, - "completions/mean_terminated_length": 452.94500732421875, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 615.103515625, + "completions/mean_terminated_length": 580.6132202148438, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.016118200134318333, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.2947535239705964, - "kl": 0.117431640625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1.110370207766606, + "kl": 0.204833984375, "learning_rate": 4.9998111909931225e-06, - "loss": 0.0012, - "num_tokens": 10691873.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.002, + "num_tokens": 10932827.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 18 }, { @@ -519,26 +519,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 901.0, - "completions/max_terminated_length": 901.0, - "completions/mean_length": 376.89453125, - "completions/mean_terminated_length": 376.89453125, - "completions/min_length": 114.0, - "completions/min_terminated_length": 114.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 600.470703125, + "completions/mean_terminated_length": 537.7999877929688, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.01701365569733602, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.21872537582484822, - "kl": 0.0916748046875, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4145181105492008, + "kl": 0.245361328125, "learning_rate": 4.999575187161439e-06, - "loss": 0.0064, - "num_tokens": 11171243.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0078, + "num_tokens": 11526668.0, + "reward": 0.09921875596046448, + "reward_std": 0.0031250000465661287, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08812850713729858, "step": 19 }, { @@ -547,20 +547,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1383.0, - "completions/max_terminated_length": 1383.0, - "completions/mean_length": 386.00390625, - "completions/mean_terminated_length": 386.00390625, - "completions/min_length": 72.0, - "completions/min_terminated_length": 72.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 584.4921875, + "completions/mean_terminated_length": 549.3680419921875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.017909111260353706, "frac_reward_zero_std": 0.9375, - "grad_norm": 0.13652379466986567, - "kl": 0.0869140625, + "grad_norm": 1.2229303856627542, + "kl": 0.143310546875, "learning_rate": 4.9992447956603455e-06, - "loss": -0.0022, - "num_tokens": 11662493.0, + "loss": -0.0046, + "num_tokens": 12119544.0, "reward": 0.099609375, "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, @@ -575,26 +575,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 893.0, - "completions/max_terminated_length": 893.0, - "completions/mean_length": 333.017578125, - "completions/mean_terminated_length": 333.017578125, - "completions/min_length": 70.0, - "completions/min_terminated_length": 70.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 480.513671875, + "completions/mean_terminated_length": 471.2750549316406, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.01880456682337139, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.04417150040050226, - "kl": 0.095703125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.10797165131936656, + "kl": 0.093505859375, "learning_rate": 4.998820030352409e-06, - "loss": 0.001, - "num_tokens": 12113686.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0073, + "num_tokens": 12646255.0, + "reward": 0.099609375, + "reward_std": 0.0010673906654119492, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 21 }, { @@ -603,26 +603,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1031.0, - "completions/max_terminated_length": 1031.0, - "completions/mean_length": 387.27734375, - "completions/mean_terminated_length": 387.27734375, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 522.076171875, + "completions/mean_terminated_length": 513.08251953125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.019700022386389075, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.19366315929923666, - "kl": 0.0830078125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.1267456048768045, + "kl": 0.0908203125, "learning_rate": 4.998300909059929e-06, - "loss": 0.0035, - "num_tokens": 12608996.0, - "reward": 0.09902343899011612, - "reward_std": 0.0028222277760505676, + "loss": 0.0084, + "num_tokens": 13210582.0, + "reward": 0.099609375, + "reward_std": 0.0010673906654119492, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.990234375, - "rewards/format_reward/std": 0.09843364357948303, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 22 }, { @@ -631,26 +631,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 900.0, - "completions/max_terminated_length": 900.0, - "completions/mean_length": 393.1796875, - "completions/mean_terminated_length": 393.1796875, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 593.544921875, + "completions/mean_terminated_length": 528.2428588867188, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, "epoch": 0.02059547794940676, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08926508511842418, - "kl": 0.081298828125, + "frac_reward_zero_std": 0.5625, + "grad_norm": 3.0222232303189247, + "kl": 0.0986328125, "learning_rate": 4.997687453564198e-06, - "loss": 0.0004, - "num_tokens": 13080752.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0604, + "num_tokens": 13784925.0, + "reward": 0.09121093899011612, + "reward_std": 0.015414923429489136, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.912109375, + "rewards/format_reward/std": 0.2834126651287079, "step": 23 }, { @@ -659,26 +659,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1882.0, - "completions/max_terminated_length": 1882.0, - "completions/mean_length": 444.2734375, - "completions/mean_terminated_length": 444.2734375, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, - "epoch": 0.021490933512424447, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.12692069614371276, - "kl": 0.085693359375, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 547.720703125, + "completions/mean_terminated_length": 526.9247436523438, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.021490933512424447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12835045627092287, + "kl": 0.0906982421875, "learning_rate": 4.9969796896045775e-06, - "loss": 0.0076, - "num_tokens": 13661468.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0009, + "num_tokens": 14418606.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 24 }, { @@ -687,26 +687,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 1259.0, - "completions/max_terminated_length": 1259.0, - "completions/mean_length": 439.455078125, - "completions/mean_terminated_length": 436.3902282714844, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/clipped_ratio": -6.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1878.0, + "completions/mean_length": 543.8203125, + "completions/mean_terminated_length": 504.6332702636719, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, "epoch": 0.02238638907544213, - "frac_reward_zero_std": 0.90625, - "grad_norm": 20.877169149767465, - "kl": 0.09375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035870702477187856, + "kl": 0.091064453125, "learning_rate": 4.996177646877426e-06, - "loss": 0.0144, - "num_tokens": 14154069.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0009, + "num_tokens": 14964642.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 25 }, { @@ -715,26 +715,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.90625, + "completions/clipped_ratio": -6.6875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1895.0, - "completions/mean_length": 474.58984375, - "completions/mean_terminated_length": 462.3340148925781, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 569.7109375, + "completions/mean_terminated_length": 509.61785888671875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.023281844638459816, - "frac_reward_zero_std": 0.90625, - "grad_norm": 92.43206042257317, - "kl": 7.2659912109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06374825486217874, + "kl": 0.0888671875, "learning_rate": 4.995281359034851e-06, - "loss": 0.0913, - "num_tokens": 14701171.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0009, + "num_tokens": 15560446.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 26 }, { @@ -743,26 +743,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1354.0, - "completions/max_terminated_length": 1354.0, - "completions/mean_length": 451.283203125, - "completions/mean_terminated_length": 449.9921569824219, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/clipped_ratio": -6.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 578.88671875, + "completions/mean_terminated_length": 493.89666748046875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.0241773002014775, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.15669283865685832, - "kl": 0.0904541015625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04349623576785538, + "kl": 0.08447265625, "learning_rate": 4.994290863683296e-06, - "loss": 0.0103, - "num_tokens": 15202436.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0008, + "num_tokens": 16127044.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 27 }, { @@ -771,26 +771,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1410.0, - "completions/max_terminated_length": 1410.0, - "completions/mean_length": 478.384765625, - "completions/mean_terminated_length": 476.5616455078125, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/clipped_ratio": -6.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 635.462890625, + "completions/mean_terminated_length": 544.4262084960938, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, "epoch": 0.025072755764495188, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07431876710651732, - "kl": 0.088623046875, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.10869987782546223, + "kl": 0.0858154296875, "learning_rate": 4.99320620238196e-06, - "loss": 0.0051, - "num_tokens": 15732137.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0015, + "num_tokens": 16737169.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 28 }, { @@ -799,20 +799,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 1981.0, - "completions/max_terminated_length": 1981.0, - "completions/mean_length": 495.1640625, - "completions/mean_terminated_length": 489.8369445800781, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/clipped_ratio": -6.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 549.55859375, + "completions/mean_terminated_length": 516.65869140625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.025968211327512872, "frac_reward_zero_std": 0.96875, - "grad_norm": 1.419703912441118, - "kl": 0.1038818359375, + "grad_norm": 0.060121213159789126, + "kl": 0.0743408203125, "learning_rate": 4.99202742064106e-06, - "loss": -0.0017, - "num_tokens": 16339613.0, + "loss": 0.0017, + "num_tokens": 17372495.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -827,26 +827,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 1830.0, - "completions/max_terminated_length": 1830.0, - "completions/mean_length": 483.849609375, - "completions/mean_terminated_length": 479.77606201171875, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1435.0, + "completions/mean_length": 512.666015625, + "completions/mean_terminated_length": 494.4604797363281, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.026863666890530557, - "frac_reward_zero_std": 0.96875, - "grad_norm": 20.526980124430374, - "kl": 0.110595703125, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08729934209278342, + "kl": 0.0672607421875, "learning_rate": 4.990754567919917e-06, - "loss": 0.0162, - "num_tokens": 16889936.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0012, + "num_tokens": 17937572.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 30 }, { @@ -855,20 +855,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1573.0, - "completions/max_terminated_length": 1573.0, - "completions/mean_length": 477.513671875, - "completions/mean_terminated_length": 475.369873046875, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1663.0, + "completions/max_terminated_length": 1663.0, + "completions/mean_length": 490.388671875, + "completions/mean_terminated_length": 490.388671875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.02775912245354824, "frac_reward_zero_std": 1.0, - "grad_norm": 0.3115792345317698, - "kl": 0.1025390625, + "grad_norm": 0.014912187748278594, + "kl": 0.065185546875, "learning_rate": 4.989387697624881e-06, - "loss": 0.001, - "num_tokens": 17437175.0, + "loss": 0.0007, + "num_tokens": 18491403.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -884,25 +884,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 1254.0, - "completions/max_terminated_length": 1254.0, - "completions/mean_length": 497.666015625, - "completions/mean_terminated_length": 496.96673583984375, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1426.0, + "completions/mean_length": 563.92578125, + "completions/mean_terminated_length": 561.0215454101562, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, "epoch": 0.02865457801656593, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.4880352589785924, - "kl": 0.1168212890625, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.10732893682943874, + "kl": 0.069091796875, "learning_rate": 4.987926867107095e-06, - "loss": 0.0045, - "num_tokens": 18036748.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0117, + "num_tokens": 19124901.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 32 }, { @@ -911,26 +911,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 1841.0, - "completions/max_terminated_length": 1841.0, - "completions/mean_length": 492.521484375, - "completions/mean_terminated_length": 484.14959716796875, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 527.953125, + "completions/mean_terminated_length": 527.953125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.029550033579583614, - "frac_reward_zero_std": 0.96875, - "grad_norm": 5.555072095753837, - "kl": 0.425537109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013880347239008294, + "kl": 0.0657958984375, "learning_rate": 4.986372137660078e-06, - "loss": 0.0105, - "num_tokens": 18605991.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0007, + "num_tokens": 19712285.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 33 }, { @@ -939,26 +939,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, - "completions/max_length": 1231.0, - "completions/max_terminated_length": 1231.0, - "completions/mean_length": 470.818359375, - "completions/mean_terminated_length": 466.25048828125, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1420.0, + "completions/max_terminated_length": 1420.0, + "completions/mean_length": 478.341796875, + "completions/mean_terminated_length": 478.341796875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.030445489142601298, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.061669261382861086, - "kl": 0.1256103515625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.11245191546489028, + "kl": 0.06451416015625, "learning_rate": 4.984723574517165e-06, - "loss": 0.0013, - "num_tokens": 19125818.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0043, + "num_tokens": 20235964.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 34 }, { @@ -967,20 +967,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.890625, - "completions/max_length": 1387.0, - "completions/max_terminated_length": 1387.0, - "completions/mean_length": 532.9296875, - "completions/mean_terminated_length": 525.10498046875, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1743.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 538.541015625, + "completions/mean_terminated_length": 538.541015625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.03134094470561898, "frac_reward_zero_std": 0.90625, - "grad_norm": 17.57107852556159, - "kl": 0.1639404296875, + "grad_norm": 0.10103218237797797, + "kl": 0.0614013671875, "learning_rate": 4.9829812468487655e-06, - "loss": 0.0174, - "num_tokens": 19718790.0, + "loss": -0.0031, + "num_tokens": 20831809.0, "reward": 0.09941406548023224, "reward_std": 0.0023437500931322575, "rewards/code_reward/mean": 0.0, @@ -995,26 +995,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.734375, - "completions/max_length": 1184.0, - "completions/max_terminated_length": 1184.0, - "completions/mean_length": 516.59765625, - "completions/mean_terminated_length": 496.25860595703125, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/clipped_ratio": -6.765625, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 534.1484375, + "completions/mean_terminated_length": 521.9295654296875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.03223640026863667, - "frac_reward_zero_std": 0.9375, - "grad_norm": 17.78488865540282, - "kl": 0.3154296875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 20.200206186476855, + "kl": 2.783203125, "learning_rate": 4.981145227759457e-06, - "loss": 0.0204, - "num_tokens": 20301240.0, - "reward": 0.09726562350988388, - "reward_std": 0.0020409778226166964, + "loss": 0.0329, + "num_tokens": 21423245.0, + "reward": 0.09707031399011612, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.97265625, - "rewards/format_reward/std": 0.16324250400066376, + "rewards/format_reward/mean": 0.970703125, + "rewards/format_reward/std": 0.16880230605602264, "step": 36 }, { @@ -1023,26 +1023,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.84375, - "completions/max_length": 1759.0, - "completions/max_terminated_length": 1759.0, - "completions/mean_length": 550.166015625, - "completions/mean_terminated_length": 536.2271118164062, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1672.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 572.818359375, + "completions/mean_terminated_length": 571.872802734375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.03313185583165435, - "frac_reward_zero_std": 0.90625, - "grad_norm": 7806.320146811803, - "kl": 572.0, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.6440547223266587, + "kl": 0.12445068359375, "learning_rate": 4.979215594284924e-06, - "loss": 5.7231, - "num_tokens": 20888077.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0065, + "num_tokens": 22021680.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 37 }, { @@ -1051,26 +1051,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.890625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1769.0, - "completions/mean_length": 570.6875, - "completions/mean_terminated_length": 558.4415893554688, - "completions/min_length": 100.0, - "completions/min_terminated_length": 100.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1848.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 580.0234375, + "completions/mean_terminated_length": 580.0234375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.03402731139467204, - "frac_reward_zero_std": 0.9375, - "grad_norm": 29.0123039860226, - "kl": 2.9072265625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.058757743235516635, + "kl": 0.06072998046875, "learning_rate": 4.977192427388722e-06, - "loss": 0.051, - "num_tokens": 21505389.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0025, + "num_tokens": 22643772.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 38 }, { @@ -1079,26 +1079,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.90625, - "completions/max_length": 1847.0, - "completions/max_terminated_length": 1847.0, - "completions/mean_length": 538.337890625, - "completions/mean_terminated_length": 531.9466552734375, - "completions/min_length": 113.0, - "completions/min_terminated_length": 113.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1413.0, + "completions/max_terminated_length": 1413.0, + "completions/mean_length": 548.609375, + "completions/mean_terminated_length": 548.609375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.03492276695768973, - "frac_reward_zero_std": 0.90625, - "grad_norm": 1.830223160421892, - "kl": 0.1162109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012598270654231074, + "kl": 0.058837890625, "learning_rate": 4.9750758119588824e-06, - "loss": -0.0029, - "num_tokens": 22082218.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0006, + "num_tokens": 23225860.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 39 }, { @@ -1107,26 +1107,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.84375, - "completions/max_length": 1363.0, - "completions/max_terminated_length": 1363.0, - "completions/mean_length": 560.26953125, - "completions/mean_terminated_length": 548.7888793945312, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1677.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 580.900390625, + "completions/mean_terminated_length": 580.900390625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, "epoch": 0.03581822252070741, - "frac_reward_zero_std": 0.96875, - "grad_norm": 5.189694636533056, - "kl": 0.1761474609375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012729652253403266, + "kl": 0.05987548828125, "learning_rate": 4.972865836804349e-06, - "loss": 0.006, - "num_tokens": 22659380.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 23813585.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 40 }, { @@ -1135,20 +1135,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 1279.0, - "completions/max_terminated_length": 1279.0, - "completions/mean_length": 478.83203125, - "completions/mean_terminated_length": 475.21063232421875, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1436.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 509.9296875, + "completions/mean_terminated_length": 509.9296875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.036713678083725096, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08930606384058017, - "kl": 0.115966796875, + "grad_norm": 0.07495467584140274, + "kl": 0.05712890625, "learning_rate": 4.970562594651254e-06, - "loss": -0.0013, - "num_tokens": 23177790.0, + "loss": 0.0019, + "num_tokens": 24347917.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -1163,26 +1163,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, - "completions/max_length": 1525.0, - "completions/max_terminated_length": 1525.0, - "completions/mean_length": 544.818359375, - "completions/mean_terminated_length": 535.7534790039062, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1801.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 552.830078125, + "completions/mean_terminated_length": 552.830078125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.03760913364674278, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.12171028839751209, - "kl": 0.1319580078125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014256239329774286, + "kl": 0.05963134765625, "learning_rate": 4.968166182139026e-06, - "loss": 0.0031, - "num_tokens": 23784705.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 24958934.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 42 }, { @@ -1191,26 +1191,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.796875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1370.0, - "completions/mean_length": 490.720703125, - "completions/mean_terminated_length": 470.34869384765625, - "completions/min_length": 85.0, - "completions/min_terminated_length": 85.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1480.0, + "completions/max_terminated_length": 1480.0, + "completions/mean_length": 481.08984375, + "completions/mean_terminated_length": 481.08984375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.038504589209760465, - "frac_reward_zero_std": 0.96875, - "grad_norm": 7.7636935355164045, - "kl": 0.246826171875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012410355612420237, + "kl": 0.06182861328125, "learning_rate": 4.9656766998163306e-06, - "loss": 0.0094, - "num_tokens": 24299730.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 25469028.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 43 }, { @@ -1219,26 +1219,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1637.0, - "completions/mean_length": 527.525390625, - "completions/mean_terminated_length": 520.1400756835938, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_terminated_length": 1624.0, + "completions/mean_length": 538.57421875, + "completions/mean_terminated_length": 533.486328125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.03940004477277815, - "frac_reward_zero_std": 0.96875, - "grad_norm": 4.974293908259754, - "kl": 0.6517333984375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.11410330840934924, + "kl": 0.081298828125, "learning_rate": 4.963094252136865e-06, - "loss": 0.0236, - "num_tokens": 24869215.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0112, + "num_tokens": 26044170.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 44 }, { @@ -1247,26 +1247,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.890625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1675.0, - "completions/mean_length": 517.080078125, - "completions/mean_terminated_length": 506.8396301269531, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1548.0, + "completions/max_terminated_length": 1548.0, + "completions/mean_length": 526.244140625, + "completions/mean_terminated_length": 526.244140625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.040295500335795834, - "frac_reward_zero_std": 0.90625, - "grad_norm": 5.969358530129748, - "kl": 0.1383056640625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008982764516293115, + "kl": 0.06011962890625, "learning_rate": 4.960418947454958e-06, - "loss": 0.0292, - "num_tokens": 25461880.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0006, + "num_tokens": 26641527.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 45 }, { @@ -1275,26 +1275,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 1767.0, - "completions/max_terminated_length": 1767.0, - "completions/mean_length": 547.33203125, - "completions/mean_terminated_length": 540.8327026367188, - "completions/min_length": 75.0, - "completions/min_terminated_length": 75.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1526.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 545.90234375, + "completions/mean_terminated_length": 545.90234375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.04119095589881352, - "frac_reward_zero_std": 0.96875, - "grad_norm": 3.964515594688607, - "kl": 0.09521484375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011841241698184731, + "kl": 0.0606689453125, "learning_rate": 4.957650898021038e-06, - "loss": 0.0095, - "num_tokens": 26040930.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 27219845.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 46 }, { @@ -1303,26 +1303,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 569.171875, - "completions/mean_terminated_length": 545.6984252929688, - "completions/min_length": 55.0, - "completions/min_terminated_length": 55.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 566.806640625, + "completions/mean_terminated_length": 563.9080200195312, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.04208641146183121, "frac_reward_zero_std": 0.90625, - "grad_norm": 3.633363104209369, - "kl": 0.6241455078125, + "grad_norm": 0.11052168377753285, + "kl": 0.0595703125, "learning_rate": 4.954790219976915e-06, - "loss": 0.0428, - "num_tokens": 26644090.0, - "reward": 0.09921875596046448, - "reward_std": 0.0026298905722796917, + "loss": 0.0108, + "num_tokens": 27821794.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 47 }, { @@ -1331,26 +1331,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.671875, + "completions/clipped_ratio": -6.8125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1566.0, - "completions/mean_length": 581.9375, - "completions/mean_terminated_length": 558.0386962890625, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/max_terminated_length": 1283.0, + "completions/mean_length": 598.818359375, + "completions/mean_terminated_length": 588.4580078125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.042981867024848894, - "frac_reward_zero_std": 0.90625, - "grad_norm": 1.3607360038408776, - "kl": 0.390625, + "frac_reward_zero_std": 0.875, + "grad_norm": 3.73818212098571, + "kl": 0.09808349609375, "learning_rate": 4.95183703335091e-06, - "loss": 0.0081, - "num_tokens": 27297130.0, - "reward": 0.0966796875, - "reward_std": 0.0023437500931322575, + "loss": 0.0196, + "num_tokens": 28483477.0, + "reward": 0.09726563096046448, + "reward_std": 0.003839729819446802, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.966796875, - "rewards/format_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.16324250400066376, "step": 48 }, { @@ -1359,20 +1359,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1945.0, - "completions/mean_length": 546.5859375, - "completions/mean_terminated_length": 539.001953125, - "completions/min_length": 128.0, - "completions/min_terminated_length": 128.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 584.134765625, + "completions/mean_terminated_length": 581.2700805664062, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.04387732258786658, "frac_reward_zero_std": 1.0, - "grad_norm": 0.9432769696523784, - "kl": 0.1865234375, + "grad_norm": 0.009853203927592551, + "kl": 0.05950927734375, "learning_rate": 4.948791462052819e-06, - "loss": 0.0019, - "num_tokens": 27855910.0, + "loss": 0.0006, + "num_tokens": 29061482.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -1387,20 +1387,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 1071.0, - "completions/max_terminated_length": 1071.0, - "completions/mean_length": 510.458984375, - "completions/mean_terminated_length": 506.47833251953125, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1504.0, + "completions/max_terminated_length": 1504.0, + "completions/mean_length": 558.048828125, + "completions/mean_terminated_length": 558.048828125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, "epoch": 0.04477277815088426, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08737982579415644, - "kl": 0.091796875, + "grad_norm": 0.06158028008338189, + "kl": 0.06207275390625, "learning_rate": 4.945653633868716e-06, - "loss": 0.0053, - "num_tokens": 28437265.0, + "loss": 0.0002, + "num_tokens": 29667203.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -1415,22 +1415,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 1840.0, - "completions/max_terminated_length": 1840.0, - "completions/mean_length": 559.35546875, - "completions/mean_terminated_length": 553.75048828125, - "completions/min_length": 115.0, - "completions/min_terminated_length": 115.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1496.0, + "completions/max_terminated_length": 1496.0, + "completions/mean_length": 589.54296875, + "completions/mean_terminated_length": 589.54296875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.04566823371390195, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06582492357236211, - "kl": 0.0966796875, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.085918900087203, + "kl": 0.06072998046875, "learning_rate": 4.942423680455584e-06, - "loss": 0.0007, - "num_tokens": 29032375.0, + "loss": 0.0021, + "num_tokens": 30277769.0, "reward": 0.099609375, - "reward_std": 0.0010673906654119492, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.99609375, @@ -1443,20 +1443,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.859375, - "completions/max_length": 1732.0, - "completions/max_terminated_length": 1732.0, - "completions/mean_length": 529.138671875, - "completions/mean_terminated_length": 513.461181640625, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1400.0, + "completions/max_terminated_length": 1400.0, + "completions/mean_length": 566.83203125, + "completions/mean_terminated_length": 565.9589233398438, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, "epoch": 0.04656368927691963, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.24454712354121277, - "kl": 0.193115234375, + "grad_norm": 13.53527608696407, + "kl": 0.06634521484375, "learning_rate": 4.939101737335802e-06, - "loss": 0.012, - "num_tokens": 29592686.0, + "loss": 0.0112, + "num_tokens": 30857379.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -1471,26 +1471,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1830.0, - "completions/mean_length": 514.66796875, - "completions/mean_terminated_length": 507.16534423828125, - "completions/min_length": 125.0, - "completions/min_terminated_length": 125.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2036.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 568.298828125, + "completions/mean_terminated_length": 566.6731567382812, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.047459144839937316, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07209160390847483, - "kl": 0.0982666015625, + "frac_reward_zero_std": 1.0, + "grad_norm": 753.4933997875953, + "kl": 40.29571533203125, "learning_rate": 4.935687943891447e-06, - "loss": 0.0171, - "num_tokens": 30153252.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.404, + "num_tokens": 31445404.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 53 }, { @@ -1499,20 +1499,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.90625, - "completions/max_length": 1958.0, - "completions/max_terminated_length": 1958.0, - "completions/mean_length": 556.134765625, - "completions/mean_terminated_length": 545.2826538085938, - "completions/min_length": 74.0, - "completions/min_terminated_length": 74.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1762.0, + "completions/mean_length": 606.25, + "completions/mean_terminated_length": 603.4285888671875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.048354600402955, "frac_reward_zero_std": 0.96875, - "grad_norm": 1.2182649822769658, - "kl": 0.1866455078125, + "grad_norm": 0.07880783979440148, + "kl": 0.0592041015625, "learning_rate": 4.932182443358458e-06, - "loss": 0.013, - "num_tokens": 30773801.0, + "loss": 0.0101, + "num_tokens": 32091612.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -1527,26 +1527,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, - "completions/max_length": 1496.0, - "completions/max_terminated_length": 1496.0, - "completions/mean_length": 520.10546875, - "completions/mean_terminated_length": 513.6646728515625, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1750.0, + "completions/mean_length": 554.294921875, + "completions/mean_terminated_length": 548.4373168945312, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, "epoch": 0.04925005596597269, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.06929038708329119, - "kl": 0.1387939453125, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08726278881698582, + "kl": 0.05682373046875, "learning_rate": 4.928585382820616e-06, - "loss": 0.0014, - "num_tokens": 31299967.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0059, + "num_tokens": 32635283.0, + "reward": 0.09941406548023224, + "reward_std": 0.0018486406188458204, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 55 }, { @@ -1555,26 +1555,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 1904.0, - "completions/max_terminated_length": 1904.0, - "completions/mean_length": 542.359375, - "completions/mean_terminated_length": 540.060791015625, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1862.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 616.517578125, + "completions/mean_terminated_length": 616.517578125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, "epoch": 0.050145511528990376, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.15563688859048186, - "kl": 0.104736328125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00787517172635136, + "kl": 0.05615234375, "learning_rate": 4.924896913203376e-06, - "loss": -0.0022, - "num_tokens": 31888535.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 33261820.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 56 }, { @@ -1584,25 +1584,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 1506.0, - "completions/max_terminated_length": 1506.0, - "completions/mean_length": 516.908203125, - "completions/mean_terminated_length": 515.5968627929688, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 579.927734375, + "completions/mean_terminated_length": 577.0548095703125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.05104096709200806, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.09565916384806829, - "kl": 0.07861328125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.09467657721643793, + "kl": 0.0556640625, "learning_rate": 4.921117189267535e-06, - "loss": 0.0044, - "num_tokens": 32461608.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0123, + "num_tokens": 33867159.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 57 }, { @@ -1611,26 +1611,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 1916.0, - "completions/max_terminated_length": 1916.0, - "completions/mean_length": 525.482421875, - "completions/mean_terminated_length": 521.7431640625, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 573.78125, + "completions/mean_terminated_length": 573.78125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.051936422655025745, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0748699769594785, - "kl": 0.1156005859375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007058775392801076, + "kl": 0.05938720703125, "learning_rate": 4.917246369602742e-06, - "loss": 0.0002, - "num_tokens": 33044847.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 34475127.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 58 }, { @@ -1640,19 +1640,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2001.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 514.23046875, - "completions/mean_terminated_length": 513.3111572265625, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_length": 1731.0, + "completions/max_terminated_length": 1731.0, + "completions/mean_length": 574.1796875, + "completions/mean_terminated_length": 572.4011840820312, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.05283187821804343, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.03709928067002307, - "kl": 0.079833984375, + "grad_norm": 6.081099094392287, + "kl": 0.35699462890625, "learning_rate": 4.9132846166208355e-06, - "loss": -0.0006, - "num_tokens": 33620501.0, + "loss": 0.0019, + "num_tokens": 35081475.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -1667,26 +1667,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1458.0, - "completions/mean_length": 511.310546875, - "completions/mean_terminated_length": 505.2843322753906, - "completions/min_length": 118.0, - "completions/min_terminated_length": 118.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1659.0, + "completions/max_terminated_length": 1659.0, + "completions/mean_length": 580.228515625, + "completions/mean_terminated_length": 580.228515625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.053727333781061114, - "frac_reward_zero_std": 0.875, - "grad_norm": 0.13840494930716188, - "kl": 0.06787109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007985885460781206, + "kl": 0.05706787109375, "learning_rate": 4.9092320965490365e-06, - "loss": 0.0211, - "num_tokens": 34206116.0, - "reward": 0.09902343899011612, - "reward_std": 0.003411140525713563, + "loss": 0.0006, + "num_tokens": 35702376.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.990234375, - "rewards/format_reward/std": 0.09843364357948303, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 60 }, { @@ -1696,25 +1696,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1895.0, - "completions/max_terminated_length": 1895.0, - "completions/mean_length": 518.853515625, - "completions/mean_terminated_length": 518.853515625, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_length": 1325.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 575.150390625, + "completions/mean_terminated_length": 575.150390625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.0546227893440788, - "frac_reward_zero_std": 0.875, - "grad_norm": 0.14872642942212774, - "kl": 0.0692138671875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008989013474146063, + "kl": 0.05889892578125, "learning_rate": 4.905088979422971e-06, - "loss": 0.0128, - "num_tokens": 34793785.0, - "reward": 0.09921875596046448, - "reward_std": 0.0031250000465661287, + "loss": 0.0006, + "num_tokens": 36318869.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 61 }, { @@ -1724,25 +1724,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 1878.0, - "completions/max_terminated_length": 1878.0, - "completions/mean_length": 474.60546875, - "completions/mean_terminated_length": 473.1917724609375, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_length": 1401.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 543.685546875, + "completions/mean_terminated_length": 542.7866821289062, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.05551824490709648, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.12988434784735217, - "kl": 0.0841064453125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01919284660807941, + "kl": 0.06610107421875, "learning_rate": 4.900855439079536e-06, - "loss": -0.0012, - "num_tokens": 35339935.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0007, + "num_tokens": 36900388.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 62 }, { @@ -1752,19 +1752,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1579.0, - "completions/max_terminated_length": 1579.0, - "completions/mean_length": 504.658203125, - "completions/mean_terminated_length": 504.658203125, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 568.287109375, + "completions/mean_terminated_length": 568.287109375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.056413700470114174, "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07899678142978885, - "kl": 0.06787109375, + "grad_norm": 0.07846205338958102, + "kl": 0.05694580078125, "learning_rate": 4.8965316531496055e-06, - "loss": -0.0002, - "num_tokens": 35911392.0, + "loss": -0.0019, + "num_tokens": 37504423.0, "reward": 0.099609375, "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, @@ -1779,26 +1779,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1658.0, - "completions/mean_length": 541.5859375, - "completions/mean_terminated_length": 535.678466796875, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1904.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 603.353515625, + "completions/mean_terminated_length": 601.3953247070312, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.05730915603313186, - "frac_reward_zero_std": 0.84375, - "grad_norm": 0.1638638882495904, - "kl": 0.06365966796875, + "frac_reward_zero_std": 0.9375, + "grad_norm": 4.810189704077544, + "kl": 0.062744140625, "learning_rate": 4.892117803050578e-06, - "loss": 0.0252, - "num_tokens": 36471548.0, - "reward": 0.09882812947034836, - "reward_std": 0.004192390479147434, + "loss": 0.0096, + "num_tokens": 38096204.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.98828125, - "rewards/format_reward/std": 0.10772226005792618, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 64 }, { @@ -1807,26 +1807,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1190.0, - "completions/mean_length": 509.482421875, - "completions/mean_terminated_length": 505.13140869140625, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1312.0, + "completions/max_terminated_length": 1312.0, + "completions/mean_length": 587.44921875, + "completions/mean_terminated_length": 586.2974243164062, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.05820461159614954, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.12491007191362062, - "kl": 0.0806884765625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 6.992893766082191, + "kl": 0.5133056640625, "learning_rate": 4.887614073978761e-06, - "loss": 0.0189, - "num_tokens": 37031171.0, - "reward": 0.09921875596046448, - "reward_std": 0.0026298905722796917, + "loss": 0.0079, + "num_tokens": 38695746.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 65 }, { @@ -1836,25 +1836,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1841.0, - "completions/max_terminated_length": 1841.0, - "completions/mean_length": 521.2890625, - "completions/mean_terminated_length": 521.2890625, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_length": 1648.0, + "completions/max_terminated_length": 1648.0, + "completions/mean_length": 590.52734375, + "completions/mean_terminated_length": 590.52734375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.05910006715916723, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08381886765190119, - "kl": 0.06219482421875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006834195519283801, + "kl": 0.05694580078125, "learning_rate": 4.883020654901609e-06, - "loss": 0.0019, - "num_tokens": 37627703.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 39327728.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 66 }, { @@ -1863,26 +1863,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1381.0, - "completions/max_terminated_length": 1381.0, - "completions/mean_length": 477.830078125, - "completions/mean_terminated_length": 477.830078125, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 539.552734375, + "completions/mean_terminated_length": 538.74169921875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.05999552272218491, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.013399015519228851, - "kl": 0.0623779296875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1445536.0669629306, + "kl": 23296.041015625, "learning_rate": 4.878337738549785e-06, - "loss": 0.0006, - "num_tokens": 38178656.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 233.4053, + "num_tokens": 39910283.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 67 }, { @@ -1891,20 +1891,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1351.0, - "completions/max_terminated_length": 1351.0, - "completions/mean_length": 559.1171875, - "completions/mean_terminated_length": 558.129150390625, - "completions/min_length": 203.0, - "completions/min_terminated_length": 203.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1645.0, + "completions/max_terminated_length": 1645.0, + "completions/mean_length": 603.494140625, + "completions/mean_terminated_length": 603.494140625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.060890978285202596, "frac_reward_zero_std": 1.0, - "grad_norm": 0.02053915419301268, - "kl": 0.0797119140625, + "grad_norm": 0.007074653107084449, + "kl": 0.05511474609375, "learning_rate": 4.873565521409082e-06, - "loss": 0.0008, - "num_tokens": 38780044.0, + "loss": 0.0006, + "num_tokens": 40534392.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -1920,25 +1920,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1736.0, - "completions/max_terminated_length": 1736.0, - "completions/mean_length": 528.73046875, - "completions/mean_terminated_length": 528.73046875, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/max_length": 1478.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 589.14453125, + "completions/mean_terminated_length": 589.14453125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.06178643384822028, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.013023668854051146, - "kl": 0.06231689453125, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.10404347225648633, + "kl": 0.05804443359375, "learning_rate": 4.868704203712173e-06, - "loss": 0.0006, - "num_tokens": 39342786.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0054, + "num_tokens": 41128066.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 69 }, { @@ -1947,26 +1947,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1704.0, - "completions/mean_length": 539.8671875, - "completions/mean_terminated_length": 529.748046875, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1608.0, + "completions/max_terminated_length": 1608.0, + "completions/mean_length": 570.591796875, + "completions/mean_terminated_length": 570.591796875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.06268188941123796, - "frac_reward_zero_std": 0.875, - "grad_norm": 7.439993648948339, - "kl": 0.08404541015625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06680192103628023, + "kl": 0.05828857421875, "learning_rate": 4.86375398943021e-06, - "loss": 0.0415, - "num_tokens": 39922430.0, - "reward": 0.09921875596046448, - "reward_std": 0.0031250000465661287, + "loss": 0.002, + "num_tokens": 41723441.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 70 }, { @@ -1975,26 +1975,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1456.0, - "completions/max_terminated_length": 1456.0, - "completions/mean_length": 529.509765625, - "completions/mean_terminated_length": 529.509765625, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1842.0, + "completions/max_terminated_length": 1842.0, + "completions/mean_length": 564.57421875, + "completions/mean_terminated_length": 562.3902587890625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.06357734497425566, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.012082899535339748, - "kl": 0.06158447265625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 2.1606000379295507, + "kl": 0.06298828125, "learning_rate": 4.858715086264274e-06, - "loss": 0.0006, - "num_tokens": 40501891.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0042, + "num_tokens": 42320855.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 71 }, { @@ -2004,25 +2004,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1235.0, - "completions/max_terminated_length": 1235.0, - "completions/mean_length": 563.224609375, - "completions/mean_terminated_length": 563.224609375, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_length": 1384.0, + "completions/max_terminated_length": 1384.0, + "completions/mean_length": 597.32421875, + "completions/mean_terminated_length": 597.32421875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, "epoch": 0.06447280053727333, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.10284201346946772, - "kl": 0.05999755859375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06791730290876942, + "kl": 0.05657958984375, "learning_rate": 4.853587705636646e-06, - "loss": 0.0043, - "num_tokens": 41123062.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0004, + "num_tokens": 42959485.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 72 }, { @@ -2031,26 +2031,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1747.0, - "completions/max_terminated_length": 1747.0, - "completions/mean_length": 533.79296875, - "completions/mean_terminated_length": 533.79296875, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1228.0, + "completions/max_terminated_length": 1228.0, + "completions/mean_length": 530.748046875, + "completions/mean_terminated_length": 529.383544921875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.06536825610029103, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.050213646166851394, - "kl": 0.0599365234375, + "frac_reward_zero_std": 1.0, + "grad_norm": 937.0925069160255, + "kl": 59.2928466796875, "learning_rate": 4.84837206268195e-06, - "loss": 0.0, - "num_tokens": 41681452.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.5913, + "num_tokens": 43516316.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 73 }, { @@ -2059,20 +2059,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1085.0, - "completions/max_terminated_length": 1085.0, - "completions/mean_length": 509.712890625, - "completions/mean_terminated_length": 509.712890625, - "completions/min_length": 160.0, - "completions/min_terminated_length": 160.0, - "epoch": 0.0662637116633087, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1156.0, + "completions/mean_length": 516.6015625, + "completions/mean_terminated_length": 513.6046752929688, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.0662637116633087, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06633850089342387, - "kl": 0.05963134765625, + "grad_norm": 0.10313094352636988, + "kl": 0.056884765625, "learning_rate": 4.8430683762381195e-06, - "loss": 0.0061, - "num_tokens": 42219641.0, + "loss": 0.0177, + "num_tokens": 44058032.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -2088,19 +2088,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1555.0, - "completions/max_terminated_length": 1555.0, - "completions/mean_length": 574.283203125, - "completions/mean_terminated_length": 574.283203125, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/max_length": 1437.0, + "completions/max_terminated_length": 1437.0, + "completions/mean_length": 566.34765625, + "completions/mean_terminated_length": 566.34765625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.0671591672263264, "frac_reward_zero_std": 1.0, - "grad_norm": 0.020666631584363913, - "kl": 0.060791015625, + "grad_norm": 0.00747549164256916, + "kl": 0.05645751953125, "learning_rate": 4.837676868837213e-06, "loss": 0.0006, - "num_tokens": 42824586.0, + "num_tokens": 44658914.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -2115,20 +2115,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1942.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 521.0546875, - "completions/mean_terminated_length": 521.0546875, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 527.84375, + "completions/mean_terminated_length": 520.9234008789062, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.06805462278934409, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06331685772788986, - "kl": 0.05889892578125, + "grad_norm": 0.06719001887293087, + "kl": 0.06622314453125, "learning_rate": 4.832197766696085e-06, - "loss": -0.0, - "num_tokens": 43379718.0, + "loss": 0.0199, + "num_tokens": 45217522.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -2143,26 +2143,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1598.0, - "completions/max_terminated_length": 1598.0, - "completions/mean_length": 568.814453125, - "completions/mean_terminated_length": 568.814453125, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1606.0, + "completions/max_terminated_length": 1606.0, + "completions/mean_length": 570.078125, + "completions/mean_terminated_length": 568.4344482421875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.06895007835236176, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07366287417422632, - "kl": 0.0562744140625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03159919437264003, + "kl": 0.060546875, "learning_rate": 4.826631299706887e-06, - "loss": -0.0016, - "num_tokens": 43978151.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0006, + "num_tokens": 45816602.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 77 }, { @@ -2172,25 +2172,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1806.0, - "completions/max_terminated_length": 1806.0, - "completions/mean_length": 569.267578125, - "completions/mean_terminated_length": 569.267578125, - "completions/min_length": 220.0, - "completions/min_terminated_length": 220.0, + "completions/max_length": 1261.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 537.560546875, + "completions/mean_terminated_length": 537.560546875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.06984553391537945, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.11848160599061058, - "kl": 0.058349609375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0670951539644897, + "kl": 0.05682373046875, "learning_rate": 4.820977701427424e-06, - "loss": 0.0152, - "num_tokens": 44552064.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0036, + "num_tokens": 46374281.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 78 }, { @@ -2200,19 +2200,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1567.0, - "completions/max_terminated_length": 1567.0, - "completions/mean_length": 557.267578125, - "completions/mean_terminated_length": 557.267578125, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_length": 1410.0, + "completions/max_terminated_length": 1410.0, + "completions/mean_length": 548.822265625, + "completions/mean_terminated_length": 548.822265625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.07074098947839713, "frac_reward_zero_std": 1.0, - "grad_norm": 0.011448547949088674, - "kl": 0.06097412109375, + "grad_norm": 0.00830534868975707, + "kl": 0.05743408203125, "learning_rate": 4.81523720907136e-06, "loss": 0.0006, - "num_tokens": 45158809.0, + "num_tokens": 46976702.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -2228,25 +2228,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 2009.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 552.33984375, - "completions/mean_terminated_length": 552.33984375, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_length": 1503.0, + "completions/max_terminated_length": 1503.0, + "completions/mean_length": 526.49609375, + "completions/mean_terminated_length": 526.49609375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.07163644504141482, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.056084177201828476, - "kl": 0.0584716796875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01078274311527768, + "kl": 0.0574951171875, "learning_rate": 4.809410063498254e-06, - "loss": -0.0002, - "num_tokens": 45754263.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 47558924.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 80 }, { @@ -2256,25 +2256,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1401.0, - "completions/max_terminated_length": 1401.0, - "completions/mean_length": 543.6796875, - "completions/mean_terminated_length": 543.6796875, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 527.138671875, + "completions/mean_terminated_length": 527.138671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.0725319006044325, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.10338543772762039, - "kl": 0.0606689453125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011012657853872506, + "kl": 0.0562744140625, "learning_rate": 4.8034965092034656e-06, - "loss": 0.0038, - "num_tokens": 46356163.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0006, + "num_tokens": 48152355.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 81 }, { @@ -2283,20 +2283,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1176.0, - "completions/max_terminated_length": 1176.0, - "completions/mean_length": 584.32421875, - "completions/mean_terminated_length": 583.5518798828125, - "completions/min_length": 236.0, - "completions/min_terminated_length": 236.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1796.0, + "completions/max_terminated_length": 1796.0, + "completions/mean_length": 570.82421875, + "completions/mean_terminated_length": 570.82421875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.07342735616745019, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0722417078888751, - "kl": 0.0614013671875, + "grad_norm": 0.05913765032733167, + "kl": 0.0567626953125, "learning_rate": 4.797496794307889e-06, - "loss": 0.0038, - "num_tokens": 46960281.0, + "loss": -0.0007, + "num_tokens": 48749561.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -2312,19 +2312,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1328.0, - "completions/mean_length": 599.443359375, - "completions/mean_terminated_length": 596.6085815429688, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_length": 1441.0, + "completions/max_terminated_length": 1441.0, + "completions/mean_length": 566.939453125, + "completions/mean_terminated_length": 565.4774780273438, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.07432281173046787, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04419533464180944, - "kl": 0.0595703125, + "grad_norm": 3.166488626408786, + "kl": 0.0667724609375, "learning_rate": 4.791411170547545e-06, - "loss": 0.0122, - "num_tokens": 47594764.0, + "loss": 0.0085, + "num_tokens": 49367402.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -2339,26 +2339,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1370.0, - "completions/max_terminated_length": 1370.0, - "completions/mean_length": 567.396484375, - "completions/mean_terminated_length": 567.396484375, - "completions/min_length": 202.0, - "completions/min_terminated_length": 202.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1595.0, + "completions/mean_length": 560.765625, + "completions/mean_terminated_length": 557.8551635742188, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.07521826729348556, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.008048576448064306, - "kl": 0.0582275390625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08297988092224323, + "kl": 0.05584716796875, "learning_rate": 4.785239893263017e-06, - "loss": 0.0006, - "num_tokens": 48214007.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0134, + "num_tokens": 49983250.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 84 }, { @@ -2368,25 +2368,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1268.0, - "completions/max_terminated_length": 1268.0, - "completions/mean_length": 568.2421875, - "completions/mean_terminated_length": 568.2421875, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 545.970703125, + "completions/mean_terminated_length": 545.970703125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.07611372285650325, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.13977703925477217, - "kl": 0.06085205078125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007356660460353614, + "kl": 0.05523681640625, "learning_rate": 4.778983221388742e-06, - "loss": 0.0049, - "num_tokens": 48800115.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0006, + "num_tokens": 50557955.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 85 }, { @@ -2395,26 +2395,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1454.0, - "completions/mean_length": 582.78515625, - "completions/mean_terminated_length": 579.9177856445312, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1424.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 578.482421875, + "completions/mean_terminated_length": 578.482421875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.07700917841952093, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07934026261985073, - "kl": 0.0589599609375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007809110580486689, + "kl": 0.0545654296875, "learning_rate": 4.77264141744214e-06, - "loss": 0.0097, - "num_tokens": 49419445.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 51175082.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 86 }, { @@ -2424,19 +2424,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1647.0, - "completions/max_terminated_length": 1647.0, - "completions/mean_length": 590.314453125, - "completions/mean_terminated_length": 590.314453125, - "completions/min_length": 230.0, - "completions/min_terminated_length": 230.0, + "completions/max_length": 1490.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 591.57421875, + "completions/mean_terminated_length": 591.57421875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, "epoch": 0.07790463398253862, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06933720029794796, - "kl": 0.0592041015625, + "grad_norm": 0.04977783029837793, + "kl": 0.05291748046875, "learning_rate": 4.766214747512603e-06, - "loss": 0.001, - "num_tokens": 50031558.0, + "loss": 0.0013, + "num_tokens": 51787840.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -2451,26 +2451,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1328.0, - "completions/max_terminated_length": 1328.0, - "completions/mean_length": 559.21875, - "completions/mean_terminated_length": 558.4931640625, - "completions/min_length": 208.0, - "completions/min_terminated_length": 208.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1299.0, + "completions/max_terminated_length": 1299.0, + "completions/mean_length": 559.4375, + "completions/mean_terminated_length": 559.4375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.0788000895455563, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.15169183218007004, - "kl": 0.07476806640625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05494814305555876, + "kl": 0.05511474609375, "learning_rate": 4.759703481250331e-06, - "loss": 0.0007, - "num_tokens": 50636678.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0037, + "num_tokens": 52393072.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 88 }, { @@ -2479,26 +2479,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1333.0, - "completions/max_terminated_length": 1333.0, - "completions/mean_length": 577.947265625, - "completions/mean_terminated_length": 577.947265625, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1503.0, + "completions/max_terminated_length": 1503.0, + "completions/mean_length": 579.380859375, + "completions/mean_terminated_length": 578.4148559570312, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.07969554510857399, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.007228029815201077, - "kl": 0.058837890625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 160.94730744375263, + "kl": 30.91864013671875, "learning_rate": 4.753107891855015e-06, - "loss": 0.0006, - "num_tokens": 51286507.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.3115, + "num_tokens": 53043635.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 89 }, { @@ -2508,19 +2508,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1744.0, - "completions/max_terminated_length": 1744.0, - "completions/mean_length": 552.662109375, - "completions/mean_terminated_length": 552.662109375, - "completions/min_length": 150.0, - "completions/min_terminated_length": 150.0, + "completions/max_length": 1789.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 577.939453125, + "completions/mean_terminated_length": 577.939453125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.08059100067159167, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05217256333372253, - "kl": 0.06201171875, + "grad_norm": 0.10489931391093958, + "kl": 0.0546875, "learning_rate": 4.746428256064375e-06, - "loss": -0.001, - "num_tokens": 51888542.0, + "loss": 0.0041, + "num_tokens": 53658612.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -2536,25 +2536,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1383.0, - "completions/max_terminated_length": 1383.0, - "completions/mean_length": 529.599609375, - "completions/mean_terminated_length": 529.599609375, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_length": 1258.0, + "completions/max_terminated_length": 1258.0, + "completions/mean_length": 549.6484375, + "completions/mean_terminated_length": 549.6484375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.08148645623460936, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07897485826909563, - "kl": 0.0584716796875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008182419427830684, + "kl": 0.05682373046875, "learning_rate": 4.7396648541425534e-06, - "loss": 0.0055, - "num_tokens": 52460673.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 54241008.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 91 }, { @@ -2563,26 +2563,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1405.0, - "completions/max_terminated_length": 1405.0, - "completions/mean_length": 534.078125, - "completions/mean_terminated_length": 534.078125, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, - "epoch": 0.08238191179762704, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.009069004256641576, - "kl": 0.058837890625, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1601.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 556.23828125, + "completions/mean_terminated_length": 555.0195922851562, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.08238191179762704, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05844731713658409, + "kl": 0.05755615234375, "learning_rate": 4.732817969868348e-06, "loss": 0.0006, - "num_tokens": 53053129.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "num_tokens": 54844810.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 92 }, { @@ -2592,19 +2592,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1509.0, - "completions/max_terminated_length": 1509.0, - "completions/mean_length": 581.345703125, - "completions/mean_terminated_length": 581.345703125, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/max_length": 1856.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 608.26171875, + "completions/mean_terminated_length": 608.26171875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.08327736736064473, "frac_reward_zero_std": 1.0, - "grad_norm": 0.008041339436883252, - "kl": 0.05926513671875, + "grad_norm": 0.005945148181573863, + "kl": 0.05767822265625, "learning_rate": 4.7258878905233095e-06, "loss": 0.0006, - "num_tokens": 53676490.0, + "num_tokens": 55481952.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -2619,20 +2619,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1690.0, - "completions/max_terminated_length": 1690.0, - "completions/mean_length": 570.1796875, - "completions/mean_terminated_length": 570.1796875, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2026.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 610.75390625, + "completions/mean_terminated_length": 609.74560546875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.08417282292366242, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08004068477244176, - "kl": 0.06011962890625, + "grad_norm": 0.052266875089248634, + "kl": 0.0565185546875, "learning_rate": 4.718874906879688e-06, - "loss": 0.0048, - "num_tokens": 54286198.0, + "loss": -0.0009, + "num_tokens": 56112434.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -2648,19 +2648,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1286.0, - "completions/max_terminated_length": 1286.0, - "completions/mean_length": 503.283203125, - "completions/mean_terminated_length": 503.283203125, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_length": 1580.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 546.06640625, + "completions/mean_terminated_length": 546.06640625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.0850682784866801, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007110972110569579, - "kl": 0.05828857421875, + "grad_norm": 0.006438116887744368, + "kl": 0.0577392578125, "learning_rate": 4.711779313188231e-06, "loss": 0.0006, - "num_tokens": 54808951.0, + "num_tokens": 56657092.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -2675,20 +2675,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1328.0, - "completions/max_terminated_length": 1328.0, - "completions/mean_length": 575.7421875, - "completions/mean_terminated_length": 575.7421875, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1746.0, + "completions/max_terminated_length": 1746.0, + "completions/mean_length": 620.203125, + "completions/mean_terminated_length": 619.3052978515625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, "epoch": 0.08596373404969779, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06146080952581339, - "kl": 0.058349609375, + "grad_norm": 0.05996351922986414, + "kl": 0.05584716796875, "learning_rate": 4.70460140716584e-06, - "loss": 0.0038, - "num_tokens": 55425011.0, + "loss": -0.0007, + "num_tokens": 57295916.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -2704,19 +2704,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1760.0, - "completions/max_terminated_length": 1760.0, - "completions/mean_length": 510.607421875, - "completions/mean_terminated_length": 510.607421875, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 1329.0, + "completions/mean_length": 535.82421875, + "completions/mean_terminated_length": 535.82421875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.08685918961271547, "frac_reward_zero_std": 1.0, - "grad_norm": 0.013320757255101928, - "kl": 0.05645751953125, + "grad_norm": 0.0065402778185692, + "kl": 0.05560302734375, "learning_rate": 4.697341489983076e-06, "loss": 0.0006, - "num_tokens": 55965402.0, + "num_tokens": 57849218.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -2731,26 +2731,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1890.0, - "completions/max_terminated_length": 1890.0, - "completions/mean_length": 565.958984375, - "completions/mean_terminated_length": 565.958984375, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1644.0, + "completions/mean_length": 605.234375, + "completions/mean_terminated_length": 602.4109497070312, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.08775464517573316, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.007573822836148033, - "kl": 0.05853271484375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.055220508680084254, + "kl": 0.05645751953125, "learning_rate": 4.6899998662515215e-06, - "loss": 0.0006, - "num_tokens": 56571765.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0009, + "num_tokens": 58475690.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 98 }, { @@ -2759,20 +2759,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1135.0, - "completions/max_terminated_length": 1135.0, - "completions/mean_length": 503.03125, - "completions/mean_terminated_length": 503.03125, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1649.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 539.849609375, + "completions/mean_terminated_length": 538.8356323242188, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.08865010073875083, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007334410202363631, - "kl": 0.05633544921875, + "grad_norm": 2.722021588876746, + "kl": 0.2435302734375, "learning_rate": 4.682576844011007e-06, - "loss": 0.0006, - "num_tokens": 57124213.0, + "loss": 0.0024, + "num_tokens": 59046989.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -2788,19 +2788,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1658.0, - "completions/max_terminated_length": 1658.0, - "completions/mean_length": 536.2421875, - "completions/mean_terminated_length": 536.2421875, - "completions/min_length": 134.0, - "completions/min_terminated_length": 134.0, + "completions/max_length": 1446.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 559.466796875, + "completions/mean_terminated_length": 559.466796875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.08954555630176853, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007211066238880028, - "kl": 0.05853271484375, + "grad_norm": 0.006225919234576967, + "kl": 0.0565185546875, "learning_rate": 4.675072734716678e-06, "loss": 0.0006, - "num_tokens": 57682705.0, + "num_tokens": 59617372.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -2816,25 +2816,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1632.0, - "completions/max_terminated_length": 1632.0, - "completions/mean_length": 554.22265625, - "completions/mean_terminated_length": 554.22265625, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/max_length": 1832.0, + "completions/max_terminated_length": 1832.0, + "completions/mean_length": 623.28515625, + "completions/mean_terminated_length": 623.28515625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.09044101186478622, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.006708160382694399, - "kl": 0.05792236328125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07716472081979199, + "kl": 0.0572509765625, "learning_rate": 4.667487853225931e-06, - "loss": 0.0006, - "num_tokens": 58310755.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0003, + "num_tokens": 60280782.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 101 }, { @@ -2845,18 +2845,18 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1931.0, - "completions/mean_length": 566.173828125, - "completions/mean_terminated_length": 563.2739868164062, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 1512.0, + "completions/mean_length": 593.6640625, + "completions/mean_terminated_length": 590.8179931640625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.0913364674278039, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07622570043853691, - "kl": 0.05584716796875, + "grad_norm": 0.054090751280617605, + "kl": 0.05364990234375, "learning_rate": 4.659822517785203e-06, - "loss": 0.0137, - "num_tokens": 58888860.0, + "loss": 0.0106, + "num_tokens": 60872962.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -2872,25 +2872,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1372.0, - "completions/max_terminated_length": 1372.0, - "completions/mean_length": 553.12109375, - "completions/mean_terminated_length": 553.12109375, - "completions/min_length": 212.0, - "completions/min_terminated_length": 212.0, + "completions/max_length": 1912.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 577.80859375, + "completions/mean_terminated_length": 577.80859375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.09223192299082159, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.049130696414956106, - "kl": 0.05828857421875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005765836397552098, + "kl": 0.055419921875, "learning_rate": 4.6520770500166165e-06, - "loss": -0.0001, - "num_tokens": 59485690.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 61482432.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 103 }, { @@ -2899,26 +2899,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1689.0, - "completions/max_terminated_length": 1689.0, - "completions/mean_length": 592.68359375, - "completions/mean_terminated_length": 591.5635986328125, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1909.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 625.951171875, + "completions/mean_terminated_length": 625.951171875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.09312737855383926, - "frac_reward_zero_std": 0.9375, - "grad_norm": 4.817209315659024, - "kl": 0.07928466796875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006272367997240394, + "kl": 0.0543212890625, "learning_rate": 4.644251774904487e-06, - "loss": 0.0076, - "num_tokens": 60135864.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 62149639.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 104 }, { @@ -2927,20 +2927,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1234.0, - "completions/max_terminated_length": 1234.0, - "completions/mean_length": 563.57421875, - "completions/mean_terminated_length": 563.57421875, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1764.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 594.787109375, + "completions/mean_terminated_length": 594.0704345703125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.09402283411685695, "frac_reward_zero_std": 1.0, - "grad_norm": 0.009712812135983481, - "kl": 0.05499267578125, + "grad_norm": 0.007591245217264678, + "kl": 0.0574951171875, "learning_rate": 4.636347020781684e-06, "loss": 0.0006, - "num_tokens": 60712014.0, + "num_tokens": 62741770.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -2955,26 +2955,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1801.0, - "completions/max_terminated_length": 1801.0, - "completions/mean_length": 551.984375, - "completions/mean_terminated_length": 551.984375, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1339.0, + "completions/mean_length": 554.9921875, + "completions/mean_terminated_length": 552.0704345703125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.09491828967987463, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06067404264171204, - "kl": 0.05560302734375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.09890693315638917, + "kl": 0.0543212890625, "learning_rate": 4.6283631193158605e-06, - "loss": 0.0012, - "num_tokens": 61273174.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0161, + "num_tokens": 63304470.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 106 }, { @@ -2984,25 +2984,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 1733.0, - "completions/max_terminated_length": 1733.0, - "completions/mean_length": 565.373046875, - "completions/mean_terminated_length": 564.3267822265625, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1571.0, + "completions/mean_length": 574.91015625, + "completions/mean_terminated_length": 572.0274047851562, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.09581374524289232, - "frac_reward_zero_std": 1.0, - "grad_norm": 19.400452290618507, - "kl": 3.197265625, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.09008482271620612, + "kl": 0.05413818359375, "learning_rate": 4.620300405495532e-06, - "loss": 0.032, - "num_tokens": 61852677.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0096, + "num_tokens": 63888856.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 107 }, { @@ -3011,26 +3011,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1473.0, - "completions/mean_length": 554.125, - "completions/mean_terminated_length": 551.2015380859375, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1465.0, + "completions/max_terminated_length": 1465.0, + "completions/mean_length": 572.958984375, + "completions/mean_terminated_length": 572.958984375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.09670920080591, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0524692551884981, - "kl": 0.05401611328125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009429951785588081, + "kl": 0.0540771484375, "learning_rate": 4.612159217616022e-06, - "loss": 0.0146, - "num_tokens": 62462309.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 64508131.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 108 }, { @@ -3039,26 +3039,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1408.0, - "completions/max_terminated_length": 1408.0, - "completions/mean_length": 546.9453125, - "completions/mean_terminated_length": 546.9453125, - "completions/min_length": 240.0, - "completions/min_terminated_length": 240.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1404.0, + "completions/max_terminated_length": 1404.0, + "completions/mean_length": 548.35546875, + "completions/mean_terminated_length": 547.6868896484375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.09760465636892769, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.006616016440406459, - "kl": 0.05621337890625, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.09621891008446948, + "kl": 0.057861328125, "learning_rate": 4.603939897265268e-06, - "loss": 0.0006, - "num_tokens": 63050249.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0, + "num_tokens": 65096793.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 109 }, { @@ -3067,20 +3067,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1746.0, - "completions/max_terminated_length": 1746.0, - "completions/mean_length": 540.046875, - "completions/mean_terminated_length": 540.046875, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1719.0, + "completions/max_terminated_length": 1719.0, + "completions/mean_length": 533.857421875, + "completions/mean_terminated_length": 532.8375854492188, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.09850011193194538, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006058315501498899, - "kl": 0.0535888671875, + "grad_norm": 0.010383900504147035, + "kl": 0.065673828125, "learning_rate": 4.595642789309492e-06, - "loss": 0.0005, - "num_tokens": 63619009.0, + "loss": 0.0007, + "num_tokens": 65662384.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -3095,20 +3095,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1615.0, - "completions/max_terminated_length": 1615.0, - "completions/mean_length": 546.98046875, - "completions/mean_terminated_length": 546.98046875, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 553.45703125, + "completions/mean_terminated_length": 550.5322875976562, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.09939556749496306, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006228639344980519, - "kl": 0.05523681640625, + "grad_norm": 0.006695037361617289, + "kl": 0.0556640625, "learning_rate": 4.587268241878724e-06, "loss": 0.0006, - "num_tokens": 64165991.0, + "num_tokens": 66212682.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -3123,26 +3123,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1374.0, - "completions/max_terminated_length": 1374.0, - "completions/mean_length": 555.0703125, - "completions/mean_terminated_length": 554.5205688476562, - "completions/min_length": 236.0, - "completions/min_terminated_length": 236.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1947.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 563.484375, + "completions/mean_terminated_length": 563.484375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.10029102305798075, - "frac_reward_zero_std": 0.96875, - "grad_norm": 1.2290379791712172, - "kl": 0.05560302734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006346704211210989, + "kl": 0.05413818359375, "learning_rate": 4.578816606352205e-06, - "loss": 0.0045, - "num_tokens": 64768443.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 66819442.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 112 }, { @@ -3151,20 +3151,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1258.0, - "completions/max_terminated_length": 1258.0, - "completions/mean_length": 563.681640625, - "completions/mean_terminated_length": 562.8062744140625, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1462.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 587.166015625, + "completions/mean_terminated_length": 587.166015625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.10118647862099843, "frac_reward_zero_std": 0.96875, - "grad_norm": 16.255998645519178, - "kl": 0.74365234375, + "grad_norm": 0.043742302560142485, + "kl": 0.05584716796875, "learning_rate": 4.570288237343632e-06, - "loss": 0.0048, - "num_tokens": 65357352.0, + "loss": -0.0033, + "num_tokens": 67420375.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -3180,25 +3180,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1490.0, - "completions/max_terminated_length": 1490.0, - "completions/mean_length": 586.935546875, - "completions/mean_terminated_length": 586.935546875, - "completions/min_length": 191.0, - "completions/min_terminated_length": 191.0, + "completions/max_length": 1322.0, + "completions/max_terminated_length": 1322.0, + "completions/mean_length": 621.79296875, + "completions/mean_terminated_length": 621.79296875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.10208193418401612, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0774700444571494, - "kl": 0.0572509765625, + "grad_norm": 0.0653506341979416, + "kl": 0.05206298828125, "learning_rate": 4.561683492686289e-06, "loss": 0.0038, - "num_tokens": 65968535.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "num_tokens": 68049405.0, + "reward": 0.09941406548023224, + "reward_std": 0.0012597277527675033, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 114 }, { @@ -3208,25 +3208,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1247.0, - "completions/max_terminated_length": 1247.0, - "completions/mean_length": 541.302734375, - "completions/mean_terminated_length": 541.302734375, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 558.744140625, + "completions/mean_terminated_length": 558.744140625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.1029773897470338, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.00852963260021217, - "kl": 0.0596923828125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08608131464673477, + "kl": 0.05255126953125, "learning_rate": 4.5530027334180285e-06, - "loss": 0.0006, - "num_tokens": 66535154.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0009, + "num_tokens": 68624954.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 115 }, { @@ -3235,26 +3235,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1153.0, - "completions/max_terminated_length": 1153.0, - "completions/mean_length": 504.041015625, - "completions/mean_terminated_length": 504.041015625, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 557.375, + "completions/mean_terminated_length": 556.373779296875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.10387284531005149, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.00898169383540532, - "kl": 0.060791015625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.051558838637773656, + "kl": 0.0572509765625, "learning_rate": 4.544246323766122e-06, - "loss": 0.0006, - "num_tokens": 67104631.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0009, + "num_tokens": 69221738.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 116 }, { @@ -3263,20 +3263,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1247.0, - "completions/max_terminated_length": 1247.0, - "completions/mean_length": 537.59375, - "completions/mean_terminated_length": 537.59375, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1382.0, + "completions/mean_length": 599.0, + "completions/mean_terminated_length": 593.3176879882812, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.10476830087306917, "frac_reward_zero_std": 1.0, - "grad_norm": 0.009282105832995153, - "kl": 0.06085205078125, + "grad_norm": 0.011260686367210828, + "kl": 0.06072998046875, "learning_rate": 4.535414631131983e-06, "loss": 0.0006, - "num_tokens": 67668119.0, + "num_tokens": 69816666.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -3292,25 +3292,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1405.0, - "completions/max_terminated_length": 1405.0, - "completions/mean_length": 516.166015625, - "completions/mean_terminated_length": 516.166015625, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_length": 1427.0, + "completions/max_terminated_length": 1427.0, + "completions/mean_length": 557.9140625, + "completions/mean_terminated_length": 557.9140625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.10566375643608686, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07276575023389194, - "kl": 0.06048583984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007377506198813074, + "kl": 0.05279541015625, "learning_rate": 4.526508026075746e-06, - "loss": -0.0023, - "num_tokens": 68238044.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 70407966.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 118 }, { @@ -3319,26 +3319,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1659.0, - "completions/max_terminated_length": 1659.0, - "completions/mean_length": 546.119140625, - "completions/mean_terminated_length": 546.119140625, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1868.0, + "completions/mean_length": 604.025390625, + "completions/mean_terminated_length": 601.1995849609375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.10655921199910455, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.007709260903345854, - "kl": 0.056884765625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05981685989580845, + "kl": 0.04998779296875, "learning_rate": 4.517526882300721e-06, - "loss": 0.0006, - "num_tokens": 68832137.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0092, + "num_tokens": 71031707.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 119 }, { @@ -3347,22 +3347,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1518.0, - "completions/max_terminated_length": 1518.0, - "completions/mean_length": 510.16796875, - "completions/mean_terminated_length": 510.16796875, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 584.580078125, + "completions/mean_terminated_length": 578.8411865234375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.10745466756212223, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.12374567038474613, - "kl": 0.05853271484375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08493217637882901, + "kl": 0.0518798828125, "learning_rate": 4.508471576637713e-06, - "loss": 0.0052, - "num_tokens": 69416831.0, + "loss": 0.0158, + "num_tokens": 71654500.0, "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "reward_std": 0.0010673906654119492, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.99609375, @@ -3376,19 +3376,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1343.0, - "completions/max_terminated_length": 1343.0, - "completions/mean_length": 517.318359375, - "completions/mean_terminated_length": 517.318359375, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/max_length": 1486.0, + "completions/max_terminated_length": 1486.0, + "completions/mean_length": 583.146484375, + "completions/mean_terminated_length": 583.146484375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.10835012312513992, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007982824080905008, - "kl": 0.0570068359375, + "grad_norm": 0.009711429854862424, + "kl": 0.05181884765625, "learning_rate": 4.499342489029211e-06, - "loss": 0.0006, - "num_tokens": 69983394.0, + "loss": 0.0005, + "num_tokens": 72254767.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -3403,26 +3403,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1176.0, - "completions/max_terminated_length": 1176.0, - "completions/mean_length": 503.2109375, - "completions/mean_terminated_length": 503.2109375, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1591.0, + "completions/max_terminated_length": 1591.0, + "completions/mean_length": 591.98046875, + "completions/mean_terminated_length": 590.6927490234375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.1092455786881576, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05137313735665502, - "kl": 0.0572509765625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006688758768353923, + "kl": 0.05718994140625, "learning_rate": 4.490140002513449e-06, - "loss": -0.0022, - "num_tokens": 70554302.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 72871125.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 122 }, { @@ -3432,19 +3432,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 1103.0, - "completions/max_terminated_length": 1103.0, - "completions/mean_length": 492.255859375, - "completions/mean_terminated_length": 491.2739562988281, - "completions/min_length": 90.0, - "completions/min_terminated_length": 90.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 569.203125, + "completions/mean_terminated_length": 566.3092041015625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.11014103425117529, "frac_reward_zero_std": 0.96875, - "grad_norm": 6.6575044592016654, - "kl": 0.06549072265625, + "grad_norm": 0.05402023980897781, + "kl": 0.05047607421875, "learning_rate": 4.48086450320833e-06, - "loss": 0.0053, - "num_tokens": 71102657.0, + "loss": 0.0152, + "num_tokens": 73458877.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -3459,26 +3459,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1139.0, - "completions/max_terminated_length": 1139.0, - "completions/mean_length": 501.88671875, - "completions/mean_terminated_length": 501.88671875, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1306.0, + "completions/max_terminated_length": 1306.0, + "completions/mean_length": 571.666015625, + "completions/mean_terminated_length": 569.3275146484375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.11103648981419297, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.13448036932392413, - "kl": 0.05462646484375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.055716219457509576, + "kl": 0.06829833984375, "learning_rate": 4.4715163802952266e-06, - "loss": 0.0022, - "num_tokens": 71662455.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": -0.001, + "num_tokens": 74054402.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 124 }, { @@ -3488,25 +3488,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1883.0, - "completions/max_terminated_length": 1883.0, - "completions/mean_length": 558.08203125, - "completions/mean_terminated_length": 558.08203125, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/max_length": 1784.0, + "completions/max_terminated_length": 1784.0, + "completions/mean_length": 621.005859375, + "completions/mean_terminated_length": 621.005859375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.11193194537721066, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.008863906624678173, - "kl": 0.05322265625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.059224756742812495, + "kl": 0.05059814453125, "learning_rate": 4.462096026002655e-06, - "loss": 0.0005, - "num_tokens": 72236449.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0017, + "num_tokens": 74660613.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 125 }, { @@ -3516,19 +3516,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1646.0, - "completions/max_terminated_length": 1646.0, - "completions/mean_length": 523.720703125, - "completions/mean_terminated_length": 523.720703125, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_length": 1711.0, + "completions/max_terminated_length": 1711.0, + "completions/mean_length": 581.55078125, + "completions/mean_terminated_length": 581.55078125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.11282740094022835, "frac_reward_zero_std": 1.0, - "grad_norm": 0.009601045986280915, - "kl": 0.0523681640625, + "grad_norm": 0.006152298759927084, + "kl": 0.0509033203125, "learning_rate": 4.4526038355898144e-06, "loss": 0.0005, - "num_tokens": 72817346.0, + "num_tokens": 75271119.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -3544,25 +3544,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1414.0, - "completions/max_terminated_length": 1414.0, - "completions/mean_length": 511.84375, - "completions/mean_terminated_length": 511.84375, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/max_length": 1253.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 575.66015625, + "completions/mean_terminated_length": 575.66015625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.11372285650324603, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.01150702874361092, - "kl": 0.05126953125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.054134619906622765, + "kl": 0.0501708984375, "learning_rate": 4.4430402073300035e-06, - "loss": 0.0005, - "num_tokens": 73390642.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0006, + "num_tokens": 75877089.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 127 }, { @@ -3572,19 +3572,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1298.0, - "completions/max_terminated_length": 1298.0, - "completions/mean_length": 535.732421875, - "completions/mean_terminated_length": 535.732421875, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/max_length": 1672.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 579.533203125, + "completions/mean_terminated_length": 579.533203125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.11461831206626372, "frac_reward_zero_std": 1.0, - "grad_norm": 0.9117448141973743, - "kl": 0.12908935546875, + "grad_norm": 0.005668747851169466, + "kl": 0.0504150390625, "learning_rate": 4.433405542493909e-06, - "loss": 0.0013, - "num_tokens": 73959529.0, + "loss": 0.0005, + "num_tokens": 76468402.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -3599,26 +3599,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1485.0, - "completions/mean_length": 606.228515625, - "completions/mean_terminated_length": 594.8759765625, - "completions/min_length": 275.0, - "completions/min_terminated_length": 275.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1584.0, + "completions/max_terminated_length": 1584.0, + "completions/mean_length": 604.94921875, + "completions/mean_terminated_length": 603.1017456054688, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, "epoch": 0.1155137676292814, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.014323382678470751, - "kl": 0.0703125, + "frac_reward_zero_std": 0.9375, + "grad_norm": 3.323619983813528, + "kl": 0.05792236328125, "learning_rate": 4.4237002453327734e-06, - "loss": 0.0007, - "num_tokens": 74576526.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward/mean": 0.0, + "loss": 0.0121, + "num_tokens": 77084744.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 129 }, { @@ -3627,26 +3627,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 693.314453125, - "completions/mean_terminated_length": 663.5708618164062, - "completions/min_length": 261.0, - "completions/min_terminated_length": 261.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 554.462890625, + "completions/mean_terminated_length": 554.462890625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.11640922319229909, - "frac_reward_zero_std": 0.8125, - "grad_norm": 0.18814500549038302, - "kl": 0.106689453125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005401927989012176, + "kl": 0.0499267578125, "learning_rate": 4.4139247230614245e-06, - "loss": 0.0036, - "num_tokens": 75240063.0, - "reward": 0.09882812201976776, - "reward_std": 0.004687500186264515, + "loss": 0.0005, + "num_tokens": 77677189.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.98828125, - "rewards/format_reward/std": 0.10772226005792618, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 130 }, { @@ -3655,20 +3655,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.53125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2036.0, - "completions/mean_length": 822.12109375, - "completions/mean_terminated_length": 745.8215942382812, - "completions/min_length": 282.0, - "completions/min_terminated_length": 282.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1505.0, + "completions/max_terminated_length": 1505.0, + "completions/mean_length": 559.4375, + "completions/mean_terminated_length": 559.4375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.11730467875531676, "frac_reward_zero_std": 1.0, - "grad_norm": 0.02343943908233584, - "kl": 0.1158447265625, + "grad_norm": 0.0061677770684820275, + "kl": 0.05316162109375, "learning_rate": 4.404079385841201e-06, - "loss": 0.0012, - "num_tokens": 75948637.0, + "loss": 0.0005, + "num_tokens": 78251269.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -3683,26 +3683,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 994.90625, - "completions/mean_terminated_length": 847.144775390625, - "completions/min_length": 234.0, - "completions/min_terminated_length": 234.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1731.0, + "completions/max_terminated_length": 1731.0, + "completions/mean_length": 591.04296875, + "completions/mean_terminated_length": 591.04296875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.11820013431833445, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.07306244420161156, - "kl": 0.11376953125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005599681736981137, + "kl": 0.052978515625, "learning_rate": 4.394164646762734e-06, - "loss": 0.0054, - "num_tokens": 76751421.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0005, + "num_tokens": 78847275.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 132 }, { @@ -3711,26 +3711,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -5.84375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 1017.630859375, - "completions/mean_terminated_length": 843.5501708984375, - "completions/min_length": 270.0, - "completions/min_terminated_length": 270.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1358.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 576.123046875, + "completions/mean_terminated_length": 576.123046875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.11909558988135213, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04774455505526359, - "kl": 0.11669921875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005987513884752594, + "kl": 0.0504150390625, "learning_rate": 4.384180921828618e-06, - "loss": -0.0014, - "num_tokens": 77631184.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 79500986.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 133 }, { @@ -3739,26 +3739,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -5.1875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2044.0, - "completions/mean_length": 1152.490234375, - "completions/mean_terminated_length": 890.169189453125, - "completions/min_length": 266.0, - "completions/min_terminated_length": 266.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1332.0, + "completions/max_terminated_length": 1332.0, + "completions/mean_length": 580.025390625, + "completions/mean_terminated_length": 578.5538330078125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, "epoch": 0.11999104544436982, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07160446510060933, - "kl": 0.120361328125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0068151723856892735, + "kl": 0.05657958984375, "learning_rate": 4.374128629935955e-06, "loss": 0.0006, - "num_tokens": 78525467.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "num_tokens": 80102167.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 134 }, { @@ -3767,26 +3767,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.59375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 1246.322265625, - "completions/mean_terminated_length": 901.4664306640625, - "completions/min_length": 203.0, - "completions/min_terminated_length": 203.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1683.0, + "completions/max_terminated_length": 1683.0, + "completions/mean_length": 578.212890625, + "completions/mean_terminated_length": 576.4461669921875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.12088650100738751, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.05630071606380346, - "kl": 0.11083984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03848281665656771, + "kl": 0.0733642578125, "learning_rate": 4.364008192858781e-06, - "loss": -0.003, - "num_tokens": 79475744.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0007, + "num_tokens": 80710372.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 135 }, { @@ -3795,26 +3795,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.5625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 1288.12890625, - "completions/mean_terminated_length": 955.1516723632812, - "completions/min_length": 243.0, - "completions/min_terminated_length": 243.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1568.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 587.712890625, + "completions/mean_terminated_length": 585.7944946289062, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.12178195657040519, - "frac_reward_zero_std": 0.875, - "grad_norm": 795332.7231700029, - "kl": 14656.082153320312, + "frac_reward_zero_std": 0.90625, + "grad_norm": 1.1813951961066245, + "kl": 0.0977783203125, "learning_rate": 4.353820035230366e-06, - "loss": 145.7659, - "num_tokens": 80433074.0, - "reward": 0.09921875596046448, - "reward_std": 0.0031250000465661287, + "loss": 0.0095, + "num_tokens": 81309089.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 136 }, { @@ -3823,26 +3823,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.265625, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2030.0, - "completions/mean_length": 1294.751953125, - "completions/mean_terminated_length": 903.5994262695312, - "completions/min_length": 226.0, - "completions/min_terminated_length": 226.0, + "completions/max_terminated_length": 1525.0, + "completions/mean_length": 529.837890625, + "completions/mean_terminated_length": 526.866943359375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.12267741213342288, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.1687795917678753, - "kl": 0.126708984375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.10318370576581831, + "kl": 0.05096435546875, "learning_rate": 4.3435645845254e-06, - "loss": 0.0049, - "num_tokens": 81393939.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0084, + "num_tokens": 81878318.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 137 }, { @@ -3851,26 +3851,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.015625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2025.0, - "completions/mean_length": 1392.58203125, - "completions/mean_terminated_length": 1002.5980834960938, - "completions/min_length": 301.0, - "completions/min_terminated_length": 301.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1481.0, + "completions/max_terminated_length": 1481.0, + "completions/mean_length": 601.44140625, + "completions/mean_terminated_length": 601.44140625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.12357286769644056, - "frac_reward_zero_std": 0.78125, - "grad_norm": 0.09891611716752852, - "kl": 0.1055908203125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005667719990219518, + "kl": 0.05023193359375, "learning_rate": 4.333242271042054e-06, - "loss": -0.0155, - "num_tokens": 82458253.0, - "reward": 0.09843750298023224, - "reward_std": 0.005754890851676464, + "loss": 0.0005, + "num_tokens": 82537568.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.984375, - "rewards/format_reward/std": 0.12414088100194931, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 138 }, { @@ -3879,26 +3879,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.3125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 1325.6328125, - "completions/mean_terminated_length": 960.2000122070312, - "completions/min_length": 202.0, - "completions/min_terminated_length": 202.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1317.0, + "completions/max_terminated_length": 1317.0, + "completions/mean_length": 556.82421875, + "completions/mean_terminated_length": 556.82421875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.12446832325945825, - "frac_reward_zero_std": 0.75, - "grad_norm": 0.11212612719912256, - "kl": 0.1063232421875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006808014727799348, + "kl": 0.05157470703125, "learning_rate": 4.32285352788393e-06, - "loss": -0.005, - "num_tokens": 83456625.0, - "reward": 0.09824219346046448, - "reward_std": 0.006536140572279692, + "loss": 0.0005, + "num_tokens": 83142310.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.982421875, - "rewards/format_reward/std": 0.13154059648513794, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 139 }, { @@ -3907,26 +3907,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.15625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2022.0, - "completions/mean_length": 1346.60546875, - "completions/mean_terminated_length": 959.7757568359375, - "completions/min_length": 272.0, - "completions/min_terminated_length": 272.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1133.0, + "completions/max_terminated_length": 1133.0, + "completions/mean_length": 543.28125, + "completions/mean_terminated_length": 543.28125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.12536377882247593, - "frac_reward_zero_std": 0.78125, - "grad_norm": 12694.24016060589, - "kl": 618.6192626953125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006999711096947045, + "kl": 0.05181884765625, "learning_rate": 4.312398790941882e-06, - "loss": 6.1707, - "num_tokens": 84461159.0, - "reward": 0.09843750298023224, - "reward_std": 0.005754890851676464, + "loss": 0.0005, + "num_tokens": 83735542.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.984375, - "rewards/format_reward/std": 0.12414088100194931, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 140 }, { @@ -3935,22 +3935,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.671875, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 1241.689453125, - "completions/mean_terminated_length": 910.7245483398438, - "completions/min_length": 230.0, - "completions/min_terminated_length": 230.0, + "completions/max_terminated_length": 1233.0, + "completions/mean_length": 519.869140625, + "completions/mean_terminated_length": 513.8765258789062, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.1262592343854936, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.059238726705022914, - "kl": 0.1033935546875, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.12202530599091238, + "kl": 0.0498046875, "learning_rate": 4.301878498875735e-06, - "loss": 0.0002, - "num_tokens": 85383528.0, + "loss": 0.0316, + "num_tokens": 84288339.0, "reward": 0.09941406548023224, - "reward_std": 0.0018486406188458204, + "reward_std": 0.0023437500931322575, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.994140625, @@ -3963,26 +3963,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.421875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 1294.9375, - "completions/mean_terminated_length": 936.85302734375, - "completions/min_length": 306.0, - "completions/min_terminated_length": 306.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1405.0, + "completions/max_terminated_length": 1405.0, + "completions/mean_length": 517.1953125, + "completions/mean_terminated_length": 517.1953125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.1271546899485113, - "frac_reward_zero_std": 0.84375, - "grad_norm": 0.08422896350695738, - "kl": 0.1038818359375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0060097815761899044, + "kl": 0.051513671875, "learning_rate": 4.291293093095873e-06, - "loss": -0.0012, - "num_tokens": 86330200.0, - "reward": 0.09882812947034836, - "reward_std": 0.004192390479147434, + "loss": 0.0005, + "num_tokens": 84836807.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.98828125, - "rewards/format_reward/std": 0.10772226005792618, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 142 }, { @@ -3991,26 +3991,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.203125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 1359.435546875, - "completions/mean_terminated_length": 989.3063354492188, - "completions/min_length": 276.0, - "completions/min_terminated_length": 276.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1405.0, + "completions/max_terminated_length": 1405.0, + "completions/mean_length": 567.349609375, + "completions/mean_terminated_length": 567.349609375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.128050145511529, - "frac_reward_zero_std": 0.84375, - "grad_norm": 1.0100953665705108, - "kl": 0.123291015625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0070273058865257835, + "kl": 0.05145263671875, "learning_rate": 4.280643017744723e-06, - "loss": -0.0056, - "num_tokens": 87369495.0, - "reward": 0.09902344644069672, - "reward_std": 0.00390625, + "loss": 0.0005, + "num_tokens": 85470554.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.990234375, - "rewards/format_reward/std": 0.09843364357948303, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 143 }, { @@ -4019,26 +4019,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -3.859375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 1424.6328125, - "completions/mean_terminated_length": 1021.7492065429688, - "completions/min_length": 348.0, - "completions/min_terminated_length": 348.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 596.11328125, + "completions/mean_terminated_length": 596.11328125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.12894560107454667, - "frac_reward_zero_std": 0.6875, - "grad_norm": 0.09935539337282194, - "kl": 0.0885009765625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0067549331197202115, + "kl": 0.05047607421875, "learning_rate": 4.269928719678117e-06, - "loss": 0.0017, - "num_tokens": 88405371.0, - "reward": 0.09765625, - "reward_std": 0.008384780958294868, + "loss": 0.0005, + "num_tokens": 86082228.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9765625, - "rewards/format_reward/std": 0.15143637359142303, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 144 }, { @@ -4047,26 +4047,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.140625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1348.701171875, - "completions/mean_terminated_length": 959.7294921875, - "completions/min_length": 338.0, - "completions/min_terminated_length": 338.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1662.0, + "completions/max_terminated_length": 1662.0, + "completions/mean_length": 539.474609375, + "completions/mean_terminated_length": 539.474609375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.12984105663756437, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.2595208434649135, - "kl": 0.113037109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011441613334834384, + "kl": 0.05242919921875, "learning_rate": 4.2591506484465426e-06, - "loss": 0.0059, - "num_tokens": 89399538.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 86662071.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 145 }, { @@ -4075,26 +4075,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.203125, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1366.169921875, - "completions/mean_terminated_length": 999.6607055664062, - "completions/min_length": 247.0, - "completions/min_terminated_length": 247.0, + "completions/max_terminated_length": 1496.0, + "completions/mean_length": 561.123046875, + "completions/mean_terminated_length": 555.2921752929688, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.13073651220058205, - "frac_reward_zero_std": 0.875, - "grad_norm": 0.08052216200621251, - "kl": 0.1043701171875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0589455460705877, + "kl": 0.052734375, "learning_rate": 4.248309256276283e-06, - "loss": 0.006, - "num_tokens": 90401529.0, - "reward": 0.09921875596046448, - "reward_std": 0.0031250000465661287, + "loss": 0.0137, + "num_tokens": 87251878.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 146 }, { @@ -4103,26 +4103,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.359375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1320.82421875, - "completions/mean_terminated_length": 962.5364990234375, - "completions/min_length": 255.0, - "completions/min_terminated_length": 255.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1170.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 520.42578125, + "completions/mean_terminated_length": 520.42578125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.13163196776359973, - "frac_reward_zero_std": 0.78125, - "grad_norm": 2.41982629512539, - "kl": 0.103759765625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012190047453444887, + "kl": 0.0516357421875, "learning_rate": 4.23740499805044e-06, - "loss": 0.0062, - "num_tokens": 91347151.0, - "reward": 0.0986328125, - "reward_std": 0.00546875037252903, + "loss": 0.0005, + "num_tokens": 87787696.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.986328125, - "rewards/format_reward/std": 0.1162383034825325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 147 }, { @@ -4131,26 +4131,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.515625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1281.87109375, - "completions/mean_terminated_length": 936.7875366210938, - "completions/min_length": 227.0, - "completions/min_terminated_length": 227.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1788.0, + "completions/max_terminated_length": 1788.0, + "completions/mean_length": 520.943359375, + "completions/mean_terminated_length": 520.943359375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.1325274233266174, - "frac_reward_zero_std": 0.6875, - "grad_norm": 1.0229642621220172, - "kl": 0.1658935546875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008192901660929772, + "kl": 0.05194091796875, "learning_rate": 4.22643833128985e-06, - "loss": 0.0131, - "num_tokens": 92310541.0, - "reward": 0.09687500447034836, - "reward_std": 0.009193411096930504, + "loss": 0.0005, + "num_tokens": 88361491.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.96875, - "rewards/format_reward/std": 0.17416280508041382, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 148 }, { @@ -4159,26 +4159,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.21875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1364.658203125, - "completions/mean_terminated_length": 1000.4820556640625, - "completions/min_length": 315.0, - "completions/min_terminated_length": 315.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1529.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 561.27734375, + "completions/mean_terminated_length": 561.27734375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.1334228788896351, - "frac_reward_zero_std": 0.75, - "grad_norm": 0.11093213327135282, - "kl": 0.0858154296875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0059915904846710765, + "kl": 0.05084228515625, "learning_rate": 4.215409716133885e-06, - "loss": -0.0018, - "num_tokens": 93312318.0, - "reward": 0.09824219346046448, - "reward_std": 0.006536140572279692, + "loss": 0.0005, + "num_tokens": 88951937.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.982421875, - "rewards/format_reward/std": 0.13154059648513794, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 149 }, { @@ -4187,26 +4187,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.5625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2046.0, - "completions/mean_length": 1338.37890625, - "completions/mean_terminated_length": 1027.42138671875, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1336.0, + "completions/max_terminated_length": 1336.0, + "completions/mean_length": 582.072265625, + "completions/mean_terminated_length": 582.072265625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.1343183344526528, - "frac_reward_zero_std": 0.8125, - "grad_norm": 0.3995251198579641, - "kl": 0.0902099609375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006120979141360546, + "kl": 0.04888916015625, "learning_rate": 4.204319615321151e-06, - "loss": 0.0086, - "num_tokens": 94314160.0, - "reward": 0.09882812947034836, - "reward_std": 0.004687500186264515, + "loss": 0.0005, + "num_tokens": 89566550.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.98828125, - "rewards/format_reward/std": 0.10772226005792618, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 150 }, { @@ -4215,26 +4215,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -4.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 1252.2109375, - "completions/mean_terminated_length": 984.1775512695312, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1401.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 570.46875, + "completions/mean_terminated_length": 570.46875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.13521379001567047, - "frac_reward_zero_std": 0.75, - "grad_norm": 0.11590978634626535, - "kl": 0.08642578125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005023995826478019, + "kl": 0.04949951171875, "learning_rate": 4.193168494170065e-06, - "loss": -0.0085, - "num_tokens": 95272492.0, - "reward": 0.09804687649011612, - "reward_std": 0.006822281051427126, + "loss": 0.0005, + "num_tokens": 90175830.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.98046875, - "rewards/format_reward/std": 0.1385180652141571, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 151 }, { @@ -4243,26 +4243,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -5.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2045.0, - "completions/mean_length": 1240.875, - "completions/mean_terminated_length": 980.1757202148438, - "completions/min_length": 241.0, - "completions/min_terminated_length": 241.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1614.0, + "completions/max_terminated_length": 1614.0, + "completions/mean_length": 601.76953125, + "completions/mean_terminated_length": 601.76953125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, "epoch": 0.13610924557868817, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.06993291127200373, - "kl": 0.120849609375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006632757499928364, + "kl": 0.0469970703125, "learning_rate": 4.181956820559339e-06, - "loss": -0.0054, - "num_tokens": 96201068.0, - "reward": 0.09921875596046448, - "reward_std": 0.0026298905722796917, + "loss": 0.0005, + "num_tokens": 90777184.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 152 }, { @@ -4271,26 +4271,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -5.578125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1163.3671875, - "completions/mean_terminated_length": 972.1520385742188, - "completions/min_length": 219.0, - "completions/min_terminated_length": 219.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1444.0, + "completions/max_terminated_length": 1444.0, + "completions/mean_length": 553.236328125, + "completions/mean_terminated_length": 552.5107421875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.13700470114170585, - "frac_reward_zero_std": 0.84375, - "grad_norm": 0.0861319462004464, - "kl": 0.0950927734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 669442.2813001474, + "kl": 103936.03869628906, "learning_rate": 4.170685064908342e-06, - "loss": 0.0035, - "num_tokens": 97089800.0, - "reward": 0.09902343899011612, - "reward_std": 0.00390625, + "loss": 1039.7333, + "num_tokens": 91353529.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.990234375, - "rewards/format_reward/std": 0.09843364357948303, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 153 }, { @@ -4299,26 +4299,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -5.3125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 1177.39453125, - "completions/mean_terminated_length": 944.6583862304688, - "completions/min_length": 238.0, - "completions/min_terminated_length": 238.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1472.0, + "completions/max_terminated_length": 1472.0, + "completions/mean_length": 586.95703125, + "completions/mean_terminated_length": 585.880615234375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.13790015670472353, - "frac_reward_zero_std": 0.78125, - "grad_norm": 0.26491805736565843, - "kl": 0.1044921875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 122492.80685143314, + "kl": 38912.03790283203, "learning_rate": 4.159353700157365e-06, - "loss": -0.0087, - "num_tokens": 97982770.0, - "reward": 0.09843750298023224, - "reward_std": 0.005754890851676464, + "loss": 387.657, + "num_tokens": 91944195.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.984375, - "rewards/format_reward/std": 0.12414088100194931, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 154 }, { @@ -4327,26 +4327,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -5.796875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 1122.61328125, - "completions/mean_terminated_length": 958.8092041015625, - "completions/min_length": 274.0, - "completions/min_terminated_length": 274.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 548.65234375, + "completions/mean_terminated_length": 548.65234375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.1387956122677412, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.0671134919909452, - "kl": 0.0772705078125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0060252282224972425, + "kl": 0.0513916015625, "learning_rate": 4.14796320174778e-06, - "loss": 0.0031, - "num_tokens": 98844428.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0005, + "num_tokens": 92511985.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 155 }, { @@ -4355,26 +4355,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -5.6875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 1108.546875, - "completions/mean_terminated_length": 924.168212890625, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 1338.0, + "completions/mean_length": 578.60546875, + "completions/mean_terminated_length": 575.7299194335938, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.1396910678307589, - "frac_reward_zero_std": 0.8125, - "grad_norm": 0.09175374952691938, - "kl": 0.098876953125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07969173074262088, + "kl": 0.05120849609375, "learning_rate": 4.136514047602087e-06, - "loss": -0.0086, - "num_tokens": 99727028.0, - "reward": 0.09882812947034836, - "reward_std": 0.004687500186264515, + "loss": 0.0121, + "num_tokens": 93123255.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.98828125, - "rewards/format_reward/std": 0.10772226005792618, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 156 }, { @@ -4383,26 +4383,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -5.84375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2020.0, - "completions/mean_length": 1090.578125, - "completions/mean_terminated_length": 928.8218994140625, - "completions/min_length": 277.0, - "completions/min_terminated_length": 277.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1571.0, + "completions/max_terminated_length": 1571.0, + "completions/mean_length": 583.58203125, + "completions/mean_terminated_length": 583.58203125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, "epoch": 0.14058652339377659, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.057742259396112945, - "kl": 0.0733642578125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.052805707699068435, + "kl": 0.0496826171875, "learning_rate": 4.1250067181038635e-06, - "loss": 0.0023, - "num_tokens": 100603164.0, - "reward": 0.09941406548023224, - "reward_std": 0.0018486406188458204, + "loss": -0.0005, + "num_tokens": 93739809.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 157 }, { @@ -4411,26 +4411,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -5.640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2011.0, - "completions/mean_length": 1118.96875, - "completions/mean_terminated_length": 928.79052734375, - "completions/min_length": 245.0, - "completions/min_terminated_length": 245.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1360.0, + "completions/max_terminated_length": 1360.0, + "completions/mean_length": 596.07421875, + "completions/mean_terminated_length": 596.07421875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.14148197895679426, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.009952836592016451, - "kl": 0.0738525390625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07305239980212441, + "kl": 0.05096435546875, "learning_rate": 4.113441696077608e-06, - "loss": 0.0007, - "num_tokens": 101509644.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0005, + "num_tokens": 94378567.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 158 }, { @@ -4439,26 +4439,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.046875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2047.0, - "completions/mean_length": 1012.00390625, - "completions/mean_terminated_length": 871.8802490234375, - "completions/min_length": 216.0, - "completions/min_terminated_length": 216.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1299.0, + "completions/max_terminated_length": 1299.0, + "completions/mean_length": 601.626953125, + "completions/mean_terminated_length": 601.626953125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, "epoch": 0.14237743451981194, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.14858531404881065, - "kl": 0.076904296875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005014221442121329, + "kl": 0.051513671875, "learning_rate": 4.101819466768484e-06, - "loss": -0.0044, - "num_tokens": 102328510.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 94987320.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 159 }, { @@ -4467,26 +4467,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.40625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1991.0, - "completions/mean_length": 921.74609375, - "completions/mean_terminated_length": 831.4556274414062, - "completions/min_length": 223.0, - "completions/min_terminated_length": 223.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1367.0, + "completions/max_terminated_length": 1367.0, + "completions/mean_length": 585.607421875, + "completions/mean_terminated_length": 585.607421875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.14327289008282965, - "frac_reward_zero_std": 0.75, - "grad_norm": 0.1361238385560723, - "kl": 0.11376953125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004656472609840778, + "kl": 0.04998779296875, "learning_rate": 4.0901405178219535e-06, - "loss": -0.0019, - "num_tokens": 103101340.0, - "reward": 0.09843750298023224, - "reward_std": 0.0062500000931322575, + "loss": 0.0005, + "num_tokens": 95588047.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.984375, - "rewards/format_reward/std": 0.12414088100194931, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 160 }, { @@ -4495,26 +4495,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.4375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1997.0, - "completions/mean_length": 858.59765625, - "completions/mean_terminated_length": 768.6428833007812, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 545.453125, + "completions/mean_terminated_length": 544.9236450195312, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.14416834564584732, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.056481508031488666, - "kl": 0.07568359375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.11889440151384241, + "kl": 0.06475830078125, "learning_rate": 4.078405339263326e-06, - "loss": -0.0145, - "num_tokens": 103830654.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": -0.0015, + "num_tokens": 96157031.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 161 }, { @@ -4523,26 +4523,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.4375, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 945.384765625, - "completions/mean_terminated_length": 861.9937133789062, - "completions/min_length": 234.0, - "completions/min_terminated_length": 234.0, + "completions/max_terminated_length": 1741.0, + "completions/mean_length": 644.041015625, + "completions/mean_terminated_length": 641.2935180664062, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.145063801208865, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.05837129013687458, - "kl": 0.06982421875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005252156535586015, + "kl": 0.04974365234375, "learning_rate": 4.06661442347719e-06, - "loss": 0.0009, - "num_tokens": 104648867.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, - "rewards/code_reward/mean": 0.0, + "loss": 0.0005, + "num_tokens": 96820956.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 162 }, { @@ -4551,26 +4551,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.734375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 852.2265625, - "completions/mean_terminated_length": 811.1596069335938, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1574.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 598.158203125, + "completions/mean_terminated_length": 597.5126953125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.1459592567718827, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.079316699909495, - "kl": 0.0712890625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01181182986602236, + "kl": 0.04864501953125, "learning_rate": 4.054768265186758e-06, - "loss": 0.0102, - "num_tokens": 105394327.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 97436333.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 163 }, { @@ -4579,26 +4579,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.6875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 787.642578125, - "completions/mean_terminated_length": 736.4085083007812, - "completions/min_length": 228.0, - "completions/min_terminated_length": 228.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1857.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 583.1484375, + "completions/mean_terminated_length": 583.1484375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.14685471233490038, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.034540728831484666, - "kl": 0.085693359375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0054554811693594725, + "kl": 0.04931640625, "learning_rate": 4.0428673614331036e-06, - "loss": -0.0013, - "num_tokens": 106085968.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 98023273.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 164 }, { @@ -4607,26 +4607,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.59375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 830.478515625, - "completions/mean_terminated_length": 765.3436279296875, - "completions/min_length": 210.0, - "completions/min_terminated_length": 210.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1654.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 620.806640625, + "completions/mean_terminated_length": 619.1565551757812, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.14775016789791806, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.06310920256066563, - "kl": 0.074951171875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006482515932308206, + "kl": 0.05206298828125, "learning_rate": 4.030912211554316e-06, - "loss": -0.0152, - "num_tokens": 106846837.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0005, + "num_tokens": 98676790.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 165 }, { @@ -4635,20 +4635,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.640625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1986.0, - "completions/mean_length": 834.1953125, - "completions/mean_terminated_length": 777.104248046875, - "completions/min_length": 221.0, - "completions/min_terminated_length": 221.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1674.0, + "completions/max_terminated_length": 1674.0, + "completions/mean_length": 641.078125, + "completions/mean_terminated_length": 641.078125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.14864562346093574, "frac_reward_zero_std": 1.0, - "grad_norm": 0.011733653254552742, - "kl": 0.0711669921875, + "grad_norm": 0.005327353948433487, + "kl": 0.04791259765625, "learning_rate": 4.018903317164539e-06, - "loss": 0.0007, - "num_tokens": 107568953.0, + "loss": 0.0005, + "num_tokens": 99300030.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -4663,26 +4663,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.796875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 774.447265625, - "completions/mean_terminated_length": 741.2685546875, - "completions/min_length": 256.0, - "completions/min_terminated_length": 256.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1534.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 614.861328125, + "completions/mean_terminated_length": 614.861328125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.14954107902395344, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.03409817940123074, - "kl": 0.069580078125, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.09522432906751536, + "kl": 0.04833984375, "learning_rate": 4.006841182132932e-06, - "loss": -0.0017, - "num_tokens": 108281694.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0009, + "num_tokens": 99931063.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 167 }, { @@ -4691,26 +4691,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.828125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 756.509765625, - "completions/mean_terminated_length": 728.1536865234375, - "completions/min_length": 227.0, - "completions/min_terminated_length": 227.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1574.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 608.65234375, + "completions/mean_terminated_length": 608.65234375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.15043653458697112, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.053464405396001835, - "kl": 0.073486328125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0054496948068973525, + "kl": 0.048583984375, "learning_rate": 3.9947263125625195e-06, - "loss": -0.0007, - "num_tokens": 108949523.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 100523189.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 168 }, { @@ -4719,26 +4719,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.671875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2010.0, - "completions/mean_length": 788.6875, - "completions/mean_terminated_length": 734.826904296875, - "completions/min_length": 208.0, - "completions/min_terminated_length": 208.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1655.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 635.2734375, + "completions/mean_terminated_length": 635.2734375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.1513319901499888, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07949072343589329, - "kl": 0.0740966796875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005337411111325454, + "kl": 0.0496826171875, "learning_rate": 3.982559216768967e-06, - "loss": 0.0034, - "num_tokens": 109670835.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 101165953.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 169 }, { @@ -4747,20 +4747,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.796875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 698.345703125, - "completions/mean_terminated_length": 663.1843872070312, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 1283.0, + "completions/mean_length": 585.845703125, + "completions/mean_terminated_length": 582.9843139648438, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.1522274457130065, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.045217268440116885, - "kl": 0.0711669921875, + "grad_norm": 0.06625166911677717, + "kl": 0.04742431640625, "learning_rate": 3.970340405259245e-06, - "loss": -0.0016, - "num_tokens": 110321060.0, + "loss": 0.0127, + "num_tokens": 101758578.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -4775,20 +4775,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.890625, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1813.0, - "completions/mean_length": 644.583984375, - "completions/mean_terminated_length": 625.1307373046875, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/max_terminated_length": 1243.0, + "completions/mean_length": 574.005859375, + "completions/mean_terminated_length": 569.803955078125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.15312290127602418, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05776747187731656, - "kl": 0.0753173828125, + "grad_norm": 0.11944483374125982, + "kl": 0.10235595703125, "learning_rate": 3.958070390710214e-06, - "loss": 0.0006, - "num_tokens": 110936255.0, + "loss": 0.0142, + "num_tokens": 102337637.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -4803,26 +4803,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2021.0, - "completions/mean_length": 692.998046875, - "completions/mean_terminated_length": 682.3287353515625, - "completions/min_length": 236.0, - "completions/min_terminated_length": 236.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 577.810546875, + "completions/mean_terminated_length": 577.810546875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.15401835683904186, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.056419783846101156, - "kl": 0.072021484375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007013422365760943, + "kl": 0.048095703125, "learning_rate": 3.945749687947109e-06, "loss": 0.0005, - "num_tokens": 111589950.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "num_tokens": 102932356.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 172 }, { @@ -4831,20 +4831,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1706.0, - "completions/mean_length": 645.02734375, - "completions/mean_terminated_length": 631.1913452148438, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1270.0, + "completions/max_terminated_length": 1270.0, + "completions/mean_length": 557.375, + "completions/mean_terminated_length": 557.375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.15491381240205954, "frac_reward_zero_std": 1.0, - "grad_norm": 0.008793484041353715, - "kl": 0.0721435546875, + "grad_norm": 0.00570724158301358, + "kl": 0.0462646484375, "learning_rate": 3.933378813921942e-06, - "loss": 0.0007, - "num_tokens": 112188604.0, + "loss": 0.0005, + "num_tokens": 103486132.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -4859,26 +4859,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.859375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 708.283203125, - "completions/mean_terminated_length": 684.3120727539062, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1897.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 623.728515625, + "completions/mean_terminated_length": 623.728515625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.15580926796507724, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.041267501649098925, - "kl": 0.0745849609375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005255713212212517, + "kl": 0.048095703125, "learning_rate": 3.920958287691811e-06, - "loss": -0.0033, - "num_tokens": 112857709.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 104111945.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 174 }, { @@ -4887,20 +4887,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 678.421875, - "completions/mean_terminated_length": 664.9152221679688, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1481.0, + "completions/max_terminated_length": 1481.0, + "completions/mean_length": 575.19140625, + "completions/mean_terminated_length": 575.19140625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.15670472352809492, "frac_reward_zero_std": 0.9375, - "grad_norm": 0.06445929796633103, - "kl": 0.0740966796875, + "grad_norm": 0.09350953931872348, + "kl": 0.04718017578125, "learning_rate": 3.908488630397121e-06, - "loss": -0.0031, - "num_tokens": 113514933.0, + "loss": 0.0081, + "num_tokens": 104716315.0, "reward": 0.099609375, "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, @@ -4915,20 +4915,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 672.736328125, - "completions/mean_terminated_length": 659.173583984375, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 1441.0, + "completions/mean_length": 592.453125, + "completions/mean_terminated_length": 589.6046752929688, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.1576001790911126, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05116548843844329, - "kl": 0.0716552734375, + "grad_norm": 0.06667660244468013, + "kl": 0.049072265625, "learning_rate": 3.8959703652397175e-06, - "loss": -0.0012, - "num_tokens": 114166094.0, + "loss": 0.0117, + "num_tokens": 105326371.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -4943,26 +4943,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.90625, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2028.0, - "completions/mean_length": 710.96875, - "completions/mean_terminated_length": 695.1541748046875, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1544.0, + "completions/mean_length": 606.212890625, + "completions/mean_terminated_length": 603.391357421875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, "epoch": 0.1584956346541303, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.006938377171807984, - "kl": 0.092529296875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.04243473871670977, + "kl": 0.0487060546875, "learning_rate": 3.883404017460935e-06, - "loss": 0.0009, - "num_tokens": 114829342.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0141, + "num_tokens": 105935984.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 177 }, { @@ -4971,26 +4971,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1971.0, - "completions/mean_length": 691.232421875, - "completions/mean_terminated_length": 669.6964721679688, - "completions/min_length": 251.0, - "completions/min_terminated_length": 251.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1227.0, + "completions/max_terminated_length": 1227.0, + "completions/mean_length": 572.67578125, + "completions/mean_terminated_length": 571.4833374023438, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.15939109021714798, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.0818158362296435, - "kl": 0.0672607421875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006519149950539733, + "kl": 0.05511474609375, "learning_rate": 3.870790114319559e-06, - "loss": 0.0031, - "num_tokens": 115473525.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0006, + "num_tokens": 106519466.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 178 }, { @@ -4999,20 +4999,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.890625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1721.0, - "completions/mean_length": 639.69140625, - "completions/mean_terminated_length": 620.1702880859375, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, - "epoch": 0.16028654578016566, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1574.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 554.39453125, + "completions/mean_terminated_length": 552.3992309570312, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.16028654578016566, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007890322312255954, - "kl": 0.07080078125, + "grad_norm": 0.023331785461308704, + "kl": 0.063720703125, "learning_rate": 3.858129185069701e-06, - "loss": 0.0007, - "num_tokens": 116083991.0, + "loss": 0.0006, + "num_tokens": 107086260.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5027,26 +5027,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1891.0, - "completions/mean_length": 689.5546875, - "completions/mean_terminated_length": 679.472412109375, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 587.150390625, + "completions/mean_terminated_length": 586.5479125976562, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.16118200134318333, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07882687764220778, - "kl": 0.11083984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005382943100030063, + "kl": 0.0528564453125, "learning_rate": 3.845421760938597e-06, - "loss": 0.0029, - "num_tokens": 116761651.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 107711489.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 180 }, { @@ -5055,26 +5055,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 635.724609375, - "completions/mean_terminated_length": 627.4008178710938, - "completions/min_length": 233.0, - "completions/min_terminated_length": 233.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1169.0, + "completions/max_terminated_length": 1169.0, + "completions/mean_length": 538.271484375, + "completions/mean_terminated_length": 538.271484375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.16207745690620104, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0725677041475304, - "kl": 0.070556640625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005548822156078627, + "kl": 0.0509033203125, "learning_rate": 3.832668375104312e-06, - "loss": -0.0004, - "num_tokens": 117383574.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 108283516.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 181 }, { @@ -5083,20 +5083,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2003.0, - "completions/mean_length": 713.12109375, - "completions/mean_terminated_length": 699.9566040039062, - "completions/min_length": 205.0, - "completions/min_terminated_length": 205.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1659.0, + "completions/max_terminated_length": 1659.0, + "completions/mean_length": 586.451171875, + "completions/mean_terminated_length": 586.451171875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.16297291246921872, "frac_reward_zero_std": 1.0, - "grad_norm": 0.014271174770978515, - "kl": 0.0697021484375, + "grad_norm": 0.00603788980728595, + "kl": 0.0499267578125, "learning_rate": 3.8198695626733725e-06, - "loss": 0.0007, - "num_tokens": 118063748.0, + "loss": 0.0005, + "num_tokens": 108898835.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5111,26 +5111,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.84375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2040.0, - "completions/mean_length": 713.150390625, - "completions/mean_terminated_length": 686.5597534179688, - "completions/min_length": 219.0, - "completions/min_terminated_length": 219.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1504.0, + "completions/max_terminated_length": 1504.0, + "completions/mean_length": 585.361328125, + "completions/mean_terminated_length": 585.361328125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.1638683680322364, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.08507378122083202, - "kl": 0.105224609375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005262098905925186, + "kl": 0.0496826171875, "learning_rate": 3.8070258606583156e-06, - "loss": -0.0037, - "num_tokens": 118727089.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0005, + "num_tokens": 109496748.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 183 }, { @@ -5139,26 +5139,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1989.0, - "completions/mean_length": 722.576171875, - "completions/mean_terminated_length": 712.1397705078125, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1667.0, + "completions/max_terminated_length": 1667.0, + "completions/mean_length": 594.75, + "completions/mean_terminated_length": 594.75, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.16476382359525407, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07094156347361062, - "kl": 0.0694580078125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0048278578819136685, + "kl": 0.0506591796875, "learning_rate": 3.7941378079551544e-06, - "loss": 0.0022, - "num_tokens": 119412936.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 110117148.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 184 }, { @@ -5167,20 +5167,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.90625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1883.0, - "completions/mean_length": 696.935546875, - "completions/mean_terminated_length": 680.9150390625, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1774.0, + "completions/max_terminated_length": 1774.0, + "completions/mean_length": 592.947265625, + "completions/mean_terminated_length": 588.956787109375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.16565927915827178, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007349496040761767, - "kl": 0.0682373046875, + "grad_norm": 0.008863827601325813, + "kl": 0.07080078125, "learning_rate": 3.7812059453207677e-06, "loss": 0.0007, - "num_tokens": 120084967.0, + "num_tokens": 110735937.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5195,26 +5195,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1895.0, - "completions/mean_length": 676.48828125, - "completions/mean_terminated_length": 665.68896484375, - "completions/min_length": 196.0, - "completions/min_terminated_length": 196.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1491.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 565.056640625, + "completions/mean_terminated_length": 562.4459838867188, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.16655473472128945, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07706499183432042, - "kl": 0.067626953125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02719947360988251, + "kl": 0.0760498046875, "learning_rate": 3.768230815350213e-06, - "loss": -0.0007, - "num_tokens": 120746113.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0008, + "num_tokens": 111340030.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 186 }, { @@ -5223,20 +5223,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1893.0, - "completions/mean_length": 636.73046875, - "completions/mean_terminated_length": 616.3154907226562, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 552.83984375, + "completions/mean_terminated_length": 551.7005615234375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, "epoch": 0.16745019028430713, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04911030768903344, - "kl": 0.099853515625, + "grad_norm": 0.9236121462672159, + "kl": 0.057373046875, "learning_rate": 3.7552129624539557e-06, - "loss": -0.002, - "num_tokens": 121372679.0, + "loss": 0.0075, + "num_tokens": 111923644.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -5251,20 +5251,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1942.0, - "completions/mean_length": 626.787109375, - "completions/mean_terminated_length": 615.596435546875, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_terminated_length": 1403.0, + "completions/mean_length": 558.384765625, + "completions/mean_terminated_length": 555.4696655273438, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.16834564584732484, "frac_reward_zero_std": 0.9375, - "grad_norm": 0.06941023633883373, - "kl": 0.0716552734375, + "grad_norm": 0.061955717355027674, + "kl": 0.050537109375, "learning_rate": 3.7421529328350316e-06, - "loss": -0.0033, - "num_tokens": 121996346.0, + "loss": -0.0028, + "num_tokens": 112512289.0, "reward": 0.099609375, "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, @@ -5279,26 +5279,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1995.0, - "completions/mean_length": 690.220703125, - "completions/mean_terminated_length": 676.8303833007812, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1771.0, + "completions/max_terminated_length": 1771.0, + "completions/mean_length": 601.98828125, + "completions/mean_terminated_length": 601.3092041015625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.16924110141034251, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07568288588640923, - "kl": 0.066162109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005052704969820576, + "kl": 0.0516357421875, "learning_rate": 3.7290512744661274e-06, - "loss": 0.0084, - "num_tokens": 122671259.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 113142027.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 189 }, { @@ -5307,26 +5307,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.90625, + "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1983.0, - "completions/mean_length": 697.283203125, - "completions/mean_terminated_length": 681.266845703125, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 589.88671875, + "completions/mean_terminated_length": 581.292724609375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.1701365569733602, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.049509900781209426, - "kl": 0.0650634765625, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2800674868147115, + "kl": 0.85662841796875, "learning_rate": 3.715908537066589e-06, - "loss": 0.001, - "num_tokens": 123326652.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0086, + "num_tokens": 113742433.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 190 }, { @@ -5335,20 +5335,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1783.0, - "completions/mean_length": 626.283203125, - "completions/mean_terminated_length": 615.0885620117188, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1471.0, + "completions/max_terminated_length": 1471.0, + "completions/mean_length": 553.888671875, + "completions/mean_terminated_length": 553.0528564453125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.17103201253637787, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007656796040386732, - "kl": 0.066650390625, + "grad_norm": 0.006196419562863969, + "kl": 0.04998779296875, "learning_rate": 3.7027252720793538e-06, - "loss": 0.0007, - "num_tokens": 123951005.0, + "loss": 0.0005, + "num_tokens": 114329720.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5363,20 +5363,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.90625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 665.228515625, - "completions/mean_terminated_length": 648.83203125, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1544.0, + "completions/max_terminated_length": 1544.0, + "completions/mean_length": 557.12109375, + "completions/mean_terminated_length": 557.12109375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.17192746809939558, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05792298135309963, - "kl": 0.0677490234375, + "grad_norm": 0.06389277517732068, + "kl": 0.04864501953125, "learning_rate": 3.689502032647817e-06, - "loss": -0.0015, - "num_tokens": 124596706.0, + "loss": 0.0013, + "num_tokens": 114920070.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -5391,20 +5391,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1688.0, - "completions/mean_length": 620.7578125, - "completions/mean_terminated_length": 609.5196533203125, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1318.0, + "completions/max_terminated_length": 1318.0, + "completions/mean_length": 540.427734375, + "completions/mean_terminated_length": 540.427734375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.17282292366241325, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007164808257495399, - "kl": 0.06610107421875, + "grad_norm": 0.0056093930735562555, + "kl": 0.0491943359375, "learning_rate": 3.6762393735926245e-06, - "loss": 0.0007, - "num_tokens": 125195014.0, + "loss": 0.0005, + "num_tokens": 115477249.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5419,20 +5419,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1790.0, - "completions/mean_length": 638.5703125, - "completions/mean_terminated_length": 627.472412109375, - "completions/min_length": 230.0, - "completions/min_terminated_length": 230.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1444.0, + "completions/max_terminated_length": 1444.0, + "completions/mean_length": 549.9765625, + "completions/mean_terminated_length": 549.9765625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.17371837922543093, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.060704628699271526, - "kl": 0.06787109375, + "grad_norm": 0.0797524710726536, + "kl": 0.05047607421875, "learning_rate": 3.6629378513883852e-06, - "loss": 0.0009, - "num_tokens": 125829882.0, + "loss": 0.0031, + "num_tokens": 116066757.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -5447,20 +5447,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1977.0, - "completions/mean_length": 624.65625, - "completions/mean_terminated_length": 619.0745239257812, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 541.453125, + "completions/mean_terminated_length": 541.453125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.17461383478844864, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007082540003861009, - "kl": 0.066162109375, + "grad_norm": 0.004492696441640513, + "kl": 0.0484619140625, "learning_rate": 3.6495980241403307e-06, - "loss": 0.0007, - "num_tokens": 126427866.0, + "loss": 0.0005, + "num_tokens": 116622141.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5475,26 +5475,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2033.0, - "completions/mean_length": 701.626953125, - "completions/mean_terminated_length": 698.9921875, - "completions/min_length": 214.0, - "completions/min_terminated_length": 214.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1729.0, + "completions/max_terminated_length": 1729.0, + "completions/mean_length": 606.66015625, + "completions/mean_terminated_length": 606.66015625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.1755092903514663, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.006717488479532777, - "kl": 0.0650634765625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06668454810927989, + "kl": 0.0494384765625, "learning_rate": 3.636220451560896e-06, - "loss": 0.0007, - "num_tokens": 127094715.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0012, + "num_tokens": 117240367.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 196 }, { @@ -5503,26 +5503,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1944.0, - "completions/mean_length": 662.830078125, - "completions/mean_terminated_length": 651.9232177734375, - "completions/min_length": 205.0, - "completions/min_terminated_length": 205.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1500.0, + "completions/max_terminated_length": 1500.0, + "completions/mean_length": 589.2578125, + "completions/mean_terminated_length": 589.2578125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.176404745914484, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.08069331728310686, - "kl": 0.0665283203125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004513425242810063, + "kl": 0.05010986328125, "learning_rate": 3.622805694946235e-06, - "loss": -0.0005, - "num_tokens": 127758052.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 117866035.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 197 }, { @@ -5531,26 +5531,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1890.0, - "completions/mean_length": 676.98828125, - "completions/mean_terminated_length": 671.61181640625, - "completions/min_length": 250.0, - "completions/min_terminated_length": 250.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1554.0, + "completions/max_terminated_length": 1554.0, + "completions/mean_length": 572.9609375, + "completions/mean_terminated_length": 571.9628295898438, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.17730020147750167, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07236099664779054, - "kl": 0.068359375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005737804995862872, + "kl": 0.0511474609375, "learning_rate": 3.609354317152667e-06, - "loss": -0.0019, - "num_tokens": 128406366.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 118461087.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 198 }, { @@ -5559,26 +5559,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1925.0, - "completions/mean_length": 668.119140625, - "completions/mean_terminated_length": 662.7078857421875, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1371.0, + "completions/max_terminated_length": 1371.0, + "completions/mean_length": 579.970703125, + "completions/mean_terminated_length": 579.365966796875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.17819565704051937, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.040604973329934714, - "kl": 0.0650634765625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011689236262334025, + "kl": 0.05712890625, "learning_rate": 3.595866882573063e-06, - "loss": -0.0003, - "num_tokens": 129045771.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 119055360.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 199 }, { @@ -5587,26 +5587,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.890625, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1728.0, - "completions/mean_length": 622.59765625, - "completions/mean_terminated_length": 602.839599609375, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 555.169921875, + "completions/mean_terminated_length": 552.24853515625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.17909111260353705, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.03790785409368292, - "kl": 0.078369140625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005120448524080564, + "kl": 0.0484619140625, "learning_rate": 3.5823439571131675e-06, - "loss": -0.0017, - "num_tokens": 129675389.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 119650455.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 200 }, { @@ -5616,19 +5616,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1954.0, - "completions/mean_length": 647.595703125, - "completions/mean_terminated_length": 642.1039428710938, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_length": 1604.0, + "completions/max_terminated_length": 1604.0, + "completions/mean_length": 583.546875, + "completions/mean_terminated_length": 579.5451049804688, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, "epoch": 0.17998656816655473, "frac_reward_zero_std": 1.0, - "grad_norm": 0.009018080934348364, - "kl": 0.067138671875, + "grad_norm": 0.02771302136152841, + "kl": 0.06964111328125, "learning_rate": 3.5687861081678477e-06, "loss": 0.0007, - "num_tokens": 130304526.0, + "num_tokens": 120246799.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5643,26 +5643,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1711.0, - "completions/mean_length": 616.826171875, - "completions/mean_terminated_length": 608.3909912109375, - "completions/min_length": 212.0, - "completions/min_terminated_length": 212.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1386.0, + "completions/max_terminated_length": 1386.0, + "completions/mean_length": 593.412109375, + "completions/mean_terminated_length": 593.412109375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.18088202372957243, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04772213814585424, - "kl": 0.068115234375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007212915094271475, + "kl": 0.05010986328125, "learning_rate": 3.555193904597291e-06, - "loss": -0.0004, - "num_tokens": 130928037.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 120858322.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 202 }, { @@ -5671,20 +5671,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1809.0, - "completions/mean_length": 642.734375, - "completions/mean_terminated_length": 620.4285888671875, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_terminated_length": 1592.0, + "completions/mean_length": 590.431640625, + "completions/mean_terminated_length": 587.5792236328125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.1817774792925901, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0082751790561264, - "kl": 0.065185546875, + "grad_norm": 0.005532916229287632, + "kl": 0.04833984375, "learning_rate": 3.541567916703138e-06, - "loss": 0.0007, - "num_tokens": 131538845.0, + "loss": 0.0005, + "num_tokens": 121442351.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5699,20 +5699,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 691.69921875, - "completions/mean_terminated_length": 678.323486328125, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 629.58984375, + "completions/mean_terminated_length": 626.8140869140625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.1826729348556078, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06259185264923466, - "kl": 0.06427001953125, + "grad_norm": 0.07012025346138279, + "kl": 0.04815673828125, "learning_rate": 3.5279087162045517e-06, - "loss": 0.009, - "num_tokens": 132206467.0, + "loss": 0.0031, + "num_tokens": 122078173.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -5727,26 +5727,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2009.0, - "completions/mean_length": 672.505859375, - "completions/mean_terminated_length": 664.3988647460938, - "completions/min_length": 245.0, - "completions/min_terminated_length": 245.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1894.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 618.482421875, + "completions/mean_terminated_length": 618.482421875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, "epoch": 0.18356839041862547, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07151737136940692, - "kl": 0.0640869140625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005303183352547322, + "kl": 0.04888916015625, "learning_rate": 3.5142168762142265e-06, - "loss": 0.0023, - "num_tokens": 132888022.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 122732068.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 205 }, { @@ -5755,26 +5755,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.859375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1765.0, - "completions/mean_length": 700.201171875, - "completions/mean_terminated_length": 676.08544921875, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1630.0, + "completions/max_terminated_length": 1630.0, + "completions/mean_length": 612.640625, + "completions/mean_terminated_length": 612.640625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.18446384598164317, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.08013199443660099, - "kl": 0.067626953125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06296171173023327, + "kl": 0.05108642578125, "learning_rate": 3.500492971214347e-06, - "loss": 0.0001, - "num_tokens": 133558285.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": -0.0047, + "num_tokens": 123357500.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 206 }, { @@ -5783,26 +5783,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1677.0, - "completions/max_terminated_length": 1677.0, - "completions/mean_length": 591.958984375, - "completions/mean_terminated_length": 591.958984375, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 565.35546875, + "completions/mean_terminated_length": 561.558837890625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.18535930154466085, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07996600764674648, - "kl": 0.06640625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008285648181235224, + "kl": 0.050537109375, "learning_rate": 3.48673757703248e-06, - "loss": -0.0019, - "num_tokens": 134162152.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 123947746.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 207 }, { @@ -5811,20 +5811,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1960.0, - "completions/mean_length": 624.052734375, - "completions/mean_terminated_length": 618.4686889648438, - "completions/min_length": 250.0, - "completions/min_terminated_length": 250.0, + "completions/max_terminated_length": 1469.0, + "completions/mean_length": 582.73828125, + "completions/mean_terminated_length": 572.6154174804688, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.18625475710767853, "frac_reward_zero_std": 1.0, - "grad_norm": 0.010916793033113123, - "kl": 0.065185546875, + "grad_norm": 0.059277726369991776, + "kl": 0.06085205078125, "learning_rate": 3.472951270817418e-06, - "loss": 0.0007, - "num_tokens": 134795923.0, + "loss": 0.0006, + "num_tokens": 124560364.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5839,26 +5839,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2023.0, - "completions/mean_length": 617.498046875, - "completions/mean_terminated_length": 606.2342529296875, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1881.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 578.189453125, + "completions/mean_terminated_length": 578.189453125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.1871502126706962, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05456360479658491, - "kl": 0.0648193359375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004773000588147992, + "kl": 0.0477294921875, "learning_rate": 3.4591346310149578e-06, - "loss": 0.0019, - "num_tokens": 135414450.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 125158765.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 209 }, { @@ -5867,26 +5867,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, + "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2038.0, - "completions/mean_length": 662.5078125, - "completions/mean_terminated_length": 654.3418579101562, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 632.3125, + "completions/mean_terminated_length": 624.06103515625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.1880456682337139, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.10789941880075726, - "kl": 0.06365966796875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02963143641657182, + "kl": 0.0645751953125, "learning_rate": 3.445288237343632e-06, - "loss": 0.0034, - "num_tokens": 136086934.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0006, + "num_tokens": 125815789.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 210 }, { @@ -5895,20 +5895,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, + "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2008.0, - "completions/mean_length": 717.3984375, - "completions/mean_terminated_length": 709.5560302734375, - "completions/min_length": 265.0, - "completions/min_terminated_length": 265.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 668.24609375, + "completions/mean_terminated_length": 646.3452758789062, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.18894112379673159, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0062856581992966785, - "kl": 0.06195068359375, + "grad_norm": 0.00816300375427543, + "kl": 0.05059814453125, "learning_rate": 3.4314126707703895e-06, - "loss": 0.0006, - "num_tokens": 136787346.0, + "loss": 0.0005, + "num_tokens": 126491035.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -5925,24 +5925,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2039.0, - "completions/mean_length": 625.490234375, - "completions/mean_terminated_length": 619.9118041992188, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 605.109375, + "completions/mean_terminated_length": 600.2549438476562, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.18983657935974926, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04953062525787495, - "kl": 0.0645751953125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00596263048445519, + "kl": 0.05322265625, "learning_rate": 3.4175085134862128e-06, - "loss": -0.0006, - "num_tokens": 137424605.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 127117859.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 212 }, { @@ -5951,26 +5951,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, + "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 654.0859375, - "completions/mean_terminated_length": 646.8939208984375, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 681.240234375, + "completions/mean_terminated_length": 654.0139770507812, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.19073203492276697, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0655278626205979, - "kl": 0.10150146484375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005312087159167155, + "kl": 0.04901123046875, "learning_rate": 3.4035763488816953e-06, - "loss": 0.0028, - "num_tokens": 138080889.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 127788046.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 213 }, { @@ -5979,26 +5979,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1689.0, - "completions/mean_length": 611.712890625, - "completions/mean_terminated_length": 608.9021606445312, - "completions/min_length": 195.0, - "completions/min_terminated_length": 195.0, + "completions/max_terminated_length": 1576.0, + "completions/mean_length": 613.779296875, + "completions/mean_terminated_length": 588.962158203125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.19162749048578465, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.006893797084836839, - "kl": 0.06103515625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1042.0108001699423, + "kl": 1.84454345703125, "learning_rate": 3.3896167615225594e-06, - "loss": 0.0006, - "num_tokens": 138714822.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0181, + "num_tokens": 128423037.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 214 }, { @@ -6007,26 +6007,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1959.0, - "completions/mean_length": 565.845703125, - "completions/mean_terminated_length": 562.9451904296875, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 549.240234375, + "completions/mean_terminated_length": 532.712890625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, "epoch": 0.19252294604880232, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05962691311018318, - "kl": 0.06121826171875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015176587381927465, + "kl": 0.06927490234375, "learning_rate": 3.375630337125133e-06, - "loss": 0.0004, - "num_tokens": 139281447.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0007, + "num_tokens": 128981160.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 215 }, { @@ -6035,20 +6035,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1919.0, - "completions/max_terminated_length": 1919.0, - "completions/mean_length": 620.06640625, - "completions/mean_terminated_length": 620.06640625, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 634.53515625, + "completions/mean_terminated_length": 616.1663818359375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.19341840161182, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007079063914301492, - "kl": 0.06048583984375, + "grad_norm": 0.011838339376112336, + "kl": 0.05859375, "learning_rate": 3.361617662531772e-06, "loss": 0.0006, - "num_tokens": 139878761.0, + "num_tokens": 129585882.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6063,26 +6063,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 627.935546875, - "completions/mean_terminated_length": 622.36669921875, - "completions/min_length": 98.0, - "completions/min_terminated_length": 98.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 626.982421875, + "completions/mean_terminated_length": 610.1324462890625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, "epoch": 0.1943138571748377, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05517179205307519, - "kl": 0.05999755859375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004774713168803346, + "kl": 0.04888916015625, "learning_rate": 3.347579325686237e-06, - "loss": 0.0037, - "num_tokens": 140503432.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 130210065.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 217 }, { @@ -6091,26 +6091,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.8125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1972.0, - "completions/mean_length": 614.9609375, - "completions/mean_terminated_length": 609.3411865234375, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_terminated_length": 1875.0, + "completions/mean_length": 628.736328125, + "completions/mean_terminated_length": 594.6740112304688, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.19520931273785538, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06462999787985825, - "kl": 0.06182861328125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009277070324678074, + "kl": 0.05230712890625, "learning_rate": 3.333515915609027e-06, - "loss": 0.0039, - "num_tokens": 141108180.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 130821866.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 218 }, { @@ -6119,26 +6119,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1883.0, - "completions/max_terminated_length": 1883.0, - "completions/mean_length": 640.31640625, - "completions/mean_terminated_length": 640.31640625, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 618.80078125, + "completions/mean_terminated_length": 606.6588134765625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.19610476830087306, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.00663515657731032, - "kl": 0.05987548828125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.056877402143134144, + "kl": 0.0538330078125, "learning_rate": 3.3194280223726616e-06, - "loss": 0.0006, - "num_tokens": 141779206.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0095, + "num_tokens": 131481876.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 219 }, { @@ -6147,26 +6147,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 654.919921875, - "completions/mean_terminated_length": 649.4569091796875, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 1790.0, + "completions/mean_length": 641.310546875, + "completions/mean_terminated_length": 628.4141845703125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.19700022386389077, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.09335614343429051, - "kl": 0.0611572265625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00940584007592381, + "kl": 0.05828857421875, "learning_rate": 3.305316237076927e-06, - "loss": -0.0058, - "num_tokens": 142449213.0, - "reward": 0.09921875596046448, - "reward_std": 0.0026298905722796917, + "loss": 0.0006, + "num_tokens": 132144915.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 220 }, { @@ -6177,24 +6177,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 599.158203125, - "completions/mean_terminated_length": 588.3779296875, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 562.9921875, + "completions/mean_terminated_length": 551.6476440429688, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.19789567942690844, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.03817769249302888, - "kl": 0.09942626953125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006685311606671522, + "kl": 0.0562744140625, "learning_rate": 3.291181151824071e-06, - "loss": 0.0014, - "num_tokens": 143030670.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 132707855.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 221 }, { @@ -6205,18 +6205,18 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1530.0, - "completions/mean_length": 607.646484375, - "completions/mean_terminated_length": 599.1571655273438, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 1332.0, + "completions/mean_length": 571.44921875, + "completions/mean_terminated_length": 562.74658203125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.19879113498992612, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007676197980008601, - "kl": 0.06060791015625, + "grad_norm": 0.00478893835093211, + "kl": 0.04815673828125, "learning_rate": 3.27702335969396e-06, - "loss": 0.0006, - "num_tokens": 143625449.0, + "loss": 0.0005, + "num_tokens": 133284101.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6231,26 +6231,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.875, + "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1841.0, - "completions/mean_length": 659.720703125, - "completions/mean_terminated_length": 637.6845703125, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 577.771484375, + "completions/mean_terminated_length": 568.0078735351562, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, "epoch": 0.1996865905529438, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.0787303783620633, - "kl": 0.09423828125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006428223247827128, + "kl": 0.0548095703125, "learning_rate": 3.2628434547191985e-06, - "loss": 0.0004, - "num_tokens": 144259370.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 133876064.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 223 }, { @@ -6259,26 +6259,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, + "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1851.0, - "completions/mean_length": 631.271484375, - "completions/mean_terminated_length": 622.9214477539062, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 1768.0, + "completions/mean_length": 591.037109375, + "completions/mean_terminated_length": 578.5680541992188, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.2005820461159615, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005402341436416679, - "kl": 0.0595703125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06631243326090758, + "kl": 0.06134033203125, "learning_rate": 3.2486420318601973e-06, - "loss": 0.0006, - "num_tokens": 144882757.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0131, + "num_tokens": 134478851.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 224 }, { @@ -6287,20 +6287,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.890625, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1912.0, - "completions/mean_length": 676.494140625, - "completions/mean_terminated_length": 657.4832153320312, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 597.66015625, + "completions/mean_terminated_length": 592.931396484375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.20147750167897918, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005625148130404837, - "kl": 0.05645751953125, + "grad_norm": 0.008008668550094192, + "kl": 0.05316162109375, "learning_rate": 3.2344196869802187e-06, - "loss": 0.0006, - "num_tokens": 145555714.0, + "loss": 0.0005, + "num_tokens": 135111445.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6315,20 +6315,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1901.0, - "completions/mean_length": 654.791015625, - "completions/mean_terminated_length": 643.8208618164062, - "completions/min_length": 214.0, - "completions/min_terminated_length": 214.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1460.0, + "completions/max_terminated_length": 1460.0, + "completions/mean_length": 582.05859375, + "completions/mean_terminated_length": 580.8982543945312, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.20237295724199686, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006171692676610506, - "kl": 0.06085205078125, + "grad_norm": 0.006453994968670067, + "kl": 0.05267333984375, "learning_rate": 3.2201770168203694e-06, - "loss": 0.0006, - "num_tokens": 146156487.0, + "loss": 0.0005, + "num_tokens": 135674979.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6343,20 +6343,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1982.0, - "completions/mean_length": 709.08203125, - "completions/mean_terminated_length": 698.5393676757812, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1670.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 626.3046875, + "completions/mean_terminated_length": 626.3046875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.20326841280501456, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007539541625438744, - "kl": 0.0589599609375, + "grad_norm": 0.0049162524656169515, + "kl": 0.04791259765625, "learning_rate": 3.205914618974563e-06, - "loss": 0.0006, - "num_tokens": 146838753.0, + "loss": 0.0005, + "num_tokens": 136314863.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6371,26 +6371,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1851.0, - "completions/max_terminated_length": 1851.0, - "completions/mean_length": 648.322265625, - "completions/mean_terminated_length": 648.322265625, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 588.765625, + "completions/mean_terminated_length": 583.0431518554688, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.20416386836803224, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.09389806559017558, - "kl": 0.05657958984375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08725654752259214, + "kl": 0.0467529296875, "learning_rate": 3.1916330918644496e-06, - "loss": -0.0014, - "num_tokens": 147471062.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0138, + "num_tokens": 136916679.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 228 }, { @@ -6399,26 +6399,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1809.0, - "completions/mean_length": 631.474609375, - "completions/mean_terminated_length": 628.7025146484375, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1188.0, + "completions/max_terminated_length": 1188.0, + "completions/mean_length": 552.72265625, + "completions/mean_terminated_length": 552.72265625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.20505932393104992, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.045648453246142924, - "kl": 0.0579833984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005918437311628653, + "kl": 0.04791259765625, "learning_rate": 3.177333034714303e-06, "loss": 0.0005, - "num_tokens": 148087913.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "num_tokens": 137493209.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 229 }, { @@ -6427,26 +6427,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.921875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2032.0, - "completions/mean_length": 649.294921875, - "completions/mean_terminated_length": 635.5009765625, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1678.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 581.326171875, + "completions/mean_terminated_length": 581.326171875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.2059547794940676, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05962054355895875, - "kl": 0.058349609375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00490406522911546, + "kl": 0.0498046875, "learning_rate": 3.1630150475258813e-06, - "loss": 0.0154, - "num_tokens": 148741456.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 138111952.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 230 }, { @@ -6455,26 +6455,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1859.0, - "completions/max_terminated_length": 1859.0, - "completions/mean_length": 608.193359375, - "completions/mean_terminated_length": 608.193359375, - "completions/min_length": 168.0, - "completions/min_terminated_length": 168.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1587.0, + "completions/mean_length": 570.556640625, + "completions/mean_terminated_length": 566.9745483398438, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.2068502350570853, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.049023036722908975, - "kl": 0.05743408203125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005622069756397748, + "kl": 0.052001953125, "learning_rate": 3.148679731053252e-06, - "loss": 0.0008, - "num_tokens": 149318099.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 138669325.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 231 }, { @@ -6483,26 +6483,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1938.0, - "completions/mean_length": 625.474609375, - "completions/mean_terminated_length": 617.0903930664062, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1477.0, + "completions/max_terminated_length": 1477.0, + "completions/mean_length": 561.345703125, + "completions/mean_terminated_length": 560.127197265625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.20774569062010298, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07269872169199251, - "kl": 0.05938720703125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0061702858371463455, + "kl": 0.05303955078125, "learning_rate": 3.1343276867775805e-06, - "loss": -0.0021, - "num_tokens": 149926854.0, - "reward": 0.09941406548023224, - "reward_std": 0.0018486406188458204, + "loss": 0.0005, + "num_tokens": 139245246.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 232 }, { @@ -6511,26 +6511,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1980.0, - "completions/mean_length": 613.642578125, - "completions/mean_terminated_length": 608.929443359375, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1300.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 554.18359375, + "completions/mean_terminated_length": 553.1956787109375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.20864114618312066, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.6311607973790101, - "kl": 0.10137939453125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0076579380750235754, + "kl": 0.0537109375, "learning_rate": 3.1199595168819043e-06, - "loss": 0.0164, - "num_tokens": 150555471.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0005, + "num_tokens": 139843420.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 233 }, { @@ -6539,26 +6539,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.890625, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1926.0, - "completions/mean_length": 645.9453125, - "completions/mean_terminated_length": 626.5109252929688, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1842.0, + "completions/max_terminated_length": 1842.0, + "completions/mean_length": 567.6640625, + "completions/mean_terminated_length": 566.75537109375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.20953660174613833, - "frac_reward_zero_std": 0.875, - "grad_norm": 0.6851753963574233, - "kl": 0.09783935546875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06004552744678529, + "kl": 0.0496826171875, "learning_rate": 3.105575824225852e-06, - "loss": 0.0114, - "num_tokens": 151199779.0, - "reward": 0.09921875596046448, - "reward_std": 0.0031250000465661287, + "loss": 0.005, + "num_tokens": 140447648.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 234 }, { @@ -6567,26 +6567,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2006.0, - "completions/mean_length": 680.703125, - "completions/mean_terminated_length": 669.93701171875, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 606.52734375, + "completions/mean_terminated_length": 600.8745727539062, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.21043205730915604, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.008950782233121669, - "kl": 0.05804443359375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08033506958705695, + "kl": 0.04827880859375, "learning_rate": 3.091177212320363e-06, - "loss": 0.0006, - "num_tokens": 151869563.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0086, + "num_tokens": 141079454.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 235 }, { @@ -6595,26 +6595,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1773.0, - "completions/mean_length": 600.86328125, - "completions/mean_terminated_length": 598.0313110351562, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1724.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 544.689453125, + "completions/mean_terminated_length": 544.689453125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.21132751287217372, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06847164647361609, - "kl": 0.05633544921875, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.07990215927891466, + "kl": 0.046142578125, "learning_rate": 3.0767642853023538e-06, - "loss": 0.0001, - "num_tokens": 152453125.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 141634255.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 236 }, { @@ -6623,26 +6623,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1943.0, - "completions/mean_length": 621.181640625, - "completions/mean_terminated_length": 612.7720947265625, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1361.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 563.033203125, + "completions/mean_terminated_length": 561.4774780273438, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.2122229684351914, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.055953444725904807, - "kl": 0.05670166015625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006933071789708101, + "kl": 0.05389404296875, "learning_rate": 3.062337647909376e-06, - "loss": 0.0002, - "num_tokens": 153060290.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 142211648.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 237 }, { @@ -6651,26 +6651,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1898.0, - "completions/mean_length": 596.9453125, - "completions/mean_terminated_length": 594.1056518554688, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 552.03515625, + "completions/mean_terminated_length": 552.03515625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.2131184239982091, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05271028890825897, - "kl": 0.0595703125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005791270028062369, + "kl": 0.04583740234375, "learning_rate": 3.04789790545424e-06, - "loss": 0.0018, - "num_tokens": 153654278.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 142782642.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 238 }, { @@ -6679,20 +6679,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1867.0, - "completions/mean_length": 626.3359375, - "completions/mean_terminated_length": 620.7608032226562, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1458.0, + "completions/max_terminated_length": 1458.0, + "completions/mean_length": 572.533203125, + "completions/mean_terminated_length": 572.533203125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.21401387956122678, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00539813134920801, - "kl": 0.056640625, + "grad_norm": 0.008004719272026003, + "kl": 0.0469970703125, "learning_rate": 3.033445663799621e-06, - "loss": 0.0006, - "num_tokens": 154274802.0, + "loss": 0.0005, + "num_tokens": 143375619.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6707,20 +6707,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1754.0, - "completions/mean_length": 632.125, - "completions/mean_terminated_length": 626.5725708007812, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 583.48046875, + "completions/mean_terminated_length": 580.614501953125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.21490933512424445, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0055914288499768205, - "kl": 0.05413818359375, + "grad_norm": 0.004638129698711235, + "kl": 0.04461669921875, "learning_rate": 3.018981529332633e-06, - "loss": 0.0005, - "num_tokens": 154885650.0, + "loss": 0.0004, + "num_tokens": 143961561.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6735,26 +6735,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 654.580078125, - "completions/mean_terminated_length": 644.25390625, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1391.0, + "completions/max_terminated_length": 1391.0, + "completions/mean_length": 585.458984375, + "completions/mean_terminated_length": 585.458984375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.21580479068726213, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06858938507620503, - "kl": 0.08441162109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018662130861021326, + "kl": 0.04779052734375, "learning_rate": 3.00450610893939e-06, - "loss": 0.0039, - "num_tokens": 155542603.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 144583124.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 241 }, { @@ -6763,26 +6763,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1936.0, - "completions/mean_length": 604.103515625, - "completions/mean_terminated_length": 598.4412231445312, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1732.0, + "completions/max_terminated_length": 1732.0, + "completions/mean_length": 571.75390625, + "completions/mean_terminated_length": 571.75390625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.21670024625027984, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08157612254136043, - "kl": 0.05743408203125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00517530359345756, + "kl": 0.04498291015625, "learning_rate": 2.9900200099795396e-06, - "loss": 0.0117, - "num_tokens": 156160816.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 145184774.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 242 }, { @@ -6791,26 +6791,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 2035.0, - "completions/mean_length": 573.150390625, - "completions/mean_terminated_length": 570.26416015625, - "completions/min_length": 109.0, - "completions/min_terminated_length": 109.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1316.0, + "completions/max_terminated_length": 1316.0, + "completions/mean_length": 532.478515625, + "completions/mean_terminated_length": 532.478515625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, "epoch": 0.21759570181329752, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06343258576945847, - "kl": 0.05889892578125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005194283804810911, + "kl": 0.0467529296875, "learning_rate": 2.9755238402607826e-06, - "loss": -0.0, - "num_tokens": 156762141.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 145765275.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 243 }, { @@ -6821,18 +6821,18 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1798.0, - "completions/mean_length": 644.193359375, - "completions/mean_terminated_length": 641.4461669921875, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 597.060546875, + "completions/mean_terminated_length": 594.2211303710938, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.2184911573763152, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06649190527686924, - "kl": 0.056884765625, + "grad_norm": 0.0654952222523936, + "kl": 0.0472412109375, "learning_rate": 2.961018208013367e-06, - "loss": 0.0051, - "num_tokens": 157430304.0, + "loss": 0.0096, + "num_tokens": 146409306.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -6847,20 +6847,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1929.0, - "completions/mean_length": 661.330078125, - "completions/mean_terminated_length": 655.8922119140625, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 598.9921875, + "completions/mean_terminated_length": 593.8074951171875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.2193866129393329, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0058189104851992055, - "kl": 0.056396484375, + "grad_norm": 0.006328652943732272, + "kl": 0.059326171875, "learning_rate": 2.9465037218645694e-06, "loss": 0.0006, - "num_tokens": 158107769.0, + "num_tokens": 147054854.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6876,19 +6876,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1780.0, - "completions/max_terminated_length": 1780.0, - "completions/mean_length": 600.646484375, - "completions/mean_terminated_length": 600.646484375, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/max_length": 1251.0, + "completions/max_terminated_length": 1251.0, + "completions/mean_length": 579.97265625, + "completions/mean_terminated_length": 579.97265625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.22028206850235058, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006009210073300697, - "kl": 0.060791015625, + "grad_norm": 0.005146548055592316, + "kl": 0.047607421875, "learning_rate": 2.9319809908131604e-06, - "loss": 0.0006, - "num_tokens": 158705812.0, + "loss": 0.0005, + "num_tokens": 147642312.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6903,20 +6903,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1928.0, - "completions/mean_length": 621.87890625, - "completions/mean_terminated_length": 613.4735107421875, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1445.0, + "completions/max_terminated_length": 1445.0, + "completions/mean_length": 584.04296875, + "completions/mean_terminated_length": 583.6438598632812, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, "epoch": 0.22117752406536825, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005694321164073396, - "kl": 0.05706787109375, + "grad_norm": 0.005701587911010035, + "kl": 0.05059814453125, "learning_rate": 2.917450624203847e-06, - "loss": 0.0006, - "num_tokens": 159333622.0, + "loss": 0.0005, + "num_tokens": 148250750.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6931,20 +6931,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1845.0, - "completions/mean_length": 599.318359375, - "completions/mean_terminated_length": 593.6372680664062, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_terminated_length": 1826.0, + "completions/mean_length": 580.466796875, + "completions/mean_terminated_length": 574.662109375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.22207297962838593, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005349239849816427, - "kl": 0.05615234375, + "grad_norm": 0.005432057373877413, + "kl": 0.05230712890625, "learning_rate": 2.9029132317017118e-06, - "loss": 0.0006, - "num_tokens": 159950761.0, + "loss": 0.0005, + "num_tokens": 148858237.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -6959,26 +6959,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1974.0, - "completions/max_terminated_length": 1974.0, - "completions/mean_length": 542.564453125, - "completions/mean_terminated_length": 542.564453125, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1624.0, + "completions/mean_length": 544.392578125, + "completions/mean_terminated_length": 541.4500732421875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.22296843519140364, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06951417104945298, - "kl": 0.05462646484375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004372778590201045, + "kl": 0.04522705078125, "learning_rate": 2.888369423266629e-06, - "loss": 0.001, - "num_tokens": 160511178.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 149419590.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 249 }, { @@ -6988,19 +6988,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1705.0, - "completions/mean_length": 573.044921875, - "completions/mean_terminated_length": 570.1585083007812, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 559.509765625, + "completions/mean_terminated_length": 558.8160400390625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.2238638907544213, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006103601784565662, - "kl": 0.05767822265625, + "grad_norm": 0.007646236721940514, + "kl": 0.05322265625, "learning_rate": 2.8738198091276712e-06, - "loss": 0.0006, - "num_tokens": 161116001.0, + "loss": 0.0005, + "num_tokens": 150017483.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7015,20 +7015,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1891.0, - "completions/mean_length": 566.23828125, - "completions/mean_terminated_length": 563.3385620117188, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_terminated_length": 1453.0, + "completions/mean_length": 543.091796875, + "completions/mean_terminated_length": 538.3988647460938, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.224759346317439, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005820946741876905, - "kl": 0.05841064453125, + "grad_norm": 0.008486197284828401, + "kl": 0.05853271484375, "learning_rate": 2.859264999757509e-06, "loss": 0.0006, - "num_tokens": 161671451.0, + "num_tokens": 150561082.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7043,26 +7043,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, + "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1879.0, - "completions/mean_length": 609.8671875, - "completions/mean_terminated_length": 604.1729125976562, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 582.26171875, + "completions/mean_terminated_length": 574.6023559570312, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.2256548018804567, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.9391411364603389, - "kl": 0.6685791015625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07379730684287858, + "kl": 0.0567626953125, "learning_rate": 2.8447056058467928e-06, - "loss": 0.0145, - "num_tokens": 162310423.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0042, + "num_tokens": 151185920.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 252 }, { @@ -7071,20 +7071,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1870.0, - "completions/max_terminated_length": 1870.0, - "completions/mean_length": 606.626953125, - "completions/mean_terminated_length": 606.626953125, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 609.107421875, + "completions/mean_terminated_length": 598.6134033203125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, "epoch": 0.22655025744347437, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006495409905893996, - "kl": 0.05908203125, + "grad_norm": 0.008824669605331369, + "kl": 0.0582275390625, "learning_rate": 2.830142238278531e-06, "loss": 0.0006, - "num_tokens": 162959432.0, + "num_tokens": 151836199.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7099,20 +7099,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2048.0, - "completions/mean_length": 617.728515625, - "completions/mean_terminated_length": 612.11962890625, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 596.0546875, + "completions/mean_terminated_length": 593.2133178710938, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.22744571300649205, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0073227247170333624, - "kl": 0.05572509765625, + "grad_norm": 0.00403469657058676, + "kl": 0.0469970703125, "learning_rate": 2.81557550810246e-06, - "loss": 0.0006, - "num_tokens": 163563917.0, + "loss": 0.0005, + "num_tokens": 152429587.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7127,20 +7127,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 2001.0, - "completions/max_terminated_length": 2001.0, - "completions/mean_length": 586.6796875, - "completions/mean_terminated_length": 586.6796875, - "completions/min_length": 142.0, - "completions/min_terminated_length": 142.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1476.0, + "completions/mean_length": 585.66015625, + "completions/mean_terminated_length": 580.1296997070312, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.22834116856950973, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00564245091070103, - "kl": 0.05596923828125, + "grad_norm": 0.010358310622908547, + "kl": 0.05926513671875, "learning_rate": 2.8010060265094026e-06, "loss": 0.0006, - "num_tokens": 164157049.0, + "num_tokens": 153022197.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7157,24 +7157,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2027.0, - "completions/mean_length": 594.05078125, - "completions/mean_terminated_length": 589.6470947265625, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 569.18359375, + "completions/mean_terminated_length": 563.3843383789062, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, "epoch": 0.22923662413252743, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05172954991761498, - "kl": 0.056396484375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0050837400422739315, + "kl": 0.0462646484375, "learning_rate": 2.786434404805629e-06, - "loss": 0.0022, - "num_tokens": 164766163.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 153618579.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 256 }, { @@ -7183,20 +7183,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1935.0, - "completions/max_terminated_length": 1935.0, - "completions/mean_length": 606.1328125, - "completions/mean_terminated_length": 604.4226684570312, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1237.0, + "completions/mean_length": 612.283203125, + "completions/mean_terminated_length": 608.6686401367188, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.2301320796955451, "frac_reward_zero_std": 1.0, - "grad_norm": 0.03439221585128269, - "kl": 0.06646728515625, + "grad_norm": 0.004218948614674557, + "kl": 0.04693603515625, "learning_rate": 2.771861254387199e-06, - "loss": 0.0007, - "num_tokens": 165420887.0, + "loss": 0.0005, + "num_tokens": 154276452.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7211,20 +7211,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1828.0, - "completions/mean_length": 619.390625, - "completions/mean_terminated_length": 613.7882690429688, - "completions/min_length": 230.0, - "completions/min_terminated_length": 230.0, + "completions/max_terminated_length": 1304.0, + "completions/mean_length": 623.123046875, + "completions/mean_terminated_length": 620.3346557617188, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, "epoch": 0.2310275352585628, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005817974948231771, - "kl": 0.05487060546875, + "grad_norm": 0.0038371490056063186, + "kl": 0.044921875, "learning_rate": 2.7572871867143204e-06, - "loss": 0.0005, - "num_tokens": 166037263.0, + "loss": 0.0004, + "num_tokens": 154894739.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7239,20 +7239,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 2017.0, - "completions/max_terminated_length": 2017.0, - "completions/mean_length": 578.689453125, - "completions/mean_terminated_length": 578.689453125, - "completions/min_length": 207.0, - "completions/min_terminated_length": 207.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1241.0, + "completions/mean_length": 578.615234375, + "completions/mean_terminated_length": 571.785888671875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.23192299082158047, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006405651142742982, - "kl": 0.05810546875, + "grad_norm": 0.008145436520239484, + "kl": 0.06011962890625, "learning_rate": 2.742712813285681e-06, "loss": 0.0006, - "num_tokens": 166660208.0, + "num_tokens": 155517646.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7269,18 +7269,18 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1865.0, - "completions/mean_length": 621.69921875, - "completions/mean_terminated_length": 613.292724609375, - "completions/min_length": 193.0, - "completions/min_terminated_length": 193.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 626.619140625, + "completions/mean_terminated_length": 618.24169921875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, "epoch": 0.23281844638459817, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05751604346871723, - "kl": 0.05535888671875, + "grad_norm": 0.0689990067886915, + "kl": 0.055908203125, "learning_rate": 2.7281387456128017e-06, - "loss": 0.0092, - "num_tokens": 167327846.0, + "loss": 0.0071, + "num_tokens": 156187803.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -7295,20 +7295,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1699.0, - "completions/mean_length": 627.65234375, - "completions/mean_terminated_length": 624.872802734375, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 1619.0, + "completions/mean_length": 612.373046875, + "completions/mean_terminated_length": 603.91162109375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.23371390194761585, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005698567996800333, - "kl": 0.05517578125, + "grad_norm": 1.1313821813786047, + "kl": 0.098388671875, "learning_rate": 2.7135655951943716e-06, - "loss": 0.0006, - "num_tokens": 167989476.0, + "loss": 0.001, + "num_tokens": 156841610.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7323,26 +7323,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1988.0, - "completions/max_terminated_length": 1988.0, - "completions/mean_length": 594.044921875, - "completions/mean_terminated_length": 594.044921875, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1338.0, + "completions/mean_length": 560.53515625, + "completions/mean_terminated_length": 547.2662963867188, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.23460935751063353, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005400704242655025, - "kl": 0.0574951171875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.09702522439017101, + "kl": 0.05462646484375, "learning_rate": 2.698993973490598e-06, - "loss": 0.0006, - "num_tokens": 168635003.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.015, + "num_tokens": 157469980.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 262 }, { @@ -7351,26 +7351,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1762.0, - "completions/mean_length": 555.1796875, - "completions/mean_terminated_length": 549.3255004882812, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 535.12109375, + "completions/mean_terminated_length": 522.0059204101562, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.23550481307365123, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08504139270976999, - "kl": 0.05609130859375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010862493459670133, + "kl": 0.06060791015625, "learning_rate": 2.6844244918975416e-06, - "loss": 0.0133, - "num_tokens": 169199639.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 158024346.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 263 }, { @@ -7379,26 +7379,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 585.38671875, - "completions/mean_terminated_length": 582.5244750976562, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 558.44140625, + "completions/mean_terminated_length": 543.75146484375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.2364002686366689, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.03248029309967152, - "kl": 0.0560302734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01007907413544423, + "kl": 0.0545654296875, "learning_rate": 2.66985776172147e-06, - "loss": 0.0153, - "num_tokens": 169786493.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 158597404.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 264 }, { @@ -7407,20 +7407,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1704.0, - "completions/mean_length": 587.125, - "completions/mean_terminated_length": 584.26611328125, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 573.693359375, + "completions/mean_terminated_length": 548.4691772460938, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.2372957241996866, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0054535619814076244, - "kl": 0.0555419921875, + "grad_norm": 0.009575151333470916, + "kl": 0.05621337890625, "learning_rate": 2.6552943941532088e-06, "loss": 0.0006, - "num_tokens": 170403005.0, + "num_tokens": 159207039.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7435,20 +7435,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 2000.0, - "completions/max_terminated_length": 2000.0, - "completions/mean_length": 578.591796875, - "completions/mean_terminated_length": 578.591796875, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 573.662109375, + "completions/mean_terminated_length": 526.102783203125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.23819117976270426, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005800541107133192, - "kl": 0.0567626953125, + "grad_norm": 0.013645668908412388, + "kl": 0.06549072265625, "learning_rate": 2.6407350002424927e-06, - "loss": 0.0006, - "num_tokens": 171022652.0, + "loss": 0.0007, + "num_tokens": 159824162.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7463,20 +7463,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.734375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 598.119140625, - "completions/mean_terminated_length": 595.2817993164062, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 574.169921875, + "completions/mean_terminated_length": 523.5535888671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.23908663532572197, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005198138170323681, - "kl": 0.05340576171875, + "grad_norm": 0.014855709612759045, + "kl": 0.0626220703125, "learning_rate": 2.626180190872329e-06, - "loss": 0.0005, - "num_tokens": 171613625.0, + "loss": 0.0006, + "num_tokens": 160402873.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7491,26 +7491,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1365.0, - "completions/max_terminated_length": 1365.0, - "completions/mean_length": 574.216796875, - "completions/mean_terminated_length": 574.216796875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/clipped_ratio": -6.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1247.0, + "completions/mean_length": 545.73828125, + "completions/mean_terminated_length": 512.7545166015625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.23998209088873965, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.00526669304671622, - "kl": 0.05548095703125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.052896967313257874, + "kl": 0.06146240234375, "learning_rate": 2.611630576733372e-06, - "loss": 0.0006, - "num_tokens": 172219752.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0025, + "num_tokens": 160994419.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 268 }, { @@ -7519,20 +7519,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.8125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1925.0, - "completions/mean_length": 590.83984375, - "completions/mean_terminated_length": 587.98828125, - "completions/min_length": 207.0, - "completions/min_terminated_length": 207.0, + "completions/max_terminated_length": 1549.0, + "completions/mean_length": 539.318359375, + "completions/mean_terminated_length": 505.3060302734375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.24087754645175732, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005186941679267824, - "kl": 0.0560302734375, + "grad_norm": 0.011827177223144848, + "kl": 0.0672607421875, "learning_rate": 2.5970867682982885e-06, - "loss": 0.0006, - "num_tokens": 172821350.0, + "loss": 0.0007, + "num_tokens": 161569638.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7547,20 +7547,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.78125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 2007.0, - "completions/mean_length": 596.86328125, - "completions/mean_terminated_length": 591.172607421875, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 550.271484375, + "completions/mean_terminated_length": 508.1666564941406, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.24177300201477503, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005093847327967122, - "kl": 0.05621337890625, + "grad_norm": 0.009620558992138992, + "kl": 0.06439208984375, "learning_rate": 2.582549375796154e-06, "loss": 0.0006, - "num_tokens": 173464784.0, + "num_tokens": 162189217.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7575,26 +7575,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.8125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1905.0, - "completions/mean_length": 593.822265625, - "completions/mean_terminated_length": 588.11962890625, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 564.26953125, + "completions/mean_terminated_length": 528.6600341796875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.2426684575777927, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.060575861215051076, - "kl": 0.05645751953125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011339660484190167, + "kl": 0.06573486328125, "learning_rate": 2.568019009186841e-06, - "loss": -0.0023, - "num_tokens": 174030725.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0007, + "num_tokens": 162740027.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 271 }, { @@ -7603,26 +7603,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1777.0, - "completions/max_terminated_length": 1777.0, - "completions/mean_length": 542.765625, - "completions/mean_terminated_length": 542.765625, - "completions/min_length": 214.0, - "completions/min_terminated_length": 214.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1700.0, + "completions/mean_length": 513.61328125, + "completions/mean_terminated_length": 483.0478210449219, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.24356391314081038, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04746305553350059, - "kl": 0.0526123046875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009385702347993711, + "kl": 0.0584716796875, "learning_rate": 2.5534962781354317e-06, - "loss": -0.0031, - "num_tokens": 174611821.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 163306197.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 272 }, { @@ -7631,26 +7631,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.859375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1549.0, - "completions/mean_length": 583.73046875, - "completions/mean_terminated_length": 577.98828125, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 552.63671875, + "completions/mean_terminated_length": 525.8806762695312, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, "epoch": 0.24445936870382806, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0045173693793721685, - "kl": 0.0533447265625, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08002624358034882, + "kl": 0.0601806640625, "learning_rate": 2.538981791986634e-06, - "loss": 0.0005, - "num_tokens": 175206915.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0008, + "num_tokens": 163885371.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 273 }, { @@ -7659,20 +7659,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 2019.0, - "completions/max_terminated_length": 2019.0, - "completions/mean_length": 567.197265625, - "completions/mean_terminated_length": 567.197265625, - "completions/min_length": 106.0, - "completions/min_terminated_length": 106.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1331.0, + "completions/mean_length": 552.349609375, + "completions/mean_terminated_length": 516.4540405273438, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.24535482426684577, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06113247025112339, - "kl": 0.05450439453125, + "grad_norm": 0.06166515634300674, + "kl": 0.073974609375, "learning_rate": 2.524476159739218e-06, - "loss": 0.0005, - "num_tokens": 175822152.0, + "loss": 0.0006, + "num_tokens": 164493006.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -7687,20 +7687,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1987.0, - "completions/max_terminated_length": 1987.0, - "completions/mean_length": 585.8984375, - "completions/mean_terminated_length": 585.8984375, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1314.0, + "completions/mean_length": 547.078125, + "completions/mean_terminated_length": 526.2732543945312, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.24625027982986344, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00573175071522722, - "kl": 0.0528564453125, + "grad_norm": 0.01257928535443048, + "kl": 0.0628662109375, "learning_rate": 2.5099799900204607e-06, - "loss": 0.0005, - "num_tokens": 176426356.0, + "loss": 0.0006, + "num_tokens": 165077334.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7715,20 +7715,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1939.0, - "completions/max_terminated_length": 1939.0, - "completions/mean_length": 591.904296875, - "completions/mean_terminated_length": 589.8297119140625, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1098.0, + "completions/mean_length": 537.591796875, + "completions/mean_terminated_length": 531.6686401367188, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.24714573539288112, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006265698499194678, - "kl": 0.0513916015625, + "grad_norm": 0.008330623143216287, + "kl": 0.0562744140625, "learning_rate": 2.4954938910606108e-06, - "loss": 0.0005, - "num_tokens": 177004131.0, + "loss": 0.0006, + "num_tokens": 165627301.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7743,26 +7743,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1694.0, - "completions/mean_length": 543.6328125, - "completions/mean_terminated_length": 540.6888427734375, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 513.478515625, + "completions/mean_terminated_length": 507.4608154296875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.24804119095589883, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06559073582245414, - "kl": 0.05224609375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08070272275220955, + "kl": 0.0557861328125, "learning_rate": 2.481018470667368e-06, - "loss": 0.0005, - "num_tokens": 177565431.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 166173162.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 277 }, { @@ -7773,24 +7773,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1680.0, - "completions/mean_length": 568.951171875, - "completions/mean_terminated_length": 563.87255859375, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/max_terminated_length": 1408.0, + "completions/mean_length": 528.388671875, + "completions/mean_terminated_length": 522.429443359375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.2489366465189165, - "frac_reward_zero_std": 0.90625, - "grad_norm": 13.405018764621035, - "kl": 0.37408447265625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006969320341412188, + "kl": 0.05340576171875, "learning_rate": 2.4665543362003802e-06, - "loss": 0.0184, - "num_tokens": 178140382.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0005, + "num_tokens": 166727345.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 278 }, { @@ -7800,25 +7800,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1781.0, - "completions/mean_length": 582.24609375, - "completions/mean_terminated_length": 579.377685546875, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_length": 1269.0, + "completions/max_terminated_length": 1269.0, + "completions/mean_length": 540.447265625, + "completions/mean_terminated_length": 539.8199462890625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.24983210208193418, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07793443113322279, - "kl": 0.05450439453125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007011045285617038, + "kl": 0.05621337890625, "learning_rate": 2.4521020945457615e-06, - "loss": 0.0025, - "num_tokens": 178751148.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 167316710.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 279 }, { @@ -7827,26 +7827,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1761.0, - "completions/max_terminated_length": 1761.0, - "completions/mean_length": 555.86328125, - "completions/mean_terminated_length": 555.86328125, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1439.0, + "completions/mean_length": 534.302734375, + "completions/mean_terminated_length": 531.3405151367188, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.25072755764495186, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005138666007838519, - "kl": 0.05181884765625, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.07799875756079883, + "kl": 0.0548095703125, "learning_rate": 2.4376623520906255e-06, - "loss": 0.0005, - "num_tokens": 179352646.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0067, + "num_tokens": 167907169.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, - "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 280 }, { @@ -7856,25 +7856,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1485.0, - "completions/max_terminated_length": 1485.0, - "completions/mean_length": 591.35546875, - "completions/mean_terminated_length": 591.35546875, - "completions/min_length": 207.0, - "completions/min_terminated_length": 207.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 576.05078125, + "completions/mean_terminated_length": 576.05078125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.25162301320796954, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005950683891807377, - "kl": 0.05291748046875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.04411135965167851, + "kl": 0.05206298828125, "learning_rate": 2.4232357146976478e-06, - "loss": 0.0005, - "num_tokens": 179970588.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0027, + "num_tokens": 168517275.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 281 }, { @@ -7883,20 +7883,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1767.0, - "completions/max_terminated_length": 1767.0, - "completions/mean_length": 615.041015625, - "completions/mean_terminated_length": 613.628173828125, - "completions/min_length": 235.0, - "completions/min_terminated_length": 235.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1842.0, + "completions/max_terminated_length": 1842.0, + "completions/mean_length": 582.052734375, + "completions/mean_terminated_length": 582.052734375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.2525184687709872, "frac_reward_zero_std": 1.0, - "grad_norm": 24.033499681250188, - "kl": 2.96002197265625, + "grad_norm": 0.006361764340800594, + "kl": 0.0521240234375, "learning_rate": 2.408822787679637e-06, - "loss": 0.0295, - "num_tokens": 180573601.0, + "loss": 0.0005, + "num_tokens": 169103398.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7911,20 +7911,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1644.0, - "completions/max_terminated_length": 1644.0, - "completions/mean_length": 565.318359375, - "completions/mean_terminated_length": 565.318359375, - "completions/min_length": 178.0, - "completions/min_terminated_length": 178.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1212.0, + "completions/mean_length": 554.568359375, + "completions/mean_terminated_length": 548.7117919921875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.25341392433400495, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0051856058148775286, - "kl": 0.0518798828125, + "grad_norm": 0.09731232801458174, + "kl": 0.0823974609375, "learning_rate": 2.3944241757741475e-06, - "loss": 0.0005, - "num_tokens": 181160180.0, + "loss": 0.0008, + "num_tokens": 169684473.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -7939,26 +7939,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1993.0, - "completions/mean_length": 617.09375, - "completions/mean_terminated_length": 614.2935180664062, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1220.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 586.662109375, + "completions/mean_terminated_length": 586.662109375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.2543093798970226, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0068416300076160775, - "kl": 0.05120849609375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06020667119192139, + "kl": 0.05108642578125, "learning_rate": 2.380040483118097e-06, - "loss": 0.0005, - "num_tokens": 181755748.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0002, + "num_tokens": 170264460.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 284 }, { @@ -7967,20 +7967,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1894.0, - "completions/max_terminated_length": 1894.0, - "completions/mean_length": 583.033203125, - "completions/mean_terminated_length": 583.033203125, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 568.12890625, + "completions/mean_terminated_length": 565.2328491210938, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.2552048354600403, "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07301203946773449, - "kl": 0.051513671875, + "grad_norm": 0.09146996211829635, + "kl": 0.05084228515625, "learning_rate": 2.365672313222419e-06, - "loss": -0.0029, - "num_tokens": 182358469.0, + "loss": 0.0006, + "num_tokens": 170859550.0, "reward": 0.099609375, "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, @@ -7997,24 +7997,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1728.0, - "completions/mean_length": 628.9453125, - "completions/mean_terminated_length": 623.3804321289062, - "completions/min_length": 212.0, - "completions/min_terminated_length": 212.0, + "completions/max_terminated_length": 1467.0, + "completions/mean_length": 622.955078125, + "completions/mean_terminated_length": 617.36669921875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.256100291023058, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0054564530522672825, - "kl": 0.05206298828125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.048868905799068094, + "kl": 0.0537109375, "learning_rate": 2.351320268946749e-06, - "loss": 0.0005, - "num_tokens": 183015449.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0049, + "num_tokens": 171513463.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 286 }, { @@ -8023,26 +8023,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1781.0, - "completions/mean_length": 538.4453125, - "completions/mean_terminated_length": 535.4912109375, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_terminated_length": 1393.0, + "completions/mean_length": 545.109375, + "completions/mean_terminated_length": 538.8231811523438, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.25699574658607566, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005346092389195165, - "kl": 0.0521240234375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0656092790599532, + "kl": 0.0565185546875, "learning_rate": 2.336984952474119e-06, - "loss": 0.0005, - "num_tokens": 183558509.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0027, + "num_tokens": 172059935.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 287 }, { @@ -8051,20 +8051,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1950.0, - "completions/mean_length": 547.73046875, - "completions/mean_terminated_length": 544.7944946289062, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_terminated_length": 1208.0, + "completions/mean_length": 561.330078125, + "completions/mean_terminated_length": 555.5000610351562, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.25789120214909333, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005394893401423969, - "kl": 0.0499267578125, + "grad_norm": 0.007045410253853639, + "kl": 0.053955078125, "learning_rate": 2.322666965285697e-06, "loss": 0.0005, - "num_tokens": 184104819.0, + "num_tokens": 172613208.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8081,18 +8081,18 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1585.0, - "completions/mean_length": 588.498046875, - "completions/mean_terminated_length": 582.7745361328125, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/max_terminated_length": 1763.0, + "completions/mean_length": 595.86328125, + "completions/mean_terminated_length": 590.1686401367188, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.258786657712111, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004969546618879694, - "kl": 0.05023193359375, + "grad_norm": 0.005338478822253961, + "kl": 0.0498046875, "learning_rate": 2.3083669081355507e-06, "loss": 0.0005, - "num_tokens": 184713954.0, + "num_tokens": 173226114.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8107,20 +8107,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1976.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 552.771484375, - "completions/mean_terminated_length": 552.771484375, - "completions/min_length": 102.0, - "completions/min_terminated_length": 102.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1233.0, + "completions/mean_length": 578.037109375, + "completions/mean_terminated_length": 562.0119018554688, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.25968211327512875, "frac_reward_zero_std": 1.0, - "grad_norm": 0.008575669078842711, - "kl": 0.0533447265625, + "grad_norm": 0.014065273768047462, + "kl": 0.06695556640625, "learning_rate": 2.2940853810254377e-06, - "loss": 0.0005, - "num_tokens": 185294381.0, + "loss": 0.0007, + "num_tokens": 173819477.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8135,20 +8135,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1260.0, - "completions/max_terminated_length": 1260.0, - "completions/mean_length": 579.490234375, - "completions/mean_terminated_length": 579.490234375, - "completions/min_length": 226.0, - "completions/min_terminated_length": 226.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 612.31640625, + "completions/mean_terminated_length": 603.8546142578125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, "epoch": 0.2605775688381464, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0052073739337190885, - "kl": 0.05035400390625, + "grad_norm": 0.005526260451771069, + "kl": 0.049072265625, "learning_rate": 2.2798229831796313e-06, "loss": 0.0005, - "num_tokens": 185851064.0, + "num_tokens": 174392967.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8163,26 +8163,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1751.0, - "completions/mean_length": 630.0, - "completions/mean_terminated_length": 627.2250366210938, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 1816.0, + "completions/mean_length": 638.041015625, + "completions/mean_terminated_length": 621.7806396484375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.2614730244011641, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.053543315977731425, - "kl": 0.0494384765625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006603994997144677, + "kl": 0.05377197265625, "learning_rate": 2.2655803130197816e-06, - "loss": -0.0003, - "num_tokens": 186467848.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 175013868.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 292 }, { @@ -8191,26 +8191,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1816.0, - "completions/mean_length": 568.5859375, - "completions/mean_terminated_length": 565.6907958984375, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 608.197265625, + "completions/mean_terminated_length": 579.5159301757812, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.2623684799641818, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004634463449738635, - "kl": 0.05108642578125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07767449708337147, + "kl": 0.06494140625, "learning_rate": 2.2513579681398034e-06, - "loss": 0.0005, - "num_tokens": 187048996.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0111, + "num_tokens": 175615297.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 293 }, { @@ -8219,20 +8219,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.796875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1966.0, - "completions/mean_length": 554.080078125, - "completions/mean_terminated_length": 548.2216186523438, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/max_terminated_length": 1844.0, + "completions/mean_length": 590.447265625, + "completions/mean_terminated_length": 552.4749755859375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.26326393552719946, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004680081807285644, - "kl": 0.0513916015625, + "grad_norm": 0.007266034489754845, + "kl": 0.05670166015625, "learning_rate": 2.237156545280803e-06, - "loss": 0.0005, - "num_tokens": 187595901.0, + "loss": 0.0006, + "num_tokens": 176180822.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8247,20 +8247,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1857.0, - "completions/max_terminated_length": 1857.0, - "completions/mean_length": 592.625, - "completions/mean_terminated_length": 592.625, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 599.375, + "completions/mean_terminated_length": 593.6941528320312, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.26415939109021713, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004679039787165404, - "kl": 0.052001953125, + "grad_norm": 0.005244228890614103, + "kl": 0.05059814453125, "learning_rate": 2.2229766403060403e-06, "loss": 0.0005, - "num_tokens": 188187341.0, + "num_tokens": 176775718.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8275,26 +8275,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1792.0, - "completions/mean_length": 614.19140625, - "completions/mean_terminated_length": 610.5235595703125, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 619.388671875, + "completions/mean_terminated_length": 603.662109375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.2650548466532348, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.052447372839830086, - "kl": 0.05328369140625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0067364873550132195, + "kl": 0.05413818359375, "learning_rate": 2.2088188481759305e-06, - "loss": 0.0126, - "num_tokens": 188791503.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 177382541.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 296 }, { @@ -8303,54 +8303,54 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1818.0, - "completions/max_terminated_length": 1818.0, - "completions/mean_length": 627.5078125, - "completions/mean_terminated_length": 626.383544921875, - "completions/min_length": 225.0, - "completions/min_terminated_length": 225.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1749.0, + "completions/mean_length": 633.080078125, + "completions/mean_terminated_length": 604.8944702148438, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.26595030221625254, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04957490670885095, - "kl": 0.05078125, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08346497425582332, + "kl": 0.05059814453125, "learning_rate": 2.194683762923073e-06, - "loss": -0.0005, - "num_tokens": 189428755.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0129, + "num_tokens": 178022646.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 297 }, { "clip_ratio/high_max": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1568.0, - "completions/max_terminated_length": 1568.0, - "completions/mean_length": 558.3046875, - "completions/mean_terminated_length": 558.3046875, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1680.0, + "completions/mean_length": 578.904296875, + "completions/mean_terminated_length": 567.3366088867188, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.2668457577792702, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07521000733044311, - "kl": 0.05029296875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0606077670201604, + "kl": 0.0494384765625, "learning_rate": 2.1805719776273387e-06, - "loss": 0.0034, - "num_tokens": 189996799.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0148, + "num_tokens": 178601237.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 298 }, { @@ -8359,26 +8359,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1702.0, - "completions/mean_length": 595.80078125, - "completions/mean_terminated_length": 590.1058959960938, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 631.0078125, + "completions/mean_terminated_length": 614.20556640625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, "epoch": 0.2677412133422879, - "frac_reward_zero_std": 0.9375, - "grad_norm": 52.29210967783974, - "kl": 2.88189697265625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005602309576249166, + "kl": 0.049072265625, "learning_rate": 2.166484084390974e-06, - "loss": 0.0381, - "num_tokens": 190597097.0, - "reward": 0.09941406548023224, - "reward_std": 0.0018486406188458204, + "loss": 0.0005, + "num_tokens": 179219561.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 299 }, { @@ -8387,26 +8387,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1919.0, - "completions/max_terminated_length": 1919.0, - "completions/mean_length": 575.779296875, - "completions/mean_terminated_length": 575.779296875, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 597.46875, + "completions/mean_terminated_length": 577.3782348632812, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.2686366689053056, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05666066912224791, - "kl": 0.0506591796875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007717796395967091, + "kl": 0.05401611328125, "learning_rate": 2.1524206743137636e-06, - "loss": 0.0024, - "num_tokens": 191182056.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 179815625.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 300 }, { @@ -8415,26 +8415,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1711.0, - "completions/mean_length": 602.306640625, - "completions/mean_terminated_length": 599.4774780273438, - "completions/min_length": 183.0, - "completions/min_terminated_length": 183.0, + "completions/max_terminated_length": 1439.0, + "completions/mean_length": 618.8203125, + "completions/mean_terminated_length": 601.87353515625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.26953212446832325, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04902480949298609, - "kl": 0.0518798828125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00549947178848661, + "kl": 0.04949951171875, "learning_rate": 2.1383823374682287e-06, - "loss": -0.0009, - "num_tokens": 191805093.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 180447117.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 301 }, { @@ -8443,26 +8443,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1448.0, - "completions/mean_length": 600.71484375, - "completions/mean_terminated_length": 595.0392456054688, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 599.6796875, + "completions/mean_terminated_length": 581.9069213867188, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.27042758003134093, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004860988243453146, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07205405556316538, "kl": 0.05169677734375, "learning_rate": 2.124369662874868e-06, - "loss": 0.0005, - "num_tokens": 192417747.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0029, + "num_tokens": 181059241.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 302 }, { @@ -8471,26 +8471,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1668.0, - "completions/mean_length": 565.7265625, - "completions/mean_terminated_length": 562.8258056640625, - "completions/min_length": 111.0, - "completions/min_terminated_length": 111.0, + "completions/max_terminated_length": 1763.0, + "completions/mean_length": 581.98828125, + "completions/mean_terminated_length": 570.4448852539062, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.2713230355943586, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.054741263922224406, - "kl": 0.05029296875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005610020156050893, + "kl": 0.04766845703125, "learning_rate": 2.110383238477441e-06, - "loss": -0.0018, - "num_tokens": 192973031.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 181622851.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 303 }, { @@ -8499,26 +8499,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1812.0, - "completions/max_terminated_length": 1812.0, - "completions/mean_length": 627.341796875, - "completions/mean_terminated_length": 625.0234985351562, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/clipped_ratio": -6.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 643.609375, + "completions/mean_terminated_length": 620.0357666015625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, "epoch": 0.27221849115737634, - "frac_reward_zero_std": 0.96875, - "grad_norm": 1.1739951576785839, - "kl": 0.18341064453125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0068354809798462845, + "kl": 0.0518798828125, "learning_rate": 2.096423651118305e-06, - "loss": 0.0074, - "num_tokens": 193610566.0, - "reward": 0.099609375, - "reward_std": 0.0010673906654119492, + "loss": 0.0005, + "num_tokens": 182268715.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 304 }, { @@ -8527,20 +8527,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1740.0, - "completions/mean_length": 594.041015625, - "completions/mean_terminated_length": 591.1956787109375, - "completions/min_length": 205.0, - "completions/min_terminated_length": 205.0, + "completions/max_terminated_length": 1517.0, + "completions/mean_length": 582.166015625, + "completions/mean_terminated_length": 575.3182983398438, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.273113946720394, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005936683963889181, - "kl": 0.0511474609375, + "grad_norm": 0.007411219035362688, + "kl": 0.05291748046875, "learning_rate": 2.082491486513788e-06, "loss": 0.0005, - "num_tokens": 194198091.0, + "num_tokens": 182850160.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8555,20 +8555,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1528.0, - "completions/mean_length": 582.513671875, - "completions/mean_terminated_length": 579.6458129882812, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 579.1015625, + "completions/mean_terminated_length": 563.873291015625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.2740094022834117, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006122848959639277, - "kl": 0.05328369140625, + "grad_norm": 0.009215146072812768, + "kl": 0.05816650390625, "learning_rate": 2.0685873292296116e-06, - "loss": 0.0005, - "num_tokens": 194772946.0, + "loss": 0.0006, + "num_tokens": 183423268.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8583,26 +8583,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1602.0, - "completions/max_terminated_length": 1602.0, - "completions/mean_length": 591.427734375, - "completions/mean_terminated_length": 591.427734375, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 587.12890625, + "completions/mean_terminated_length": 569.2713012695312, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.2749048578464294, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05947760191798018, - "kl": 0.05084228515625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016584960558114703, + "kl": 0.0501708984375, "learning_rate": 2.054711762656369e-06, - "loss": 0.0041, - "num_tokens": 195386189.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 184034310.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 307 }, { @@ -8611,26 +8611,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1716.0, - "completions/mean_length": 598.970703125, - "completions/mean_terminated_length": 596.135009765625, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "completions/max_terminated_length": 1832.0, + "completions/mean_length": 593.46484375, + "completions/mean_terminated_length": 581.0433959960938, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.27580031340944705, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.10573090050455233, - "kl": 0.0560302734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011959138514659325, + "kl": 0.0601806640625, "learning_rate": 2.040865368985044e-06, - "loss": 0.0109, - "num_tokens": 195976750.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0006, + "num_tokens": 184622052.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 308 }, { @@ -8641,18 +8641,18 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1736.0, - "completions/mean_length": 646.505859375, - "completions/mean_terminated_length": 635.470458984375, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_terminated_length": 1662.0, + "completions/mean_length": 651.04296875, + "completions/mean_terminated_length": 640.0433349609375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.27669576897246473, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005228529652921396, - "kl": 0.05120849609375, + "grad_norm": 0.004220974278526902, + "kl": 0.04766845703125, "learning_rate": 2.027048729182583e-06, "loss": 0.0005, - "num_tokens": 196618753.0, + "num_tokens": 185266378.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8667,26 +8667,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1819.0, - "completions/max_terminated_length": 1819.0, - "completions/mean_length": 606.69921875, - "completions/mean_terminated_length": 606.69921875, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1602.0, + "completions/mean_length": 609.353515625, + "completions/mean_terminated_length": 600.874267578125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, "epoch": 0.2775912245354824, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0051559766107908914, - "kl": 0.05517578125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05299649935908705, + "kl": 0.05035400390625, "learning_rate": 2.0132624229675205e-06, - "loss": 0.0006, - "num_tokens": 197266071.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0005, + "num_tokens": 185915055.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 310 }, { @@ -8695,26 +8695,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1437.0, - "completions/mean_length": 596.806640625, - "completions/mean_terminated_length": 593.9667358398438, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 600.80859375, + "completions/mean_terminated_length": 587.051513671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.27848668009850014, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0479412945784682, - "kl": 0.05206298828125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008478364104235272, + "kl": 0.0601806640625, "learning_rate": 1.9995070287856546e-06, - "loss": 0.0012, - "num_tokens": 197858996.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 186510029.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 311 }, { @@ -8723,20 +8723,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.765625, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1617.0, - "completions/mean_length": 578.19140625, - "completions/mean_terminated_length": 575.3150634765625, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 614.634765625, + "completions/mean_terminated_length": 571.3742065429688, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, "epoch": 0.2793821356615178, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05124512126982719, - "kl": 0.05169677734375, + "grad_norm": 0.14229366676561211, + "kl": 0.052978515625, "learning_rate": 1.985783123785774e-06, - "loss": 0.0183, - "num_tokens": 198438822.0, + "loss": 0.0146, + "num_tokens": 187108514.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -8751,20 +8751,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1749.0, - "completions/mean_length": 568.921875, - "completions/mean_terminated_length": 566.0274047851562, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 1280.0, + "completions/mean_length": 570.029296875, + "completions/mean_terminated_length": 565.2745361328125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.2802775912245355, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05819859258148276, - "kl": 0.0533447265625, + "grad_norm": 0.07031878190856904, + "kl": 0.048095703125, "learning_rate": 1.9720912837954486e-06, - "loss": -0.0, - "num_tokens": 199030926.0, + "loss": 0.0002, + "num_tokens": 187701185.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -8779,26 +8779,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1491.0, - "completions/max_terminated_length": 1491.0, - "completions/mean_length": 554.88671875, - "completions/mean_terminated_length": 554.88671875, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 554.68359375, + "completions/mean_terminated_length": 551.76123046875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.28117304678755317, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.09616545890196271, - "kl": 0.0570068359375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05260379600232649, + "kl": 0.04962158203125, "learning_rate": 1.958432083296862e-06, - "loss": 0.0083, - "num_tokens": 199609012.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0008, + "num_tokens": 188279167.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 314 }, { @@ -8808,25 +8808,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1908.0, - "completions/mean_length": 562.427734375, - "completions/mean_terminated_length": 559.5205688476562, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/max_length": 1435.0, + "completions/max_terminated_length": 1435.0, + "completions/mean_length": 538.119140625, + "completions/mean_terminated_length": 537.2328491210938, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.28206850235057085, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.006128274293792745, - "kl": 0.052490234375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.03687044756246853, + "kl": 0.05096435546875, "learning_rate": 1.9448060954027093e-06, - "loss": 0.0005, - "num_tokens": 200170303.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0008, + "num_tokens": 188828012.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 315 }, { @@ -8836,25 +8836,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1877.0, - "completions/max_terminated_length": 1877.0, - "completions/mean_length": 551.5390625, - "completions/mean_terminated_length": 551.5390625, - "completions/min_length": 103.0, - "completions/min_terminated_length": 103.0, + "completions/max_length": 1454.0, + "completions/max_terminated_length": 1454.0, + "completions/mean_length": 561.173828125, + "completions/mean_terminated_length": 561.173828125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, "epoch": 0.2829639579135885, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06205881843423876, - "kl": 0.0543212890625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004801073747051007, + "kl": 0.04974365234375, "learning_rate": 1.931213891832153e-06, - "loss": -0.0005, - "num_tokens": 200752611.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 189415253.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 316 }, { @@ -8863,26 +8863,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1282.0, - "completions/mean_length": 569.83984375, - "completions/mean_terminated_length": 564.0431518554688, - "completions/min_length": 131.0, - "completions/min_terminated_length": 131.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1515.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 548.810546875, + "completions/mean_terminated_length": 548.810546875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.2838594134766062, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07968967539050426, - "kl": 0.0521240234375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004862757981370567, + "kl": 0.046875, "learning_rate": 1.9176560428868336e-06, - "loss": 0.0085, - "num_tokens": 201334673.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 189986548.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 317 }, { @@ -8892,19 +8892,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1782.0, - "completions/max_terminated_length": 1782.0, - "completions/mean_length": 618.275390625, - "completions/mean_terminated_length": 618.275390625, - "completions/min_length": 222.0, - "completions/min_terminated_length": 222.0, + "completions/max_length": 1594.0, + "completions/max_terminated_length": 1594.0, + "completions/mean_length": 591.748046875, + "completions/mean_terminated_length": 591.748046875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.2847548690396239, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005425161560628047, - "kl": 0.054443359375, + "grad_norm": 0.004813244125939766, + "kl": 0.049072265625, "learning_rate": 1.9041331174269373e-06, "loss": 0.0005, - "num_tokens": 201992350.0, + "num_tokens": 190630643.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -8920,25 +8920,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1876.0, - "completions/mean_length": 564.7734375, - "completions/mean_terminated_length": 561.870849609375, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/max_length": 1462.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 549.74609375, + "completions/mean_terminated_length": 547.9608764648438, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.2856503246026416, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005142824320868143, - "kl": 0.0537109375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.327604947622168, + "kl": 0.0538330078125, "learning_rate": 1.8906456828473341e-06, - "loss": 0.0005, - "num_tokens": 202598010.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.011, + "num_tokens": 191228609.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 319 }, { @@ -8947,26 +8947,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1676.0, - "completions/mean_length": 582.58984375, - "completions/mean_terminated_length": 576.8432006835938, - "completions/min_length": 140.0, - "completions/min_terminated_length": 140.0, + "completions/max_terminated_length": 1705.0, + "completions/mean_length": 550.708984375, + "completions/mean_terminated_length": 547.7788696289062, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, "epoch": 0.2865457801656593, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.009530691407977925, - "kl": 0.0511474609375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.07457068940555031, + "kl": 0.049560546875, "learning_rate": 1.8771943050537656e-06, - "loss": 0.0005, - "num_tokens": 203186424.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0119, + "num_tokens": 191800700.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 320 }, { @@ -8976,19 +8976,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1453.0, - "completions/max_terminated_length": 1453.0, - "completions/mean_length": 580.2578125, - "completions/mean_terminated_length": 580.2578125, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_length": 1793.0, + "completions/max_terminated_length": 1793.0, + "completions/mean_length": 542.2109375, + "completions/mean_terminated_length": 542.2109375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.28744123572867697, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005597161597226462, - "kl": 0.05303955078125, + "grad_norm": 0.0051614526596112445, + "kl": 0.04888916015625, "learning_rate": 1.8637795484391046e-06, "loss": 0.0005, - "num_tokens": 203807692.0, + "num_tokens": 192402488.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9003,20 +9003,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1518.0, - "completions/mean_length": 560.55859375, - "completions/mean_terminated_length": 556.5628051757812, - "completions/min_length": 190.0, - "completions/min_terminated_length": 190.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1628.0, + "completions/max_terminated_length": 1628.0, + "completions/mean_length": 546.103515625, + "completions/mean_terminated_length": 544.9647827148438, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.28833669129169465, "frac_reward_zero_std": 1.0, - "grad_norm": 10.190425237517521, - "kl": 2.46160888671875, + "grad_norm": 1.1883992636807388, + "kl": 0.2384033203125, "learning_rate": 1.8504019758596698e-06, - "loss": 0.0244, - "num_tokens": 204418266.0, + "loss": 0.0024, + "num_tokens": 193005661.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9032,19 +9032,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1940.0, - "completions/max_terminated_length": 1940.0, - "completions/mean_length": 554.6328125, - "completions/mean_terminated_length": 554.6328125, - "completions/min_length": 104.0, - "completions/min_terminated_length": 104.0, + "completions/max_length": 1251.0, + "completions/max_terminated_length": 1251.0, + "completions/mean_length": 537.029296875, + "completions/mean_terminated_length": 537.029296875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.2892321468547123, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007024406654251954, - "kl": 0.0526123046875, + "grad_norm": 0.006500542025853673, + "kl": 0.0484619140625, "learning_rate": 1.8370621486116163e-06, "loss": 0.0005, - "num_tokens": 205002910.0, + "num_tokens": 193581292.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9060,25 +9060,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1875.0, - "completions/max_terminated_length": 1875.0, - "completions/mean_length": 582.15625, - "completions/mean_terminated_length": 582.15625, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 576.76171875, + "completions/mean_terminated_length": 576.76171875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.29012760241773, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.007452427305034759, - "kl": 0.0511474609375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06971423792820254, + "kl": 0.04791259765625, "learning_rate": 1.823760626407377e-06, - "loss": 0.0005, - "num_tokens": 205639790.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0039, + "num_tokens": 194215410.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 324 }, { @@ -9088,19 +9088,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1902.0, - "completions/max_terminated_length": 1902.0, - "completions/mean_length": 606.71484375, - "completions/mean_terminated_length": 606.71484375, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/max_length": 1765.0, + "completions/max_terminated_length": 1765.0, + "completions/mean_length": 592.740234375, + "completions/mean_terminated_length": 592.740234375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.2910230579807477, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005388949503012748, - "kl": 0.0504150390625, + "grad_norm": 0.004864534599712507, + "kl": 0.04632568359375, "learning_rate": 1.8104979673521838e-06, "loss": 0.0005, - "num_tokens": 206277644.0, + "num_tokens": 194846109.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9116,25 +9116,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1746.0, - "completions/max_terminated_length": 1746.0, - "completions/mean_length": 566.4375, - "completions/mean_terminated_length": 566.4375, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/max_length": 1486.0, + "completions/max_terminated_length": 1486.0, + "completions/mean_length": 552.826171875, + "completions/mean_terminated_length": 552.826171875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.2919185135437654, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06290666297265979, - "kl": 0.05303955078125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007089949043229102, + "kl": 0.04876708984375, "learning_rate": 1.7972747279206482e-06, - "loss": -0.0001, - "num_tokens": 206868460.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 195429956.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 326 }, { @@ -9143,26 +9143,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1924.0, - "completions/max_terminated_length": 1924.0, - "completions/mean_length": 587.607421875, - "completions/mean_terminated_length": 587.607421875, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 556.896484375, + "completions/mean_terminated_length": 553.9784545898438, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, "epoch": 0.2928139691067831, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005627099408001751, - "kl": 0.05084228515625, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0776498558557657, + "kl": 0.0474853515625, "learning_rate": 1.7840914629334122e-06, - "loss": 0.0005, - "num_tokens": 207492451.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0082, + "num_tokens": 196038223.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 327 }, { @@ -9173,18 +9173,18 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1831.0, - "completions/mean_length": 575.02734375, - "completions/mean_terminated_length": 572.1448364257812, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 571.099609375, + "completions/mean_terminated_length": 568.2094116210938, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.29370942466980077, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006024866616452142, - "kl": 0.05047607421875, + "grad_norm": 0.006748427087785765, + "kl": 0.04718017578125, "learning_rate": 1.7709487255338731e-06, "loss": 0.0005, - "num_tokens": 208115505.0, + "num_tokens": 196659266.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9199,20 +9199,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1685.0, - "completions/mean_length": 610.88671875, - "completions/mean_terminated_length": 605.2510375976562, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1758.0, + "completions/max_terminated_length": 1758.0, + "completions/mean_length": 575.4453125, + "completions/mean_terminated_length": 575.4453125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.29460488023281844, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00783507654646286, - "kl": 0.0528564453125, + "grad_norm": 0.006442099438506587, + "kl": 0.05078125, "learning_rate": 1.7578470671649684e-06, "loss": 0.0005, - "num_tokens": 208772103.0, + "num_tokens": 197297718.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9228,25 +9228,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1555.0, - "completions/max_terminated_length": 1555.0, - "completions/mean_length": 544.369140625, - "completions/mean_terminated_length": 544.369140625, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/max_length": 1384.0, + "completions/max_terminated_length": 1384.0, + "completions/mean_length": 544.8359375, + "completions/mean_terminated_length": 544.8359375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.2955003357958361, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08261649735534114, - "kl": 0.0511474609375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007338475781825208, + "kl": 0.049072265625, "learning_rate": 1.744787037546045e-06, - "loss": 0.0142, - "num_tokens": 209375716.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 197901570.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 330 }, { @@ -9254,27 +9254,27 @@ "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, - "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1732.0, - "completions/mean_length": 595.642578125, - "completions/mean_terminated_length": 589.9470825195312, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1936.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 573.591796875, + "completions/mean_terminated_length": 572.878662109375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, "epoch": 0.2963957913588538, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05439911125987834, - "kl": 0.049560546875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07468248860939418, + "kl": 0.06597900390625, "learning_rate": 1.731769184649788e-06, - "loss": 0.0093, - "num_tokens": 210026557.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0007, + "num_tokens": 198541121.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 331 }, { @@ -9283,26 +9283,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1854.0, - "completions/max_terminated_length": 1854.0, - "completions/mean_length": 576.931640625, - "completions/mean_terminated_length": 576.931640625, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1480.0, + "completions/max_terminated_length": 1480.0, + "completions/mean_length": 557.41796875, + "completions/mean_terminated_length": 556.6849365234375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.2972912469218715, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.03824221393460093, - "kl": 0.0489501953125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035730325862680115, + "kl": 0.05535888671875, "learning_rate": 1.7187940546792325e-06, - "loss": -0.0039, - "num_tokens": 210615114.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 199119687.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 332 }, { @@ -9311,20 +9311,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1659.0, - "completions/mean_length": 583.22265625, - "completions/mean_terminated_length": 577.4784545898438, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1388.0, + "completions/max_terminated_length": 1388.0, + "completions/mean_length": 569.91015625, + "completions/mean_terminated_length": 569.91015625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, "epoch": 0.2981867024848892, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00502000048821662, - "kl": 0.051513671875, + "grad_norm": 0.0064265463541584265, + "kl": 0.0482177734375, "learning_rate": 1.7058621920448465e-06, "loss": 0.0005, - "num_tokens": 211220828.0, + "num_tokens": 199718585.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9339,20 +9339,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1862.0, - "completions/mean_length": 604.205078125, - "completions/mean_terminated_length": 601.379638671875, - "completions/min_length": 191.0, - "completions/min_terminated_length": 191.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1220.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 569.783203125, + "completions/mean_terminated_length": 569.783203125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.2990821580479069, "frac_reward_zero_std": 1.0, - "grad_norm": 0.009980348313441741, - "kl": 0.0511474609375, + "grad_norm": 0.00432845573715703, + "kl": 0.04718017578125, "learning_rate": 1.6929741393416855e-06, "loss": 0.0005, - "num_tokens": 211842757.0, + "num_tokens": 200322890.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9367,26 +9367,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1599.0, - "completions/mean_length": 558.705078125, - "completions/mean_terminated_length": 555.7905883789062, - "completions/min_length": 132.0, - "completions/min_terminated_length": 132.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 548.86328125, + "completions/mean_terminated_length": 543.6817626953125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.29997761361092457, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005398017340368072, - "kl": 0.05047607421875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.10020521096041221, + "kl": 0.06817626953125, "learning_rate": 1.6801304373266286e-06, - "loss": 0.0005, - "num_tokens": 212409118.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0131, + "num_tokens": 200884212.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 335 }, { @@ -9395,20 +9395,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1582.0, - "completions/max_terminated_length": 1582.0, - "completions/mean_length": 577.64453125, - "completions/mean_terminated_length": 575.9510498046875, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1516.0, + "completions/max_terminated_length": 1516.0, + "completions/mean_length": 551.220703125, + "completions/mean_terminated_length": 551.220703125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.30087306917394224, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0400953710083368, - "kl": 0.0546875, + "grad_norm": 0.055243589570167055, + "kl": 0.04827880859375, "learning_rate": 1.667331624895689e-06, - "loss": -0.0065, - "num_tokens": 213017944.0, + "loss": 0.001, + "num_tokens": 201479509.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -9425,24 +9425,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1681.0, - "completions/mean_length": 585.810546875, - "completions/mean_terminated_length": 582.9490966796875, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_terminated_length": 1496.0, + "completions/mean_length": 572.970703125, + "completions/mean_terminated_length": 570.0841674804688, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.3017685247369599, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005855989237846307, - "kl": 0.05377197265625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05189289377144097, + "kl": 0.05181884765625, "learning_rate": 1.6545782390614037e-06, - "loss": 0.0005, - "num_tokens": 213613735.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0093, + "num_tokens": 202068726.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 337 }, { @@ -9451,20 +9451,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1459.0, - "completions/max_terminated_length": 1459.0, - "completions/mean_length": 552.6015625, - "completions/mean_terminated_length": 551.0430297851562, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1363.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 532.65234375, + "completions/mean_terminated_length": 532.65234375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.3026639802999776, "frac_reward_zero_std": 1.0, - "grad_norm": 0.021342755030715328, - "kl": 0.0576171875, + "grad_norm": 0.004464966063305006, + "kl": 0.04925537109375, "learning_rate": 1.6418708149302992e-06, - "loss": 0.0006, - "num_tokens": 214163835.0, + "loss": 0.0005, + "num_tokens": 202608612.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9479,20 +9479,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1754.0, - "completions/mean_length": 568.935546875, - "completions/mean_terminated_length": 566.0410766601562, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 561.587890625, + "completions/mean_terminated_length": 561.587890625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, "epoch": 0.3035594358629953, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004817435169613062, - "kl": 0.05023193359375, + "grad_norm": 0.004036873268911823, + "kl": 0.04779052734375, "learning_rate": 1.6292098856804423e-06, "loss": 0.0005, - "num_tokens": 214752170.0, + "num_tokens": 203193185.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9507,26 +9507,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1948.0, - "completions/mean_length": 632.53125, - "completions/mean_terminated_length": 624.1885986328125, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1599.0, + "completions/max_terminated_length": 1599.0, + "completions/mean_length": 632.673828125, + "completions/mean_terminated_length": 632.673828125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, "epoch": 0.304454891426013, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.042312900898714305, - "kl": 0.05169677734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005087884865379063, + "kl": 0.04986572265625, "learning_rate": 1.6165959825390661e-06, - "loss": -0.0005, - "num_tokens": 215456714.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 203897802.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 340 }, { @@ -9535,26 +9535,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 611.076171875, - "completions/mean_terminated_length": 605.4412231445312, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1579.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 591.111328125, + "completions/mean_terminated_length": 590.142822265625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.3053503469890307, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06514871663082826, - "kl": 0.0489501953125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005377231692265041, + "kl": 0.051513671875, "learning_rate": 1.604029634760284e-06, - "loss": 0.0068, - "num_tokens": 216074449.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 204505315.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 341 }, { @@ -9564,25 +9564,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1917.0, - "completions/mean_length": 607.349609375, - "completions/mean_terminated_length": 604.5303344726562, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/max_length": 1411.0, + "completions/max_terminated_length": 1411.0, + "completions/mean_length": 587.439453125, + "completions/mean_terminated_length": 586.878662109375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.30624580255204836, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004697869051960354, - "kl": 0.04931640625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07341969235932985, + "kl": 0.04986572265625, "learning_rate": 1.59151136960288e-06, - "loss": 0.0005, - "num_tokens": 216714900.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0029, + "num_tokens": 205135572.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 342 }, { @@ -9591,26 +9591,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1952.0, - "completions/mean_length": 585.927734375, - "completions/mean_terminated_length": 583.0665283203125, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1459.0, + "completions/max_terminated_length": 1459.0, + "completions/mean_length": 574.189453125, + "completions/mean_terminated_length": 574.189453125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, "epoch": 0.30714125811506604, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.041363506025014346, - "kl": 0.049560546875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004649311531668215, + "kl": 0.048828125, "learning_rate": 1.5790417123081903e-06, - "loss": -0.0014, - "num_tokens": 217293791.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 205708453.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 343 }, { @@ -9619,26 +9619,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1563.0, - "completions/max_terminated_length": 1563.0, - "completions/mean_length": 552.130859375, - "completions/mean_terminated_length": 550.9921875, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1230.0, + "completions/max_terminated_length": 1230.0, + "completions/mean_length": 555.431640625, + "completions/mean_terminated_length": 555.431640625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.3080367136780837, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.024577605367231227, - "kl": 0.05859375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.09669618323590906, + "kl": 0.0479736328125, "learning_rate": 1.5666211860780583e-06, - "loss": 0.0006, - "num_tokens": 217848418.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0021, + "num_tokens": 206264770.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 344 }, { @@ -9647,20 +9647,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1901.0, - "completions/max_terminated_length": 1901.0, - "completions/mean_length": 602.701171875, - "completions/mean_terminated_length": 602.701171875, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1325.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 608.814453125, + "completions/mean_terminated_length": 607.5107421875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, "epoch": 0.3089321692411014, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004385742088872913, - "kl": 0.04888916015625, + "grad_norm": 0.006432639585104695, + "kl": 0.0496826171875, "learning_rate": 1.5542503120528918e-06, "loss": 0.0005, - "num_tokens": 218470825.0, + "num_tokens": 206890307.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9675,20 +9675,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1693.0, - "completions/mean_length": 612.8984375, - "completions/mean_terminated_length": 607.2706298828125, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1633.0, + "completions/max_terminated_length": 1633.0, + "completions/mean_length": 597.21484375, + "completions/mean_terminated_length": 597.21484375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.3098276248041191, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004942302367708638, - "kl": 0.04833984375, + "grad_norm": 0.0054548326889689094, + "kl": 0.04541015625, "learning_rate": 1.5419296092897866e-06, "loss": 0.0005, - "num_tokens": 219057701.0, + "num_tokens": 207469153.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9703,20 +9703,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1459.0, - "completions/mean_length": 554.208984375, - "completions/mean_terminated_length": 551.2857055664062, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1477.0, + "completions/max_terminated_length": 1477.0, + "completions/mean_length": 542.3125, + "completions/mean_terminated_length": 542.3125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, "epoch": 0.3107230803671368, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004833275143594728, - "kl": 0.0494384765625, + "grad_norm": 0.004968550676186407, + "kl": 0.04718017578125, "learning_rate": 1.529659594740755e-06, "loss": 0.0005, - "num_tokens": 219618176.0, + "num_tokens": 208023537.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9731,26 +9731,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1726.0, - "completions/max_terminated_length": 1726.0, - "completions/mean_length": 615.357421875, - "completions/mean_terminated_length": 615.357421875, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1501.0, + "completions/mean_length": 609.939453125, + "completions/mean_terminated_length": 604.9157104492188, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.3116185359301545, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0541376938239617, - "kl": 0.0521240234375, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.12041308563154017, + "kl": 0.05133056640625, "learning_rate": 1.5174407832310338e-06, - "loss": -0.0004, - "num_tokens": 220263719.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0176, + "num_tokens": 208666306.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 348 }, { @@ -9759,26 +9759,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1533.0, - "completions/max_terminated_length": 1533.0, - "completions/mean_length": 606.06640625, - "completions/mean_terminated_length": 606.06640625, - "completions/min_length": 215.0, - "completions/min_terminated_length": 215.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1665.0, + "completions/max_terminated_length": 1665.0, + "completions/mean_length": 606.841796875, + "completions/mean_terminated_length": 605.76708984375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.31251399149317216, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.007838170301132085, - "kl": 0.04876708984375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.0907557345293637, + "kl": 0.0474853515625, "learning_rate": 1.5052736874374815e-06, - "loss": 0.0005, - "num_tokens": 220890633.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0043, + "num_tokens": 209293617.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 349 }, { @@ -9787,26 +9787,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1796.0, - "completions/mean_length": 563.53125, - "completions/mean_terminated_length": 557.7098388671875, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1548.0, + "completions/max_terminated_length": 1548.0, + "completions/mean_length": 568.509765625, + "completions/mean_terminated_length": 567.4539794921875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, "epoch": 0.31340944705618984, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0041665151315056406, - "kl": 0.04742431640625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06738432092092399, + "kl": 0.05059814453125, "learning_rate": 1.4931588178670695e-06, - "loss": 0.0005, - "num_tokens": 221464633.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0038, + "num_tokens": 209870166.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 350 }, { @@ -9816,25 +9816,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1522.0, - "completions/max_terminated_length": 1522.0, - "completions/mean_length": 596.26953125, - "completions/mean_terminated_length": 596.26953125, - "completions/min_length": 218.0, - "completions/min_terminated_length": 218.0, + "completions/max_length": 1246.0, + "completions/max_terminated_length": 1246.0, + "completions/mean_length": 587.666015625, + "completions/mean_terminated_length": 587.666015625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.3143049026192075, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004236470661639679, - "kl": 0.0465087890625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0667752478450315, + "kl": 0.04486083984375, "learning_rate": 1.4810966828354605e-06, - "loss": 0.0005, - "num_tokens": 222123811.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0002, + "num_tokens": 210524939.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 351 }, { @@ -9843,26 +9843,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1732.0, - "completions/max_terminated_length": 1732.0, - "completions/mean_length": 579.677734375, - "completions/mean_terminated_length": 579.677734375, - "completions/min_length": 107.0, - "completions/min_terminated_length": 107.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1489.0, + "completions/mean_length": 585.203125, + "completions/mean_terminated_length": 582.3405151367188, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.3152003581822252, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004236180586681027, - "kl": 0.0506591796875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0747582857471383, + "kl": 0.048828125, "learning_rate": 1.469087788445684e-06, - "loss": 0.0005, - "num_tokens": 222724574.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0083, + "num_tokens": 211128531.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 352 }, { @@ -9872,19 +9872,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1603.0, - "completions/max_terminated_length": 1603.0, - "completions/mean_length": 563.109375, - "completions/mean_terminated_length": 563.109375, - "completions/min_length": 191.0, - "completions/min_terminated_length": 191.0, + "completions/max_length": 1240.0, + "completions/max_terminated_length": 1240.0, + "completions/mean_length": 571.16015625, + "completions/mean_terminated_length": 571.16015625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.31609581374524287, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004163722780438454, - "kl": 0.04901123046875, + "grad_norm": 0.0041276490396841056, + "kl": 0.047607421875, "learning_rate": 1.4571326385668965e-06, "loss": 0.0005, - "num_tokens": 223341510.0, + "num_tokens": 211749589.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -9899,26 +9899,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1976.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 594.8515625, - "completions/mean_terminated_length": 594.8515625, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1388.0, + "completions/max_terminated_length": 1388.0, + "completions/mean_length": 576.314453125, + "completions/mean_terminated_length": 575.5655517578125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.3169912693082606, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05815166195072682, - "kl": 0.04833984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8926123024299515, + "kl": 0.26904296875, "learning_rate": 1.4452317348132434e-06, - "loss": -0.0017, - "num_tokens": 223961466.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0027, + "num_tokens": 212360054.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 354 }, { @@ -9927,26 +9927,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1820.0, - "completions/mean_length": 580.19921875, - "completions/mean_terminated_length": 574.4431762695312, - "completions/min_length": 192.0, - "completions/min_terminated_length": 192.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1667.0, + "completions/max_terminated_length": 1667.0, + "completions/mean_length": 583.8984375, + "completions/mean_terminated_length": 582.5910034179688, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.3178867248712783, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.11420371586495477, - "kl": 0.04827880859375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.45238995582403174, + "kl": 0.14990234375, "learning_rate": 1.4333855765228104e-06, - "loss": 0.0157, - "num_tokens": 224564816.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0015, + "num_tokens": 212965298.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 355 }, { @@ -9957,24 +9957,24 @@ "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1698.0, - "completions/mean_length": 630.931640625, - "completions/mean_terminated_length": 628.1585083007812, - "completions/min_length": 221.0, - "completions/min_terminated_length": 221.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 625.888671875, + "completions/mean_terminated_length": 623.1056518554688, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.31878218043429596, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004304656769130002, - "kl": 0.0465087890625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06558967056937225, + "kl": 0.04473876953125, "learning_rate": 1.421594660736675e-06, - "loss": 0.0005, - "num_tokens": 225199357.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0103, + "num_tokens": 213597257.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 356 }, { @@ -9984,19 +9984,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1453.0, - "completions/max_terminated_length": 1453.0, - "completions/mean_length": 544.44140625, - "completions/mean_terminated_length": 544.44140625, - "completions/min_length": 202.0, - "completions/min_terminated_length": 202.0, + "completions/max_length": 1395.0, + "completions/max_terminated_length": 1395.0, + "completions/mean_length": 560.33203125, + "completions/mean_terminated_length": 560.33203125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, "epoch": 0.31967763599731364, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004792128064874122, - "kl": 0.05010986328125, + "grad_norm": 0.004939443011093821, + "kl": 0.04925537109375, "learning_rate": 1.4098594821780476e-06, "loss": 0.0005, - "num_tokens": 225752303.0, + "num_tokens": 214158339.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10012,25 +10012,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1750.0, - "completions/max_terminated_length": 1750.0, - "completions/mean_length": 569.7265625, - "completions/mean_terminated_length": 569.7265625, - "completions/min_length": 206.0, - "completions/min_terminated_length": 206.0, + "completions/max_length": 1255.0, + "completions/max_terminated_length": 1255.0, + "completions/mean_length": 563.447265625, + "completions/mean_terminated_length": 563.447265625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.3205730915603313, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07939089082655287, - "kl": 0.04949951171875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004402935055334641, + "kl": 0.04644775390625, "learning_rate": 1.3981805332315174e-06, - "loss": 0.0001, - "num_tokens": 226358003.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 214760824.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 358 }, { @@ -10040,19 +10040,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1859.0, - "completions/max_terminated_length": 1859.0, - "completions/mean_length": 586.755859375, - "completions/mean_terminated_length": 586.755859375, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 575.783203125, + "completions/mean_terminated_length": 575.783203125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.321468547123349, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004747531699904538, - "kl": 0.04705810546875, + "grad_norm": 0.004501749654718466, + "kl": 0.0462646484375, "learning_rate": 1.3865583039223929e-06, "loss": 0.0005, - "num_tokens": 226954870.0, + "num_tokens": 215352073.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10067,26 +10067,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1848.0, - "completions/mean_length": 561.67578125, - "completions/mean_terminated_length": 558.76708984375, - "completions/min_length": 172.0, - "completions/min_terminated_length": 172.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1129.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 541.359375, + "completions/mean_terminated_length": 541.359375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.32236400268636667, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05340801461414318, - "kl": 0.0465087890625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004656261138922308, + "kl": 0.04718017578125, "learning_rate": 1.374993281896137e-06, - "loss": 0.0166, - "num_tokens": 227541488.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 215928289.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 360 }, { @@ -10096,19 +10096,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1943.0, - "completions/max_terminated_length": 1943.0, - "completions/mean_length": 549.7890625, - "completions/mean_terminated_length": 549.7890625, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/max_length": 1584.0, + "completions/max_terminated_length": 1584.0, + "completions/mean_length": 542.564453125, + "completions/mean_terminated_length": 542.564453125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.3232594582493844, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004837436430436778, - "kl": 0.0484619140625, + "grad_norm": 0.004741245952791287, + "kl": 0.04730224609375, "learning_rate": 1.3634859523979134e-06, "loss": 0.0005, - "num_tokens": 228105924.0, + "num_tokens": 216489026.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10124,19 +10124,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1845.0, - "completions/max_terminated_length": 1845.0, - "completions/mean_length": 572.677734375, - "completions/mean_terminated_length": 572.677734375, - "completions/min_length": 126.0, - "completions/min_terminated_length": 126.0, + "completions/max_length": 1602.0, + "completions/max_terminated_length": 1602.0, + "completions/mean_length": 566.744140625, + "completions/mean_terminated_length": 566.744140625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.3241549138124021, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0056382167382188035, - "kl": 0.04815673828125, + "grad_norm": 0.004922462540278737, + "kl": 0.04833984375, "learning_rate": 1.3520367982522208e-06, "loss": 0.0005, - "num_tokens": 228684975.0, + "num_tokens": 217065039.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10151,20 +10151,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1624.0, - "completions/mean_length": 548.544921875, - "completions/mean_terminated_length": 545.6105346679688, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1338.0, + "completions/max_terminated_length": 1338.0, + "completions/mean_length": 547.189453125, + "completions/mean_terminated_length": 547.189453125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.32505036937541976, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0067283143933235, - "kl": 0.04949951171875, + "grad_norm": 0.004862974742261159, + "kl": 0.04705810546875, "learning_rate": 1.3406462998426358e-06, "loss": 0.0005, - "num_tokens": 229248662.0, + "num_tokens": 217628032.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10180,25 +10180,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1774.0, - "completions/mean_length": 621.30078125, - "completions/mean_terminated_length": 615.7059326171875, - "completions/min_length": 235.0, - "completions/min_terminated_length": 235.0, + "completions/max_length": 1460.0, + "completions/max_terminated_length": 1460.0, + "completions/mean_length": 609.470703125, + "completions/mean_terminated_length": 607.2078857421875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.32594582493843743, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.08503371465958129, - "kl": 0.04840087890625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 2.0311803594047575, + "kl": 0.0655517578125, "learning_rate": 1.3293149350916595e-06, - "loss": 0.0103, - "num_tokens": 229883920.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0072, + "num_tokens": 218257233.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 364 }, { @@ -10208,19 +10208,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1562.0, - "completions/max_terminated_length": 1562.0, - "completions/mean_length": 595.625, - "completions/mean_terminated_length": 595.625, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_length": 1472.0, + "completions/max_terminated_length": 1472.0, + "completions/mean_length": 584.74609375, + "completions/mean_terminated_length": 584.74609375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.3268412805014551, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0068623792592217995, - "kl": 0.04730224609375, + "grad_norm": 0.004975922076955785, + "kl": 0.04620361328125, "learning_rate": 1.3180431794406623e-06, "loss": 0.0005, - "num_tokens": 230488288.0, + "num_tokens": 218856031.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10235,20 +10235,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1877.0, - "completions/max_terminated_length": 1877.0, - "completions/mean_length": 582.458984375, - "completions/mean_terminated_length": 582.458984375, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1364.0, + "completions/max_terminated_length": 1364.0, + "completions/mean_length": 577.599609375, + "completions/mean_terminated_length": 576.2211303710938, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.3277367360644728, "frac_reward_zero_std": 1.0, - "grad_norm": 0.018560148818948925, - "kl": 0.04766845703125, + "grad_norm": 0.048253010828955015, + "kl": 0.057373046875, "learning_rate": 1.3068315058299358e-06, - "loss": 0.0005, - "num_tokens": 231086187.0, + "loss": 0.0006, + "num_tokens": 219451442.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10263,20 +10263,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1877.0, - "completions/mean_length": 613.73046875, - "completions/mean_terminated_length": 608.1058959960938, - "completions/min_length": 226.0, - "completions/min_terminated_length": 226.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1321.0, + "completions/max_terminated_length": 1321.0, + "completions/mean_length": 594.275390625, + "completions/mean_terminated_length": 592.8532104492188, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.32863219162749047, "frac_reward_zero_std": 1.0, - "grad_norm": 0.006036312025660227, - "kl": 0.0472412109375, + "grad_norm": 0.0489489060192923, + "kl": 0.05865478515625, "learning_rate": 1.2956803846788503e-06, - "loss": 0.0005, - "num_tokens": 231702705.0, + "loss": 0.0006, + "num_tokens": 220057999.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10291,20 +10291,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1561.0, - "completions/mean_length": 575.67578125, - "completions/mean_terminated_length": 572.7944946289062, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1641.0, + "completions/max_terminated_length": 1641.0, + "completions/mean_length": 571.3046875, + "completions/mean_terminated_length": 571.3046875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, "epoch": 0.32952764719050814, "frac_reward_zero_std": 1.0, - "grad_norm": 0.010626533398767141, - "kl": 0.04766845703125, + "grad_norm": 0.004236000922588133, + "kl": 0.0469970703125, "learning_rate": 1.284590283866116e-06, "loss": 0.0005, - "num_tokens": 232314187.0, + "num_tokens": 220667243.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10320,19 +10320,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1877.0, - "completions/max_terminated_length": 1877.0, - "completions/mean_length": 638.849609375, - "completions/mean_terminated_length": 638.849609375, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_length": 1322.0, + "completions/max_terminated_length": 1322.0, + "completions/mean_length": 646.376953125, + "completions/mean_terminated_length": 646.376953125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, "epoch": 0.3304231027535259, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005441410501154126, - "kl": 0.04730224609375, + "grad_norm": 0.004577538862500448, + "kl": 0.046875, "learning_rate": 1.2735616687101518e-06, "loss": 0.0005, - "num_tokens": 233017150.0, + "num_tokens": 221374060.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10347,26 +10347,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1908.0, - "completions/mean_length": 603.595703125, - "completions/mean_terminated_length": 595.08251953125, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 565.533203125, + "completions/mean_terminated_length": 564.5929565429688, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, "epoch": 0.33131855831654355, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.04062997694360191, - "kl": 0.04681396484375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.09863928059149049, + "kl": 0.0606689453125, "learning_rate": 1.2625950019495614e-06, - "loss": 0.0005, - "num_tokens": 233633391.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0009, + "num_tokens": 221970813.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 370 }, { @@ -10376,19 +10376,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1448.0, - "completions/max_terminated_length": 1448.0, - "completions/mean_length": 572.203125, - "completions/mean_terminated_length": 572.203125, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/max_length": 1147.0, + "completions/max_terminated_length": 1147.0, + "completions/mean_length": 564.126953125, + "completions/mean_terminated_length": 564.126953125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.33221401387956123, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00508915950543525, - "kl": 0.04736328125, + "grad_norm": 0.005033013139063627, + "kl": 0.047607421875, "learning_rate": 1.251690743723718e-06, "loss": 0.0005, - "num_tokens": 234234375.0, + "num_tokens": 222567662.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10404,19 +10404,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1783.0, - "completions/max_terminated_length": 1783.0, - "completions/mean_length": 571.640625, - "completions/mean_terminated_length": 571.640625, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/max_length": 1359.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 571.357421875, + "completions/mean_terminated_length": 571.357421875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.3331094694425789, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005649723189142395, - "kl": 0.04681396484375, + "grad_norm": 0.004486081414495797, + "kl": 0.0452880859375, "learning_rate": 1.2408493515534581e-06, "loss": 0.0005, - "num_tokens": 234813247.0, + "num_tokens": 223146389.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10431,26 +10431,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1921.0, - "completions/max_terminated_length": 1921.0, - "completions/mean_length": 582.796875, - "completions/mean_terminated_length": 582.796875, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1588.0, + "completions/max_terminated_length": 1588.0, + "completions/mean_length": 593.37109375, + "completions/mean_terminated_length": 592.3013916015625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.3340049250055966, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07817677753397509, - "kl": 0.04779052734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6676414497649207, + "kl": 0.23779296875, "learning_rate": 1.2300712803218834e-06, - "loss": 0.0039, - "num_tokens": 235400759.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0024, + "num_tokens": 223739315.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 373 }, { @@ -10459,20 +10459,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1730.0, - "completions/mean_length": 561.732421875, - "completions/mean_terminated_length": 558.8238525390625, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 1376.0, + "completions/mean_length": 561.052734375, + "completions/mean_terminated_length": 558.0452270507812, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, "epoch": 0.33490038056861426, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005750730935584701, - "kl": 0.04840087890625, + "grad_norm": 6.352807716641198, + "kl": 1.9359130859375, "learning_rate": 1.2193569822552772e-06, - "loss": 0.0005, - "num_tokens": 236002254.0, + "loss": 0.0193, + "num_tokens": 224340462.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10487,26 +10487,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1538.0, - "completions/max_terminated_length": 1538.0, - "completions/mean_length": 581.84765625, - "completions/mean_terminated_length": 581.84765625, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1526.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 559.923828125, + "completions/mean_terminated_length": 559.0567626953125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.33579583613163194, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005148892348158003, - "kl": 0.0478515625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07663783762398667, + "kl": 0.0635986328125, "learning_rate": 1.2087069069041268e-06, - "loss": 0.0005, - "num_tokens": 236597552.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0018, + "num_tokens": 224924535.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 375 }, { @@ -10515,20 +10515,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1951.0, - "completions/mean_length": 613.890625, - "completions/mean_terminated_length": 608.7745361328125, - "completions/min_length": 213.0, - "completions/min_terminated_length": 213.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 594.92578125, + "completions/mean_terminated_length": 592.0822143554688, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.3366912916946497, "frac_reward_zero_std": 0.96875, - "grad_norm": 6.1250219766271465, - "kl": 0.0712890625, + "grad_norm": 0.059619881056307, + "kl": 0.04766845703125, "learning_rate": 1.1981215011242654e-06, - "loss": 0.0117, - "num_tokens": 237222104.0, + "loss": 0.0094, + "num_tokens": 225539377.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -10543,26 +10543,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1886.0, - "completions/max_terminated_length": 1886.0, - "completions/mean_length": 586.263671875, - "completions/mean_terminated_length": 586.263671875, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1601.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 569.505859375, + "completions/mean_terminated_length": 568.4813842773438, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, "epoch": 0.33758674725766735, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.00577756812887239, - "kl": 0.04656982421875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.09423306788440591, + "kl": 0.0728759765625, "learning_rate": 1.1876012090581184e-06, - "loss": 0.0005, - "num_tokens": 237836223.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0034, + "num_tokens": 226144916.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 377 }, { @@ -10572,19 +10572,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 1996.0, - "completions/max_terminated_length": 1996.0, - "completions/mean_length": 609.84765625, - "completions/mean_terminated_length": 607.878662109375, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_length": 1744.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 585.783203125, + "completions/mean_terminated_length": 584.75537109375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, "epoch": 0.33848220282068503, "frac_reward_zero_std": 1.0, - "grad_norm": 0.02365896656045564, - "kl": 0.0654296875, + "grad_norm": 0.015044538467711114, + "kl": 0.053955078125, "learning_rate": 1.177146472116071e-06, - "loss": 0.0007, - "num_tokens": 238463233.0, + "loss": 0.0005, + "num_tokens": 226759605.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10600,25 +10600,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1808.0, - "completions/max_terminated_length": 1808.0, - "completions/mean_length": 585.064453125, - "completions/mean_terminated_length": 585.064453125, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/max_length": 1211.0, + "completions/max_terminated_length": 1211.0, + "completions/mean_length": 563.88671875, + "completions/mean_terminated_length": 563.88671875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, "epoch": 0.3393776583837027, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07355594175930143, - "kl": 0.050048828125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00500054399657516, + "kl": 0.04815673828125, "learning_rate": 1.1667577289579462e-06, - "loss": 0.0049, - "num_tokens": 239076322.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 227361851.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 379 }, { @@ -10627,20 +10627,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1374.0, - "completions/mean_length": 589.796875, - "completions/mean_terminated_length": 586.9432373046875, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1342.0, + "completions/max_terminated_length": 1342.0, + "completions/mean_length": 572.224609375, + "completions/mean_terminated_length": 572.224609375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.3402731139467204, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005923305178076053, - "kl": 0.04888916015625, + "grad_norm": 0.005051424307733384, + "kl": 0.0479736328125, "learning_rate": 1.1564354154746007e-06, "loss": 0.0005, - "num_tokens": 239686714.0, + "num_tokens": 227963246.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10655,20 +10655,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1704.0, - "completions/max_terminated_length": 1704.0, - "completions/mean_length": 565.86328125, - "completions/mean_terminated_length": 565.86328125, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1781.0, + "completions/max_terminated_length": 1781.0, + "completions/mean_length": 543.689453125, + "completions/mean_terminated_length": 541.678466796875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.34116856950973806, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06055954764239639, - "kl": 0.04974365234375, + "grad_norm": 1.2170304837306363, + "kl": 0.05426025390625, "learning_rate": 1.146179964769635e-06, - "loss": 0.0006, - "num_tokens": 240269876.0, + "loss": 0.0025, + "num_tokens": 228535055.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -10683,26 +10683,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1780.0, - "completions/mean_length": 604.6328125, - "completions/mean_terminated_length": 601.8082275390625, - "completions/min_length": 158.0, - "completions/min_terminated_length": 158.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1515.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 574.69921875, + "completions/mean_terminated_length": 574.69921875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.34206402507275574, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.057122471938020274, - "kl": 0.0457763671875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004686795250678084, + "kl": 0.04437255859375, "learning_rate": 1.1359918071412195e-06, - "loss": 0.0114, - "num_tokens": 240867560.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0004, + "num_tokens": 229117413.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 382 }, { @@ -10711,26 +10711,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1869.0, - "completions/mean_length": 574.115234375, - "completions/mean_terminated_length": 568.3353271484375, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 581.048828125, + "completions/mean_terminated_length": 576.4617309570312, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.3429594806357735, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0652873493200889, - "kl": 0.04638671875, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.1783978547689393, + "kl": 0.05865478515625, "learning_rate": 1.1258713700640456e-06, "loss": 0.014, - "num_tokens": 241445155.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "num_tokens": 229698558.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 383 }, { @@ -10739,20 +10739,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1976.0, - "completions/max_terminated_length": 1976.0, - "completions/mean_length": 588.0546875, - "completions/mean_terminated_length": 588.0546875, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1769.0, + "completions/max_terminated_length": 1769.0, + "completions/mean_length": 570.1640625, + "completions/mean_terminated_length": 569.1859130859375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.34385493619879115, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004754868273249351, - "kl": 0.047607421875, + "grad_norm": 0.01024322824980028, + "kl": 0.0523681640625, "learning_rate": 1.115819078171383e-06, "loss": 0.0005, - "num_tokens": 242033343.0, + "num_tokens": 230277586.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10768,19 +10768,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1922.0, - "completions/max_terminated_length": 1922.0, - "completions/mean_length": 574.859375, - "completions/mean_terminated_length": 574.859375, - "completions/min_length": 137.0, - "completions/min_terminated_length": 137.0, + "completions/max_length": 1455.0, + "completions/max_terminated_length": 1455.0, + "completions/mean_length": 583.599609375, + "completions/mean_terminated_length": 583.599609375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.3447503917618088, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0056589735897611725, - "kl": 0.05059814453125, + "grad_norm": 0.00435591430534266, + "kl": 0.04876708984375, "learning_rate": 1.1058353532372667e-06, "loss": 0.0005, - "num_tokens": 242619959.0, + "num_tokens": 230868677.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10796,25 +10796,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1501.0, - "completions/mean_length": 573.388671875, - "completions/mean_terminated_length": 563.56103515625, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_length": 1716.0, + "completions/max_terminated_length": 1716.0, + "completions/mean_length": 563.646484375, + "completions/mean_terminated_length": 559.092529296875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.3456458473248265, - "frac_reward_zero_std": 0.90625, - "grad_norm": 0.11311586362691683, - "kl": 0.0516357421875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0644270242005122, + "kl": 0.04962158203125, "learning_rate": 1.0959206141587998e-06, - "loss": 0.0303, - "num_tokens": 243228654.0, - "reward": 0.09941406548023224, - "reward_std": 0.0023437500931322575, + "loss": 0.0125, + "num_tokens": 231472384.0, + "reward": 0.09921875596046448, + "reward_std": 0.0013975425390526652, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08812850713729858, "step": 386 }, { @@ -10824,19 +10824,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 2034.0, - "completions/max_terminated_length": 2034.0, - "completions/mean_length": 582.330078125, - "completions/mean_terminated_length": 582.330078125, - "completions/min_length": 182.0, - "completions/min_terminated_length": 182.0, + "completions/max_length": 1587.0, + "completions/max_terminated_length": 1587.0, + "completions/mean_length": 573.7109375, + "completions/mean_terminated_length": 573.7109375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.3465413028878442, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005083694085424717, - "kl": 0.04852294921875, + "grad_norm": 0.0043769719826980726, + "kl": 0.0465087890625, "learning_rate": 1.0860752769385766e-06, "loss": 0.0005, - "num_tokens": 243815575.0, + "num_tokens": 232054892.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10851,26 +10851,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1655.0, - "completions/max_terminated_length": 1655.0, - "completions/mean_length": 608.525390625, - "completions/mean_terminated_length": 608.525390625, - "completions/min_length": 180.0, - "completions/min_terminated_length": 180.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1421.0, + "completions/max_terminated_length": 1421.0, + "completions/mean_length": 625.736328125, + "completions/mean_terminated_length": 625.0215454101562, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.34743675845086186, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05526627161129057, - "kl": 0.0477294921875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004764854587022044, + "kl": 0.04669189453125, "learning_rate": 1.0762997546672279e-06, - "loss": 0.0015, - "num_tokens": 244427892.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 232676021.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 388 }, { @@ -10879,26 +10879,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1761.0, - "completions/max_terminated_length": 1761.0, - "completions/mean_length": 562.423828125, - "completions/mean_terminated_length": 562.423828125, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1553.0, + "completions/max_terminated_length": 1553.0, + "completions/mean_length": 556.107421875, + "completions/mean_terminated_length": 554.7964477539062, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, "epoch": 0.34833221401387954, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08080507072024797, - "kl": 0.048583984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019412253728347505, + "kl": 0.055419921875, "learning_rate": 1.0665944575060914e-06, - "loss": 0.004, - "num_tokens": 245024445.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0006, + "num_tokens": 233269340.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 389 }, { @@ -10907,20 +10907,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1832.0, - "completions/max_terminated_length": 1832.0, - "completions/mean_length": 586.203125, - "completions/mean_terminated_length": 586.203125, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 598.857421875, + "completions/mean_terminated_length": 596.0215454101562, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.34922766957689727, "frac_reward_zero_std": 1.0, - "grad_norm": 0.009200464557926759, - "kl": 0.0494384765625, + "grad_norm": 0.004042710509275122, + "kl": 0.04693603515625, "learning_rate": 1.056959792669997e-06, "loss": 0.0005, - "num_tokens": 245641717.0, + "num_tokens": 233893091.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -10936,25 +10936,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1634.0, - "completions/max_terminated_length": 1634.0, - "completions/mean_length": 597.11328125, - "completions/mean_terminated_length": 597.11328125, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 600.2421875, + "completions/mean_terminated_length": 600.2421875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.35012312513991495, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05477299161437103, - "kl": 0.04791259765625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004273522818328221, + "kl": 0.046630859375, "learning_rate": 1.0473961644101856e-06, - "loss": 0.0018, - "num_tokens": 246288895.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 234541871.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 391 }, { @@ -10964,25 +10964,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1455.0, - "completions/mean_length": 610.310546875, - "completions/mean_terminated_length": 607.4970703125, - "completions/min_length": 184.0, - "completions/min_terminated_length": 184.0, + "completions/max_length": 1491.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 619.369140625, + "completions/mean_terminated_length": 618.6966552734375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.3510185807029326, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04103458864719703, - "kl": 0.047607421875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0040486666283392125, + "kl": 0.04632568359375, "learning_rate": 1.037903973997345e-06, - "loss": 0.0002, - "num_tokens": 246940366.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 235197980.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 392 }, { @@ -10991,26 +10991,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1687.0, - "completions/max_terminated_length": 1687.0, - "completions/mean_length": 571.6171875, - "completions/mean_terminated_length": 571.6171875, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 592.013671875, + "completions/mean_terminated_length": 586.303955078125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, "epoch": 0.3519140362659503, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0045559010772451055, - "kl": 0.04620361328125, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.07832944286109411, + "kl": 0.044677734375, "learning_rate": 1.0284836197047737e-06, - "loss": 0.0005, - "num_tokens": 247554154.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0094, + "num_tokens": 235822211.0, + "reward": 0.09941406548023224, + "reward_std": 0.0018486406188458204, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 393 }, { @@ -11020,19 +11020,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1482.0, - "completions/max_terminated_length": 1482.0, - "completions/mean_length": 529.73046875, - "completions/mean_terminated_length": 529.73046875, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/max_length": 1323.0, + "completions/max_terminated_length": 1323.0, + "completions/mean_length": 555.65625, + "completions/mean_terminated_length": 555.65625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.352809491828968, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005620991916216765, - "kl": 0.05078125, + "grad_norm": 0.010148834998323347, + "kl": 0.04815673828125, "learning_rate": 1.0191354967916712e-06, "loss": 0.0005, - "num_tokens": 248125712.0, + "num_tokens": 236407043.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11048,25 +11048,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1438.0, - "completions/mean_length": 577.833984375, - "completions/mean_terminated_length": 574.9569702148438, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/max_length": 1373.0, + "completions/max_terminated_length": 1373.0, + "completions/mean_length": 587.875, + "completions/mean_terminated_length": 586.9177856445312, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.35370494739198566, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005897057155196266, - "kl": 0.05084228515625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1.9869550657623536, + "kl": 0.27435302734375, "learning_rate": 1.0098599974865515e-06, - "loss": 0.0005, - "num_tokens": 248716379.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.013, + "num_tokens": 237002851.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 395 }, { @@ -11076,19 +11076,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1838.0, - "completions/max_terminated_length": 1838.0, - "completions/mean_length": 607.990234375, - "completions/mean_terminated_length": 607.990234375, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/max_length": 1436.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 601.15234375, + "completions/mean_terminated_length": 601.15234375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.35460040295500334, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005227169861081531, - "kl": 0.04913330078125, + "grad_norm": 0.004630268582371048, + "kl": 0.0477294921875, "learning_rate": 1.0006575109707898e-06, "loss": 0.0005, - "num_tokens": 249353110.0, + "num_tokens": 237636081.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11104,19 +11104,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1525.0, - "completions/max_terminated_length": 1525.0, - "completions/mean_length": 534.1484375, - "completions/mean_terminated_length": 534.1484375, - "completions/min_length": 97.0, - "completions/min_terminated_length": 97.0, + "completions/max_length": 1576.0, + "completions/max_terminated_length": 1576.0, + "completions/mean_length": 572.310546875, + "completions/mean_terminated_length": 572.310546875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.35549585851802107, "frac_reward_zero_std": 1.0, - "grad_norm": 0.009091379046012489, - "kl": 0.04925537109375, + "grad_norm": 0.004840189114307491, + "kl": 0.04644775390625, "learning_rate": 9.915284233622877e-07, "loss": 0.0005, - "num_tokens": 249943106.0, + "num_tokens": 238245616.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11132,19 +11132,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1687.0, - "completions/max_terminated_length": 1687.0, - "completions/mean_length": 554.1640625, - "completions/mean_terminated_length": 554.1640625, - "completions/min_length": 110.0, - "completions/min_terminated_length": 110.0, + "completions/max_length": 1507.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 587.345703125, + "completions/mean_terminated_length": 587.345703125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.35639131408103875, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00576347525077593, - "kl": 0.04864501953125, + "grad_norm": 0.003941142747216441, + "kl": 0.0457763671875, "learning_rate": 9.824731176992796e-07, "loss": 0.0005, - "num_tokens": 250492534.0, + "num_tokens": 238812033.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11160,19 +11160,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1398.0, - "completions/max_terminated_length": 1398.0, - "completions/mean_length": 580.962890625, - "completions/mean_terminated_length": 580.962890625, - "completions/min_length": 166.0, - "completions/min_terminated_length": 166.0, + "completions/max_length": 1750.0, + "completions/max_terminated_length": 1750.0, + "completions/mean_length": 608.267578125, + "completions/mean_terminated_length": 608.267578125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, "epoch": 0.3572867696440564, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005684393564410186, - "kl": 0.05072021484375, + "grad_norm": 0.0051519468305786, + "kl": 0.04791259765625, "learning_rate": 9.734919739242543e-07, "loss": 0.0005, - "num_tokens": 251093283.0, + "num_tokens": 239426762.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11187,26 +11187,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1882.0, - "completions/mean_length": 578.701171875, - "completions/mean_terminated_length": 572.9392700195312, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1857.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 608.357421875, + "completions/mean_terminated_length": 608.357421875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.3581822252070741, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.09024918016321634, - "kl": 0.04998779296875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0039636086255187275, + "kl": 0.04779052734375, "learning_rate": 9.645853688680177e-07, - "loss": 0.0057, - "num_tokens": 251716346.0, - "reward": 0.09941406548023224, - "reward_std": 0.0018486406188458204, + "loss": 0.0005, + "num_tokens": 240065009.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.994140625, - "rewards/format_reward/std": 0.07639661431312561, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 400 }, { @@ -11216,25 +11216,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1275.0, - "completions/max_terminated_length": 1275.0, - "completions/mean_length": 533.146484375, - "completions/mean_terminated_length": 533.146484375, - "completions/min_length": 121.0, - "completions/min_terminated_length": 121.0, + "completions/max_length": 1195.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 566.44921875, + "completions/mean_terminated_length": 566.44921875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.3590776807700918, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07662408160218846, - "kl": 0.04638671875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038272234301605793, + "kl": 0.044677734375, "learning_rate": 9.557536762338786e-07, - "loss": 0.0079, - "num_tokens": 252276213.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0004, + "num_tokens": 240641927.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 401 }, { @@ -11243,20 +11243,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1603.0, - "completions/max_terminated_length": 1603.0, - "completions/mean_length": 539.779296875, - "completions/mean_terminated_length": 539.779296875, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1314.0, + "completions/max_terminated_length": 1314.0, + "completions/mean_length": 572.609375, + "completions/mean_terminated_length": 571.1585083007812, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, "epoch": 0.35997313633310946, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0047643547604445105, - "kl": 0.04693603515625, + "grad_norm": 2.830266598053953, + "kl": 0.78790283203125, "learning_rate": 9.46997266581973e-07, - "loss": 0.0005, - "num_tokens": 252837172.0, + "loss": 0.0079, + "num_tokens": 241219695.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11271,20 +11271,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1944.0, - "completions/max_terminated_length": 1944.0, - "completions/mean_length": 597.201171875, - "completions/mean_terminated_length": 597.201171875, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1621.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 601.609375, + "completions/mean_terminated_length": 600.4070434570312, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.36086859189612713, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004834734143787087, - "kl": 0.0479736328125, + "grad_norm": 2.0982466909243627, + "kl": 0.50506591796875, "learning_rate": 9.383165073137115e-07, - "loss": 0.0005, - "num_tokens": 253443979.0, + "loss": 0.0051, + "num_tokens": 241828759.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11300,19 +11300,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1626.0, - "completions/max_terminated_length": 1626.0, - "completions/mean_length": 566.322265625, - "completions/mean_terminated_length": 566.322265625, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 592.77734375, + "completions/mean_terminated_length": 592.77734375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.36176404745914487, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00579926871259698, - "kl": 0.04705810546875, + "grad_norm": 0.00595576175026029, + "kl": 0.04461669921875, "learning_rate": 9.297117626563687e-07, - "loss": 0.0005, - "num_tokens": 254029248.0, + "loss": 0.0004, + "num_tokens": 242427573.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11327,26 +11327,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1328.0, - "completions/max_terminated_length": 1328.0, - "completions/mean_length": 586.23046875, - "completions/mean_terminated_length": 586.23046875, - "completions/min_length": 207.0, - "completions/min_terminated_length": 207.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1312.0, + "completions/mean_length": 591.677734375, + "completions/mean_terminated_length": 585.5462036132812, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, "epoch": 0.36265950302216254, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005531919514784843, - "kl": 0.048583984375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.15349545116743918, + "kl": 0.0850830078125, "learning_rate": 9.211833936477957e-07, - "loss": 0.0005, - "num_tokens": 254609734.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0262, + "num_tokens": 243010848.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 405 }, { @@ -11355,20 +11355,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1538.0, - "completions/max_terminated_length": 1538.0, - "completions/mean_length": 562.662109375, - "completions/mean_terminated_length": 562.662109375, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1596.0, + "completions/max_terminated_length": 1596.0, + "completions/mean_length": 592.9375, + "completions/mean_terminated_length": 591.60986328125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.3635549585851802, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00421872727224393, - "kl": 0.04925537109375, + "grad_norm": 0.16264218504894148, + "kl": 0.112060546875, "learning_rate": 9.127317581212753e-07, - "loss": 0.0005, - "num_tokens": 255193529.0, + "loss": 0.0011, + "num_tokens": 243610144.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11383,26 +11383,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1312.0, - "completions/mean_length": 546.47265625, - "completions/mean_terminated_length": 542.1961059570312, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1240.0, + "completions/max_terminated_length": 1240.0, + "completions/mean_length": 571.162109375, + "completions/mean_terminated_length": 570.0997924804688, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, "epoch": 0.3644504141481979, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.10098327848008554, - "kl": 0.05133056640625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11354577971589774, + "kl": 0.08062744140625, "learning_rate": 9.043572106905084e-07, - "loss": 0.013, - "num_tokens": 255796619.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0008, + "num_tokens": 244225875.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 407 }, { @@ -11411,26 +11411,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1818.0, - "completions/mean_length": 574.74609375, - "completions/mean_terminated_length": 571.863037109375, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1684.0, + "completions/max_terminated_length": 1684.0, + "completions/mean_length": 583.146484375, + "completions/mean_terminated_length": 583.146484375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.3653458697112156, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07221448169919696, - "kl": 0.04937744140625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008619318026623175, + "kl": 0.04681396484375, "learning_rate": 8.960601027347321e-07, - "loss": 0.0011, - "num_tokens": 256396969.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 244830526.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 408 }, { @@ -11439,26 +11439,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1660.0, - "completions/max_terminated_length": 1660.0, - "completions/mean_length": 531.146484375, - "completions/mean_terminated_length": 530.3033447265625, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1284.0, + "completions/max_terminated_length": 1284.0, + "completions/mean_length": 559.591796875, + "completions/mean_terminated_length": 559.591796875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.36624132527423325, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06674911247595358, - "kl": 0.04931640625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004150486122063207, + "kl": 0.04461669921875, "learning_rate": 8.878407823839788e-07, - "loss": 0.0029, - "num_tokens": 256954724.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0004, + "num_tokens": 245402845.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 409 }, { @@ -11467,26 +11467,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2013.0, - "completions/max_terminated_length": 2013.0, - "completions/mean_length": 568.6484375, - "completions/mean_terminated_length": 567.4246826171875, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1663.0, + "completions/mean_length": 618.41796875, + "completions/mean_terminated_length": 614.60986328125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, "epoch": 0.36713678083725093, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.010132848557988017, - "kl": 0.05450439453125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.21832381676817567, + "kl": 0.13763427734375, "learning_rate": 8.796995945044689e-07, - "loss": 0.0005, - "num_tokens": 257552176.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0153, + "num_tokens": 246025779.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 410 }, { @@ -11495,26 +11495,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1686.0, - "completions/max_terminated_length": 1686.0, - "completions/mean_length": 581.15234375, - "completions/mean_terminated_length": 581.15234375, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 598.365234375, + "completions/mean_terminated_length": 592.680419921875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.36803223640026866, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.00432008518918132, - "kl": 0.0484619140625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.25126353697075565, + "kl": 0.16729736328125, "learning_rate": 8.716368806841405e-07, - "loss": 0.0005, - "num_tokens": 258158942.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0146, + "num_tokens": 246641358.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 411 }, { @@ -11524,25 +11524,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1425.0, - "completions/max_terminated_length": 1425.0, - "completions/mean_length": 561.341796875, - "completions/mean_terminated_length": 561.341796875, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/max_length": 1309.0, + "completions/max_terminated_length": 1309.0, + "completions/mean_length": 574.302734375, + "completions/mean_terminated_length": 574.302734375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.36892769196328634, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06693413618462407, - "kl": 0.0494384765625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004008716988509208, + "kl": 0.0467529296875, "learning_rate": 8.636529792183171e-07, - "loss": -0.001, - "num_tokens": 258739869.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 247228921.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 412 }, { @@ -11551,26 +11551,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1556.0, - "completions/max_terminated_length": 1556.0, - "completions/mean_length": 560.208984375, - "completions/mean_terminated_length": 560.208984375, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 584.1484375, + "completions/mean_terminated_length": 581.2837524414062, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, "epoch": 0.369823147526304, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004847298445441672, - "kl": 0.04901123046875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.04699799709615979, + "kl": 0.0460205078125, "learning_rate": 8.557482250955144e-07, - "loss": 0.0005, - "num_tokens": 259334584.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0042, + "num_tokens": 247835893.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 413 }, { @@ -11579,20 +11579,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1558.0, - "completions/max_terminated_length": 1558.0, - "completions/mean_length": 611.892578125, - "completions/mean_terminated_length": 611.892578125, - "completions/min_length": 198.0, - "completions/min_terminated_length": 198.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1638.0, + "completions/max_terminated_length": 1638.0, + "completions/mean_length": 612.640625, + "completions/mean_terminated_length": 611.1076049804688, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.3707186030893217, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005198359727407881, - "kl": 0.04888916015625, + "grad_norm": 0.004123288560446476, + "kl": 0.048583984375, "learning_rate": 8.479229499833844e-07, "loss": 0.0005, - "num_tokens": 259972241.0, + "num_tokens": 248473933.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11607,20 +11607,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1332.0, - "completions/max_terminated_length": 1332.0, - "completions/mean_length": 529.185546875, - "completions/mean_terminated_length": 529.185546875, - "completions/min_length": 135.0, - "completions/min_terminated_length": 135.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1199.0, + "completions/max_terminated_length": 1199.0, + "completions/mean_length": 560.51953125, + "completions/mean_terminated_length": 559.4755249023438, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, "epoch": 0.3716140586523394, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005096616784905531, - "kl": 0.04815673828125, + "grad_norm": 0.011763473343105632, + "kl": 0.05010986328125, "learning_rate": 8.401774822147976e-07, "loss": 0.0005, - "num_tokens": 260477904.0, + "num_tokens": 248995639.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11636,25 +11636,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1589.0, - "completions/max_terminated_length": 1589.0, - "completions/mean_length": 574.904296875, - "completions/mean_terminated_length": 574.904296875, - "completions/min_length": 155.0, - "completions/min_terminated_length": 155.0, + "completions/max_length": 1477.0, + "completions/max_terminated_length": 1477.0, + "completions/mean_length": 575.220703125, + "completions/mean_terminated_length": 575.220703125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.37250951421535705, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06747598747854207, - "kl": 0.04888916015625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004097140634106235, + "kl": 0.04595947265625, "learning_rate": 8.325121467740695e-07, - "loss": 0.001, - "num_tokens": 261093247.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 249611144.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 416 }, { @@ -11663,20 +11663,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1710.0, - "completions/max_terminated_length": 1710.0, - "completions/mean_length": 573.78515625, - "completions/mean_terminated_length": 573.78515625, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 585.287109375, + "completions/mean_terminated_length": 582.4246826171875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.37340496977837473, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004344603358267009, - "kl": 0.0465087890625, + "grad_norm": 0.004884809695062555, + "kl": 0.0440673828125, "learning_rate": 8.249272652833226e-07, - "loss": 0.0005, - "num_tokens": 261713041.0, + "loss": 0.0004, + "num_tokens": 250236827.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11692,19 +11692,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1463.0, - "completions/max_terminated_length": 1463.0, - "completions/mean_length": 533.013671875, - "completions/mean_terminated_length": 533.013671875, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/max_length": 1809.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 554.861328125, + "completions/mean_terminated_length": 554.861328125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.3743004253413924, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004507059752267276, - "kl": 0.0506591796875, + "grad_norm": 0.004598838734909383, + "kl": 0.0479736328125, "learning_rate": 8.174231559889931e-07, "loss": 0.0005, - "num_tokens": 262281848.0, + "num_tokens": 250816820.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11720,19 +11720,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1396.0, - "completions/max_terminated_length": 1396.0, - "completions/mean_length": 517.865234375, - "completions/mean_terminated_length": 517.865234375, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/max_length": 1482.0, + "completions/max_terminated_length": 1482.0, + "completions/mean_length": 541.486328125, + "completions/mean_terminated_length": 541.486328125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, "epoch": 0.37519588090441014, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005351908786787514, - "kl": 0.04852294921875, + "grad_norm": 0.004336303560681598, + "kl": 0.04498291015625, "learning_rate": 8.100001337484787e-07, - "loss": 0.0005, - "num_tokens": 262811091.0, + "loss": 0.0004, + "num_tokens": 251358157.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11747,20 +11747,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1760.0, - "completions/max_terminated_length": 1760.0, - "completions/mean_length": 561.669921875, - "completions/mean_terminated_length": 561.669921875, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1565.0, + "completions/max_terminated_length": 1565.0, + "completions/mean_length": 575.189453125, + "completions/mean_terminated_length": 573.25244140625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.3760913364674278, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.058773060593634845, - "kl": 0.0477294921875, + "grad_norm": 0.04522967734559762, + "kl": 0.05145263671875, "learning_rate": 8.026585100169251e-07, - "loss": -0.0004, - "num_tokens": 263420570.0, + "loss": 0.0003, + "num_tokens": 251974558.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -11775,26 +11775,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1659.0, - "completions/max_terminated_length": 1659.0, - "completions/mean_length": 601.13671875, - "completions/mean_terminated_length": 601.13671875, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1519.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 600.025390625, + "completions/mean_terminated_length": 599.1056518554688, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.3769867920304455, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04780470323490276, - "kl": 0.04840087890625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14856992304676514, + "kl": 0.12725830078125, "learning_rate": 7.953985928341601e-07, - "loss": -0.0011, - "num_tokens": 264049296.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0013, + "num_tokens": 252602715.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 421 }, { @@ -11804,19 +11804,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1901.0, - "completions/max_terminated_length": 1901.0, - "completions/mean_length": 587.703125, - "completions/mean_terminated_length": 587.703125, - "completions/min_length": 122.0, - "completions/min_terminated_length": 122.0, + "completions/max_length": 1414.0, + "completions/max_terminated_length": 1414.0, + "completions/mean_length": 600.701171875, + "completions/mean_terminated_length": 600.701171875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.37788224759346317, "frac_reward_zero_std": 1.0, - "grad_norm": 0.015584951802210437, - "kl": 0.0504150390625, + "grad_norm": 0.004152663083082165, + "kl": 0.04595947265625, "learning_rate": 7.882206868117693e-07, "loss": 0.0005, - "num_tokens": 264649528.0, + "num_tokens": 253209602.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11832,25 +11832,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1931.0, - "completions/max_terminated_length": 1931.0, - "completions/mean_length": 575.669921875, - "completions/mean_terminated_length": 575.669921875, - "completions/min_length": 145.0, - "completions/min_terminated_length": 145.0, + "completions/max_length": 1494.0, + "completions/max_terminated_length": 1494.0, + "completions/mean_length": 585.802734375, + "completions/mean_terminated_length": 585.802734375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.37877770315648085, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.008202181121387101, - "kl": 0.04742431640625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06489189022575756, + "kl": 0.0447998046875, "learning_rate": 7.81125093120313e-07, - "loss": 0.0005, - "num_tokens": 265265791.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0013, + "num_tokens": 253831053.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 423 }, { @@ -11859,26 +11859,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1773.0, - "completions/max_terminated_length": 1773.0, - "completions/mean_length": 554.826171875, - "completions/mean_terminated_length": 554.826171875, - "completions/min_length": 153.0, - "completions/min_terminated_length": 153.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1164.0, + "completions/max_terminated_length": 1164.0, + "completions/mean_length": 569.109375, + "completions/mean_terminated_length": 568.2015380859375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, "epoch": 0.3796731587194985, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07411872102018555, - "kl": 0.04827880859375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04129359182029064, + "kl": 0.06884765625, "learning_rate": 7.741121094766916e-07, - "loss": 0.0036, - "num_tokens": 265876198.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0007, + "num_tokens": 254448773.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 424 }, { @@ -11888,19 +11888,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1806.0, - "completions/max_terminated_length": 1806.0, - "completions/mean_length": 531.36328125, - "completions/mean_terminated_length": 531.36328125, - "completions/min_length": 116.0, - "completions/min_terminated_length": 116.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 552.451171875, + "completions/mean_terminated_length": 552.451171875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, "epoch": 0.3805686142825162, "frac_reward_zero_std": 1.0, - "grad_norm": 0.00483447721173187, - "kl": 0.049072265625, + "grad_norm": 0.004063387730342484, + "kl": 0.04522705078125, "learning_rate": 7.671820301316532e-07, "loss": 0.0005, - "num_tokens": 266424928.0, + "num_tokens": 255008300.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -11916,25 +11916,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1398.0, - "completions/max_terminated_length": 1398.0, - "completions/mean_length": 547.166015625, - "completions/mean_terminated_length": 547.166015625, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_length": 1368.0, + "completions/max_terminated_length": 1368.0, + "completions/mean_length": 570.951171875, + "completions/mean_terminated_length": 570.951171875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, "epoch": 0.38146406984553394, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004354827538356667, - "kl": 0.04656982421875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05431180203096875, + "kl": 0.04412841796875, "learning_rate": 7.603351458574474e-07, - "loss": 0.0005, - "num_tokens": 266996373.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0001, + "num_tokens": 255591923.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 426 }, { @@ -11943,26 +11943,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1578.0, - "completions/mean_length": 560.0390625, - "completions/mean_terminated_length": 557.127197265625, - "completions/min_length": 162.0, - "completions/min_terminated_length": 162.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1706.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 592.3515625, + "completions/mean_terminated_length": 592.3515625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.3823595254085516, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.052963479074057004, - "kl": 0.05413818359375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021289388295280785, + "kl": 0.051513671875, "learning_rate": 7.535717439356255e-07, - "loss": 0.0009, - "num_tokens": 267662393.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 256274487.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 427 }, { @@ -11971,20 +11971,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1869.0, - "completions/max_terminated_length": 1869.0, - "completions/mean_length": 532.46484375, - "completions/mean_terminated_length": 532.46484375, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1369.0, + "completions/max_terminated_length": 1369.0, + "completions/mean_length": 569.8515625, + "completions/mean_terminated_length": 569.1956787109375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.3832549809715693, "frac_reward_zero_std": 1.0, - "grad_norm": 0.14959490402999703, - "kl": 0.0740966796875, + "grad_norm": 0.0046493196129765504, + "kl": 0.047119140625, "learning_rate": 7.46892108144986e-07, - "loss": 0.0007, - "num_tokens": 268222999.0, + "loss": 0.0005, + "num_tokens": 256854235.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12000,19 +12000,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1312.0, - "completions/mean_length": 572.41015625, - "completions/mean_terminated_length": 569.5225219726562, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_length": 1685.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 594.58203125, + "completions/mean_terminated_length": 592.876708984375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.38415043653458697, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005582201559752547, - "kl": 0.05035400390625, + "grad_norm": 0.030024377459490645, + "kl": 0.06475830078125, "learning_rate": 7.402965187496697e-07, - "loss": 0.0005, - "num_tokens": 268835369.0, + "loss": 0.0006, + "num_tokens": 257477957.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12027,26 +12027,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1578.0, - "completions/max_terminated_length": 1578.0, - "completions/mean_length": 572.984375, - "completions/mean_terminated_length": 572.984375, - "completions/min_length": 197.0, - "completions/min_terminated_length": 197.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1847.0, + "completions/mean_length": 612.123046875, + "completions/mean_terminated_length": 607.6902465820312, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.38504589209760465, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05576338510401166, - "kl": 0.0494384765625, + "grad_norm": 0.059584893596399784, + "kl": 0.0477294921875, "learning_rate": 7.337852524873974e-07, - "loss": 0.0035, - "num_tokens": 269440033.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0081, + "num_tokens": 258102660.0, + "reward": 0.099609375, + "reward_std": 0.0010673906654119492, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 430 }, { @@ -12056,25 +12056,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1423.0, - "completions/max_terminated_length": 1423.0, - "completions/mean_length": 562.912109375, - "completions/mean_terminated_length": 562.912109375, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/max_length": 1323.0, + "completions/max_terminated_length": 1323.0, + "completions/mean_length": 572.529296875, + "completions/mean_terminated_length": 572.529296875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, "epoch": 0.3859413476606223, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08650265338397081, - "kl": 0.04925537109375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003820837122162048, + "kl": 0.04449462890625, "learning_rate": 7.273585825578608e-07, - "loss": 0.0027, - "num_tokens": 269999428.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0004, + "num_tokens": 258666979.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 431 }, { @@ -12083,20 +12083,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1740.0, - "completions/max_terminated_length": 1740.0, - "completions/mean_length": 583.6328125, - "completions/mean_terminated_length": 583.6328125, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1434.0, + "completions/max_terminated_length": 1434.0, + "completions/mean_length": 601.86328125, + "completions/mean_terminated_length": 600.2348022460938, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, "epoch": 0.38683680322364, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0041924114517845655, - "kl": 0.04949951171875, + "grad_norm": 0.005050065975834865, + "kl": 0.050537109375, "learning_rate": 7.21016778611259e-07, "loss": 0.0005, - "num_tokens": 270609576.0, + "num_tokens": 259286461.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12112,19 +12112,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1501.0, - "completions/max_terminated_length": 1501.0, - "completions/mean_length": 555.6171875, - "completions/mean_terminated_length": 555.6171875, - "completions/min_length": 119.0, - "completions/min_terminated_length": 119.0, + "completions/max_length": 1626.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 583.451171875, + "completions/mean_terminated_length": 583.451171875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.38773225878665774, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004051015296435293, - "kl": 0.04632568359375, + "grad_norm": 0.003638010107531803, + "kl": 0.0433349609375, "learning_rate": 7.147601067369835e-07, - "loss": 0.0005, - "num_tokens": 271192932.0, + "loss": 0.0004, + "num_tokens": 259884068.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12139,20 +12139,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1703.0, - "completions/max_terminated_length": 1703.0, - "completions/mean_length": 600.853515625, - "completions/mean_terminated_length": 600.853515625, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1436.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 620.447265625, + "completions/mean_terminated_length": 618.9706420898438, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.3886277143496754, "frac_reward_zero_std": 1.0, - "grad_norm": 0.003873089336388401, - "kl": 0.047607421875, + "grad_norm": 0.0049178729245381745, + "kl": 0.04931640625, "learning_rate": 7.085888294524561e-07, "loss": 0.0005, - "num_tokens": 271807193.0, + "num_tokens": 260508361.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12168,25 +12168,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1880.0, - "completions/max_terminated_length": 1880.0, - "completions/mean_length": 589.169921875, - "completions/mean_terminated_length": 589.169921875, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_length": 1796.0, + "completions/max_terminated_length": 1796.0, + "completions/mean_length": 614.025390625, + "completions/mean_terminated_length": 614.025390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, "epoch": 0.3895231699126931, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06598911812763071, - "kl": 0.04876708984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0053380131336369435, + "kl": 0.04571533203125, "learning_rate": 7.025032056921117e-07, - "loss": 0.0017, - "num_tokens": 272413184.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 261127078.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 435 }, { @@ -12196,19 +12196,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1621.0, - "completions/max_terminated_length": 1621.0, - "completions/mean_length": 554.048828125, - "completions/mean_terminated_length": 554.048828125, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_length": 1784.0, + "completions/max_terminated_length": 1784.0, + "completions/mean_length": 594.244140625, + "completions/mean_terminated_length": 594.244140625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.39041862547571077, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005011629822964105, - "kl": 0.04913330078125, + "grad_norm": 0.003692024494965669, + "kl": 0.0433349609375, "learning_rate": 6.965034907965349e-07, - "loss": 0.0005, - "num_tokens": 273018809.0, + "loss": 0.0004, + "num_tokens": 261753283.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12223,26 +12223,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1509.0, - "completions/mean_length": 546.236328125, - "completions/mean_terminated_length": 540.3471069335938, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1207.0, + "completions/max_terminated_length": 1207.0, + "completions/mean_length": 555.828125, + "completions/mean_terminated_length": 554.5538330078125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.39131408103872845, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05460977307454807, - "kl": 0.0478515625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09019199102303466, + "kl": 0.09649658203125, "learning_rate": 6.905899365017462e-07, - "loss": 0.0158, - "num_tokens": 273584962.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.001, + "num_tokens": 262324347.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 437 }, { @@ -12251,26 +12251,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1624.0, - "completions/mean_length": 561.583984375, - "completions/mean_terminated_length": 558.6751708984375, - "completions/min_length": 157.0, - "completions/min_terminated_length": 157.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1715.0, + "completions/max_terminated_length": 1433.0, + "completions/mean_length": 584.7890625, + "completions/mean_terminated_length": 581.9176635742188, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.3922095366017461, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0062424353732860646, - "kl": 0.05084228515625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07482831067859354, + "kl": 0.05682373046875, "learning_rate": 6.847627909286409e-07, - "loss": 0.0005, - "num_tokens": 274214125.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.009, + "num_tokens": 262965391.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 438 }, { @@ -12280,19 +12280,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1474.0, - "completions/max_terminated_length": 1474.0, - "completions/mean_length": 580.083984375, - "completions/mean_terminated_length": 580.083984375, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/max_length": 1697.0, + "completions/max_terminated_length": 1697.0, + "completions/mean_length": 603.103515625, + "completions/mean_terminated_length": 603.103515625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.3931049921647638, "frac_reward_zero_std": 1.0, - "grad_norm": 0.007684600815592728, - "kl": 0.0478515625, + "grad_norm": 0.0036304390520191508, + "kl": 0.04461669921875, "learning_rate": 6.790222985725761e-07, - "loss": 0.0005, - "num_tokens": 274811064.0, + "loss": 0.0004, + "num_tokens": 263574116.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12307,20 +12307,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.75, - "completions/max_length": 1687.0, - "completions/max_terminated_length": 1687.0, - "completions/mean_length": 562.736328125, - "completions/mean_terminated_length": 551.4375, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 1304.0, + "completions/max_terminated_length": 1304.0, + "completions/mean_length": 592.533203125, + "completions/mean_terminated_length": 567.8157958984375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, "epoch": 0.39400044772778153, "frac_reward_zero_std": 1.0, - "grad_norm": 105.4794447937914, - "kl": 12.5390625, + "grad_norm": 1.19342053013638, + "kl": 0.71484375, "learning_rate": 6.733687002931141e-07, - "loss": 0.1253, - "num_tokens": 275402609.0, + "loss": 0.0072, + "num_tokens": 264180917.0, "reward": 0.09687499701976776, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12335,26 +12335,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1720.0, - "completions/mean_length": 574.7421875, - "completions/mean_terminated_length": 568.9647216796875, - "completions/min_length": 154.0, - "completions/min_terminated_length": 154.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1393.0, + "completions/max_terminated_length": 1393.0, + "completions/mean_length": 589.984375, + "completions/mean_terminated_length": 588.74169921875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.3948959032907992, - "frac_reward_zero_std": 0.875, - "grad_norm": 0.15188148089603382, - "kl": 0.0477294921875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07111130745860231, + "kl": 0.04693603515625, "learning_rate": 6.678022333039158e-07, - "loss": 0.024, - "num_tokens": 275982381.0, - "reward": 0.09921875596046448, - "reward_std": 0.0031250000465661287, + "loss": 0.0022, + "num_tokens": 264768493.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.9921875, - "rewards/format_reward/std": 0.08812850713729858, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 441 }, { @@ -12364,19 +12364,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1923.0, - "completions/max_terminated_length": 1923.0, - "completions/mean_length": 606.69921875, - "completions/mean_terminated_length": 606.69921875, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 626.0546875, + "completions/mean_terminated_length": 626.0546875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.3957913588538169, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06622735966778687, - "kl": 0.04669189453125, + "grad_norm": 0.0522103422216749, + "kl": 0.0443115234375, "learning_rate": 6.623231311627876e-07, - "loss": 0.0043, - "num_tokens": 276612803.0, + "loss": 0.0023, + "num_tokens": 265408825.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -12392,25 +12392,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1895.0, - "completions/max_terminated_length": 1895.0, - "completions/mean_length": 589.787109375, - "completions/mean_terminated_length": 589.787109375, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_length": 1747.0, + "completions/max_terminated_length": 1747.0, + "completions/mean_length": 606.671875, + "completions/mean_terminated_length": 606.671875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.39668681441683457, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004054937917658486, - "kl": 0.04730224609375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.04443605509556234, + "kl": 0.044677734375, "learning_rate": 6.569316237618811e-07, - "loss": 0.0005, - "num_tokens": 277203622.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0006, + "num_tokens": 266008289.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 443 }, { @@ -12419,20 +12419,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 1854.0, - "completions/max_terminated_length": 1660.0, - "completions/mean_length": 602.287109375, - "completions/mean_terminated_length": 597.3784790039062, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1765.0, + "completions/mean_length": 629.169921875, + "completions/mean_terminated_length": 626.3933715820312, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.39758226997985224, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07219201096985092, - "kl": 0.04864501953125, + "grad_norm": 0.079977521203583, + "kl": 0.048095703125, "learning_rate": 6.516279373180499e-07, - "loss": 0.0084, - "num_tokens": 277847913.0, + "loss": 0.0147, + "num_tokens": 266666344.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -12447,20 +12447,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1784.0, - "completions/mean_length": 554.794921875, - "completions/mean_terminated_length": 551.872802734375, - "completions/min_length": 138.0, - "completions/min_terminated_length": 138.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1862.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 570.20703125, + "completions/mean_terminated_length": 570.20703125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.3984777255428699, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004333441625319936, - "kl": 0.0489501953125, + "grad_norm": 0.00374445862358689, + "kl": 0.04571533203125, "learning_rate": 6.464122943633543e-07, "loss": 0.0005, - "num_tokens": 278461056.0, + "num_tokens": 267287378.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12475,26 +12475,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1661.0, - "completions/mean_length": 592.8828125, - "completions/mean_terminated_length": 590.0352172851562, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1577.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 603.232421875, + "completions/mean_terminated_length": 603.232421875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.3993731811058876, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.07587832755649311, - "kl": 0.0469970703125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004544902573679008, + "kl": 0.0443115234375, "learning_rate": 6.412849137357271e-07, - "loss": -0.0016, - "num_tokens": 279083252.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0004, + "num_tokens": 267914873.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 446 }, { @@ -12503,26 +12503,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1835.0, - "completions/max_terminated_length": 1835.0, - "completions/mean_length": 558.845703125, - "completions/mean_terminated_length": 558.0822143554688, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 569.06640625, + "completions/mean_terminated_length": 569.06640625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.40026863666890533, - "frac_reward_zero_std": 0.9375, - "grad_norm": 5.034003120090257, - "kl": 0.0579833984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006639340072597727, + "kl": 0.045654296875, "learning_rate": 6.3624601056979e-07, - "loss": 0.0124, - "num_tokens": 279681589.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 268518443.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 447 }, { @@ -12531,26 +12531,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1953.0, - "completions/mean_length": 614.783203125, - "completions/mean_terminated_length": 611.9784545898438, - "completions/min_length": 248.0, - "completions/min_terminated_length": 248.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1834.0, + "completions/max_terminated_length": 1834.0, + "completions/mean_length": 592.236328125, + "completions/mean_terminated_length": 592.236328125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.401164092231923, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04124554822003232, - "kl": 0.048583984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037418093289070475, + "kl": 0.04522705078125, "learning_rate": 6.312957962878278e-07, - "loss": -0.0048, - "num_tokens": 280269110.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 269094420.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 448 }, { @@ -12560,19 +12560,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1602.0, - "completions/max_terminated_length": 1602.0, - "completions/mean_length": 570.455078125, - "completions/mean_terminated_length": 570.455078125, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/max_length": 1501.0, + "completions/max_terminated_length": 1501.0, + "completions/mean_length": 577.109375, + "completions/mean_terminated_length": 577.109375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, "epoch": 0.4020595477949407, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004252061047808049, - "kl": 0.0467529296875, + "grad_norm": 0.004226852697815755, + "kl": 0.043701171875, "learning_rate": 6.264344785909181e-07, - "loss": 0.0005, - "num_tokens": 280854959.0, + "loss": 0.0004, + "num_tokens": 269683676.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12587,26 +12587,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1850.0, - "completions/mean_length": 615.158203125, - "completions/mean_terminated_length": 610.9019775390625, - "completions/min_length": 225.0, - "completions/min_terminated_length": 225.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1750.0, + "completions/max_terminated_length": 1750.0, + "completions/mean_length": 608.45703125, + "completions/mean_terminated_length": 606.75927734375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, "epoch": 0.40295500335795836, - "frac_reward_zero_std": 0.96875, - "grad_norm": 4.155482563277789, - "kl": 0.0538330078125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004989757138908895, + "kl": 0.04827880859375, "learning_rate": 6.216622614502149e-07, - "loss": 0.0073, - "num_tokens": 281508256.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 270333542.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 450 }, { @@ -12616,25 +12616,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1883.0, - "completions/mean_length": 618.771484375, - "completions/mean_terminated_length": 615.9745483398438, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/max_length": 1637.0, + "completions/max_terminated_length": 1637.0, + "completions/mean_length": 630.955078125, + "completions/mean_terminated_length": 628.9862670898438, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, "epoch": 0.40385045892097604, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.069402280555639, - "kl": 0.0484619140625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0045917782708162535, + "kl": 0.05010986328125, "learning_rate": 6.169793450983916e-07, - "loss": -0.0053, - "num_tokens": 282139451.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0005, + "num_tokens": 270970975.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 451 }, { @@ -12643,26 +12643,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1601.0, - "completions/mean_length": 629.6171875, - "completions/mean_terminated_length": 624.054931640625, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1564.0, + "completions/max_terminated_length": 1564.0, + "completions/mean_length": 619.875, + "completions/mean_terminated_length": 618.6692504882812, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.4047459144839937, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05619296797430902, - "kl": 0.04974365234375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004445821112373912, + "kl": 0.04986572265625, "learning_rate": 6.123859260212393e-07, - "loss": -0.0006, - "num_tokens": 282805479.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 271632015.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 452 }, { @@ -12672,25 +12672,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1345.0, - "completions/max_terminated_length": 1345.0, - "completions/mean_length": 551.099609375, - "completions/mean_terminated_length": 551.099609375, - "completions/min_length": 164.0, - "completions/min_terminated_length": 164.0, + "completions/max_length": 1271.0, + "completions/max_terminated_length": 1271.0, + "completions/mean_length": 571.25, + "completions/mean_terminated_length": 571.25, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, "epoch": 0.4056413700470114, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06716769755390013, - "kl": 0.047607421875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035904161477165997, + "kl": 0.045654296875, "learning_rate": 6.07882196949423e-07, - "loss": 0.0002, - "num_tokens": 283388314.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 272225167.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 453 }, { @@ -12699,26 +12699,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1887.0, - "completions/mean_length": 585.65625, - "completions/mean_terminated_length": 582.7944946289062, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1623.0, + "completions/max_terminated_length": 1623.0, + "completions/mean_length": 607.998046875, + "completions/mean_terminated_length": 607.998046875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, "epoch": 0.40653682561002913, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004697335407404541, - "kl": 0.04742431640625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.047268762318116685, + "kl": 0.043701171875, "learning_rate": 6.034683468503948e-07, - "loss": 0.0005, - "num_tokens": 283968202.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0001, + "num_tokens": 272816494.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 454 }, { @@ -12728,19 +12728,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.75, - "completions/max_length": 1486.0, - "completions/max_terminated_length": 1486.0, - "completions/mean_length": 585.796875, - "completions/mean_terminated_length": 574.6612548828125, - "completions/min_length": 163.0, - "completions/min_terminated_length": 163.0, + "completions/max_length": 1386.0, + "completions/max_terminated_length": 1386.0, + "completions/mean_length": 616.38671875, + "completions/mean_terminated_length": 591.7217407226562, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.4074322811730468, "frac_reward_zero_std": 1.0, - "grad_norm": 1.027380515611281, - "kl": 0.306640625, + "grad_norm": 1.0018677373618705, + "kl": 0.2841796875, "learning_rate": 5.991445609204641e-07, - "loss": 0.0031, - "num_tokens": 284611314.0, + "loss": 0.0028, + "num_tokens": 273475268.0, "reward": 0.09687499701976776, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12755,26 +12755,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1545.0, - "completions/max_terminated_length": 1545.0, - "completions/mean_length": 546.5234375, - "completions/mean_terminated_length": 546.5234375, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1678.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 574.8203125, + "completions/mean_terminated_length": 573.5812377929688, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, "epoch": 0.4083277367360645, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0556622348002105, - "kl": 0.04693603515625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005519132621378801, + "kl": 0.05157470703125, "learning_rate": 5.949110205770292e-07, - "loss": -0.0011, - "num_tokens": 285224430.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 274102872.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 456 }, { @@ -12783,26 +12783,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1539.0, - "completions/max_terminated_length": 1539.0, - "completions/mean_length": 569.07421875, - "completions/mean_terminated_length": 569.07421875, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1485.0, + "completions/max_terminated_length": 1485.0, + "completions/mean_length": 590.720703125, + "completions/mean_terminated_length": 588.6372680664062, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.40922319229908216, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.038169107742384197, - "kl": 0.0482177734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00569307441138671, + "kl": 0.05108642578125, "learning_rate": 5.90767903450964e-07, - "loss": -0.0023, - "num_tokens": 285823204.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 274712729.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 457 }, { @@ -12811,26 +12811,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1509.0, - "completions/mean_length": 579.447265625, - "completions/mean_terminated_length": 576.5733642578125, - "completions/min_length": 188.0, - "completions/min_terminated_length": 188.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1336.0, + "completions/max_terminated_length": 1336.0, + "completions/mean_length": 597.15234375, + "completions/mean_terminated_length": 593.3929443359375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.41011864786209984, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.005065842696717293, - "kl": 0.047607421875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1.6059039024411552, + "kl": 0.0643310546875, "learning_rate": 5.867153833791652e-07, - "loss": 0.0005, - "num_tokens": 286402665.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0072, + "num_tokens": 275301255.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 458 }, { @@ -12839,20 +12839,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1992.0, - "completions/mean_length": 614.25, - "completions/mean_terminated_length": 605.7996215820312, - "completions/min_length": 112.0, - "completions/min_terminated_length": 112.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1331.0, + "completions/max_terminated_length": 1331.0, + "completions/mean_length": 591.708984375, + "completions/mean_terminated_length": 591.708984375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, "epoch": 0.4110141034251175, "frac_reward_zero_std": 1.0, - "grad_norm": 0.003912700365631719, - "kl": 0.04583740234375, + "grad_norm": 0.003911482325395383, + "kl": 0.04339599609375, "learning_rate": 5.827536303972587e-07, - "loss": 0.0005, - "num_tokens": 286987241.0, + "loss": 0.0004, + "num_tokens": 275874290.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12867,26 +12867,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1813.0, - "completions/max_terminated_length": 1813.0, - "completions/mean_length": 560.013671875, - "completions/mean_terminated_length": 560.013671875, - "completions/min_length": 69.0, - "completions/min_terminated_length": 69.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1864.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 587.716796875, + "completions/mean_terminated_length": 586.3072509765625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, "epoch": 0.4119095589881352, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004495489669199859, - "kl": 0.04718017578125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05776806822294325, + "kl": 0.0504150390625, "learning_rate": 5.78882810732465e-07, - "loss": 0.0005, - "num_tokens": 287555536.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0014, + "num_tokens": 276456769.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 460 }, { @@ -12895,26 +12895,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1580.0, - "completions/mean_length": 558.630859375, - "completions/mean_terminated_length": 555.7162475585938, - "completions/min_length": 130.0, - "completions/min_terminated_length": 130.0, + "completions/max_terminated_length": 1766.0, + "completions/mean_length": 578.732421875, + "completions/mean_terminated_length": 575.305908203125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, "epoch": 0.4128050145511529, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0046506916985020715, - "kl": 0.0484619140625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07438159135182086, + "kl": 0.0528564453125, "learning_rate": 5.75103086796625e-07, - "loss": 0.0005, - "num_tokens": 288162883.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0161, + "num_tokens": 277074408.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 461 }, { @@ -12923,26 +12923,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1877.0, - "completions/max_terminated_length": 1877.0, - "completions/mean_length": 624.595703125, - "completions/mean_terminated_length": 624.595703125, - "completions/min_length": 239.0, - "completions/min_terminated_length": 239.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1618.0, + "completions/max_terminated_length": 1618.0, + "completions/mean_length": 642.39453125, + "completions/mean_terminated_length": 640.9647827148438, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, "epoch": 0.4137004701141706, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0049626766542558935, - "kl": 0.0458984375, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06372747249738105, + "kl": 0.05255126953125, "learning_rate": 5.714146171793846e-07, - "loss": 0.0005, - "num_tokens": 288826548.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0026, + "num_tokens": 277747186.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 462 }, { @@ -12951,20 +12951,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1807.0, - "completions/mean_length": 675.462890625, - "completions/mean_terminated_length": 670.0804443359375, - "completions/min_length": 169.0, - "completions/min_terminated_length": 169.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1767.0, + "completions/max_terminated_length": 1767.0, + "completions/mean_length": 682.73828125, + "completions/mean_terminated_length": 681.872802734375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, "epoch": 0.4145959256771883, "frac_reward_zero_std": 1.0, - "grad_norm": 0.008119931529559445, - "kl": 0.049072265625, + "grad_norm": 0.004279885734629871, + "kl": 0.05035400390625, "learning_rate": 5.678175566415422e-07, "loss": 0.0005, - "num_tokens": 289538001.0, + "num_tokens": 278462364.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -12979,20 +12979,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1913.0, - "completions/max_terminated_length": 1913.0, - "completions/mean_length": 582.626953125, - "completions/mean_terminated_length": 582.626953125, - "completions/min_length": 129.0, - "completions/min_terminated_length": 129.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1377.0, + "completions/max_terminated_length": 1377.0, + "completions/mean_length": 605.052734375, + "completions/mean_terminated_length": 603.4569091796875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, "epoch": 0.41549138124020596, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004928378991447442, - "kl": 0.049072265625, + "grad_norm": 0.004562879684523193, + "kl": 0.0518798828125, "learning_rate": 5.643120561085528e-07, "loss": 0.0005, - "num_tokens": 290164018.0, + "num_tokens": 279099863.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13007,26 +13007,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1680.0, - "completions/max_terminated_length": 1680.0, - "completions/mean_length": 601.583984375, - "completions/mean_terminated_length": 601.583984375, - "completions/min_length": 217.0, - "completions/min_terminated_length": 217.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1470.0, + "completions/max_terminated_length": 1470.0, + "completions/mean_length": 597.873046875, + "completions/mean_terminated_length": 595.24169921875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, "epoch": 0.41638683680322364, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.008088857980013029, - "kl": 0.04791259765625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1.532391883408515, + "kl": 0.059814453125, "learning_rate": 5.608982626641991e-07, - "loss": 0.0005, - "num_tokens": 290800349.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0091, + "num_tokens": 279734294.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 465 }, { @@ -13035,20 +13035,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1930.0, - "completions/max_terminated_length": 1930.0, - "completions/mean_length": 568.7578125, - "completions/mean_terminated_length": 568.7578125, - "completions/min_length": 81.0, - "completions/min_terminated_length": 81.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1271.0, + "completions/max_terminated_length": 1271.0, + "completions/mean_length": 556.6953125, + "completions/mean_terminated_length": 556.0665283203125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, "epoch": 0.4172822923662413, "frac_reward_zero_std": 1.0, - "grad_norm": 0.024017793326594178, - "kl": 0.05133056640625, + "grad_norm": 0.004546148613238247, + "kl": 0.04937744140625, "learning_rate": 5.575763195444166e-07, "loss": 0.0005, - "num_tokens": 291339153.0, + "num_tokens": 280266922.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13063,20 +13063,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1584.0, - "completions/max_terminated_length": 1584.0, - "completions/mean_length": 581.87109375, - "completions/mean_terminated_length": 581.87109375, - "completions/min_length": 147.0, - "completions/min_terminated_length": 147.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1401.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 585.609375, + "completions/mean_terminated_length": 584.6829833984375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.418177747929259, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004137094950964047, - "kl": 0.04852294921875, + "grad_norm": 0.009347005608886688, + "kl": 0.05615234375, "learning_rate": 5.543463661312847e-07, - "loss": 0.0005, - "num_tokens": 291955615.0, + "loss": 0.0006, + "num_tokens": 280885298.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13091,20 +13091,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.9375, + "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1926.0, - "completions/mean_length": 632.26171875, - "completions/mean_terminated_length": 621.1141967773438, - "completions/min_length": 189.0, - "completions/min_terminated_length": 189.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 650.8828125, + "completions/mean_terminated_length": 645.8843383789062, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.41907320349227667, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06436363037629371, - "kl": 0.04949951171875, + "grad_norm": 0.044976200251507704, + "kl": 0.0523681640625, "learning_rate": 5.512085379471808e-07, - "loss": 0.0011, - "num_tokens": 292623845.0, + "loss": 0.0119, + "num_tokens": 281563062.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -13119,20 +13119,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1338.0, - "completions/max_terminated_length": 1338.0, - "completions/mean_length": 540.55859375, - "completions/mean_terminated_length": 539.2700805664062, - "completions/min_length": 200.0, - "completions/min_terminated_length": 200.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1638.0, + "completions/max_terminated_length": 1638.0, + "completions/mean_length": 569.3828125, + "completions/mean_terminated_length": 565.9019775390625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.4199686590552944, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.9091175781068817, - "kl": 0.0504150390625, + "grad_norm": 0.060898844246720894, + "kl": 0.06231689453125, "learning_rate": 5.481629666490903e-07, - "loss": 0.006, - "num_tokens": 293211251.0, + "loss": -0.0039, + "num_tokens": 282165226.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -13148,25 +13148,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1309.0, - "completions/max_terminated_length": 1309.0, - "completions/mean_length": 571.876953125, - "completions/mean_terminated_length": 571.876953125, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/max_length": 1286.0, + "completions/max_terminated_length": 1286.0, + "completions/mean_length": 584.701171875, + "completions/mean_terminated_length": 584.701171875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, "epoch": 0.4208641146183121, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08882810422258497, - "kl": 0.04718017578125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003834240543409795, + "kl": 0.04595947265625, "learning_rate": 5.452097800230853e-07, - "loss": 0.0039, - "num_tokens": 293818340.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 282778881.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 470 }, { @@ -13175,26 +13175,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1920.0, - "completions/mean_length": 618.740234375, - "completions/mean_terminated_length": 615.9432373046875, - "completions/min_length": 156.0, - "completions/min_terminated_length": 156.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 611.36328125, + "completions/mean_terminated_length": 611.36328125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, "epoch": 0.42175957018132976, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.08441184034971604, - "kl": 0.0458984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003375383832642503, + "kl": 0.043701171875, "learning_rate": 5.423491019789623e-07, - "loss": 0.0047, - "num_tokens": 294451039.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0004, + "num_tokens": 283407803.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 471 }, { @@ -13204,19 +13204,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1859.0, - "completions/max_terminated_length": 1859.0, - "completions/mean_length": 571.236328125, - "completions/mean_terminated_length": 571.236328125, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/max_length": 1437.0, + "completions/max_terminated_length": 1437.0, + "completions/mean_length": 588.123046875, + "completions/mean_terminated_length": 588.123046875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.42265502574434743, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004151029586595816, - "kl": 0.047607421875, + "grad_norm": 0.007763949257554806, + "kl": 0.04583740234375, "learning_rate": 5.395810525450425e-07, "loss": 0.0005, - "num_tokens": 295049912.0, + "num_tokens": 284015322.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13231,26 +13231,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1622.0, - "completions/max_terminated_length": 1622.0, - "completions/mean_length": 585.880859375, - "completions/mean_terminated_length": 585.880859375, - "completions/min_length": 171.0, - "completions/min_terminated_length": 171.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1981.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 589.7265625, + "completions/mean_terminated_length": 588.6594848632812, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, "epoch": 0.4235504813073651, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.006773189374311078, - "kl": 0.0477294921875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06245932963948891, + "kl": 0.0523681640625, "learning_rate": 5.369057478631359e-07, - "loss": 0.0005, - "num_tokens": 295687771.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.003, + "num_tokens": 284655150.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 473 }, { @@ -13259,26 +13259,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 1519.0, - "completions/max_terminated_length": 1519.0, - "completions/mean_length": 559.310546875, - "completions/mean_terminated_length": 555.547119140625, - "completions/min_length": 141.0, - "completions/min_terminated_length": 141.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1275.0, + "completions/mean_length": 588.333984375, + "completions/mean_terminated_length": 573.5711669921875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.4244459368703828, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05238104977419995, - "kl": 0.0484619140625, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.393684754940159, + "kl": 0.079345703125, "learning_rate": 5.343233001836694e-07, - "loss": 0.0077, - "num_tokens": 296294970.0, - "reward": 0.099609375, - "reward_std": 0.0010673906654119492, + "loss": 0.0239, + "num_tokens": 285277209.0, + "reward": 0.09902343899011612, + "reward_std": 0.0021787926089018583, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.990234375, + "rewards/format_reward/std": 0.09843364357948303, "step": 474 }, { @@ -13287,26 +13287,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1596.0, - "completions/mean_length": 597.630859375, - "completions/mean_terminated_length": 591.0942993164062, - "completions/min_length": 136.0, - "completions/min_terminated_length": 136.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1996.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 620.279296875, + "completions/mean_terminated_length": 616.6882934570312, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.42534139243340047, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04626310549442998, - "kl": 0.0478515625, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08588302090778102, + "kl": 0.04718017578125, "learning_rate": 5.318338178609754e-07, - "loss": 0.0124, - "num_tokens": 296916957.0, - "reward": 0.099609375, - "reward_std": 0.0010673906654119492, + "loss": 0.014, + "num_tokens": 285910792.0, + "reward": 0.09941406548023224, + "reward_std": 0.0018486406188458204, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, "step": 475 }, { @@ -13315,20 +13315,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1873.0, - "completions/mean_length": 556.599609375, - "completions/mean_terminated_length": 550.7510375976562, - "completions/min_length": 177.0, - "completions/min_terminated_length": 177.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1860.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 572.791015625, + "completions/mean_terminated_length": 571.6007690429688, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, "epoch": 0.4262368479964182, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0053069795960833605, - "kl": 0.04669189453125, + "grad_norm": 0.02767124078956891, + "kl": 0.06396484375, "learning_rate": 5.294374053487459e-07, - "loss": 0.0005, - "num_tokens": 297485056.0, + "loss": 0.0006, + "num_tokens": 286487181.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13343,20 +13343,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1817.0, - "completions/max_terminated_length": 1817.0, - "completions/mean_length": 534.4140625, - "completions/mean_terminated_length": 534.4140625, - "completions/min_length": 167.0, - "completions/min_terminated_length": 167.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1404.0, + "completions/max_terminated_length": 1404.0, + "completions/mean_length": 537.857421875, + "completions/mean_terminated_length": 536.6569213867188, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.4271323035594359, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004148165474532734, - "kl": 0.04693603515625, + "grad_norm": 0.01657319794506783, + "kl": 0.0655517578125, "learning_rate": 5.271341631956511e-07, - "loss": 0.0005, - "num_tokens": 298039460.0, + "loss": 0.0007, + "num_tokens": 287043348.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13371,26 +13371,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1695.0, - "completions/mean_length": 564.353515625, - "completions/mean_terminated_length": 561.4500732421875, - "completions/min_length": 143.0, - "completions/min_terminated_length": 143.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 567.2109375, + "completions/mean_terminated_length": 567.2109375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.42802775912245355, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.0361604804039374, - "kl": 0.04638671875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004291858410065178, + "kl": 0.04486083984375, "learning_rate": 5.249241880411181e-07, - "loss": 0.0153, - "num_tokens": 298646553.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0004, + "num_tokens": 287651904.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 478 }, { @@ -13399,26 +13399,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1789.0, - "completions/mean_length": 595.029296875, - "completions/mean_terminated_length": 592.1859130859375, - "completions/min_length": 174.0, - "completions/min_terminated_length": 174.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1304.0, + "completions/max_terminated_length": 1304.0, + "completions/mean_length": 596.017578125, + "completions/mean_terminated_length": 592.281494140625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, "epoch": 0.42892321468547123, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.07503338366587108, - "kl": 0.048583984375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007262425387433753, + "kl": 0.0606689453125, "learning_rate": 5.228075726112785e-07, - "loss": 0.0007, - "num_tokens": 299286024.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0006, + "num_tokens": 288291881.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 479 }, { @@ -13427,26 +13427,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1694.0, - "completions/max_terminated_length": 1694.0, - "completions/mean_length": 589.59375, - "completions/mean_terminated_length": 589.59375, - "completions/min_length": 173.0, - "completions/min_terminated_length": 173.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 625.6953125, + "completions/mean_terminated_length": 620.923583984375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, "epoch": 0.4298186702484889, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.003932622151853824, - "kl": 0.048095703125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07604608796303157, + "kl": 0.0860595703125, "learning_rate": 5.207844057150768e-07, - "loss": 0.0005, - "num_tokens": 299916216.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0169, + "num_tokens": 288940557.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 480 }, { @@ -13455,20 +13455,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1600.0, - "completions/mean_length": 556.908203125, - "completions/mean_terminated_length": 553.990234375, - "completions/min_length": 144.0, - "completions/min_terminated_length": 144.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1301.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 566.443359375, + "completions/mean_terminated_length": 566.443359375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, "epoch": 0.4307141258115066, "frac_reward_zero_std": 1.0, - "grad_norm": 0.0046160433450423335, - "kl": 0.05029296875, + "grad_norm": 0.0056663762582022026, + "kl": 0.04766845703125, "learning_rate": 5.188547722405437e-07, "loss": 0.0005, - "num_tokens": 300529225.0, + "num_tokens": 289558448.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13484,19 +13484,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1835.0, - "completions/max_terminated_length": 1835.0, - "completions/mean_length": 611.91796875, - "completions/mean_terminated_length": 611.91796875, - "completions/min_length": 211.0, - "completions/min_terminated_length": 211.0, + "completions/max_length": 1752.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 601.892578125, + "completions/mean_terminated_length": 601.892578125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.43160958137452426, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004081618380481847, - "kl": 0.04681396484375, + "grad_norm": 0.0035106948516688734, + "kl": 0.04449462890625, "learning_rate": 5.170187531512351e-07, - "loss": 0.0005, - "num_tokens": 301174607.0, + "loss": 0.0004, + "num_tokens": 290198697.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13511,20 +13511,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1538.0, - "completions/max_terminated_length": 1538.0, - "completions/mean_length": 561.107421875, - "completions/mean_terminated_length": 561.107421875, - "completions/min_length": 127.0, - "completions/min_terminated_length": 127.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1490.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 573.759765625, + "completions/mean_terminated_length": 572.25048828125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.432505036937542, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005272474133392292, - "kl": 0.05059814453125, + "grad_norm": 0.03392016125236342, + "kl": 0.0640869140625, "learning_rate": 5.152764254828348e-07, - "loss": 0.0005, - "num_tokens": 301783334.0, + "loss": 0.0006, + "num_tokens": 290813902.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13540,25 +13540,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1796.0, - "completions/max_terminated_length": 1796.0, - "completions/mean_length": 575.103515625, - "completions/mean_terminated_length": 575.103515625, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/max_length": 1524.0, + "completions/max_terminated_length": 1524.0, + "completions/mean_length": 576.0390625, + "completions/mean_terminated_length": 576.0390625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.4334004925005597, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004371253746007476, - "kl": 0.04931640625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.04602480512840631, + "kl": 0.0469970703125, "learning_rate": 5.136278623399225e-07, - "loss": 0.0005, - "num_tokens": 302405147.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": -0.0029, + "num_tokens": 291436194.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 484 }, { @@ -13567,26 +13567,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1587.0, - "completions/max_terminated_length": 1587.0, - "completions/mean_length": 557.376953125, - "completions/mean_terminated_length": 555.73779296875, - "completions/min_length": 149.0, - "completions/min_terminated_length": 149.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1310.0, + "completions/max_terminated_length": 1310.0, + "completions/mean_length": 566.806640625, + "completions/mean_terminated_length": 563.4970703125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.43429594806357735, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.00631760392876256, - "kl": 0.04669189453125, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.4297514894292327, + "kl": 0.0731201171875, "learning_rate": 5.120731328929058e-07, - "loss": 0.0005, - "num_tokens": 302999388.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0004, + "num_tokens": 292035263.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 485 }, { @@ -13595,20 +13595,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 1474.0, - "completions/max_terminated_length": 1474.0, - "completions/mean_length": 579.818359375, - "completions/mean_terminated_length": 578.4951171875, - "completions/min_length": 201.0, - "completions/min_terminated_length": 201.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1402.0, + "completions/mean_length": 593.1875, + "completions/mean_terminated_length": 587.4823608398438, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.43519140362659503, "frac_reward_zero_std": 0.96875, - "grad_norm": 0.7675240642119238, - "kl": 0.05047607421875, + "grad_norm": 0.2048485997684941, + "kl": 0.08758544921875, "learning_rate": 5.106123023751187e-07, - "loss": 0.0069, - "num_tokens": 303594319.0, + "loss": 0.0123, + "num_tokens": 292637039.0, "reward": 0.09980468451976776, "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, @@ -13623,20 +13623,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, + "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, - "completions/max_terminated_length": 1803.0, - "completions/mean_length": 610.142578125, - "completions/mean_terminated_length": 604.5039672851562, - "completions/min_length": 199.0, - "completions/min_terminated_length": 199.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 620.24609375, + "completions/mean_terminated_length": 617.4520263671875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, "epoch": 0.4360868591896127, "frac_reward_zero_std": 1.0, - "grad_norm": 0.003966772271342612, - "kl": 0.04791259765625, + "grad_norm": 0.004771668900955531, + "kl": 0.046630859375, "learning_rate": 5.092454320800833e-07, "loss": 0.0005, - "num_tokens": 304248552.0, + "num_tokens": 293296445.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13651,26 +13651,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.953125, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1912.0, - "completions/mean_length": 583.4921875, - "completions/mean_terminated_length": 574.8605346679688, - "completions/min_length": 159.0, - "completions/min_terminated_length": 159.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1690.0, + "completions/max_terminated_length": 1690.0, + "completions/mean_length": 562.4921875, + "completions/mean_terminated_length": 562.4921875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, "epoch": 0.4369823147526304, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06718584348509721, - "kl": 0.04815673828125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004316334400510252, + "kl": 0.04510498046875, "learning_rate": 5.079725793589405e-07, - "loss": 0.0119, - "num_tokens": 304857716.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 293894857.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 488 }, { @@ -13679,26 +13679,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1631.0, - "completions/mean_length": 587.619140625, - "completions/mean_terminated_length": 584.76123046875, - "completions/min_length": 151.0, - "completions/min_terminated_length": 151.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1488.0, + "completions/max_terminated_length": 1488.0, + "completions/mean_length": 586.365234375, + "completions/mean_terminated_length": 586.365234375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, "epoch": 0.43787777031564806, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.06044390352276465, - "kl": 0.04400634765625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038624710298466006, + "kl": 0.0440673828125, "learning_rate": 5.067937976180407e-07, - "loss": 0.0005, - "num_tokens": 305463137.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0004, + "num_tokens": 294499636.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 489 }, { @@ -13707,20 +13707,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1466.0, - "completions/max_terminated_length": 1466.0, - "completions/mean_length": 587.826171875, - "completions/mean_terminated_length": 587.826171875, - "completions/min_length": 148.0, - "completions/min_terminated_length": 148.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2036.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 591.830078125, + "completions/mean_terminated_length": 589.8392333984375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, "epoch": 0.4387732258786658, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004340416142239811, - "kl": 0.04534912109375, + "grad_norm": 0.08591994390512282, + "kl": 0.093994140625, "learning_rate": 5.057091363167046e-07, - "loss": 0.0005, - "num_tokens": 306081032.0, + "loss": 0.0009, + "num_tokens": 295119581.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13735,20 +13735,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1335.0, - "completions/max_terminated_length": 1335.0, - "completions/mean_length": 544.626953125, - "completions/mean_terminated_length": 544.626953125, - "completions/min_length": 165.0, - "completions/min_terminated_length": 165.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1211.0, + "completions/max_terminated_length": 1211.0, + "completions/mean_length": 550.6796875, + "completions/mean_terminated_length": 548.62158203125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.4396686814416835, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005717450638553965, - "kl": 0.0474853515625, + "grad_norm": 0.05506883519186736, + "kl": 0.0794677734375, "learning_rate": 5.047186409651489e-07, - "loss": 0.0005, - "num_tokens": 306656489.0, + "loss": 0.0008, + "num_tokens": 295698137.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13763,26 +13763,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1721.0, - "completions/mean_length": 613.29296875, - "completions/mean_terminated_length": 607.6666870117188, - "completions/min_length": 170.0, - "completions/min_terminated_length": 170.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1543.0, + "completions/max_terminated_length": 1543.0, + "completions/mean_length": 608.87109375, + "completions/mean_terminated_length": 608.0391235351562, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.44056413700470115, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05273242178002334, - "kl": 0.0477294921875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038651946743403155, + "kl": 0.04791259765625, "learning_rate": 5.038223531225742e-07, - "loss": 0.0115, - "num_tokens": 307264911.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 296304295.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 492 }, { @@ -13792,19 +13792,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1694.0, - "completions/max_terminated_length": 1694.0, - "completions/mean_length": 554.16796875, - "completions/mean_terminated_length": 554.16796875, - "completions/min_length": 117.0, - "completions/min_terminated_length": 117.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 569.330078125, + "completions/mean_terminated_length": 569.330078125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.44145959256771883, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004061493509151464, - "kl": 0.04656982421875, + "grad_norm": 0.003558131893094743, + "kl": 0.04473876953125, "learning_rate": 5.030203103954232e-07, - "loss": 0.0005, - "num_tokens": 307848181.0, + "loss": 0.0004, + "num_tokens": 296895328.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13820,19 +13820,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1524.0, - "completions/max_terminated_length": 1524.0, - "completions/mean_length": 558.9453125, - "completions/mean_terminated_length": 558.9453125, - "completions/min_length": 161.0, - "completions/min_terminated_length": 161.0, + "completions/max_length": 1505.0, + "completions/max_terminated_length": 1505.0, + "completions/mean_length": 579.6875, + "completions/mean_terminated_length": 579.6875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.4423550481307365, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004367225471207651, - "kl": 0.04486083984375, + "grad_norm": 0.01139014607119812, + "kl": 0.04290771484375, "learning_rate": 5.023125464358026e-07, "loss": 0.0004, - "num_tokens": 308417529.0, + "num_tokens": 297475296.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13847,26 +13847,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1677.0, - "completions/max_terminated_length": 1677.0, - "completions/mean_length": 551.27734375, - "completions/mean_terminated_length": 551.27734375, - "completions/min_length": 133.0, - "completions/min_terminated_length": 133.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1456.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 560.001953125, + "completions/mean_terminated_length": 559.4677124023438, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, "epoch": 0.4432505036937542, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.00418532184825478, - "kl": 0.047119140625, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.09503865862602394, + "kl": 0.06842041015625, "learning_rate": 5.016990909400709e-07, - "loss": 0.0005, - "num_tokens": 308975255.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.003, + "num_tokens": 298037489.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, "step": 495 }, { @@ -13876,19 +13876,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1497.0, - "completions/max_terminated_length": 1497.0, - "completions/mean_length": 601.00390625, - "completions/mean_terminated_length": 601.00390625, - "completions/min_length": 179.0, - "completions/min_terminated_length": 179.0, + "completions/max_length": 1940.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 616.166015625, + "completions/mean_terminated_length": 616.166015625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.44414595925677186, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004602536366235129, - "kl": 0.04852294921875, + "grad_norm": 0.0038706228148763185, + "kl": 0.04656982421875, "learning_rate": 5.011799696475915e-07, "loss": 0.0005, - "num_tokens": 309577561.0, + "num_tokens": 298647558.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13903,20 +13903,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1624.0, - "completions/mean_length": 531.177734375, - "completions/mean_terminated_length": 528.2094116210938, - "completions/min_length": 181.0, - "completions/min_terminated_length": 181.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1132.0, + "completions/max_terminated_length": 1132.0, + "completions/mean_length": 553.35546875, + "completions/mean_terminated_length": 553.35546875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, "epoch": 0.4450414148197896, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004030212878424051, - "kl": 0.04644775390625, + "grad_norm": 0.004194278776945329, + "kl": 0.0443115234375, "learning_rate": 5.007552043396547e-07, - "loss": 0.0005, - "num_tokens": 310151220.0, + "loss": 0.0004, + "num_tokens": 299232572.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13932,25 +13932,25 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1416.0, - "completions/mean_length": 545.96875, - "completions/mean_terminated_length": 543.0293579101562, - "completions/min_length": 152.0, - "completions/min_terminated_length": 152.0, + "completions/max_length": 1570.0, + "completions/max_terminated_length": 1570.0, + "completions/mean_length": 569.85546875, + "completions/mean_terminated_length": 568.876708984375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, "epoch": 0.44593687038280727, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.04970801239600626, - "kl": 0.047119140625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004197022412970387, + "kl": 0.0450439453125, "learning_rate": 5.004248128385618e-07, - "loss": -0.0006, - "num_tokens": 310755700.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0005, + "num_tokens": 299849282.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 498 }, { @@ -13959,20 +13959,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1445.0, - "completions/max_terminated_length": 1445.0, - "completions/mean_length": 556.638671875, - "completions/mean_terminated_length": 556.638671875, - "completions/min_length": 185.0, - "completions/min_terminated_length": 185.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1234.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 557.01953125, + "completions/mean_terminated_length": 555.427490234375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, "epoch": 0.44683232594582495, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004571033839926497, - "kl": 0.0477294921875, + "grad_norm": 0.06481738260450282, + "kl": 0.08477783203125, "learning_rate": 5.001888090068784e-07, - "loss": 0.0005, - "num_tokens": 311324955.0, + "loss": 0.0008, + "num_tokens": 300418732.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -13988,19 +13988,19 @@ "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, - "completions/max_length": 1470.0, - "completions/max_terminated_length": 1470.0, - "completions/mean_length": 580.17578125, - "completions/mean_terminated_length": 580.17578125, - "completions/min_length": 204.0, - "completions/min_terminated_length": 204.0, + "completions/max_length": 1447.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 601.1328125, + "completions/mean_terminated_length": 601.1328125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, "epoch": 0.4477277815088426, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004267414031586906, - "kl": 0.04730224609375, + "grad_norm": 0.009343548268136184, + "kl": 0.046630859375, "learning_rate": 5.000472027468528e-07, "loss": 0.0005, - "num_tokens": 311967237.0, + "num_tokens": 301071744.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -14015,26 +14015,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1609.0, - "completions/max_terminated_length": 1609.0, - "completions/mean_length": 606.591796875, - "completions/mean_terminated_length": 606.591796875, - "completions/min_length": 146.0, - "completions/min_terminated_length": 146.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1460.0, + "completions/max_terminated_length": 1460.0, + "completions/mean_length": 603.708984375, + "completions/mean_terminated_length": 602.6790771484375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.4486232370718603, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.05086077162905058, - "kl": 0.0474853515625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034444504126124674, + "kl": 0.065673828125, "learning_rate": 5.000000000000001e-07, - "loss": 0.0011, - "num_tokens": 312581300.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0007, + "num_tokens": 301684331.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 501 }, { @@ -14043,26 +14043,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1779.0, - "completions/mean_length": 599.373046875, - "completions/mean_terminated_length": 596.5381469726562, - "completions/min_length": 108.0, - "completions/min_terminated_length": 108.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1435.0, + "completions/max_terminated_length": 1435.0, + "completions/mean_length": 607.1640625, + "completions/mean_terminated_length": 607.1640625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, "epoch": 0.449518692634878, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.056542864188258934, - "kl": 0.0482177734375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004137901914690784, + "kl": 0.044921875, "learning_rate": 5.000472027468528e-07, - "loss": 0.0146, - "num_tokens": 313236387.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0004, + "num_tokens": 302343407.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 502 }, { @@ -14071,26 +14071,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1390.0, - "completions/max_terminated_length": 1390.0, - "completions/mean_length": 565.26171875, - "completions/mean_terminated_length": 565.26171875, - "completions/min_length": 186.0, - "completions/min_terminated_length": 186.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 597.3046875, + "completions/mean_terminated_length": 591.61572265625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, "epoch": 0.45041414819789566, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.003944884798198514, - "kl": 0.04736328125, + "frac_reward_zero_std": 0.96875, + "grad_norm": 2.232073607669622, + "kl": 0.076171875, "learning_rate": 5.001888090068784e-07, - "loss": 0.0005, - "num_tokens": 313811833.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0171, + "num_tokens": 302935259.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 503 }, { @@ -14099,20 +14099,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1529.0, - "completions/max_terminated_length": 1529.0, - "completions/mean_length": 568.341796875, - "completions/mean_terminated_length": 568.341796875, - "completions/min_length": 203.0, - "completions/min_terminated_length": 203.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1283.0, + "completions/max_terminated_length": 1283.0, + "completions/mean_length": 577.80078125, + "completions/mean_terminated_length": 577.1917724609375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, "epoch": 0.4513096037609134, "frac_reward_zero_std": 1.0, - "grad_norm": 0.005105667807983009, - "kl": 0.0458984375, + "grad_norm": 0.004036384939905687, + "kl": 0.04498291015625, "learning_rate": 5.004248128385618e-07, - "loss": 0.0005, - "num_tokens": 314399896.0, + "loss": 0.0004, + "num_tokens": 303528165.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -14127,26 +14127,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1788.0, - "completions/max_terminated_length": 1788.0, - "completions/mean_length": 609.244140625, - "completions/mean_terminated_length": 609.244140625, - "completions/min_length": 175.0, - "completions/min_terminated_length": 175.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 626.9296875, + "completions/mean_terminated_length": 622.58154296875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.45220505932393107, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.0043507543970794085, - "kl": 0.0474853515625, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.056029796429855026, + "kl": 0.04736328125, "learning_rate": 5.007552043396547e-07, - "loss": 0.0005, - "num_tokens": 315061797.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "loss": 0.0122, + "num_tokens": 304199121.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 505 }, { @@ -14155,26 +14155,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.984375, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1824.0, - "completions/mean_length": 574.73828125, - "completions/mean_terminated_length": 571.8551635742188, - "completions/min_length": 139.0, - "completions/min_terminated_length": 139.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1688.0, + "completions/max_terminated_length": 1688.0, + "completions/mean_length": 585.638671875, + "completions/mean_terminated_length": 583.049072265625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, "epoch": 0.45310051488694875, - "frac_reward_zero_std": 0.96875, - "grad_norm": 0.08176921037298442, - "kl": 0.0460205078125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16384123749604673, + "kl": 0.14178466796875, "learning_rate": 5.011799696475915e-07, - "loss": 0.0138, - "num_tokens": 315676863.0, - "reward": 0.09980468451976776, - "reward_std": 0.0007812500116415322, + "loss": 0.0014, + "num_tokens": 304819768.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.998046875, - "rewards/format_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 506 }, { @@ -14183,20 +14183,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -6.96875, - "completions/max_length": 2048.0, - "completions/max_terminated_length": 1696.0, - "completions/mean_length": 594.41796875, - "completions/mean_terminated_length": 588.7176513671875, - "completions/min_length": 176.0, - "completions/min_terminated_length": 176.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 600.677734375, + "completions/mean_terminated_length": 599.6751708984375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, "epoch": 0.4539959704499664, "frac_reward_zero_std": 1.0, - "grad_norm": 0.004202191868376494, - "kl": 0.04669189453125, + "grad_norm": 0.02131276823537405, + "kl": 0.06005859375, "learning_rate": 5.016990909400706e-07, - "loss": 0.0005, - "num_tokens": 316299381.0, + "loss": 0.0006, + "num_tokens": 305445491.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, @@ -14211,26 +14211,26 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1558.0, - "completions/max_terminated_length": 1558.0, - "completions/mean_length": 567.14453125, - "completions/mean_terminated_length": 567.14453125, - "completions/min_length": 209.0, - "completions/min_terminated_length": 209.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 571.474609375, + "completions/mean_terminated_length": 570.0215454101562, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, "epoch": 0.4548914260129841, - "frac_reward_zero_std": 0.9375, - "grad_norm": 0.06400276605298859, - "kl": 0.0452880859375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8224324394461862, + "kl": 0.51495361328125, "learning_rate": 5.023125464358026e-07, - "loss": -0.0009, - "num_tokens": 316861775.0, - "reward": 0.099609375, - "reward_std": 0.0015625000232830644, + "loss": 0.0051, + "num_tokens": 306010102.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 0.99609375, - "rewards/format_reward/std": 0.06243881583213806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, "step": 508 }, { @@ -14239,41 +14239,41 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": -7.0, - "completions/max_length": 1631.0, - "completions/max_terminated_length": 1631.0, - "completions/mean_length": 544.9765625, - "completions/mean_terminated_length": 544.9765625, - "completions/min_length": 194.0, - "completions/min_terminated_length": 194.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1119.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 553.423828125, + "completions/mean_terminated_length": 552.639892578125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, "epoch": 0.4557868815760018, - "frac_reward_zero_std": 1.0, - "grad_norm": 0.004374618149752765, - "kl": 0.04608154296875, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06463336869087515, + "kl": 0.046630859375, "learning_rate": 5.03020310395423e-07, "loss": 0.0005, - "num_tokens": 317450899.0, - "reward": 0.10000000149011612, - "reward_std": 0.0, + "num_tokens": 306603551.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, - "rewards/format_reward/mean": 1.0, - "rewards/format_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, "step": 509 }, { "epoch": 0.4557868815760018, "step": 509, "total_flos": 0.0, - "train_loss": 9.052564064315237e-07, - "train_runtime": 75.296, - "train_samples_per_second": 3399.916, - "train_steps_per_second": 6.64 + "train_loss": 1.0038919752364893e-06, + "train_runtime": 66.6209, + "train_samples_per_second": 3842.636, + "train_steps_per_second": 7.505 } ], "logging_steps": 1, "max_steps": 500, - "num_input_tokens_seen": 317450899, + "num_input_tokens_seen": 306603551, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": {