diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,7 +2,7 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.56, + "epoch": 0.28, "eval_steps": 500, "global_step": 140, "is_hyper_param_search": false, @@ -15,22 +15,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 1261.0, - "completions/max_terminated_length": 1261.0, - "completions/mean_length": 411.5, - "completions/mean_terminated_length": 470.2857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 442.6666666666667, + "completions/mean_terminated_length": 482.90909090909093, "completions/min_length": 0.0, - "completions/min_terminated_length": 165.0, - "epoch": 0.004, + "completions/min_terminated_length": 212.0, + "epoch": 0.002, "format_failures": 0.0, - "grad_norm": 0.3164481222629547, + "grad_norm": 0.3274489641189575, "kl": 0.0, "learning_rate": 0.0, - "loss": 0.0574, - "num_tokens": 20912.0, - "reward": 0.10000000149011612, - "reward_std": 0.19272480905056, + "loss": 0.048, + "num_tokens": 21804.0, + "reward": 0.26185137033462524, + "reward_std": 0.28920137882232666, "step": 1 }, { @@ -39,22 +39,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 125.0, - "completions/max_terminated_length": 125.0, - "completions/mean_length": 93.625, - "completions/mean_terminated_length": 107.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 136.5, + "completions/mean_terminated_length": 148.9090909090909, "completions/min_length": 0.0, - "completions/min_terminated_length": 30.0, - "epoch": 0.008, + "completions/min_terminated_length": 60.0, + "epoch": 0.004, "format_failures": 0.0, - "grad_norm": 3.300063133239746, + "grad_norm": 1.2693145275115967, "kl": 0.0, "learning_rate": 1e-06, - "loss": -0.032, - "num_tokens": 28472.0, - "reward": 0.5, - "reward_std": 0.5345224738121033, + "loss": 0.0962, + "num_tokens": 42324.0, + "reward": 0.38461539149284363, + "reward_std": 0.3770364224910736, "step": 2 }, { @@ -63,20 +63,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 228.0, - "completions/max_terminated_length": 228.0, - "completions/mean_length": 170.375, - "completions/mean_terminated_length": 194.71428571428572, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 217.83333333333334, + "completions/mean_terminated_length": 237.63636363636363, "completions/min_length": 0.0, - "completions/min_terminated_length": 138.0, - "epoch": 0.012, + "completions/min_terminated_length": 124.0, + "epoch": 0.006, "format_failures": 0.0, - "grad_norm": 0.426563024520874, - "kl": 0.19075269997119904, + "grad_norm": 0.3044165074825287, + "kl": 0.19029825925827026, "learning_rate": 1e-06, - "loss": 0.0011, - "num_tokens": 38272.0, + "loss": 0.0009, + "num_tokens": 58980.0, "reward": 0.0, "reward_std": 0.0, "step": 3 @@ -87,22 +87,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 302.0, - "completions/max_terminated_length": 302.0, - "completions/mean_length": 215.75, - "completions/mean_terminated_length": 246.57142857142858, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 321.0833333333333, + "completions/mean_terminated_length": 350.27272727272725, "completions/min_length": 0.0, - "completions/min_terminated_length": 191.0, - "epoch": 0.016, + "completions/min_terminated_length": 103.0, + "epoch": 0.008, "format_failures": 1.0, - "grad_norm": 0.3638526201248169, - "kl": 0.0030522841261699796, + "grad_norm": 0.3372040390968323, + "kl": 0.029289670288562775, "learning_rate": 1e-06, - "loss": 0.0265, - "num_tokens": 44880.0, - "reward": 0.17291666567325592, - "reward_std": 0.16665178537368774, + "loss": 0.1107, + "num_tokens": 81756.0, + "reward": 0.23689448833465576, + "reward_std": 0.2267814427614212, "step": 4 }, { @@ -111,22 +111,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 129.0, - "completions/max_terminated_length": 129.0, - "completions/mean_length": 88.625, - "completions/mean_terminated_length": 101.28571428571429, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 119.08333333333333, + "completions/mean_terminated_length": 129.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, - "epoch": 0.02, - "format_failures": 1.0, - "grad_norm": 12.54277515411377, - "kl": 1.5523776412010193, + "epoch": 0.01, + "format_failures": 0.0, + "grad_norm": 10.779764175415039, + "kl": 3.1303787231445312, "learning_rate": 1e-06, - "loss": 0.0192, - "num_tokens": 54104.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": 0.0311, + "num_tokens": 96360.0, + "reward": 0.1666666716337204, + "reward_std": 0.30772873759269714, "step": 5 }, { @@ -135,22 +135,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 388.0, - "completions/max_terminated_length": 388.0, - "completions/mean_length": 271.75, - "completions/mean_terminated_length": 310.57142857142856, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 420.6666666666667, + "completions/mean_terminated_length": 458.90909090909093, "completions/min_length": 0.0, - "completions/min_terminated_length": 238.0, - "epoch": 0.024, - "format_failures": 0.0, - "grad_norm": 0.432476669549942, - "kl": 0.0021984531776979566, + "completions/min_terminated_length": 329.0, + "epoch": 0.012, + "format_failures": 1.0, + "grad_norm": 0.2519327402114868, + "kl": 0.016291129169985652, "learning_rate": 1e-06, - "loss": -0.088, - "num_tokens": 66888.0, - "reward": 0.2569444477558136, - "reward_std": 0.27688348293304443, + "loss": 0.0559, + "num_tokens": 119712.0, + "reward": 0.34878918528556824, + "reward_std": 0.2739146649837494, "step": 6 }, { @@ -159,20 +159,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 200.0, - "completions/max_terminated_length": 200.0, - "completions/mean_length": 82.625, - "completions/mean_terminated_length": 94.42857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 67.33333333333333, + "completions/mean_terminated_length": 73.45454545454545, "completions/min_length": 0.0, - "completions/min_terminated_length": 48.0, - "epoch": 0.028, + "completions/min_terminated_length": 36.0, + "epoch": 0.014, "format_failures": 0.0, - "grad_norm": 0.0007834668504074216, - "kl": 0.000487034791149199, + "grad_norm": 2531.101806640625, + "kl": 562.2636108398438, "learning_rate": 1e-06, - "loss": 0.0, - "num_tokens": 87976.0, + "loss": 5.4405, + "num_tokens": 128772.0, "reward": 0.0, "reward_std": 0.0, "step": 7 @@ -183,22 +183,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 182.0, - "completions/max_terminated_length": 182.0, - "completions/mean_length": 111.75, - "completions/mean_terminated_length": 127.71428571428571, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 186.41666666666666, + "completions/mean_terminated_length": 203.36363636363637, "completions/min_length": 0.0, - "completions/min_terminated_length": 94.0, - "epoch": 0.032, + "completions/min_terminated_length": 85.0, + "epoch": 0.016, "format_failures": 0.0, - "grad_norm": 0.2904910445213318, - "kl": 0.0784255713224411, + "grad_norm": 0.7023671865463257, + "kl": 0.0004708967899205163, "learning_rate": 1e-06, - "loss": 0.0007, - "num_tokens": 97376.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": -0.1143, + "num_tokens": 164100.0, + "reward": 0.06388889253139496, + "reward_std": 0.1274919956922531, "step": 8 }, { @@ -207,22 +207,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 643.0, - "completions/max_terminated_length": 643.0, - "completions/mean_length": 297.25, - "completions/mean_terminated_length": 339.7142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 253.41666666666666, + "completions/mean_terminated_length": 276.45454545454544, "completions/min_length": 0.0, - "completions/min_terminated_length": 200.0, - "epoch": 0.036, + "completions/min_terminated_length": 165.0, + "epoch": 0.018, "format_failures": 0.0, - "grad_norm": 0.5291862487792969, - "kl": 0.006049621384590864, + "grad_norm": 1.1911135911941528, + "kl": 0.0012580148177221417, "learning_rate": 1e-06, - "loss": 0.046, - "num_tokens": 110264.0, - "reward": 0.2834821343421936, - "reward_std": 0.3961408734321594, + "loss": -0.3277, + "num_tokens": 197808.0, + "reward": 0.1118159219622612, + "reward_std": 0.2614404261112213, "step": 9 }, { @@ -231,22 +231,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.25, - "completions/max_length": 62.0, - "completions/max_terminated_length": 62.0, - "completions/mean_length": 37.625, - "completions/mean_terminated_length": 50.166666666666664, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 64.83333333333333, + "completions/mean_terminated_length": 70.72727272727273, "completions/min_length": 0.0, - "completions/min_terminated_length": 42.0, - "epoch": 0.04, + "completions/min_terminated_length": 35.0, + "epoch": 0.02, "format_failures": 0.0, - "grad_norm": 1.7151610851287842, - "kl": 0.2360311597585678, + "grad_norm": 1.324984073638916, + "kl": 0.2648707218468189, "learning_rate": 1e-06, - "loss": 0.004, - "num_tokens": 115504.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": -0.0221, + "num_tokens": 207000.0, + "reward": 0.01666666753590107, + "reward_std": 0.057735029608011246, "step": 10 }, { @@ -255,22 +255,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 154.0, - "completions/max_terminated_length": 154.0, - "completions/mean_length": 109.5, - "completions/mean_terminated_length": 125.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 126.33333333333333, + "completions/mean_terminated_length": 137.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, - "epoch": 0.044, + "epoch": 0.022, "format_failures": 0.0, - "grad_norm": 1.5465294122695923, - "kl": 0.01262557739391923, + "grad_norm": 0.5873882174491882, + "kl": 0.017587594222277403, "learning_rate": 1e-06, - "loss": 0.1145, - "num_tokens": 125936.0, - "reward": 0.6499999761581421, - "reward_std": 0.4869731664657593, + "loss": 0.0197, + "num_tokens": 221808.0, + "reward": 0.1805555671453476, + "reward_std": 0.3134874999523163, "step": 11 }, { @@ -279,22 +279,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 385.0, - "completions/max_terminated_length": 385.0, - "completions/mean_length": 278.625, - "completions/mean_terminated_length": 318.42857142857144, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 2049.0, + "completions/max_terminated_length": 2049.0, + "completions/mean_length": 541.25, + "completions/mean_terminated_length": 649.5, "completions/min_length": 0.0, - "completions/min_terminated_length": 261.0, - "epoch": 0.048, + "completions/min_terminated_length": 137.0, + "epoch": 0.024, "format_failures": 0.0, - "grad_norm": 0.49245283007621765, - "kl": 0.01833944395184517, + "grad_norm": 0.48546102643013, + "kl": 0.002345994464121759, "learning_rate": 1e-06, - "loss": 0.0385, - "num_tokens": 134920.0, - "reward": 0.543181836605072, - "reward_std": 0.3499283194541931, + "loss": 0.0336, + "num_tokens": 255132.0, + "reward": 0.4682539701461792, + "reward_std": 0.4320843815803528, "step": 12 }, { @@ -303,20 +303,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 69.0, - "completions/max_terminated_length": 69.0, - "completions/mean_length": 39.5, - "completions/mean_terminated_length": 45.142857142857146, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 29.666666666666668, + "completions/mean_terminated_length": 32.36363636363637, "completions/min_length": 0.0, - "completions/min_terminated_length": 20.0, - "epoch": 0.052, + "completions/min_terminated_length": 22.0, + "epoch": 0.026, "format_failures": 0.0, - "grad_norm": 0.0073767416179180145, - "kl": 0.0018603539792820811, + "grad_norm": 0.186175137758255, + "kl": 0.041642000898718834, "learning_rate": 1e-06, - "loss": 0.0, - "num_tokens": 155632.0, + "loss": 0.0008, + "num_tokens": 265092.0, "reward": 0.0, "reward_std": 0.0, "step": 13 @@ -327,22 +327,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 1250.0, - "completions/max_terminated_length": 1250.0, - "completions/mean_length": 381.25, - "completions/mean_terminated_length": 435.7142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 381.6666666666667, + "completions/mean_terminated_length": 416.3636363636364, "completions/min_length": 0.0, - "completions/min_terminated_length": 148.0, - "epoch": 0.056, + "completions/min_terminated_length": 188.0, + "epoch": 0.028, "format_failures": 0.0, - "grad_norm": 0.4092012047767639, - "kl": 0.0037365095922723413, + "grad_norm": 0.20345070958137512, + "kl": 0.009796573780477047, "learning_rate": 1e-06, - "loss": 0.0532, - "num_tokens": 178856.0, - "reward": 0.08141025900840759, - "reward_std": 0.17304366827011108, + "loss": 0.0257, + "num_tokens": 294096.0, + "reward": 0.29761505126953125, + "reward_std": 0.16453009843826294, "step": 14 }, { @@ -351,22 +351,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 480.0, - "completions/max_terminated_length": 480.0, - "completions/mean_length": 282.25, - "completions/mean_terminated_length": 322.57142857142856, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 332.25, + "completions/mean_terminated_length": 362.45454545454544, "completions/min_length": 0.0, - "completions/min_terminated_length": 155.0, - "epoch": 0.06, - "format_failures": 0.0, - "grad_norm": 0.4555729627609253, - "kl": 0.03388933837413788, + "completions/min_terminated_length": 125.0, + "epoch": 0.03, + "format_failures": 1.0, + "grad_norm": 0.5157941579818726, + "kl": 0.004433898604474962, "learning_rate": 1e-06, - "loss": -0.0102, - "num_tokens": 189944.0, - "reward": 0.4369778633117676, - "reward_std": 0.3217828869819641, + "loss": -0.0103, + "num_tokens": 325368.0, + "reward": 0.2917824387550354, + "reward_std": 0.3325340151786804, "step": 15 }, { @@ -375,20 +375,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.375, - "completions/max_length": 184.0, - "completions/max_terminated_length": 184.0, - "completions/mean_length": 87.625, - "completions/mean_terminated_length": 140.2, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 150.16666666666666, + "completions/mean_terminated_length": 163.8181818181818, "completions/min_length": 0.0, - "completions/min_terminated_length": 51.0, - "epoch": 0.064, + "completions/min_terminated_length": 30.0, + "epoch": 0.032, "format_failures": 0.0, - "grad_norm": 8.791972160339355, - "kl": 1.302387694362551, + "grad_norm": 0.05657627806067467, + "kl": 0.0326845021918416, "learning_rate": 1e-06, - "loss": 0.0127, - "num_tokens": 197912.0, + "loss": 0.0002, + "num_tokens": 341196.0, "reward": 0.0, "reward_std": 0.0, "step": 16 @@ -399,22 +399,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 321.0, - "completions/max_terminated_length": 321.0, - "completions/mean_length": 260.0, - "completions/mean_terminated_length": 297.14285714285717, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 228.41666666666666, + "completions/mean_terminated_length": 249.1818181818182, "completions/min_length": 0.0, - "completions/min_terminated_length": 281.0, - "epoch": 0.068, + "completions/min_terminated_length": 26.0, + "epoch": 0.034, "format_failures": 0.0, - "grad_norm": 0.5435929298400879, - "kl": 0.016751494258642197, + "grad_norm": 1.8653935194015503, + "kl": 0.8598212422803044, "learning_rate": 1e-06, - "loss": 0.0027, - "num_tokens": 207184.0, - "reward": 0.0833333358168602, - "reward_std": 0.2357022762298584, + "loss": 0.014, + "num_tokens": 354228.0, + "reward": 0.01666666753590107, + "reward_std": 0.05773502588272095, "step": 17 }, { @@ -423,20 +423,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 69.0, - "completions/max_terminated_length": 69.0, - "completions/mean_length": 49.5, - "completions/mean_terminated_length": 56.57142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 48.333333333333336, + "completions/mean_terminated_length": 52.72727272727273, "completions/min_length": 0.0, - "completions/min_terminated_length": 27.0, - "epoch": 0.072, - "format_failures": 0.0, - "grad_norm": 0.19489726424217224, - "kl": 0.061227064579725266, + "completions/min_terminated_length": 25.0, + "epoch": 0.036, + "format_failures": 1.0, + "grad_norm": 0.018069056794047356, + "kl": 0.023271435871720314, "learning_rate": 1e-06, - "loss": 0.0006, - "num_tokens": 211464.0, + "loss": 0.0002, + "num_tokens": 381468.0, "reward": 0.0, "reward_std": 0.0, "step": 18 @@ -447,22 +447,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 354.0, - "completions/max_terminated_length": 354.0, - "completions/mean_length": 126.375, - "completions/mean_terminated_length": 144.42857142857142, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 228.91666666666666, + "completions/mean_terminated_length": 249.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 16.0, - "epoch": 0.076, + "epoch": 0.038, "format_failures": 0.0, - "grad_norm": 0.6644909381866455, - "kl": 0.010538576170802116, + "grad_norm": 1.073132872581482, + "kl": 0.003063492476940155, "learning_rate": 1e-06, - "loss": -0.0572, - "num_tokens": 228504.0, - "reward": 0.125, - "reward_std": 0.3535533845424652, + "loss": 0.0334, + "num_tokens": 415356.0, + "reward": 0.1666666716337204, + "reward_std": 0.38924944400787354, "step": 19 }, { @@ -471,22 +471,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 92.0, - "completions/max_terminated_length": 92.0, - "completions/mean_length": 66.375, - "completions/mean_terminated_length": 75.85714285714286, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 84.58333333333333, + "completions/mean_terminated_length": 92.27272727272727, "completions/min_length": 0.0, - "completions/min_terminated_length": 58.0, - "epoch": 0.08, + "completions/min_terminated_length": 66.0, + "epoch": 0.04, "format_failures": 0.0, - "grad_norm": 1.534692645072937, - "kl": 0.03320205491036177, + "grad_norm": 1.1736811399459839, + "kl": 0.018741012550890446, "learning_rate": 1e-06, - "loss": -0.0001, - "num_tokens": 236336.0, - "reward": 0.0625, - "reward_std": 0.1767766922712326, + "loss": 0.0962, + "num_tokens": 442596.0, + "reward": 0.1041666716337204, + "reward_std": 0.22508415579795837, "step": 20 }, { @@ -495,22 +495,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 155.0, - "completions/max_terminated_length": 155.0, - "completions/mean_length": 95.75, - "completions/mean_terminated_length": 109.42857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 89.58333333333333, + "completions/mean_terminated_length": 97.72727272727273, "completions/min_length": 0.0, "completions/min_terminated_length": 69.0, - "epoch": 0.084, + "epoch": 0.042, "format_failures": 0.0, - "grad_norm": 1.6011440753936768, - "kl": 0.028395552188158035, + "grad_norm": 0.960914671421051, + "kl": 0.03209133446216583, "learning_rate": 1e-06, - "loss": 0.0329, - "num_tokens": 243416.0, - "reward": 0.28125, - "reward_std": 0.45193037390708923, + "loss": -0.0169, + "num_tokens": 453252.0, + "reward": 0.2708333432674408, + "reward_std": 0.4454101026058197, "step": 21 }, { @@ -519,22 +519,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 292.0, - "completions/max_terminated_length": 292.0, - "completions/mean_length": 157.0, - "completions/mean_terminated_length": 179.42857142857142, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 124.33333333333333, + "completions/mean_terminated_length": 135.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, - "epoch": 0.088, + "epoch": 0.044, "format_failures": 0.0, - "grad_norm": 0.8286527991294861, - "kl": 0.05863172188401222, + "grad_norm": 1.0618880987167358, + "kl": 0.03219995368272066, "learning_rate": 1e-06, - "loss": -0.0402, - "num_tokens": 251376.0, - "reward": 0.1830357164144516, - "reward_std": 0.28149792551994324, + "loss": -0.3593, + "num_tokens": 481656.0, + "reward": 0.09444444626569748, + "reward_std": 0.17164288461208344, "step": 22 }, { @@ -543,22 +543,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 479.0, - "completions/max_terminated_length": 479.0, - "completions/mean_length": 293.625, - "completions/mean_terminated_length": 335.57142857142856, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 299.5, + "completions/mean_terminated_length": 326.72727272727275, "completions/min_length": 0.0, - "completions/min_terminated_length": 233.0, - "epoch": 0.092, + "completions/min_terminated_length": 148.0, + "epoch": 0.046, "format_failures": 0.0, - "grad_norm": 0.3510456383228302, - "kl": 0.04268372617661953, + "grad_norm": 0.3598278760910034, + "kl": 0.031054741702973843, "learning_rate": 1e-06, - "loss": -0.0068, - "num_tokens": 262824.0, - "reward": 0.29113247990608215, - "reward_std": 0.2665640711784363, + "loss": 0.0131, + "num_tokens": 505704.0, + "reward": 0.4847402572631836, + "reward_std": 0.25003767013549805, "step": 23 }, { @@ -567,22 +567,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 403.0, - "completions/max_terminated_length": 403.0, - "completions/mean_length": 276.25, - "completions/mean_terminated_length": 315.7142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 297.5, + "completions/mean_terminated_length": 324.54545454545456, "completions/min_length": 0.0, - "completions/min_terminated_length": 66.0, - "epoch": 0.096, + "completions/min_terminated_length": 211.0, + "epoch": 0.048, "format_failures": 0.0, - "grad_norm": 0.6299352645874023, - "kl": 0.09684642031788826, + "grad_norm": 0.27960336208343506, + "kl": 0.04240706283599138, "learning_rate": 1e-06, - "loss": 0.0879, - "num_tokens": 273192.0, - "reward": 0.45770204067230225, - "reward_std": 0.4340135455131531, + "loss": -0.0398, + "num_tokens": 523500.0, + "reward": 0.2615740895271301, + "reward_std": 0.219794362783432, "step": 24 }, { @@ -591,22 +591,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 170.0, - "completions/max_terminated_length": 170.0, - "completions/mean_length": 123.125, - "completions/mean_terminated_length": 140.71428571428572, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 179.16666666666666, + "completions/mean_terminated_length": 195.45454545454547, "completions/min_length": 0.0, - "completions/min_terminated_length": 119.0, - "epoch": 0.1, + "completions/min_terminated_length": 114.0, + "epoch": 0.05, "format_failures": 0.0, - "grad_norm": 1.0716724395751953, - "kl": 0.08026151731610298, + "grad_norm": 1.2980320453643799, + "kl": 0.0048073166981339455, "learning_rate": 1e-06, - "loss": 0.0274, - "num_tokens": 279688.0, - "reward": 0.5770493149757385, - "reward_std": 0.2756548523902893, + "loss": -0.3887, + "num_tokens": 555300.0, + "reward": 0.5003399848937988, + "reward_std": 0.39150455594062805, "step": 25 }, { @@ -615,22 +615,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 395.0, - "completions/max_terminated_length": 395.0, - "completions/mean_length": 241.0, - "completions/mean_terminated_length": 275.42857142857144, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 315.9166666666667, + "completions/mean_terminated_length": 344.6363636363636, "completions/min_length": 0.0, - "completions/min_terminated_length": 232.0, - "epoch": 0.104, + "completions/min_terminated_length": 239.0, + "epoch": 0.052, "format_failures": 0.0, - "grad_norm": 0.46685901284217834, - "kl": 0.06300827860832214, + "grad_norm": 0.2552706003189087, + "kl": 0.027493927627801895, "learning_rate": 1e-06, - "loss": -0.0169, - "num_tokens": 288160.0, - "reward": 0.4475490152835846, - "reward_std": 0.30980169773101807, + "loss": 0.0567, + "num_tokens": 576000.0, + "reward": 0.43729767203330994, + "reward_std": 0.18975813686847687, "step": 26 }, { @@ -639,22 +639,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 147.0, - "completions/max_terminated_length": 147.0, - "completions/mean_length": 79.75, - "completions/mean_terminated_length": 91.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 72.91666666666667, + "completions/mean_terminated_length": 79.54545454545455, "completions/min_length": 0.0, - "completions/min_terminated_length": 52.0, - "epoch": 0.108, + "completions/min_terminated_length": 65.0, + "epoch": 0.054, "format_failures": 0.0, - "grad_norm": 1.3687446117401123, - "kl": 0.04298516921699047, + "grad_norm": 1.1299240589141846, + "kl": 0.0332061443477869, "learning_rate": 1e-06, - "loss": -0.0143, - "num_tokens": 293328.0, - "reward": 0.2083333432674408, - "reward_std": 0.39591163396835327, + "loss": -0.057, + "num_tokens": 584712.0, + "reward": 0.33095240592956543, + "reward_std": 0.444376677274704, "step": 27 }, { @@ -663,20 +663,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 184.0, - "completions/max_terminated_length": 184.0, - "completions/mean_length": 93.75, - "completions/mean_terminated_length": 107.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 91.16666666666667, + "completions/mean_terminated_length": 99.45454545454545, "completions/min_length": 0.0, - "completions/min_terminated_length": 70.0, - "epoch": 0.112, + "completions/min_terminated_length": 56.0, + "epoch": 0.056, "format_failures": 0.0, - "grad_norm": 2.618457794189453, - "kl": 0.7109708972275257, + "grad_norm": 0.044371046125888824, + "kl": 0.03765446413308382, "learning_rate": 1e-06, - "loss": 0.008, - "num_tokens": 301896.0, + "loss": 0.0004, + "num_tokens": 598032.0, "reward": 0.0, "reward_std": 0.0, "step": 28 @@ -687,22 +687,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 1037.0, - "completions/max_terminated_length": 1037.0, - "completions/mean_length": 347.0, - "completions/mean_terminated_length": 396.57142857142856, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 304.5, + "completions/mean_terminated_length": 332.1818181818182, "completions/min_length": 0.0, - "completions/min_terminated_length": 164.0, - "epoch": 0.116, + "completions/min_terminated_length": 212.0, + "epoch": 0.058, "format_failures": 0.0, - "grad_norm": 0.8099527955055237, - "kl": 0.004772833781316876, + "grad_norm": 0.5104940533638, + "kl": 0.03451683558523655, "learning_rate": 1e-06, - "loss": 0.1672, - "num_tokens": 323048.0, - "reward": 0.3559523820877075, - "reward_std": 0.38564079999923706, + "loss": -0.0274, + "num_tokens": 615204.0, + "reward": 0.4068452715873718, + "reward_std": 0.37161099910736084, "step": 29 }, { @@ -711,22 +711,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 178.0, - "completions/max_terminated_length": 178.0, - "completions/mean_length": 144.5, - "completions/mean_terminated_length": 165.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 162.91666666666666, + "completions/mean_terminated_length": 177.72727272727272, "completions/min_length": 0.0, - "completions/min_terminated_length": 129.0, - "epoch": 0.12, + "completions/min_terminated_length": 59.0, + "epoch": 0.06, "format_failures": 0.0, - "grad_norm": 0.3579946756362915, - "kl": 0.02128867618739605, + "grad_norm": 1.2335582971572876, + "kl": 0.007039119256660342, "learning_rate": 1e-06, - "loss": -0.0092, - "num_tokens": 329880.0, - "reward": 0.125, - "reward_std": 0.3535533845424652, + "loss": 0.2673, + "num_tokens": 647892.0, + "reward": 0.3291666805744171, + "reward_std": 0.4266456663608551, "step": 30 }, { @@ -735,22 +735,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 300.0, - "completions/max_terminated_length": 300.0, - "completions/mean_length": 211.375, - "completions/mean_terminated_length": 241.57142857142858, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 332.5, + "completions/mean_terminated_length": 362.72727272727275, "completions/min_length": 0.0, - "completions/min_terminated_length": 188.0, - "epoch": 0.124, - "format_failures": 0.0, - "grad_norm": 1.7018321752548218, - "kl": 0.014225509017705917, + "completions/min_terminated_length": 222.0, + "epoch": 0.062, + "format_failures": 2.0, + "grad_norm": 0.3000166416168213, + "kl": 0.03664882015436888, "learning_rate": 1e-06, - "loss": -0.7754, - "num_tokens": 352120.0, - "reward": 0.2201923131942749, - "reward_std": 0.4099963307380676, + "loss": 0.0306, + "num_tokens": 670860.0, + "reward": 0.6458902955055237, + "reward_std": 0.26038500666618347, "step": 31 }, { @@ -759,22 +759,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 1071.0, - "completions/max_terminated_length": 1071.0, - "completions/mean_length": 291.375, - "completions/mean_terminated_length": 333.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 218.66666666666666, + "completions/mean_terminated_length": 238.54545454545453, "completions/min_length": 0.0, - "completions/min_terminated_length": 113.0, - "epoch": 0.128, + "completions/min_terminated_length": 180.0, + "epoch": 0.064, "format_failures": 0.0, - "grad_norm": 0.8998605608940125, - "kl": 0.0065889437682926655, + "grad_norm": 0.37272748351097107, + "kl": 0.07015270553529263, "learning_rate": 1e-06, - "loss": 0.4427, - "num_tokens": 373128.0, - "reward": 0.3992924690246582, - "reward_std": 0.34711551666259766, + "loss": 0.0169, + "num_tokens": 682212.0, + "reward": 0.43658646941185, + "reward_std": 0.24143192172050476, "step": 32 }, { @@ -783,22 +783,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 62.0, - "completions/max_terminated_length": 62.0, - "completions/mean_length": 47.125, - "completions/mean_terminated_length": 53.857142857142854, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 58.09090909090909, "completions/min_length": 0.0, - "completions/min_terminated_length": 45.0, - "epoch": 0.132, + "completions/min_terminated_length": 47.0, + "epoch": 0.066, "format_failures": 0.0, - "grad_norm": 0.3835119307041168, - "kl": 0.1347430720925331, + "grad_norm": 1.1589769124984741, + "kl": 0.03555137664079666, "learning_rate": 1e-06, - "loss": 0.002, - "num_tokens": 379176.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": -0.0651, + "num_tokens": 692040.0, + "reward": 0.11666666716337204, + "reward_std": 0.301008403301239, "step": 33 }, { @@ -807,22 +807,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 2049.0, - "completions/max_terminated_length": 2049.0, - "completions/mean_length": 523.25, - "completions/mean_terminated_length": 598.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 336.0, + "completions/mean_terminated_length": 366.54545454545456, "completions/min_length": 0.0, - "completions/min_terminated_length": 250.0, - "epoch": 0.136, + "completions/min_terminated_length": 292.0, + "epoch": 0.068, "format_failures": 0.0, - "grad_norm": 0.5971964597702026, - "kl": 0.013881782768294215, + "grad_norm": 0.42152470350265503, + "kl": 0.19683832861483097, "learning_rate": 1e-06, - "loss": 0.3237, - "num_tokens": 400040.0, - "reward": 0.36666667461395264, - "reward_std": 0.4086368978023529, + "loss": -0.0173, + "num_tokens": 704484.0, + "reward": 0.5136784911155701, + "reward_std": 0.38917282223701477, "step": 34 }, { @@ -831,22 +831,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 134.0, - "completions/max_terminated_length": 134.0, - "completions/mean_length": 70.0, - "completions/mean_terminated_length": 80.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 59.166666666666664, + "completions/mean_terminated_length": 64.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 40.0, - "epoch": 0.14, + "epoch": 0.07, "format_failures": 0.0, - "grad_norm": 1.4775596857070923, - "kl": 0.1210218146443367, + "grad_norm": 1.729435682296753, + "kl": 0.055947478860616684, "learning_rate": 1e-06, - "loss": 0.0368, - "num_tokens": 404552.0, - "reward": 0.4513888955116272, - "reward_std": 0.4428916275501251, + "loss": 0.0028, + "num_tokens": 710520.0, + "reward": 0.5611110925674438, + "reward_std": 0.45256468653678894, "step": 35 }, { @@ -855,20 +855,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 256.0, - "completions/max_terminated_length": 256.0, - "completions/mean_length": 112.75, - "completions/mean_terminated_length": 128.85714285714286, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 91.91666666666667, + "completions/mean_terminated_length": 100.27272727272727, "completions/min_length": 0.0, "completions/min_terminated_length": 59.0, - "epoch": 0.144, + "epoch": 0.072, "format_failures": 0.0, - "grad_norm": 0.09341330826282501, - "kl": 0.08837828040122986, + "grad_norm": 0.7297618389129639, + "kl": 0.28226011246442795, "learning_rate": 1e-06, - "loss": 0.0006, - "num_tokens": 411528.0, + "loss": 0.0022, + "num_tokens": 720588.0, "reward": 0.0, "reward_std": 0.0, "step": 36 @@ -879,22 +879,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 259.0, - "completions/max_terminated_length": 259.0, - "completions/mean_length": 193.375, - "completions/mean_terminated_length": 221.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 184.66666666666666, + "completions/mean_terminated_length": 201.45454545454547, "completions/min_length": 0.0, - "completions/min_terminated_length": 189.0, - "epoch": 0.148, + "completions/min_terminated_length": 152.0, + "epoch": 0.074, "format_failures": 0.0, - "grad_norm": 0.7524359822273254, - "kl": 0.04169362783432007, + "grad_norm": 0.1786535382270813, + "kl": 0.05143214017152786, "learning_rate": 1e-06, - "loss": 0.0556, - "num_tokens": 419160.0, - "reward": 0.5170454382896423, - "reward_std": 0.3414821922779083, + "loss": 0.001, + "num_tokens": 731112.0, + "reward": 0.5931217074394226, + "reward_std": 0.15197694301605225, "step": 37 }, { @@ -903,22 +903,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 64.0, - "completions/max_terminated_length": 64.0, - "completions/mean_length": 53.375, - "completions/mean_terminated_length": 61.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 61.416666666666664, + "completions/mean_terminated_length": 67.0, "completions/min_length": 0.0, - "completions/min_terminated_length": 56.0, - "epoch": 0.152, - "format_failures": 0.0, - "grad_norm": 0.3160454034805298, - "kl": 0.07513360120356083, + "completions/min_terminated_length": 36.0, + "epoch": 0.076, + "format_failures": 1.0, + "grad_norm": 2.560441732406616, + "kl": 0.061069367453455925, "learning_rate": 1e-06, - "loss": 0.0008, - "num_tokens": 423768.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": 0.1107, + "num_tokens": 758340.0, + "reward": 0.0833333358168602, + "reward_std": 0.28867512941360474, "step": 38 }, { @@ -927,22 +927,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 485.0, - "completions/max_terminated_length": 485.0, - "completions/mean_length": 301.125, - "completions/mean_terminated_length": 344.14285714285717, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 2050.0, + "completions/max_terminated_length": 2050.0, + "completions/mean_length": 715.0, + "completions/mean_terminated_length": 780.0, "completions/min_length": 0.0, - "completions/min_terminated_length": 215.0, - "epoch": 0.156, + "completions/min_terminated_length": 357.0, + "epoch": 0.078, "format_failures": 0.0, - "grad_norm": 1.2346574068069458, - "kl": 0.27855822443962097, + "grad_norm": 0.41932860016822815, + "kl": 0.01548363408073783, "learning_rate": 1e-06, - "loss": -0.0911, - "num_tokens": 435552.0, - "reward": 0.5062500238418579, - "reward_std": 0.43001821637153625, + "loss": 0.0106, + "num_tokens": 790968.0, + "reward": 0.25740742683410645, + "reward_std": 0.32573264837265015, "step": 39 }, { @@ -951,22 +951,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 983.0, - "completions/max_terminated_length": 983.0, - "completions/mean_length": 453.0, - "completions/mean_terminated_length": 517.7142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 471.75, + "completions/mean_terminated_length": 514.6363636363636, "completions/min_length": 0.0, - "completions/min_terminated_length": 296.0, - "epoch": 0.16, + "completions/min_terminated_length": 113.0, + "epoch": 0.08, "format_failures": 0.0, - "grad_norm": 0.14271600544452667, - "kl": 0.011493591591715813, + "grad_norm": 0.8145480155944824, + "kl": 0.016389482654631138, "learning_rate": 1e-06, - "loss": -0.0636, - "num_tokens": 460960.0, - "reward": 0.5462301969528198, - "reward_std": 0.12065710872411728, + "loss": 0.154, + "num_tokens": 829104.0, + "reward": 0.43334314227104187, + "reward_std": 0.3763042986392975, "step": 40 }, { @@ -975,22 +975,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 106.0, - "completions/max_terminated_length": 106.0, - "completions/mean_length": 61.75, - "completions/mean_terminated_length": 70.57142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 99.91666666666667, + "completions/mean_terminated_length": 109.0, "completions/min_length": 0.0, - "completions/min_terminated_length": 30.0, - "epoch": 0.164, + "completions/min_terminated_length": 47.0, + "epoch": 0.082, "format_failures": 0.0, - "grad_norm": 3.143950939178467, - "kl": 0.05912626534700394, + "grad_norm": 18.232030868530273, + "kl": 1.717683531343937, "learning_rate": 1e-06, - "loss": -0.0781, - "num_tokens": 469456.0, - "reward": 0.32499998807907104, - "reward_std": 0.46521884202957153, + "loss": 0.197, + "num_tokens": 850716.0, + "reward": 0.2430555671453476, + "reward_std": 0.4042987823486328, "step": 41 }, { @@ -999,22 +999,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 218.0, - "completions/max_terminated_length": 218.0, - "completions/mean_length": 81.5, - "completions/mean_terminated_length": 93.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 77.33333333333333, + "completions/mean_terminated_length": 84.36363636363636, "completions/min_length": 0.0, - "completions/min_terminated_length": 61.0, - "epoch": 0.168, + "completions/min_terminated_length": 63.0, + "epoch": 0.084, "format_failures": 0.0, - "grad_norm": 0.4773140251636505, - "kl": 0.2549777179956436, + "grad_norm": 0.5794758796691895, + "kl": 0.21323725581169128, "learning_rate": 1e-06, - "loss": 0.0019, - "num_tokens": 475288.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": -0.0344, + "num_tokens": 859644.0, + "reward": 0.0476190522313118, + "reward_std": 0.1649572253227234, "step": 42 }, { @@ -1023,22 +1023,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 172.0, - "completions/max_terminated_length": 172.0, - "completions/mean_length": 111.375, - "completions/mean_terminated_length": 127.28571428571429, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 136.66666666666666, + "completions/mean_terminated_length": 149.0909090909091, "completions/min_length": 0.0, - "completions/min_terminated_length": 42.0, - "epoch": 0.172, + "completions/min_terminated_length": 57.0, + "epoch": 0.086, "format_failures": 0.0, - "grad_norm": 3.081820487976074, - "kl": 0.05859908275306225, + "grad_norm": 2.507535934448242, + "kl": 0.2139158956706524, "learning_rate": 1e-06, - "loss": -0.132, - "num_tokens": 482848.0, - "reward": 0.375, - "reward_std": 0.5175491571426392, + "loss": -0.0282, + "num_tokens": 871596.0, + "reward": 0.3333333432674408, + "reward_std": 0.4923659861087799, "step": 43 }, { @@ -1047,20 +1047,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.5, + "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, - "completions/mean_length": 22.5, - "completions/mean_terminated_length": 45.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 42.375, "completions/min_length": 0.0, "completions/min_terminated_length": 21.0, - "epoch": 0.176, - "format_failures": 1.0, - "grad_norm": 0.5130624175071716, - "kl": 0.034313835203647614, + "epoch": 0.088, + "format_failures": 0.0, + "grad_norm": 0.33207282423973083, + "kl": 0.035286733880639076, "learning_rate": 1e-06, - "loss": 0.001, - "num_tokens": 488336.0, + "loss": 0.0008, + "num_tokens": 879828.0, "reward": 0.0, "reward_std": 0.0, "step": 44 @@ -1071,20 +1071,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 145.0, - "completions/max_terminated_length": 145.0, - "completions/mean_length": 80.125, - "completions/mean_terminated_length": 91.57142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 117.83333333333333, + "completions/mean_terminated_length": 128.54545454545453, "completions/min_length": 0.0, "completions/min_terminated_length": 68.0, - "epoch": 0.18, + "epoch": 0.09, "format_failures": 0.0, - "grad_norm": 4.689250946044922, - "kl": 0.8184864521026611, + "grad_norm": 0.2761678099632263, + "kl": 0.15724625438451767, "learning_rate": 1e-06, - "loss": 0.0107, - "num_tokens": 497608.0, + "loss": 0.0015, + "num_tokens": 899448.0, "reward": 0.0, "reward_std": 0.0, "step": 45 @@ -1095,22 +1095,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 128.0, - "completions/max_terminated_length": 128.0, - "completions/mean_length": 94.75, - "completions/mean_terminated_length": 108.28571428571429, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 105.16666666666667, + "completions/mean_terminated_length": 114.72727272727273, "completions/min_length": 0.0, - "completions/min_terminated_length": 78.0, - "epoch": 0.184, + "completions/min_terminated_length": 26.0, + "epoch": 0.092, "format_failures": 0.0, - "grad_norm": 1.715865135192871, - "kl": 0.04798049572855234, + "grad_norm": 1.1471128463745117, + "kl": 0.12899010255932808, "learning_rate": 1e-06, - "loss": 0.0176, - "num_tokens": 506248.0, - "reward": 0.3333333432674408, - "reward_std": 0.35634833574295044, + "loss": 0.0117, + "num_tokens": 914760.0, + "reward": 0.1666666716337204, + "reward_std": 0.30151134729385376, "step": 46 }, { @@ -1119,22 +1119,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 328.0, - "completions/max_terminated_length": 328.0, - "completions/mean_length": 235.75, - "completions/mean_terminated_length": 269.42857142857144, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 233.66666666666666, + "completions/mean_terminated_length": 254.9090909090909, "completions/min_length": 0.0, - "completions/min_terminated_length": 208.0, - "epoch": 0.188, + "completions/min_terminated_length": 153.0, + "epoch": 0.094, "format_failures": 0.0, - "grad_norm": 0.770552933216095, - "kl": 0.026089726015925407, + "grad_norm": 0.5467153191566467, + "kl": 0.2796362675726414, "learning_rate": 1e-06, - "loss": -0.1374, - "num_tokens": 526432.0, - "reward": 0.615674614906311, - "reward_std": 0.2741951644420624, + "loss": -0.0318, + "num_tokens": 925212.0, + "reward": 0.549458920955658, + "reward_std": 0.3676450848579407, "step": 47 }, { @@ -1143,22 +1143,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 170.0, - "completions/max_terminated_length": 170.0, - "completions/mean_length": 131.875, - "completions/mean_terminated_length": 150.71428571428572, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 166.25, + "completions/mean_terminated_length": 181.36363636363637, "completions/min_length": 0.0, - "completions/min_terminated_length": 136.0, - "epoch": 0.192, + "completions/min_terminated_length": 142.0, + "epoch": 0.096, "format_failures": 0.0, - "grad_norm": 0.5728135108947754, - "kl": 0.08094584196805954, + "grad_norm": 0.78724205493927, + "kl": 0.49516983330249786, "learning_rate": 1e-06, - "loss": -0.0257, - "num_tokens": 534408.0, - "reward": 0.09375, - "reward_std": 0.2651650309562683, + "loss": -0.0104, + "num_tokens": 938424.0, + "reward": 0.02083333395421505, + "reward_std": 0.07216878235340118, "step": 48 }, { @@ -1167,22 +1167,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 156.0, - "completions/max_terminated_length": 156.0, - "completions/mean_length": 116.25, - "completions/mean_terminated_length": 132.85714285714286, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 106.08333333333333, + "completions/mean_terminated_length": 115.72727272727273, "completions/min_length": 0.0, - "completions/min_terminated_length": 96.0, - "epoch": 0.196, + "completions/min_terminated_length": 29.0, + "epoch": 0.098, "format_failures": 1.0, - "grad_norm": 1.1056642532348633, - "kl": 0.5166730880737305, + "grad_norm": 1.7356528043746948, + "kl": 0.389555960893631, "learning_rate": 1e-06, - "loss": 0.0039, - "num_tokens": 542224.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": -0.0599, + "num_tokens": 950172.0, + "reward": 0.1944444626569748, + "reward_std": 0.38816672563552856, "step": 49 }, { @@ -1191,22 +1191,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 275.0, - "completions/max_terminated_length": 275.0, - "completions/mean_length": 102.375, - "completions/mean_terminated_length": 117.0, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 1127.0, + "completions/max_terminated_length": 1127.0, + "completions/mean_length": 186.58333333333334, + "completions/mean_terminated_length": 223.9, "completions/min_length": 0.0, "completions/min_terminated_length": 49.0, - "epoch": 0.2, + "epoch": 0.1, "format_failures": 0.0, - "grad_norm": 2.430076837539673, - "kl": 0.04860229790210724, + "grad_norm": 1.3811311721801758, + "kl": 0.0656690001487732, "learning_rate": 1e-06, - "loss": 1.0757, - "num_tokens": 557760.0, - "reward": 0.7159091234207153, - "reward_std": 0.41142913699150085, + "loss": 0.949, + "num_tokens": 981816.0, + "reward": 0.5007641911506653, + "reward_std": 0.4272591173648834, "step": 50 }, { @@ -1215,22 +1215,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.25, - "completions/max_length": 80.0, - "completions/max_terminated_length": 80.0, - "completions/mean_length": 48.25, - "completions/mean_terminated_length": 64.33333333333333, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 74.75, + "completions/mean_terminated_length": 81.54545454545455, "completions/min_length": 0.0, - "completions/min_terminated_length": 35.0, - "epoch": 0.204, + "completions/min_terminated_length": 67.0, + "epoch": 0.102, "format_failures": 0.0, - "grad_norm": 62.65098190307617, - "kl": 8.405316352844238, + "grad_norm": 3.630605697631836, + "kl": 0.11415744014084339, "learning_rate": 1e-06, - "loss": 0.0048, - "num_tokens": 565040.0, - "reward": 0.75, - "reward_std": 0.4629100561141968, + "loss": 0.1083, + "num_tokens": 994800.0, + "reward": 0.4722222685813904, + "reward_std": 0.4596514403820038, "step": 51 }, { @@ -1239,22 +1239,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 2051.0, - "completions/max_terminated_length": 2051.0, - "completions/mean_length": 493.75, - "completions/mean_terminated_length": 564.2857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 292.9166666666667, + "completions/mean_terminated_length": 319.54545454545456, "completions/min_length": 0.0, - "completions/min_terminated_length": 36.0, - "epoch": 0.208, + "completions/min_terminated_length": 230.0, + "epoch": 0.104, "format_failures": 0.0, - "grad_norm": 1.2885843515396118, - "kl": 0.019100312143564224, + "grad_norm": 0.664616048336029, + "kl": 0.024851050227880478, "learning_rate": 1e-06, - "loss": -0.1233, - "num_tokens": 587416.0, - "reward": 0.22045454382896423, - "reward_std": 0.5661183595657349, + "loss": -0.0988, + "num_tokens": 1028352.0, + "reward": 0.5121031999588013, + "reward_std": 0.26174625754356384, "step": 52 }, { @@ -1263,22 +1263,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 321.0, - "completions/max_terminated_length": 321.0, - "completions/mean_length": 195.375, - "completions/mean_terminated_length": 223.28571428571428, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 267.4166666666667, + "completions/mean_terminated_length": 291.72727272727275, "completions/min_length": 0.0, - "completions/min_terminated_length": 71.0, - "epoch": 0.212, + "completions/min_terminated_length": 158.0, + "epoch": 0.106, "format_failures": 0.0, - "grad_norm": 1.2086297273635864, - "kl": 0.17173044383525848, + "grad_norm": 0.3362949788570404, + "kl": 0.09099859930574894, "learning_rate": 1e-06, - "loss": -0.0158, - "num_tokens": 598440.0, - "reward": 0.25, - "reward_std": 0.4629100561141968, + "loss": 0.0303, + "num_tokens": 1053264.0, + "reward": 0.0625, + "reward_std": 0.21650634706020355, "step": 53 }, { @@ -1287,22 +1287,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 367.0, - "completions/max_terminated_length": 367.0, - "completions/mean_length": 270.375, - "completions/mean_terminated_length": 309.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 292.0833333333333, + "completions/mean_terminated_length": 318.6363636363636, "completions/min_length": 0.0, - "completions/min_terminated_length": 186.0, - "epoch": 0.216, + "completions/min_terminated_length": 190.0, + "epoch": 0.108, "format_failures": 0.0, - "grad_norm": 0.35032373666763306, - "kl": 0.040101515129208565, + "grad_norm": 0.17621153593063354, + "kl": 0.03119577933102846, "learning_rate": 1e-06, - "loss": 0.0421, - "num_tokens": 608128.0, - "reward": 0.5280122756958008, - "reward_std": 0.23830601572990417, + "loss": 0.0012, + "num_tokens": 1068108.0, + "reward": 0.4200083613395691, + "reward_std": 0.194437637925148, "step": 54 }, { @@ -1311,22 +1311,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 69.0, - "completions/max_terminated_length": 69.0, - "completions/mean_length": 57.5, - "completions/mean_terminated_length": 65.71428571428571, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 88.75, + "completions/mean_terminated_length": 96.81818181818181, "completions/min_length": 0.0, - "completions/min_terminated_length": 46.0, - "epoch": 0.22, + "completions/min_terminated_length": 69.0, + "epoch": 0.11, "format_failures": 0.0, - "grad_norm": 1.1529607772827148, - "kl": 0.18720119446516037, + "grad_norm": 0.6367191672325134, + "kl": 0.03671593498438597, "learning_rate": 1e-06, - "loss": -0.0008, - "num_tokens": 614320.0, - "reward": 0.1875, - "reward_std": 0.1157275140285492, + "loss": 0.0088, + "num_tokens": 1079820.0, + "reward": 0.19027778506278992, + "reward_std": 0.15930061042308807, "step": 55 }, { @@ -1335,22 +1335,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 199.0, - "completions/max_terminated_length": 199.0, - "completions/mean_length": 160.625, - "completions/mean_terminated_length": 183.57142857142858, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 163.58333333333334, + "completions/mean_terminated_length": 178.45454545454547, "completions/min_length": 0.0, - "completions/min_terminated_length": 161.0, - "epoch": 0.224, + "completions/min_terminated_length": 113.0, + "epoch": 0.112, "format_failures": 0.0, - "grad_norm": 1.1264208555221558, - "kl": 0.08833763748407364, + "grad_norm": 2.1606733798980713, + "kl": 0.20935122203081846, "learning_rate": 1e-06, - "loss": 0.0454, - "num_tokens": 623424.0, - "reward": 0.2916666567325592, - "reward_std": 0.4520675837993622, + "loss": -0.0277, + "num_tokens": 1091832.0, + "reward": 0.5777778029441833, + "reward_std": 0.4515592157840729, "step": 56 }, { @@ -1359,22 +1359,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 242.0, - "completions/max_terminated_length": 242.0, - "completions/mean_length": 184.625, - "completions/mean_terminated_length": 211.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 288.4166666666667, + "completions/mean_terminated_length": 314.6363636363636, "completions/min_length": 0.0, - "completions/min_terminated_length": 170.0, - "epoch": 0.228, + "completions/min_terminated_length": 169.0, + "epoch": 0.114, "format_failures": 0.0, - "grad_norm": 0.03325602412223816, - "kl": 0.04312510974705219, + "grad_norm": 0.32393601536750793, + "kl": 0.031358057633042336, "learning_rate": 1e-06, - "loss": 0.0002, - "num_tokens": 631976.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": -0.044, + "num_tokens": 1105608.0, + "reward": 0.1666666716337204, + "reward_std": 0.24984844028949738, "step": 57 }, { @@ -1383,20 +1383,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 95.0, - "completions/max_terminated_length": 95.0, - "completions/mean_length": 69.5, - "completions/mean_terminated_length": 79.42857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 65.5, + "completions/mean_terminated_length": 71.45454545454545, "completions/min_length": 0.0, - "completions/min_terminated_length": 59.0, - "epoch": 0.232, + "completions/min_terminated_length": 55.0, + "epoch": 0.116, "format_failures": 0.0, - "grad_norm": 0.04106176272034645, - "kl": 0.026903850957751274, + "grad_norm": 0.021954922005534172, + "kl": 0.018348069861531258, "learning_rate": 1e-06, "loss": 0.0002, - "num_tokens": 637776.0, + "num_tokens": 1113168.0, "reward": 0.0, "reward_std": 0.0, "step": 58 @@ -1407,22 +1407,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 508.0, - "completions/max_terminated_length": 508.0, - "completions/mean_length": 273.5, - "completions/mean_terminated_length": 312.57142857142856, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 224.41666666666666, + "completions/mean_terminated_length": 244.8181818181818, "completions/min_length": 0.0, - "completions/min_terminated_length": 148.0, - "epoch": 0.236, + "completions/min_terminated_length": 92.0, + "epoch": 0.118, "format_failures": 0.0, - "grad_norm": 0.11601117998361588, - "kl": 0.0859757624566555, + "grad_norm": 1.1990734338760376, + "kl": 0.3062889650464058, "learning_rate": 1e-06, - "loss": 0.0004, - "num_tokens": 650616.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": 0.0431, + "num_tokens": 1136832.0, + "reward": 0.2395833432674408, + "reward_std": 0.25259074568748474, "step": 59 }, { @@ -1431,22 +1431,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 395.0, - "completions/max_terminated_length": 395.0, - "completions/mean_length": 298.75, - "completions/mean_terminated_length": 341.42857142857144, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 238.0, + "completions/mean_terminated_length": 259.6363636363636, "completions/min_length": 0.0, - "completions/min_terminated_length": 194.0, - "epoch": 0.24, + "completions/min_terminated_length": 80.0, + "epoch": 0.12, "format_failures": 0.0, - "grad_norm": 0.3067856729030609, - "kl": 0.1856345497071743, + "grad_norm": 0.5170612931251526, + "kl": 0.03292474150657654, "learning_rate": 1e-06, - "loss": 0.033, - "num_tokens": 659168.0, - "reward": 0.5159090757369995, - "reward_std": 0.1970113068819046, + "loss": 0.0251, + "num_tokens": 1150536.0, + "reward": 0.39345240592956543, + "reward_std": 0.3553503155708313, "step": 60 }, { @@ -1455,22 +1455,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 71.0, - "completions/max_terminated_length": 71.0, - "completions/mean_length": 58.75, - "completions/mean_terminated_length": 67.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 82.16666666666667, + "completions/mean_terminated_length": 89.63636363636364, "completions/min_length": 0.0, - "completions/min_terminated_length": 53.0, - "epoch": 0.244, + "completions/min_terminated_length": 65.0, + "epoch": 0.122, "format_failures": 0.0, - "grad_norm": 1.8961628675460815, - "kl": 0.0375029481947422, + "grad_norm": 1.1562092304229736, + "kl": 0.023061166517436504, "learning_rate": 1e-06, - "loss": -0.0261, - "num_tokens": 663984.0, - "reward": 0.8500000238418579, - "reward_std": 0.3505098223686218, + "loss": 0.1452, + "num_tokens": 1158984.0, + "reward": 0.7333333492279053, + "reward_std": 0.3639269173145294, "step": 61 }, { @@ -1479,22 +1479,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 428.0, - "completions/max_terminated_length": 428.0, - "completions/mean_length": 251.625, - "completions/mean_terminated_length": 287.57142857142856, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 296.1666666666667, + "completions/mean_terminated_length": 323.09090909090907, "completions/min_length": 0.0, - "completions/min_terminated_length": 163.0, - "epoch": 0.248, + "completions/min_terminated_length": 201.0, + "epoch": 0.124, "format_failures": 0.0, - "grad_norm": 0.2780621349811554, - "kl": 0.05490433797240257, + "grad_norm": 0.32044336199760437, + "kl": 0.06375124305486679, "learning_rate": 1e-06, - "loss": 0.0335, - "num_tokens": 672792.0, - "reward": 0.5874999761581421, - "reward_std": 0.16226325929164886, + "loss": 0.0015, + "num_tokens": 1173504.0, + "reward": 0.43736547231674194, + "reward_std": 0.25956276059150696, "step": 62 }, { @@ -1503,22 +1503,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 318.0, - "completions/max_terminated_length": 318.0, - "completions/mean_length": 208.25, - "completions/mean_terminated_length": 238.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 2051.0, + "completions/max_terminated_length": 2051.0, + "completions/mean_length": 586.4166666666666, + "completions/mean_terminated_length": 639.7272727272727, "completions/min_length": 0.0, - "completions/min_terminated_length": 162.0, - "epoch": 0.252, + "completions/min_terminated_length": 38.0, + "epoch": 0.126, "format_failures": 0.0, - "grad_norm": 1.4749125242233276, - "kl": 0.11917952820658684, + "grad_norm": 0.6462875008583069, + "kl": 0.023477558977901936, "learning_rate": 1e-06, - "loss": 0.5038, - "num_tokens": 692808.0, - "reward": 0.5503472089767456, - "reward_std": 0.4739660620689392, + "loss": 0.0492, + "num_tokens": 1206840.0, + "reward": 0.501884937286377, + "reward_std": 0.5706992149353027, "step": 63 }, { @@ -1527,22 +1527,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 209.0, - "completions/max_terminated_length": 209.0, - "completions/mean_length": 121.625, - "completions/mean_terminated_length": 139.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 150.66666666666666, + "completions/mean_terminated_length": 164.36363636363637, "completions/min_length": 0.0, - "completions/min_terminated_length": 80.0, - "epoch": 0.256, + "completions/min_terminated_length": 97.0, + "epoch": 0.128, "format_failures": 0.0, - "grad_norm": 0.9009966850280762, - "kl": 0.04829751141369343, + "grad_norm": 0.4827415347099304, + "kl": 0.11513948068022728, "learning_rate": 1e-06, - "loss": -0.0333, - "num_tokens": 713192.0, - "reward": 0.4819444417953491, - "reward_std": 0.13385315239429474, + "loss": 0.2183, + "num_tokens": 1230888.0, + "reward": 0.3715476393699646, + "reward_std": 0.17215265333652496, "step": 64 }, { @@ -1551,22 +1551,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 299.0, - "completions/max_terminated_length": 299.0, - "completions/mean_length": 202.875, - "completions/mean_terminated_length": 231.85714285714286, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 1340.0, + "completions/max_terminated_length": 1340.0, + "completions/mean_length": 277.5833333333333, + "completions/mean_terminated_length": 302.8181818181818, "completions/min_length": 0.0, - "completions/min_terminated_length": 126.0, - "epoch": 0.26, + "completions/min_terminated_length": 20.0, + "epoch": 0.13, "format_failures": 0.0, - "grad_norm": 0.6346305012702942, - "kl": 0.08024599775671959, + "grad_norm": 0.46889665722846985, + "kl": 0.9275694619864225, "learning_rate": 1e-06, - "loss": 0.067, - "num_tokens": 720472.0, - "reward": 0.25189393758773804, - "reward_std": 0.2742690443992615, + "loss": 0.2754, + "num_tokens": 1262100.0, + "reward": 0.3917522430419922, + "reward_std": 0.2266404628753662, "step": 65 }, { @@ -1575,22 +1575,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 527.0, - "completions/max_terminated_length": 527.0, - "completions/mean_length": 361.875, - "completions/mean_terminated_length": 413.57142857142856, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 366.25, + "completions/mean_terminated_length": 399.54545454545456, "completions/min_length": 0.0, - "completions/min_terminated_length": 186.0, - "epoch": 0.264, - "format_failures": 0.0, - "grad_norm": 0.3846381604671478, - "kl": 0.03228219784796238, + "completions/min_terminated_length": 212.0, + "epoch": 0.132, + "format_failures": 1.0, + "grad_norm": 0.30657899379730225, + "kl": 0.16883518174290657, "learning_rate": 1e-06, - "loss": 0.0465, - "num_tokens": 732112.0, - "reward": 0.3159722089767456, - "reward_std": 0.2696826457977295, + "loss": 0.0155, + "num_tokens": 1278012.0, + "reward": 0.34761905670166016, + "reward_std": 0.2757572531700134, "step": 66 }, { @@ -1599,22 +1599,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 652.0, - "completions/max_terminated_length": 652.0, - "completions/mean_length": 350.25, - "completions/mean_terminated_length": 400.2857142857143, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 300.9166666666667, + "completions/mean_terminated_length": 361.1, "completions/min_length": 0.0, - "completions/min_terminated_length": 272.0, - "epoch": 0.268, + "completions/min_terminated_length": 224.0, + "epoch": 0.134, "format_failures": 0.0, - "grad_norm": 0.2731687128543854, - "kl": 0.02399719413369894, + "grad_norm": 0.6152874231338501, + "kl": 0.10999106336385012, "learning_rate": 1e-06, - "loss": 0.0932, - "num_tokens": 752768.0, - "reward": 0.48750001192092896, - "reward_std": 0.17878557741641998, + "loss": 0.3303, + "num_tokens": 1308996.0, + "reward": 0.32609128952026367, + "reward_std": 0.23752012848854065, "step": 67 }, { @@ -1623,22 +1623,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 115.0, - "completions/max_terminated_length": 115.0, - "completions/mean_length": 57.5, - "completions/mean_terminated_length": 65.71428571428571, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 137.5, + "completions/mean_terminated_length": 150.0, "completions/min_length": 0.0, - "completions/min_terminated_length": 43.0, - "epoch": 0.272, + "completions/min_terminated_length": 59.0, + "epoch": 0.136, "format_failures": 0.0, - "grad_norm": 0.5548056960105896, - "kl": 0.3331392854452133, + "grad_norm": 1.7395364046096802, + "kl": 0.7087040841579437, "learning_rate": 1e-06, - "loss": 0.0037, - "num_tokens": 758312.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": -0.0121, + "num_tokens": 1321020.0, + "reward": 0.20873016119003296, + "reward_std": 0.34043052792549133, "step": 68 }, { @@ -1647,22 +1647,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 236.0, - "completions/max_terminated_length": 236.0, - "completions/mean_length": 96.0, - "completions/mean_terminated_length": 109.71428571428571, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 129.83333333333334, + "completions/mean_terminated_length": 141.63636363636363, "completions/min_length": 0.0, - "completions/min_terminated_length": 44.0, - "epoch": 0.276, + "completions/min_terminated_length": 28.0, + "epoch": 0.138, "format_failures": 0.0, - "grad_norm": 5.602732181549072, - "kl": 1.5559703707695007, + "grad_norm": 0.902642548084259, + "kl": 0.7902000248432159, "learning_rate": 1e-06, - "loss": -0.0333, - "num_tokens": 765560.0, - "reward": 0.0625, - "reward_std": 0.1767766922712326, + "loss": 0.0035, + "num_tokens": 1332492.0, + "reward": 0.0877976268529892, + "reward_std": 0.20928393304347992, "step": 69 }, { @@ -1671,22 +1671,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 505.0, - "completions/max_terminated_length": 505.0, - "completions/mean_length": 288.25, - "completions/mean_terminated_length": 329.42857142857144, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 333.1666666666667, + "completions/mean_terminated_length": 444.22222222222223, "completions/min_length": 0.0, - "completions/min_terminated_length": 259.0, - "epoch": 0.28, + "completions/min_terminated_length": 133.0, + "epoch": 0.14, "format_failures": 0.0, - "grad_norm": 0.28595268726348877, - "kl": 0.05494564212858677, + "grad_norm": 0.22367094457149506, + "kl": 0.03544241935014725, "learning_rate": 1e-06, - "loss": 0.0934, - "num_tokens": 782656.0, - "reward": 0.2406907081604004, - "reward_std": 0.2288402020931244, + "loss": 0.0442, + "num_tokens": 1363812.0, + "reward": 0.22601282596588135, + "reward_std": 0.1535530686378479, "step": 70 }, { @@ -1695,22 +1695,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 389.0, - "completions/max_terminated_length": 389.0, - "completions/mean_length": 306.375, - "completions/mean_terminated_length": 350.14285714285717, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 368.5833333333333, + "completions/mean_terminated_length": 402.09090909090907, "completions/min_length": 0.0, - "completions/min_terminated_length": 280.0, - "epoch": 0.284, + "completions/min_terminated_length": 205.0, + "epoch": 0.142, "format_failures": 0.0, - "grad_norm": 0.4375990033149719, - "kl": 0.15084227919578552, + "grad_norm": 0.25884878635406494, + "kl": 0.0446395231410861, "learning_rate": 1e-06, - "loss": -0.0137, - "num_tokens": 792064.0, - "reward": 0.6625000238418579, - "reward_std": 0.31139087677001953, + "loss": 0.0091, + "num_tokens": 1396788.0, + "reward": 0.6545634865760803, + "reward_std": 0.2292691022157669, "step": 71 }, { @@ -1719,22 +1719,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.875, - "completions/max_length": 126.0, - "completions/max_terminated_length": 126.0, - "completions/mean_length": 15.75, - "completions/mean_terminated_length": 126.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 127.75, + "completions/mean_terminated_length": 139.36363636363637, "completions/min_length": 0.0, - "completions/min_terminated_length": 126.0, - "epoch": 0.288, + "completions/min_terminated_length": 62.0, + "epoch": 0.144, "format_failures": 0.0, - "grad_norm": 4.322110652923584, - "kl": 0.025789054110646248, + "grad_norm": 2.139310121536255, + "kl": 0.2615228593349457, "learning_rate": 1e-06, - "loss": 0.0076, - "num_tokens": 800160.0, - "reward": 0.875, - "reward_std": 0.3535533845424652, + "loss": 0.0935, + "num_tokens": 1411512.0, + "reward": 0.625, + "reward_std": 0.4826536476612091, "step": 72 }, { @@ -1743,22 +1743,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 433.0, - "completions/max_terminated_length": 433.0, - "completions/mean_length": 276.75, - "completions/mean_terminated_length": 316.2857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 321.1666666666667, + "completions/mean_terminated_length": 350.3636363636364, "completions/min_length": 0.0, - "completions/min_terminated_length": 191.0, - "epoch": 0.292, + "completions/min_terminated_length": 194.0, + "epoch": 0.146, "format_failures": 0.0, - "grad_norm": 0.6268705725669861, - "kl": 0.08498941920697689, + "grad_norm": 0.7009347081184387, + "kl": 0.13678913563489914, "learning_rate": 1e-06, - "loss": -0.0614, - "num_tokens": 810104.0, - "reward": 0.41130954027175903, - "reward_std": 0.3625659644603729, + "loss": 0.0771, + "num_tokens": 1436532.0, + "reward": 0.3439815044403076, + "reward_std": 0.27971503138542175, "step": 73 }, { @@ -1767,22 +1767,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 277.0, - "completions/max_terminated_length": 277.0, - "completions/mean_length": 221.125, - "completions/mean_terminated_length": 252.71428571428572, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 253.08333333333334, + "completions/mean_terminated_length": 276.09090909090907, "completions/min_length": 0.0, - "completions/min_terminated_length": 117.0, - "epoch": 0.296, + "completions/min_terminated_length": 271.0, + "epoch": 0.148, "format_failures": 0.0, - "grad_norm": 0.8162993788719177, - "kl": 0.0225818594917655, + "grad_norm": 1.2899372577667236, + "kl": 0.10085960477590561, "learning_rate": 1e-06, - "loss": 0.0523, - "num_tokens": 822920.0, - "reward": 0.7083333730697632, - "reward_std": 0.4520675837993622, + "loss": 0.3862, + "num_tokens": 1471704.0, + "reward": 0.7222222685813904, + "reward_std": 0.4457052946090698, "step": 74 }, { @@ -1791,22 +1791,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 493.0, - "completions/max_terminated_length": 493.0, - "completions/mean_length": 249.625, - "completions/mean_terminated_length": 285.2857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 196.5, + "completions/mean_terminated_length": 214.36363636363637, "completions/min_length": 0.0, - "completions/min_terminated_length": 127.0, - "epoch": 0.3, + "completions/min_terminated_length": 54.0, + "epoch": 0.15, "format_failures": 0.0, - "grad_norm": 0.3638235032558441, - "kl": 0.10483588464558125, + "grad_norm": 0.4177331328392029, + "kl": 0.026733385398983955, "learning_rate": 1e-06, - "loss": 0.093, - "num_tokens": 833608.0, - "reward": 0.2819444537162781, - "reward_std": 0.2347228229045868, + "loss": 0.0579, + "num_tokens": 1485468.0, + "reward": 0.2735119163990021, + "reward_std": 0.30911651253700256, "step": 75 }, { @@ -1815,22 +1815,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 315.0, - "completions/max_terminated_length": 315.0, - "completions/mean_length": 92.75, - "completions/mean_terminated_length": 106.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 200.41666666666666, + "completions/mean_terminated_length": 218.63636363636363, "completions/min_length": 0.0, - "completions/min_terminated_length": 58.0, - "epoch": 0.304, + "completions/min_terminated_length": 62.0, + "epoch": 0.152, "format_failures": 0.0, - "grad_norm": 0.005022191442549229, - "kl": 0.01964521873742342, + "grad_norm": 0.8074631094932556, + "kl": 0.45791861414909363, "learning_rate": 1e-06, - "loss": 0.0, - "num_tokens": 855560.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": -0.0476, + "num_tokens": 1500636.0, + "reward": 0.17129629850387573, + "reward_std": 0.19502559304237366, "step": 76 }, { @@ -1839,22 +1839,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 1057.0, - "completions/max_terminated_length": 1057.0, - "completions/mean_length": 300.25, - "completions/mean_terminated_length": 343.14285714285717, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 144.08333333333334, + "completions/mean_terminated_length": 157.1818181818182, "completions/min_length": 0.0, - "completions/min_terminated_length": 10.0, - "epoch": 0.308, + "completions/min_terminated_length": 21.0, + "epoch": 0.154, "format_failures": 0.0, - "grad_norm": 0.6531589031219482, - "kl": 0.1464357189834118, + "grad_norm": 1.8004605770111084, + "kl": 0.32159996032714844, "learning_rate": 1e-06, - "loss": -0.0231, - "num_tokens": 873712.0, - "reward": 0.24836310744285583, - "reward_std": 0.23662379384040833, + "loss": -0.0603, + "num_tokens": 1512264.0, + "reward": 0.5055555701255798, + "reward_std": 0.29963788390159607, "step": 77 }, { @@ -1863,22 +1863,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 468.0, - "completions/max_terminated_length": 468.0, - "completions/mean_length": 271.125, - "completions/mean_terminated_length": 309.85714285714283, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 312.1666666666667, + "completions/mean_terminated_length": 340.54545454545456, "completions/min_length": 0.0, - "completions/min_terminated_length": 65.0, - "epoch": 0.312, + "completions/min_terminated_length": 170.0, + "epoch": 0.156, "format_failures": 0.0, - "grad_norm": 0.585955023765564, - "kl": 0.0404690857976675, + "grad_norm": 0.3055727481842041, + "kl": 0.03414521459490061, "learning_rate": 1e-06, - "loss": 0.0946, - "num_tokens": 882408.0, - "reward": 0.6073564291000366, - "reward_std": 0.39037758111953735, + "loss": -0.0067, + "num_tokens": 1526292.0, + "reward": 0.5897321701049805, + "reward_std": 0.2986750900745392, "step": 78 }, { @@ -1887,22 +1887,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.5, - "completions/max_length": 174.0, - "completions/max_terminated_length": 174.0, - "completions/mean_length": 75.25, - "completions/mean_terminated_length": 150.5, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 167.5, + "completions/mean_terminated_length": 182.72727272727272, "completions/min_length": 0.0, - "completions/min_terminated_length": 140.0, - "epoch": 0.316, + "completions/min_terminated_length": 165.0, + "epoch": 0.158, "format_failures": 0.0, - "grad_norm": 4.991185188293457, - "kl": 0.13191331177949905, + "grad_norm": 2.3401753902435303, + "kl": 0.03888106718659401, "learning_rate": 1e-06, - "loss": -0.1159, - "num_tokens": 891768.0, - "reward": 0.5, - "reward_std": 0.5345224738121033, + "loss": -0.0218, + "num_tokens": 1540416.0, + "reward": 0.6666666865348816, + "reward_std": 0.4923659861087799, "step": 79 }, { @@ -1911,22 +1911,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 290.0, - "completions/max_terminated_length": 290.0, - "completions/mean_length": 244.0, - "completions/mean_terminated_length": 278.85714285714283, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 210.91666666666666, + "completions/mean_terminated_length": 253.1, "completions/min_length": 0.0, - "completions/min_terminated_length": 263.0, - "epoch": 0.32, + "completions/min_terminated_length": 137.0, + "epoch": 0.16, "format_failures": 0.0, - "grad_norm": 1.556532621383667, - "kl": 0.30473417043685913, + "grad_norm": 28.73111343383789, + "kl": 15.663371562957764, "learning_rate": 1e-06, - "loss": -0.0131, - "num_tokens": 900480.0, - "reward": 0.375, - "reward_std": 0.5175491571426392, + "loss": 0.0445, + "num_tokens": 1553580.0, + "reward": 0.4305555820465088, + "reward_std": 0.4738534092903137, "step": 80 }, { @@ -1935,22 +1935,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 56.0, - "completions/max_terminated_length": 56.0, - "completions/mean_length": 37.25, - "completions/mean_terminated_length": 42.57142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 43.166666666666664, + "completions/mean_terminated_length": 47.09090909090909, "completions/min_length": 0.0, - "completions/min_terminated_length": 31.0, - "epoch": 0.324, + "completions/min_terminated_length": 32.0, + "epoch": 0.162, "format_failures": 0.0, - "grad_norm": 6.271825790405273, - "kl": 1.4292120337486267, + "grad_norm": 13.234149932861328, + "kl": 2.6492202281951904, "learning_rate": 1e-06, - "loss": -0.0008, - "num_tokens": 905384.0, - "reward": 0.09375, - "reward_std": 0.2651650309562683, + "loss": -0.0385, + "num_tokens": 1560816.0, + "reward": 0.27916666865348816, + "reward_std": 0.42504456639289856, "step": 81 }, { @@ -1959,22 +1959,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 155.0, - "completions/max_terminated_length": 155.0, - "completions/mean_length": 106.375, - "completions/mean_terminated_length": 121.57142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 189.66666666666666, + "completions/mean_terminated_length": 206.9090909090909, "completions/min_length": 0.0, - "completions/min_terminated_length": 62.0, - "epoch": 0.328, + "completions/min_terminated_length": 142.0, + "epoch": 0.164, "format_failures": 0.0, - "grad_norm": 1.0847584009170532, - "kl": 0.4334397315979004, + "grad_norm": 1.0555896759033203, + "kl": 0.060676803812384605, "learning_rate": 1e-06, - "loss": 0.0061, - "num_tokens": 912488.0, - "reward": 0.1875, - "reward_std": 0.2642374634742737, + "loss": -0.0432, + "num_tokens": 1573524.0, + "reward": 0.39722225069999695, + "reward_std": 0.2684729993343353, "step": 82 }, { @@ -1983,22 +1983,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 518.0, - "completions/max_terminated_length": 518.0, - "completions/mean_length": 445.625, - "completions/mean_terminated_length": 509.2857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 482.1666666666667, + "completions/mean_terminated_length": 526.0, "completions/min_length": 0.0, - "completions/min_terminated_length": 482.0, - "epoch": 0.332, + "completions/min_terminated_length": 479.0, + "epoch": 0.166, "format_failures": 0.0, - "grad_norm": 0.3242776393890381, - "kl": 0.028012586757540703, + "grad_norm": 0.27017322182655334, + "kl": 0.013310576789081097, "learning_rate": 1e-06, - "loss": -0.0109, - "num_tokens": 927144.0, - "reward": 0.6354166865348816, - "reward_std": 0.41770702600479126, + "loss": -0.0023, + "num_tokens": 1595796.0, + "reward": 0.8000000715255737, + "reward_std": 0.39080336689949036, "step": 83 }, { @@ -2007,22 +2007,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 162.0, - "completions/max_terminated_length": 162.0, - "completions/mean_length": 80.75, - "completions/mean_terminated_length": 92.28571428571429, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 144.91666666666666, + "completions/mean_terminated_length": 158.0909090909091, "completions/min_length": 0.0, - "completions/min_terminated_length": 60.0, - "epoch": 0.336, + "completions/min_terminated_length": 83.0, + "epoch": 0.168, "format_failures": 0.0, - "grad_norm": 2.415727376937866, - "kl": 1.3026588559150696, + "grad_norm": 1.0021555423736572, + "kl": 0.2212899848818779, "learning_rate": 1e-06, - "loss": -0.0519, - "num_tokens": 933024.0, - "reward": 0.2708333432674408, - "reward_std": 0.39778655767440796, + "loss": 0.0304, + "num_tokens": 1606284.0, + "reward": 0.2957010865211487, + "reward_std": 0.2737172842025757, "step": 84 }, { @@ -2031,22 +2031,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 509.0, - "completions/max_terminated_length": 509.0, - "completions/mean_length": 287.25, - "completions/mean_terminated_length": 328.2857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 2050.0, + "completions/max_terminated_length": 2050.0, + "completions/mean_length": 510.0833333333333, + "completions/mean_terminated_length": 556.4545454545455, "completions/min_length": 0.0, - "completions/min_terminated_length": 177.0, - "epoch": 0.34, + "completions/min_terminated_length": 216.0, + "epoch": 0.17, "format_failures": 0.0, - "grad_norm": 0.5015918612480164, - "kl": 0.07602308504283428, + "grad_norm": 0.3675689399242401, + "kl": 0.2206931747496128, "learning_rate": 1e-06, - "loss": 0.0013, - "num_tokens": 943576.0, - "reward": 0.35555556416511536, - "reward_std": 0.330837219953537, + "loss": 0.1278, + "num_tokens": 1639152.0, + "reward": 0.43888890743255615, + "reward_std": 0.2596941888332367, "step": 85 }, { @@ -2055,22 +2055,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 147.0, - "completions/max_terminated_length": 147.0, - "completions/mean_length": 92.875, - "completions/mean_terminated_length": 106.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 152.25, + "completions/mean_terminated_length": 166.0909090909091, "completions/min_length": 0.0, - "completions/min_terminated_length": 30.0, - "epoch": 0.344, + "completions/min_terminated_length": 117.0, + "epoch": 0.172, "format_failures": 0.0, - "grad_norm": 3162.383056640625, - "kl": 592.9862050414085, + "grad_norm": 2.8949317932128906, + "kl": 1.413679599761963, "learning_rate": 1e-06, - "loss": 4.4073, - "num_tokens": 950272.0, - "reward": 0.125, - "reward_std": 0.3535533845424652, + "loss": 0.0356, + "num_tokens": 1652364.0, + "reward": 0.4761905074119568, + "reward_std": 0.5035434365272522, "step": 86 }, { @@ -2079,22 +2079,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.5, - "completions/max_length": 159.0, - "completions/max_terminated_length": 159.0, - "completions/mean_length": 78.875, - "completions/mean_terminated_length": 157.75, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 152.91666666666666, + "completions/mean_terminated_length": 166.8181818181818, "completions/min_length": 0.0, - "completions/min_terminated_length": 154.0, - "epoch": 0.348, + "completions/min_terminated_length": 58.0, + "epoch": 0.174, "format_failures": 0.0, - "grad_norm": 5.812924385070801, - "kl": 0.03395126201212406, + "grad_norm": 1.7609695196151733, + "kl": 0.07055489160120487, "learning_rate": 1e-06, - "loss": 0.001, - "num_tokens": 956992.0, - "reward": 0.625, - "reward_std": 0.5175491571426392, + "loss": 0.3366, + "num_tokens": 1685136.0, + "reward": 0.33750003576278687, + "reward_std": 0.43647608160972595, "step": 87 }, { @@ -2103,22 +2103,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 122.0, - "completions/max_terminated_length": 122.0, - "completions/mean_length": 85.5, - "completions/mean_terminated_length": 97.71428571428571, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 135.25, + "completions/mean_terminated_length": 147.54545454545453, "completions/min_length": 0.0, - "completions/min_terminated_length": 75.0, - "epoch": 0.352, + "completions/min_terminated_length": 77.0, + "epoch": 0.176, "format_failures": 0.0, - "grad_norm": 1.021047830581665, - "kl": 0.18108929693698883, + "grad_norm": 0.6215497255325317, + "kl": 0.08650689758360386, "learning_rate": 1e-06, - "loss": 0.0333, - "num_tokens": 962232.0, - "reward": 0.5613095164299011, - "reward_std": 0.23917356133460999, + "loss": 0.0112, + "num_tokens": 1693764.0, + "reward": 0.5745911598205566, + "reward_std": 0.1768045872449875, "step": 88 }, { @@ -2127,22 +2127,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 364.0, - "completions/max_terminated_length": 364.0, - "completions/mean_length": 119.625, - "completions/mean_terminated_length": 136.71428571428572, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 73.25, + "completions/mean_terminated_length": 79.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 61.0, - "epoch": 0.356, - "format_failures": 0.0, - "grad_norm": 1.7294161319732666, - "kl": 0.41833993047475815, + "epoch": 0.178, + "format_failures": 1.0, + "grad_norm": 0.8421996235847473, + "kl": 0.016213122755289078, "learning_rate": 1e-06, - "loss": 0.1262, - "num_tokens": 975584.0, - "reward": 0.109375, - "reward_std": 0.14250017702579498, + "loss": 0.0149, + "num_tokens": 1707588.0, + "reward": 0.06666667014360428, + "reward_std": 0.1775250881910324, "step": 89 }, { @@ -2151,22 +2151,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 215.0, - "completions/max_terminated_length": 215.0, - "completions/mean_length": 138.0, - "completions/mean_terminated_length": 157.71428571428572, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 178.16666666666666, + "completions/mean_terminated_length": 194.36363636363637, "completions/min_length": 0.0, - "completions/min_terminated_length": 56.0, - "epoch": 0.36, + "completions/min_terminated_length": 101.0, + "epoch": 0.18, "format_failures": 0.0, - "grad_norm": 64.84632873535156, - "kl": 29.09031867980957, + "grad_norm": 0.4202212691307068, + "kl": 0.3119240030646324, "learning_rate": 1e-06, - "loss": 0.188, - "num_tokens": 981256.0, - "reward": 0.7403273582458496, - "reward_std": 0.17907913029193878, + "loss": 0.0093, + "num_tokens": 1716792.0, + "reward": 0.6381944417953491, + "reward_std": 0.22775352001190186, "step": 90 }, { @@ -2175,22 +2175,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.25, - "completions/max_length": 195.0, - "completions/max_terminated_length": 195.0, - "completions/mean_length": 91.125, - "completions/mean_terminated_length": 121.5, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 165.58333333333334, + "completions/mean_terminated_length": 180.63636363636363, "completions/min_length": 0.0, - "completions/min_terminated_length": 64.0, - "epoch": 0.364, + "completions/min_terminated_length": 56.0, + "epoch": 0.182, "format_failures": 0.0, - "grad_norm": 84280.7421875, - "kl": 6375.002594873309, + "grad_norm": 3.5526509284973145, + "kl": 0.04295740742236376, "learning_rate": 1e-06, - "loss": 96.8935, - "num_tokens": 992408.0, - "reward": 0.1666666716337204, - "reward_std": 0.35634833574295044, + "loss": -0.007, + "num_tokens": 1735188.0, + "reward": 0.6666666865348816, + "reward_std": 0.4923659861087799, "step": 91 }, { @@ -2199,22 +2199,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 405.0, - "completions/max_terminated_length": 405.0, - "completions/mean_length": 301.625, - "completions/mean_terminated_length": 344.7142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 296.3333333333333, + "completions/mean_terminated_length": 323.27272727272725, "completions/min_length": 0.0, - "completions/min_terminated_length": 233.0, - "epoch": 0.368, + "completions/min_terminated_length": 142.0, + "epoch": 0.184, "format_failures": 0.0, - "grad_norm": 1.2784438133239746, - "kl": 0.7278856039047241, + "grad_norm": 0.7098760008811951, + "kl": 0.14585042744874954, "learning_rate": 1e-06, - "loss": 0.1895, - "num_tokens": 1010848.0, - "reward": 0.39940476417541504, - "reward_std": 0.3344242572784424, + "loss": -0.052, + "num_tokens": 1748808.0, + "reward": 0.4570105969905853, + "reward_std": 0.29787296056747437, "step": 92 }, { @@ -2223,22 +2223,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 435.0, - "completions/max_terminated_length": 435.0, - "completions/mean_length": 275.375, - "completions/mean_terminated_length": 314.7142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 325.1666666666667, + "completions/mean_terminated_length": 354.72727272727275, "completions/min_length": 0.0, - "completions/min_terminated_length": 176.0, - "epoch": 0.372, + "completions/min_terminated_length": 233.0, + "epoch": 0.186, "format_failures": 0.0, - "grad_norm": 0.2864258289337158, - "kl": 0.08188853040337563, + "grad_norm": 4.00807523727417, + "kl": 2.2327868938446045, "learning_rate": 1e-06, - "loss": 0.0434, - "num_tokens": 1020488.0, - "reward": 0.5244791507720947, - "reward_std": 0.16294103860855103, + "loss": 0.0328, + "num_tokens": 1763196.0, + "reward": 0.37762749195098877, + "reward_std": 0.2510078251361847, "step": 93 }, { @@ -2247,22 +2247,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 105.0, - "completions/max_terminated_length": 105.0, - "completions/mean_length": 58.25, - "completions/mean_terminated_length": 66.57142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 78.66666666666667, + "completions/mean_terminated_length": 85.81818181818181, "completions/min_length": 0.0, - "completions/min_terminated_length": 43.0, - "epoch": 0.376, + "completions/min_terminated_length": 60.0, + "epoch": 0.188, "format_failures": 0.0, - "grad_norm": 22.611703872680664, - "kl": 1.769313856959343, + "grad_norm": 4.166850566864014, + "kl": 0.4828091114759445, "learning_rate": 1e-06, - "loss": 0.0731, - "num_tokens": 1028496.0, - "reward": 0.5, - "reward_std": 0.37796446681022644, + "loss": -0.0043, + "num_tokens": 1775700.0, + "reward": 0.41428571939468384, + "reward_std": 0.20157082378864288, "step": 94 }, { @@ -2271,22 +2271,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 252.0, - "completions/max_terminated_length": 252.0, - "completions/mean_length": 136.625, - "completions/mean_terminated_length": 156.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 163.0, + "completions/mean_terminated_length": 177.8181818181818, "completions/min_length": 0.0, - "completions/min_terminated_length": 94.0, - "epoch": 0.38, + "completions/min_terminated_length": 86.0, + "epoch": 0.19, "format_failures": 0.0, - "grad_norm": 3.4661715030670166, - "kl": 0.3033728860318661, + "grad_norm": 2.0013251304626465, + "kl": 0.3356290655210614, "learning_rate": 1e-06, - "loss": 0.0753, - "num_tokens": 1038480.0, - "reward": 0.4479166567325592, - "reward_std": 0.41052013635635376, + "loss": -0.0532, + "num_tokens": 1790064.0, + "reward": 0.4275793731212616, + "reward_std": 0.3848039209842682, "step": 95 }, { @@ -2295,22 +2295,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 142.0, - "completions/max_terminated_length": 142.0, - "completions/mean_length": 109.75, - "completions/mean_terminated_length": 125.42857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 128.33333333333334, + "completions/mean_terminated_length": 140.0, "completions/min_length": 0.0, - "completions/min_terminated_length": 66.0, - "epoch": 0.384, + "completions/min_terminated_length": 134.0, + "epoch": 0.192, "format_failures": 0.0, - "grad_norm": 2.9459471702575684, - "kl": 0.8582945615053177, + "grad_norm": 6.922305107116699, + "kl": 3.5449295742437243, "learning_rate": 1e-06, - "loss": -0.0371, - "num_tokens": 1048240.0, - "reward": 0.6180555820465088, - "reward_std": 0.42537203431129456, + "loss": 0.0385, + "num_tokens": 1803036.0, + "reward": 0.6979166865348816, + "reward_std": 0.31738603115081787, "step": 96 }, { @@ -2319,22 +2319,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 76.0, - "completions/max_terminated_length": 76.0, - "completions/mean_length": 64.0, - "completions/mean_terminated_length": 73.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 94.33333333333333, + "completions/mean_terminated_length": 102.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 72.0, - "epoch": 0.388, + "epoch": 0.194, "format_failures": 0.0, - "grad_norm": 0.43100497126579285, - "kl": 0.06553871184587479, + "grad_norm": 1.4514728784561157, + "kl": 0.1412234902381897, "learning_rate": 1e-06, - "loss": 0.0368, - "num_tokens": 1054728.0, - "reward": 0.9642857313156128, - "reward_std": 0.10101523995399475, + "loss": 0.3157, + "num_tokens": 1816092.0, + "reward": 0.8380953073501587, + "reward_std": 0.30834609270095825, "step": 97 }, { @@ -2343,20 +2343,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 66.0, - "completions/max_terminated_length": 66.0, - "completions/mean_length": 40.25, - "completions/mean_terminated_length": 46.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 43.5, + "completions/mean_terminated_length": 47.45454545454545, "completions/min_length": 0.0, - "completions/min_terminated_length": 39.0, - "epoch": 0.392, + "completions/min_terminated_length": 31.0, + "epoch": 0.196, "format_failures": 0.0, - "grad_norm": 6.787167072296143, - "kl": 1.8237296342849731, + "grad_norm": 2.004136085510254, + "kl": 0.6110408902168274, "learning_rate": 1e-06, - "loss": 0.0307, - "num_tokens": 1060304.0, + "loss": 0.0095, + "num_tokens": 1827024.0, "reward": 0.0, "reward_std": 0.0, "step": 98 @@ -2367,22 +2367,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 344.0, - "completions/max_terminated_length": 344.0, - "completions/mean_length": 179.125, - "completions/mean_terminated_length": 204.71428571428572, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 212.08333333333334, + "completions/mean_terminated_length": 231.36363636363637, "completions/min_length": 0.0, - "completions/min_terminated_length": 150.0, - "epoch": 0.396, + "completions/min_terminated_length": 102.0, + "epoch": 0.198, "format_failures": 0.0, - "grad_norm": 1.4021135568618774, - "kl": 0.06424028240144253, + "grad_norm": 0.8370314240455627, + "kl": 0.09233395755290985, "learning_rate": 1e-06, - "loss": -0.0136, - "num_tokens": 1082672.0, - "reward": 0.490579217672348, - "reward_std": 0.34001559019088745, + "loss": 0.1438, + "num_tokens": 1860576.0, + "reward": 0.2782828211784363, + "reward_std": 0.2644941210746765, "step": 99 }, { @@ -2391,22 +2391,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.625, - "completions/max_length": 189.0, - "completions/max_terminated_length": 189.0, - "completions/mean_length": 52.5, - "completions/mean_terminated_length": 140.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 163.25, + "completions/mean_terminated_length": 178.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, - "epoch": 0.4, + "epoch": 0.2, "format_failures": 0.0, - "grad_norm": 4.928287982940674, - "kl": 0.6296049430966377, + "grad_norm": 1.565374732017517, + "kl": 0.391565203666687, "learning_rate": 1e-06, - "loss": -0.2632, - "num_tokens": 1090648.0, - "reward": 0.75, - "reward_std": 0.4629100561141968, + "loss": -0.0497, + "num_tokens": 1872996.0, + "reward": 0.5944445133209229, + "reward_std": 0.47775429487228394, "step": 100 }, { @@ -2415,22 +2415,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 104.0, - "completions/max_terminated_length": 104.0, - "completions/mean_length": 73.125, - "completions/mean_terminated_length": 83.57142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 150.16666666666666, + "completions/mean_terminated_length": 163.8181818181818, "completions/min_length": 0.0, - "completions/min_terminated_length": 70.0, - "epoch": 0.404, - "format_failures": 1.0, - "grad_norm": 1.0927364826202393, - "kl": 0.4457448348402977, + "completions/min_terminated_length": 94.0, + "epoch": 0.202, + "format_failures": 0.0, + "grad_norm": 1.6569881439208984, + "kl": 0.24375841114670038, "learning_rate": 1e-06, - "loss": 0.0054, - "num_tokens": 1099240.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": 0.0387, + "num_tokens": 1892856.0, + "reward": 0.3499999940395355, + "reward_std": 0.36666667461395264, "step": 101 }, { @@ -2439,22 +2439,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 141.0, - "completions/max_terminated_length": 141.0, - "completions/mean_length": 100.625, - "completions/mean_terminated_length": 115.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 107.66666666666667, + "completions/mean_terminated_length": 117.45454545454545, "completions/min_length": 0.0, - "completions/min_terminated_length": 90.0, - "epoch": 0.408, + "completions/min_terminated_length": 93.0, + "epoch": 0.204, "format_failures": 0.0, - "grad_norm": 0.3484640419483185, - "kl": 0.014615435153245926, + "grad_norm": 0.9490823745727539, + "kl": 0.010788497282192111, "learning_rate": 1e-06, - "loss": -0.0011, - "num_tokens": 1106512.0, - "reward": 0.8717262148857117, - "reward_std": 0.1315489560365677, + "loss": 0.0193, + "num_tokens": 1903992.0, + "reward": 0.7714947462081909, + "reward_std": 0.2890874743461609, "step": 102 }, { @@ -2463,22 +2463,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 120.0, - "completions/max_terminated_length": 120.0, - "completions/mean_length": 79.0, - "completions/mean_terminated_length": 90.28571428571429, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 66.0, + "completions/mean_terminated_length": 72.0, "completions/min_length": 0.0, - "completions/min_terminated_length": 69.0, - "epoch": 0.412, + "completions/min_terminated_length": 24.0, + "epoch": 0.206, "format_failures": 0.0, - "grad_norm": 2.578859329223633, - "kl": 0.05575744202360511, + "grad_norm": 1.482935905456543, + "kl": 0.03114949818700552, "learning_rate": 1e-06, - "loss": -0.1115, - "num_tokens": 1113224.0, - "reward": 0.53125, - "reward_std": 0.41052013635635376, + "loss": -0.0754, + "num_tokens": 1913640.0, + "reward": 0.3333333432674408, + "reward_std": 0.32566946744918823, "step": 103 }, { @@ -2487,22 +2487,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 299.0, - "completions/max_terminated_length": 299.0, - "completions/mean_length": 224.125, - "completions/mean_terminated_length": 256.14285714285717, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 260.5833333333333, + "completions/mean_terminated_length": 284.27272727272725, "completions/min_length": 0.0, - "completions/min_terminated_length": 180.0, - "epoch": 0.416, + "completions/min_terminated_length": 197.0, + "epoch": 0.208, "format_failures": 0.0, - "grad_norm": 0.1279614269733429, - "kl": 0.008564054034650326, + "grad_norm": 0.4501963257789612, + "kl": 0.011977697955444455, "learning_rate": 1e-06, - "loss": -0.0026, - "num_tokens": 1120832.0, - "reward": 0.6416666507720947, - "reward_std": 0.08864051848649979, + "loss": -0.0496, + "num_tokens": 1932468.0, + "reward": 0.37487921118736267, + "reward_std": 0.29262858629226685, "step": 104 }, { @@ -2511,22 +2511,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, + "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, - "completions/mean_length": 120.75, - "completions/mean_terminated_length": 138.0, + "completions/mean_length": 113.91666666666667, + "completions/mean_terminated_length": 136.7, "completions/min_length": 0.0, - "completions/min_terminated_length": 125.0, - "epoch": 0.42, + "completions/min_terminated_length": 120.0, + "epoch": 0.21, "format_failures": 0.0, - "grad_norm": 2.8721704483032227, - "kl": 0.028846602886915207, + "grad_norm": 3.2958946228027344, + "kl": 0.024902154691517353, "learning_rate": 1e-06, - "loss": 0.1163, - "num_tokens": 1129032.0, - "reward": 0.5833333730697632, - "reward_std": 0.49601587653160095, + "loss": 0.0181, + "num_tokens": 1942992.0, + "reward": 0.5, + "reward_std": 0.5222329497337341, "step": 105 }, { @@ -2535,22 +2535,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 92.0, - "completions/max_terminated_length": 92.0, - "completions/mean_length": 61.0, - "completions/mean_terminated_length": 69.71428571428571, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 166.0, + "completions/mean_terminated_length": 181.0909090909091, "completions/min_length": 0.0, - "completions/min_terminated_length": 56.0, - "epoch": 0.424, + "completions/min_terminated_length": 57.0, + "epoch": 0.212, "format_failures": 0.0, - "grad_norm": 0.4012051820755005, - "kl": 0.13534526526927948, + "grad_norm": 1.3716078996658325, + "kl": 1.098541870713234, "learning_rate": 1e-06, - "loss": 0.0023, - "num_tokens": 1137584.0, - "reward": 0.0, - "reward_std": 0.0, + "loss": 0.0299, + "num_tokens": 1964208.0, + "reward": 0.07500000298023224, + "reward_std": 0.17645499110221863, "step": 106 }, { @@ -2559,22 +2559,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 266.0, - "completions/max_terminated_length": 266.0, - "completions/mean_length": 169.0, - "completions/mean_terminated_length": 193.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 171.58333333333334, + "completions/mean_terminated_length": 187.1818181818182, "completions/min_length": 0.0, - "completions/min_terminated_length": 87.0, - "epoch": 0.428, - "format_failures": 0.0, - "grad_norm": 0.34922441840171814, - "kl": 0.014531925320625305, + "completions/min_terminated_length": 100.0, + "epoch": 0.214, + "format_failures": 2.0, + "grad_norm": 0.27850034832954407, + "kl": 0.020487794652581215, "learning_rate": 1e-06, - "loss": 0.0412, - "num_tokens": 1144344.0, - "reward": 0.3263888955116272, - "reward_std": 0.19911068677902222, + "loss": 0.0329, + "num_tokens": 1974972.0, + "reward": 0.4126984477043152, + "reward_std": 0.18834668397903442, "step": 107 }, { @@ -2583,22 +2583,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, + "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, - "completions/mean_length": 44.75, - "completions/mean_terminated_length": 51.142857142857146, + "completions/mean_length": 45.416666666666664, + "completions/mean_terminated_length": 49.54545454545455, "completions/min_length": 0.0, - "completions/min_terminated_length": 50.0, - "epoch": 0.432, + "completions/min_terminated_length": 34.0, + "epoch": 0.216, "format_failures": 0.0, - "grad_norm": 0.8536809682846069, - "kl": 0.01497908541932702, + "grad_norm": 2.118313789367676, + "kl": 0.03025034721940756, "learning_rate": 1e-06, - "loss": 0.0128, - "num_tokens": 1148760.0, - "reward": 0.90625, - "reward_std": 0.2651650309562683, + "loss": 0.0001, + "num_tokens": 1981716.0, + "reward": 0.8333333730697632, + "reward_std": 0.38924944400787354, "step": 108 }, { @@ -2607,22 +2607,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 110.0, - "completions/max_terminated_length": 110.0, - "completions/mean_length": 85.375, - "completions/mean_terminated_length": 97.57142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 117.5, + "completions/mean_terminated_length": 128.1818181818182, "completions/min_length": 0.0, - "completions/min_terminated_length": 79.0, - "epoch": 0.436, + "completions/min_terminated_length": 88.0, + "epoch": 0.218, "format_failures": 0.0, - "grad_norm": 3.196063995361328, - "kl": 0.09259714558720589, + "grad_norm": 1.9193243980407715, + "kl": 0.04295819811522961, "learning_rate": 1e-06, - "loss": -0.0254, - "num_tokens": 1154816.0, - "reward": 0.375, - "reward_std": 0.5175491571426392, + "loss": 0.009, + "num_tokens": 1992420.0, + "reward": 0.701388955116272, + "reward_std": 0.38302528858184814, "step": 109 }, { @@ -2631,22 +2631,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 122.0, - "completions/max_terminated_length": 122.0, - "completions/mean_length": 93.125, - "completions/mean_terminated_length": 106.42857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 108.66666666666667, + "completions/mean_terminated_length": 118.54545454545455, "completions/min_length": 0.0, - "completions/min_terminated_length": 28.0, - "epoch": 0.44, + "completions/min_terminated_length": 92.0, + "epoch": 0.22, "format_failures": 0.0, - "grad_norm": 2.7271082401275635, - "kl": 0.04449745221063495, + "grad_norm": 4.0581183433532715, + "kl": 0.34252697695046663, "learning_rate": 1e-06, - "loss": -0.0126, - "num_tokens": 1163208.0, - "reward": 0.4166666865348816, - "reward_std": 0.34503278136253357, + "loss": -0.014, + "num_tokens": 2004288.0, + "reward": 0.479166716337204, + "reward_std": 0.30592837929725647, "step": 110 }, { @@ -2655,20 +2655,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 147.0, - "completions/max_terminated_length": 147.0, - "completions/mean_length": 114.5, - "completions/mean_terminated_length": 130.85714285714286, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 129.0, + "completions/mean_terminated_length": 140.72727272727272, "completions/min_length": 0.0, - "completions/min_terminated_length": 62.0, - "epoch": 0.444, + "completions/min_terminated_length": 112.0, + "epoch": 0.222, "format_failures": 0.0, - "grad_norm": 0.10579583793878555, - "kl": 0.055604600347578526, + "grad_norm": 2.901212692260742, + "kl": 0.451558455824852, "learning_rate": 1e-06, - "loss": 0.0006, - "num_tokens": 1174056.0, + "loss": 0.0047, + "num_tokens": 2021400.0, "reward": 0.0, "reward_std": 0.0, "step": 111 @@ -2679,22 +2679,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 173.0, - "completions/max_terminated_length": 173.0, - "completions/mean_length": 146.375, - "completions/mean_terminated_length": 167.28571428571428, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 147.08333333333334, + "completions/mean_terminated_length": 160.45454545454547, "completions/min_length": 0.0, - "completions/min_terminated_length": 164.0, - "epoch": 0.448, + "completions/min_terminated_length": 65.0, + "epoch": 0.224, "format_failures": 0.0, - "grad_norm": 0.41507479548454285, - "kl": 0.019602006301283836, + "grad_norm": 3.0557456016540527, + "kl": 0.1749698342755437, "learning_rate": 1e-06, - "loss": 0.0037, - "num_tokens": 1181760.0, - "reward": 0.9750000238418579, - "reward_std": 0.0707106813788414, + "loss": 0.0461, + "num_tokens": 2033580.0, + "reward": 0.7708333730697632, + "reward_std": 0.32784304022789, "step": 112 }, { @@ -2703,22 +2703,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 59.0, - "completions/max_terminated_length": 59.0, - "completions/mean_length": 46.75, - "completions/mean_terminated_length": 53.42857142857143, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 81.75, + "completions/mean_terminated_length": 89.18181818181819, "completions/min_length": 0.0, "completions/min_terminated_length": 41.0, - "epoch": 0.452, + "epoch": 0.226, "format_failures": 0.0, - "grad_norm": 2.7538251876831055, - "kl": 0.05537968873977661, + "grad_norm": 2.929105281829834, + "kl": 1.0704956352710724, "learning_rate": 1e-06, - "loss": -0.0324, - "num_tokens": 1187360.0, - "reward": 0.5416666865348816, - "reward_std": 0.5019802451133728, + "loss": -0.1432, + "num_tokens": 2065740.0, + "reward": 0.6625000238418579, + "reward_std": 0.3711928129196167, "step": 113 }, { @@ -2727,22 +2727,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 654.0, - "completions/max_terminated_length": 654.0, - "completions/mean_length": 341.875, - "completions/mean_terminated_length": 390.7142857142857, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 420.5, + "completions/mean_terminated_length": 458.72727272727275, "completions/min_length": 0.0, - "completions/min_terminated_length": 190.0, - "epoch": 0.456, + "completions/min_terminated_length": 171.0, + "epoch": 0.228, "format_failures": 0.0, - "grad_norm": 0.6517180800437927, - "kl": 0.01990941632539034, + "grad_norm": 0.966941237449646, + "kl": 0.012734876945614815, "learning_rate": 1e-06, - "loss": -0.0628, - "num_tokens": 1200928.0, - "reward": 0.3812499940395355, - "reward_std": 0.4225243031978607, + "loss": -0.0432, + "num_tokens": 2101236.0, + "reward": 0.6500000357627869, + "reward_std": 0.40886637568473816, "step": 114 }, { @@ -2751,22 +2751,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.25, - "completions/max_length": 284.0, - "completions/max_terminated_length": 284.0, - "completions/mean_length": 212.625, - "completions/mean_terminated_length": 283.5, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 263.75, + "completions/mean_terminated_length": 287.72727272727275, "completions/min_length": 0.0, - "completions/min_terminated_length": 281.0, - "epoch": 0.46, + "completions/min_terminated_length": 280.0, + "epoch": 0.23, "format_failures": 0.0, - "grad_norm": 2.6183741092681885, - "kl": 0.3156433766707778, + "grad_norm": 7.276376247406006, + "kl": 2.2721076011657715, "learning_rate": 1e-06, - "loss": -0.0008, - "num_tokens": 1209616.0, - "reward": 0.875, - "reward_std": 0.3535533845424652, + "loss": 0.0151, + "num_tokens": 2114484.0, + "reward": 0.7777778506278992, + "reward_std": 0.3576955795288086, "step": 115 }, { @@ -2775,22 +2775,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 187.0, - "completions/max_terminated_length": 187.0, - "completions/mean_length": 127.625, - "completions/mean_terminated_length": 145.85714285714286, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 167.41666666666666, + "completions/mean_terminated_length": 182.63636363636363, "completions/min_length": 0.0, - "completions/min_terminated_length": 24.0, - "epoch": 0.464, + "completions/min_terminated_length": 147.0, + "epoch": 0.232, "format_failures": 0.0, - "grad_norm": 1.40922212600708, - "kl": 0.3603953216224909, + "grad_norm": 0.6819717884063721, + "kl": 0.020047412253916264, "learning_rate": 1e-06, - "loss": -0.1, - "num_tokens": 1216840.0, - "reward": 0.375, - "reward_std": 0.4520675837993622, + "loss": 0.0179, + "num_tokens": 2125992.0, + "reward": 0.8819445371627808, + "reward_std": 0.2524084150791168, "step": 116 }, { @@ -2799,22 +2799,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 240.0, - "completions/max_terminated_length": 240.0, - "completions/mean_length": 172.875, - "completions/mean_terminated_length": 197.57142857142858, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 211.33333333333334, + "completions/mean_terminated_length": 230.54545454545453, "completions/min_length": 0.0, - "completions/min_terminated_length": 143.0, - "epoch": 0.468, + "completions/min_terminated_length": 147.0, + "epoch": 0.234, "format_failures": 0.0, - "grad_norm": 0.5828225612640381, - "kl": 0.013718126341700554, + "grad_norm": 0.19310350716114044, + "kl": 0.019224281422793865, "learning_rate": 1e-06, - "loss": -0.0188, - "num_tokens": 1223464.0, - "reward": 0.3083333373069763, - "reward_std": 0.2980092763900757, + "loss": 0.012, + "num_tokens": 2137692.0, + "reward": 0.585936427116394, + "reward_std": 0.09784586727619171, "step": 117 }, { @@ -2823,22 +2823,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 296.0, - "completions/max_terminated_length": 296.0, - "completions/mean_length": 113.0, - "completions/mean_terminated_length": 129.14285714285714, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 142.16666666666666, + "completions/mean_terminated_length": 155.0909090909091, "completions/min_length": 0.0, - "completions/min_terminated_length": 59.0, - "epoch": 0.472, + "completions/min_terminated_length": 110.0, + "epoch": 0.236, "format_failures": 0.0, - "grad_norm": 1.907884120941162, - "kl": 0.16990539245307446, + "grad_norm": 2.085691213607788, + "kl": 0.09273007325828075, "learning_rate": 1e-06, - "loss": 0.1637, - "num_tokens": 1231632.0, - "reward": 0.265625, - "reward_std": 0.45531338453292847, + "loss": 0.0139, + "num_tokens": 2148816.0, + "reward": 0.319444477558136, + "reward_std": 0.2289450317621231, "step": 118 }, { @@ -2847,22 +2847,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 289.0, - "completions/max_terminated_length": 289.0, - "completions/mean_length": 139.125, - "completions/mean_terminated_length": 159.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 317.0833333333333, + "completions/mean_terminated_length": 345.90909090909093, "completions/min_length": 0.0, - "completions/min_terminated_length": 89.0, - "epoch": 0.476, + "completions/min_terminated_length": 140.0, + "epoch": 0.238, "format_failures": 0.0, - "grad_norm": 0.5671705007553101, - "kl": 0.0328083336353302, + "grad_norm": 0.37083595991134644, + "kl": 0.0630851686000824, "learning_rate": 1e-06, - "loss": 0.1641, - "num_tokens": 1242688.0, - "reward": 0.6208333373069763, - "reward_std": 0.3646862208843231, + "loss": 0.0918, + "num_tokens": 2168256.0, + "reward": 0.37870368361473083, + "reward_std": 0.2895275950431824, "step": 119 }, { @@ -2871,22 +2871,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 123.0, - "completions/max_terminated_length": 123.0, - "completions/mean_length": 74.375, - "completions/mean_terminated_length": 85.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 126.66666666666667, + "completions/mean_terminated_length": 138.1818181818182, "completions/min_length": 0.0, - "completions/min_terminated_length": 20.0, - "epoch": 0.48, + "completions/min_terminated_length": 58.0, + "epoch": 0.24, "format_failures": 0.0, - "grad_norm": 6.129162788391113, - "kl": 2.631644606590271, + "grad_norm": 6.606923580169678, + "kl": 3.8295647501945496, "learning_rate": 1e-06, - "loss": -0.0072, - "num_tokens": 1250712.0, - "reward": 0.0833333358168602, - "reward_std": 0.2357022762298584, + "loss": 0.1365, + "num_tokens": 2183124.0, + "reward": 0.4027777910232544, + "reward_std": 0.3723955750465393, "step": 120 }, { @@ -2895,22 +2895,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.5, + "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, - "completions/mean_length": 33.25, - "completions/mean_terminated_length": 66.5, + "completions/mean_length": 32.083333333333336, + "completions/mean_terminated_length": 77.0, "completions/min_length": 0.0, - "completions/min_terminated_length": 35.0, - "epoch": 0.484, + "completions/min_terminated_length": 77.0, + "epoch": 0.242, "format_failures": 0.0, - "grad_norm": 2.2025108337402344, - "kl": 0.009498461615294218, + "grad_norm": 0.08047831058502197, + "kl": 0.013985397294163704, "learning_rate": 1e-06, - "loss": -0.1436, - "num_tokens": 1255560.0, - "reward": 0.875, - "reward_std": 0.3535533845424652, + "loss": 0.0003, + "num_tokens": 2190396.0, + "reward": 1.0, + "reward_std": 0.0, "step": 121 }, { @@ -2919,20 +2919,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.25, - "completions/max_length": 582.0, - "completions/max_terminated_length": 582.0, - "completions/mean_length": 367.875, - "completions/mean_terminated_length": 490.5, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 431.0833333333333, + "completions/mean_terminated_length": 470.27272727272725, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, - "epoch": 0.488, + "epoch": 0.244, "format_failures": 0.0, - "grad_norm": 0.6904863715171814, - "kl": 0.20124347042292356, + "grad_norm": 0.019394446164369583, + "kl": 0.01961024198681116, "learning_rate": 1e-06, - "loss": 0.0011, - "num_tokens": 1273976.0, + "loss": 0.0001, + "num_tokens": 2218320.0, "reward": 0.0, "reward_std": 0.0, "step": 122 @@ -2943,22 +2943,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 346.0, - "completions/max_terminated_length": 346.0, - "completions/mean_length": 230.25, - "completions/mean_terminated_length": 263.14285714285717, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 284.9166666666667, + "completions/mean_terminated_length": 310.8181818181818, "completions/min_length": 0.0, - "completions/min_terminated_length": 135.0, - "epoch": 0.492, + "completions/min_terminated_length": 118.0, + "epoch": 0.246, "format_failures": 0.0, - "grad_norm": 0.813983142375946, - "kl": 0.09101713076233864, + "grad_norm": 1.5184653997421265, + "kl": 1.0404187738895416, "learning_rate": 1e-06, - "loss": 0.0777, - "num_tokens": 1282880.0, - "reward": 0.6432539820671082, - "reward_std": 0.3272421360015869, + "loss": -0.0335, + "num_tokens": 2231256.0, + "reward": 0.4014219641685486, + "reward_std": 0.31073111295700073, "step": 123 }, { @@ -2967,22 +2967,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 52.0, - "completions/max_terminated_length": 52.0, - "completions/mean_length": 41.75, - "completions/mean_terminated_length": 47.714285714285715, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 64.75, + "completions/mean_terminated_length": 70.63636363636364, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, - "epoch": 0.496, + "epoch": 0.248, "format_failures": 0.0, - "grad_norm": 4.916449546813965, - "kl": 0.6664696265943348, + "grad_norm": 1.6326740980148315, + "kl": 0.3745545968413353, "learning_rate": 1e-06, - "loss": -0.0205, - "num_tokens": 1288032.0, - "reward": 0.8125, - "reward_std": 0.3720118999481201, + "loss": 0.0517, + "num_tokens": 2240424.0, + "reward": 0.8037037253379822, + "reward_std": 0.3365945816040039, "step": 124 }, { @@ -2991,22 +2991,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.625, - "completions/max_length": 111.0, - "completions/max_terminated_length": 111.0, - "completions/mean_length": 37.0, - "completions/mean_terminated_length": 98.66666666666667, + "completions/clipped_ratio": 0.6666666666666667, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 113.25, "completions/min_length": 0.0, - "completions/min_terminated_length": 87.0, - "epoch": 0.5, + "completions/min_terminated_length": 102.0, + "epoch": 0.25, "format_failures": 0.0, - "grad_norm": 11.985437393188477, - "kl": 2.047822058200836, + "grad_norm": 10.052517890930176, + "kl": 1.53599963337183, "learning_rate": 1e-06, - "loss": -0.0949, - "num_tokens": 1294032.0, - "reward": 0.625, - "reward_std": 0.5175491571426392, + "loss": -0.0049, + "num_tokens": 2249424.0, + "reward": 0.9166666865348816, + "reward_std": 0.28867512941360474, "step": 125 }, { @@ -3015,20 +3015,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.375, - "completions/max_length": 226.0, - "completions/max_terminated_length": 226.0, - "completions/mean_length": 139.5, - "completions/mean_terminated_length": 223.2, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 199.5, + "completions/mean_terminated_length": 217.63636363636363, "completions/min_length": 0.0, - "completions/min_terminated_length": 221.0, - "epoch": 0.504, + "completions/min_terminated_length": 160.0, + "epoch": 0.252, "format_failures": 0.0, - "grad_norm": 0.06410921365022659, - "kl": 0.024711698293685913, + "grad_norm": 1.1388990879058838, + "kl": 0.24531831266358495, "learning_rate": 1e-06, - "loss": 0.0002, - "num_tokens": 1302656.0, + "loss": 0.0013, + "num_tokens": 2263584.0, "reward": 0.0, "reward_std": 0.0, "step": 126 @@ -3039,22 +3039,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.625, - "completions/max_length": 134.0, - "completions/max_terminated_length": 134.0, - "completions/mean_length": 50.25, - "completions/mean_terminated_length": 134.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 125.0, + "completions/mean_terminated_length": 136.36363636363637, "completions/min_length": 0.0, - "completions/min_terminated_length": 134.0, - "epoch": 0.508, + "completions/min_terminated_length": 123.0, + "epoch": 0.254, "format_failures": 0.0, - "grad_norm": 5.750870704650879, - "kl": 0.32033737003803253, + "grad_norm": 2.392914056777954, + "kl": 0.9988721050322056, "learning_rate": 1e-06, - "loss": 0.0008, - "num_tokens": 1311200.0, - "reward": 0.78125, - "reward_std": 0.33905068039894104, + "loss": -0.0025, + "num_tokens": 2276520.0, + "reward": 0.7291666865348816, + "reward_std": 0.3608439266681671, "step": 127 }, { @@ -3063,22 +3063,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 187.0, - "completions/max_terminated_length": 187.0, - "completions/mean_length": 140.0, - "completions/mean_terminated_length": 160.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 134.08333333333334, + "completions/mean_terminated_length": 146.27272727272728, "completions/min_length": 0.0, - "completions/min_terminated_length": 130.0, - "epoch": 0.512, + "completions/min_terminated_length": 106.0, + "epoch": 0.256, "format_failures": 0.0, - "grad_norm": 2.3594982624053955, - "kl": 0.27750419452786446, + "grad_norm": 0.5191885828971863, + "kl": 0.20999768376350403, "learning_rate": 1e-06, - "loss": 0.1238, - "num_tokens": 1319216.0, - "reward": 0.6676406860351562, - "reward_std": 0.22850170731544495, + "loss": 0.0146, + "num_tokens": 2286408.0, + "reward": 0.717815101146698, + "reward_std": 0.14373189210891724, "step": 128 }, { @@ -3087,22 +3087,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 154.0, - "completions/max_terminated_length": 154.0, - "completions/mean_length": 109.625, - "completions/mean_terminated_length": 125.28571428571429, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 137.75, + "completions/mean_terminated_length": 150.27272727272728, "completions/min_length": 0.0, - "completions/min_terminated_length": 90.0, - "epoch": 0.516, + "completions/min_terminated_length": 98.0, + "epoch": 0.258, "format_failures": 0.0, - "grad_norm": 0.7231677174568176, - "kl": 0.06682828813791275, + "grad_norm": 1.204528570175171, + "kl": 0.08800000417977571, "learning_rate": 1e-06, - "loss": 0.0391, - "num_tokens": 1325216.0, - "reward": 0.6453869342803955, - "reward_std": 0.17804734408855438, + "loss": 0.0511, + "num_tokens": 2296044.0, + "reward": 0.5675595998764038, + "reward_std": 0.2289842963218689, "step": 129 }, { @@ -3111,22 +3111,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 131.0, - "completions/max_terminated_length": 131.0, - "completions/mean_length": 87.25, - "completions/mean_terminated_length": 99.71428571428571, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 124.58333333333333, + "completions/mean_terminated_length": 135.9090909090909, "completions/min_length": 0.0, - "completions/min_terminated_length": 79.0, - "epoch": 0.52, + "completions/min_terminated_length": 54.0, + "epoch": 0.26, "format_failures": 0.0, - "grad_norm": 5.297896385192871, - "kl": 0.6651033144444227, + "grad_norm": 0.44312867522239685, + "kl": 0.07202759943902493, "learning_rate": 1e-06, - "loss": 0.0091, - "num_tokens": 1330968.0, - "reward": 0.5601190328598022, - "reward_std": 0.13645371794700623, + "loss": 0.0475, + "num_tokens": 2305644.0, + "reward": 0.5101972222328186, + "reward_std": 0.19489067792892456, "step": 130 }, { @@ -3135,22 +3135,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.75, - "completions/max_length": 311.0, - "completions/max_terminated_length": 311.0, - "completions/mean_length": 77.25, - "completions/mean_terminated_length": 309.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 281.1666666666667, + "completions/mean_terminated_length": 306.72727272727275, "completions/min_length": 0.0, - "completions/min_terminated_length": 307.0, - "epoch": 0.524, - "format_failures": 0.0, - "grad_norm": 0.8595375418663025, - "kl": 0.06659615971148014, + "completions/min_terminated_length": 253.0, + "epoch": 0.262, + "format_failures": 1.0, + "grad_norm": 1.5526983737945557, + "kl": 0.06795010529458523, "learning_rate": 1e-06, - "loss": 0.0004, - "num_tokens": 1339728.0, - "reward": 0.9583333730697632, - "reward_std": 0.11785111576318741, + "loss": -0.0019, + "num_tokens": 2319192.0, + "reward": 0.75, + "reward_std": 0.3217690885066986, "step": 131 }, { @@ -3159,22 +3159,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.75, - "completions/max_length": 175.0, - "completions/max_terminated_length": 175.0, - "completions/mean_length": 41.875, - "completions/mean_terminated_length": 167.5, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 162.83333333333334, + "completions/mean_terminated_length": 177.63636363636363, "completions/min_length": 0.0, - "completions/min_terminated_length": 160.0, - "epoch": 0.528, + "completions/min_terminated_length": 175.0, + "epoch": 0.264, "format_failures": 0.0, - "grad_norm": 347.5839538574219, - "kl": 61.65154816582799, + "grad_norm": 2.740288257598877, + "kl": 0.7462278339080513, "learning_rate": 1e-06, - "loss": 1.1776, - "num_tokens": 1346528.0, - "reward": 0.875, - "reward_std": 0.3535533845424652, + "loss": 0.0045, + "num_tokens": 2329488.0, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, "step": 132 }, { @@ -3183,22 +3183,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 425.0, - "completions/max_terminated_length": 425.0, - "completions/mean_length": 296.125, - "completions/mean_terminated_length": 338.42857142857144, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 315.5, + "completions/mean_terminated_length": 344.1818181818182, "completions/min_length": 0.0, - "completions/min_terminated_length": 224.0, - "epoch": 0.532, + "completions/min_terminated_length": 233.0, + "epoch": 0.266, "format_failures": 0.0, - "grad_norm": 0.43614092469215393, - "kl": 0.10557529516518116, + "grad_norm": 0.11069951951503754, + "kl": 0.01982728624716401, "learning_rate": 1e-06, - "loss": -0.0366, - "num_tokens": 1355920.0, - "reward": 0.5107142925262451, - "reward_std": 0.21715763211250305, + "loss": -0.034, + "num_tokens": 2358276.0, + "reward": 0.5852844715118408, + "reward_std": 0.12080158293247223, "step": 133 }, { @@ -3207,22 +3207,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 257.0, - "completions/max_terminated_length": 257.0, - "completions/mean_length": 149.625, - "completions/mean_terminated_length": 171.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 161.16666666666666, + "completions/mean_terminated_length": 175.8181818181818, "completions/min_length": 0.0, - "completions/min_terminated_length": 116.0, - "epoch": 0.536, + "completions/min_terminated_length": 104.0, + "epoch": 0.268, "format_failures": 0.0, - "grad_norm": 2.284273624420166, - "kl": 0.06671860627830029, + "grad_norm": 0.8276861906051636, + "kl": 0.09472572058439255, "learning_rate": 1e-06, - "loss": 0.0794, - "num_tokens": 1363104.0, - "reward": 0.663690447807312, - "reward_std": 0.2778385877609253, + "loss": 0.0149, + "num_tokens": 2368980.0, + "reward": 0.6518849730491638, + "reward_std": 0.2886110842227936, "step": 134 }, { @@ -3231,22 +3231,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 358.0, - "completions/max_terminated_length": 358.0, - "completions/mean_length": 215.625, - "completions/mean_terminated_length": 246.42857142857142, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 227.08333333333334, + "completions/mean_terminated_length": 247.72727272727272, "completions/min_length": 0.0, - "completions/min_terminated_length": 162.0, - "epoch": 0.54, + "completions/min_terminated_length": 136.0, + "epoch": 0.27, "format_failures": 0.0, - "grad_norm": 0.8071838021278381, - "kl": 0.046601174399256706, + "grad_norm": 0.5550012588500977, + "kl": 0.02074157353490591, "learning_rate": 1e-06, - "loss": -0.0342, - "num_tokens": 1370928.0, - "reward": 0.6100694537162781, - "reward_std": 0.3873949348926544, + "loss": -0.0841, + "num_tokens": 2379828.0, + "reward": 0.6243386268615723, + "reward_std": 0.3905191719532013, "step": 135 }, { @@ -3255,22 +3255,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.625, - "completions/max_length": 228.0, - "completions/max_terminated_length": 228.0, - "completions/mean_length": 85.5, - "completions/mean_terminated_length": 228.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 210.0, + "completions/mean_terminated_length": 229.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, - "epoch": 0.544, + "epoch": 0.272, "format_failures": 0.0, - "grad_norm": 0.14378634095191956, - "kl": 0.04612975288182497, + "grad_norm": 1.019722580909729, + "kl": 0.13905800506472588, "learning_rate": 1e-06, - "loss": 0.0006, - "num_tokens": 1380216.0, - "reward": 1.0, - "reward_std": 0.0, + "loss": 0.0123, + "num_tokens": 2394360.0, + "reward": 0.949999988079071, + "reward_std": 0.17320507764816284, "step": 136 }, { @@ -3279,22 +3279,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 228.0, - "completions/max_terminated_length": 228.0, - "completions/mean_length": 137.0, - "completions/mean_terminated_length": 156.57142857142858, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 215.0, + "completions/mean_terminated_length": 234.54545454545453, "completions/min_length": 0.0, - "completions/min_terminated_length": 112.0, - "epoch": 0.548, + "completions/min_terminated_length": 145.0, + "epoch": 0.274, "format_failures": 0.0, - "grad_norm": 0.8862031698226929, - "kl": 0.07590018585324287, + "grad_norm": 0.32402342557907104, + "kl": 0.014864406548440456, "learning_rate": 1e-06, - "loss": -0.0377, - "num_tokens": 1386992.0, - "reward": 0.5406250357627869, - "reward_std": 0.23373425006866455, + "loss": -0.0012, + "num_tokens": 2406096.0, + "reward": 0.6149470806121826, + "reward_std": 0.19829140603542328, "step": 137 }, { @@ -3303,22 +3303,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.125, - "completions/max_length": 148.0, - "completions/max_terminated_length": 148.0, - "completions/mean_length": 73.5, - "completions/mean_terminated_length": 84.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 136.58333333333334, + "completions/mean_terminated_length": 149.0, "completions/min_length": 0.0, "completions/min_terminated_length": 58.0, - "epoch": 0.552, + "epoch": 0.276, "format_failures": 0.0, - "grad_norm": 813.8618774414062, - "kl": 83.35732051730156, + "grad_norm": 1.005679965019226, + "kl": 0.023909798823297024, "learning_rate": 1e-06, - "loss": 1.2934, - "num_tokens": 1396808.0, - "reward": 0.4833333492279053, - "reward_std": 0.4804098606109619, + "loss": -0.0608, + "num_tokens": 2423568.0, + "reward": 0.5231481790542603, + "reward_std": 0.3425479829311371, "step": 138 }, { @@ -3327,22 +3327,22 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.5, - "completions/max_length": 181.0, - "completions/max_terminated_length": 181.0, - "completions/mean_length": 68.875, - "completions/mean_terminated_length": 137.75, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 165.58333333333334, + "completions/mean_terminated_length": 180.63636363636363, "completions/min_length": 0.0, - "completions/min_terminated_length": 8.0, - "epoch": 0.556, + "completions/min_terminated_length": 75.0, + "epoch": 0.278, "format_failures": 0.0, - "grad_norm": 2.3204505443573, - "kl": 0.11221980676054955, + "grad_norm": 3.9986395835876465, + "kl": 2.975656658411026, "learning_rate": 1e-06, - "loss": -0.2757, - "num_tokens": 1405448.0, - "reward": 0.875, - "reward_std": 0.3535533845424652, + "loss": -0.0003, + "num_tokens": 2437320.0, + "reward": 0.7277778387069702, + "reward_std": 0.4172621965408325, "step": 139 }, { @@ -3351,20 +3351,20 @@ "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, - "completions/clipped_ratio": 0.625, + "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, - "completions/mean_length": 20.625, - "completions/mean_terminated_length": 55.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 54.75, "completions/min_length": 0.0, - "completions/min_terminated_length": 55.0, - "epoch": 0.56, + "completions/min_terminated_length": 53.0, + "epoch": 0.28, "format_failures": 0.0, - "grad_norm": 0.12126144766807556, - "kl": 0.013866727240383625, + "grad_norm": 0.04945458099246025, + "kl": 0.008955058641731739, "learning_rate": 1e-06, - "loss": 0.0007, - "num_tokens": 1413312.0, + "loss": 0.0002, + "num_tokens": 2449116.0, "reward": 1.0, "reward_std": 0.0, "step": 140 @@ -3372,8 +3372,8 @@ ], "logging_steps": 1, "max_steps": 1000, - "num_input_tokens_seen": 1413312, - "num_train_epochs": 4, + "num_input_tokens_seen": 2449116, + "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -3388,7 +3388,7 @@ } }, "total_flos": 0.0, - "train_batch_size": 2, + "train_batch_size": 1, "trial_name": null, "trial_params": null }