diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3394 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.56, + "eval_steps": 500, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1261.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 411.5, + "completions/mean_terminated_length": 470.2857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.004, + "format_failures": 0.0, + "grad_norm": 0.3164481222629547, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0574, + "num_tokens": 20912.0, + "reward": 0.10000000149011612, + "reward_std": 0.19272480905056, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 93.625, + "completions/mean_terminated_length": 107.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.008, + "format_failures": 0.0, + "grad_norm": 3.300063133239746, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": -0.032, + "num_tokens": 28472.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 170.375, + "completions/mean_terminated_length": 194.71428571428572, + "completions/min_length": 0.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.012, + "format_failures": 0.0, + "grad_norm": 0.426563024520874, + "kl": 0.19075269997119904, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 38272.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 215.75, + "completions/mean_terminated_length": 246.57142857142858, + "completions/min_length": 0.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.016, + "format_failures": 1.0, + "grad_norm": 0.3638526201248169, + "kl": 0.0030522841261699796, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 44880.0, + "reward": 0.17291666567325592, + "reward_std": 0.16665178537368774, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 88.625, + "completions/mean_terminated_length": 101.28571428571429, + "completions/min_length": 0.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.02, + "format_failures": 1.0, + "grad_norm": 12.54277515411377, + "kl": 1.5523776412010193, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 54104.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 310.57142857142856, + "completions/min_length": 0.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.024, + "format_failures": 0.0, + "grad_norm": 0.432476669549942, + "kl": 0.0021984531776979566, + "learning_rate": 1e-06, + "loss": -0.088, + "num_tokens": 66888.0, + "reward": 0.2569444477558136, + "reward_std": 0.27688348293304443, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 82.625, + "completions/mean_terminated_length": 94.42857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.028, + "format_failures": 0.0, + "grad_norm": 0.0007834668504074216, + "kl": 0.000487034791149199, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 87976.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 111.75, + "completions/mean_terminated_length": 127.71428571428571, + "completions/min_length": 0.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.032, + "format_failures": 0.0, + "grad_norm": 0.2904910445213318, + "kl": 0.0784255713224411, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 97376.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 297.25, + "completions/mean_terminated_length": 339.7142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.036, + "format_failures": 0.0, + "grad_norm": 0.5291862487792969, + "kl": 0.006049621384590864, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 110264.0, + "reward": 0.2834821343421936, + "reward_std": 0.3961408734321594, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 37.625, + "completions/mean_terminated_length": 50.166666666666664, + "completions/min_length": 0.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.04, + "format_failures": 0.0, + "grad_norm": 1.7151610851287842, + "kl": 0.2360311597585678, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 115504.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 109.5, + "completions/mean_terminated_length": 125.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.044, + "format_failures": 0.0, + "grad_norm": 1.5465294122695923, + "kl": 0.01262557739391923, + "learning_rate": 1e-06, + "loss": 0.1145, + "num_tokens": 125936.0, + "reward": 0.6499999761581421, + "reward_std": 0.4869731664657593, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 278.625, + "completions/mean_terminated_length": 318.42857142857144, + "completions/min_length": 0.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.048, + "format_failures": 0.0, + "grad_norm": 0.49245283007621765, + "kl": 0.01833944395184517, + "learning_rate": 1e-06, + "loss": 0.0385, + "num_tokens": 134920.0, + "reward": 0.543181836605072, + "reward_std": 0.3499283194541931, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 45.142857142857146, + "completions/min_length": 0.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.052, + "format_failures": 0.0, + "grad_norm": 0.0073767416179180145, + "kl": 0.0018603539792820811, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 155632.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1250.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 381.25, + "completions/mean_terminated_length": 435.7142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.056, + "format_failures": 0.0, + "grad_norm": 0.4092012047767639, + "kl": 0.0037365095922723413, + "learning_rate": 1e-06, + "loss": 0.0532, + "num_tokens": 178856.0, + "reward": 0.08141025900840759, + "reward_std": 0.17304366827011108, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 282.25, + "completions/mean_terminated_length": 322.57142857142856, + "completions/min_length": 0.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.06, + "format_failures": 0.0, + "grad_norm": 0.4555729627609253, + "kl": 0.03388933837413788, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 189944.0, + "reward": 0.4369778633117676, + "reward_std": 0.3217828869819641, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 87.625, + "completions/mean_terminated_length": 140.2, + "completions/min_length": 0.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.064, + "format_failures": 0.0, + "grad_norm": 8.791972160339355, + "kl": 1.302387694362551, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 197912.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 260.0, + "completions/mean_terminated_length": 297.14285714285717, + "completions/min_length": 0.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.068, + "format_failures": 0.0, + "grad_norm": 0.5435929298400879, + "kl": 0.016751494258642197, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 207184.0, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 56.57142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.072, + "format_failures": 0.0, + "grad_norm": 0.19489726424217224, + "kl": 0.061227064579725266, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 211464.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 126.375, + "completions/mean_terminated_length": 144.42857142857142, + "completions/min_length": 0.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.076, + "format_failures": 0.0, + "grad_norm": 0.6644909381866455, + "kl": 0.010538576170802116, + "learning_rate": 1e-06, + "loss": -0.0572, + "num_tokens": 228504.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 66.375, + "completions/mean_terminated_length": 75.85714285714286, + "completions/min_length": 0.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.08, + "format_failures": 0.0, + "grad_norm": 1.534692645072937, + "kl": 0.03320205491036177, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 236336.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 95.75, + "completions/mean_terminated_length": 109.42857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.084, + "format_failures": 0.0, + "grad_norm": 1.6011440753936768, + "kl": 0.028395552188158035, + "learning_rate": 1e-06, + "loss": 0.0329, + "num_tokens": 243416.0, + "reward": 0.28125, + "reward_std": 0.45193037390708923, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 157.0, + "completions/mean_terminated_length": 179.42857142857142, + "completions/min_length": 0.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.088, + "format_failures": 0.0, + "grad_norm": 0.8286527991294861, + "kl": 0.05863172188401222, + "learning_rate": 1e-06, + "loss": -0.0402, + "num_tokens": 251376.0, + "reward": 0.1830357164144516, + "reward_std": 0.28149792551994324, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 293.625, + "completions/mean_terminated_length": 335.57142857142856, + "completions/min_length": 0.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.092, + "format_failures": 0.0, + "grad_norm": 0.3510456383228302, + "kl": 0.04268372617661953, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 262824.0, + "reward": 0.29113247990608215, + "reward_std": 0.2665640711784363, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 276.25, + "completions/mean_terminated_length": 315.7142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.096, + "format_failures": 0.0, + "grad_norm": 0.6299352645874023, + "kl": 0.09684642031788826, + "learning_rate": 1e-06, + "loss": 0.0879, + "num_tokens": 273192.0, + "reward": 0.45770204067230225, + "reward_std": 0.4340135455131531, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 123.125, + "completions/mean_terminated_length": 140.71428571428572, + "completions/min_length": 0.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.1, + "format_failures": 0.0, + "grad_norm": 1.0716724395751953, + "kl": 0.08026151731610298, + "learning_rate": 1e-06, + "loss": 0.0274, + "num_tokens": 279688.0, + "reward": 0.5770493149757385, + "reward_std": 0.2756548523902893, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 241.0, + "completions/mean_terminated_length": 275.42857142857144, + "completions/min_length": 0.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.104, + "format_failures": 0.0, + "grad_norm": 0.46685901284217834, + "kl": 0.06300827860832214, + "learning_rate": 1e-06, + "loss": -0.0169, + "num_tokens": 288160.0, + "reward": 0.4475490152835846, + "reward_std": 0.30980169773101807, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 79.75, + "completions/mean_terminated_length": 91.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.108, + "format_failures": 0.0, + "grad_norm": 1.3687446117401123, + "kl": 0.04298516921699047, + "learning_rate": 1e-06, + "loss": -0.0143, + "num_tokens": 293328.0, + "reward": 0.2083333432674408, + "reward_std": 0.39591163396835327, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 93.75, + "completions/mean_terminated_length": 107.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.112, + "format_failures": 0.0, + "grad_norm": 2.618457794189453, + "kl": 0.7109708972275257, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 301896.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 347.0, + "completions/mean_terminated_length": 396.57142857142856, + "completions/min_length": 0.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.116, + "format_failures": 0.0, + "grad_norm": 0.8099527955055237, + "kl": 0.004772833781316876, + "learning_rate": 1e-06, + "loss": 0.1672, + "num_tokens": 323048.0, + "reward": 0.3559523820877075, + "reward_std": 0.38564079999923706, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 144.5, + "completions/mean_terminated_length": 165.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.12, + "format_failures": 0.0, + "grad_norm": 0.3579946756362915, + "kl": 0.02128867618739605, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 329880.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 211.375, + "completions/mean_terminated_length": 241.57142857142858, + "completions/min_length": 0.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.124, + "format_failures": 0.0, + "grad_norm": 1.7018321752548218, + "kl": 0.014225509017705917, + "learning_rate": 1e-06, + "loss": -0.7754, + "num_tokens": 352120.0, + "reward": 0.2201923131942749, + "reward_std": 0.4099963307380676, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1071.0, + "completions/max_terminated_length": 1071.0, + "completions/mean_length": 291.375, + "completions/mean_terminated_length": 333.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.128, + "format_failures": 0.0, + "grad_norm": 0.8998605608940125, + "kl": 0.0065889437682926655, + "learning_rate": 1e-06, + "loss": 0.4427, + "num_tokens": 373128.0, + "reward": 0.3992924690246582, + "reward_std": 0.34711551666259766, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 53.857142857142854, + "completions/min_length": 0.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.132, + "format_failures": 0.0, + "grad_norm": 0.3835119307041168, + "kl": 0.1347430720925331, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 379176.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2049.0, + "completions/max_terminated_length": 2049.0, + "completions/mean_length": 523.25, + "completions/mean_terminated_length": 598.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.136, + "format_failures": 0.0, + "grad_norm": 0.5971964597702026, + "kl": 0.013881782768294215, + "learning_rate": 1e-06, + "loss": 0.3237, + "num_tokens": 400040.0, + "reward": 0.36666667461395264, + "reward_std": 0.4086368978023529, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 70.0, + "completions/mean_terminated_length": 80.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.14, + "format_failures": 0.0, + "grad_norm": 1.4775596857070923, + "kl": 0.1210218146443367, + "learning_rate": 1e-06, + "loss": 0.0368, + "num_tokens": 404552.0, + "reward": 0.4513888955116272, + "reward_std": 0.4428916275501251, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 112.75, + "completions/mean_terminated_length": 128.85714285714286, + "completions/min_length": 0.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.144, + "format_failures": 0.0, + "grad_norm": 0.09341330826282501, + "kl": 0.08837828040122986, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 411528.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 193.375, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.148, + "format_failures": 0.0, + "grad_norm": 0.7524359822273254, + "kl": 0.04169362783432007, + "learning_rate": 1e-06, + "loss": 0.0556, + "num_tokens": 419160.0, + "reward": 0.5170454382896423, + "reward_std": 0.3414821922779083, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 61.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.152, + "format_failures": 0.0, + "grad_norm": 0.3160454034805298, + "kl": 0.07513360120356083, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 423768.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 301.125, + "completions/mean_terminated_length": 344.14285714285717, + "completions/min_length": 0.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.156, + "format_failures": 0.0, + "grad_norm": 1.2346574068069458, + "kl": 0.27855822443962097, + "learning_rate": 1e-06, + "loss": -0.0911, + "num_tokens": 435552.0, + "reward": 0.5062500238418579, + "reward_std": 0.43001821637153625, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 453.0, + "completions/mean_terminated_length": 517.7142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.16, + "format_failures": 0.0, + "grad_norm": 0.14271600544452667, + "kl": 0.011493591591715813, + "learning_rate": 1e-06, + "loss": -0.0636, + "num_tokens": 460960.0, + "reward": 0.5462301969528198, + "reward_std": 0.12065710872411728, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 61.75, + "completions/mean_terminated_length": 70.57142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.164, + "format_failures": 0.0, + "grad_norm": 3.143950939178467, + "kl": 0.05912626534700394, + "learning_rate": 1e-06, + "loss": -0.0781, + "num_tokens": 469456.0, + "reward": 0.32499998807907104, + "reward_std": 0.46521884202957153, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 81.5, + "completions/mean_terminated_length": 93.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.168, + "format_failures": 0.0, + "grad_norm": 0.4773140251636505, + "kl": 0.2549777179956436, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 475288.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 111.375, + "completions/mean_terminated_length": 127.28571428571429, + "completions/min_length": 0.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.172, + "format_failures": 0.0, + "grad_norm": 3.081820487976074, + "kl": 0.05859908275306225, + "learning_rate": 1e-06, + "loss": -0.132, + "num_tokens": 482848.0, + "reward": 0.375, + "reward_std": 0.5175491571426392, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 45.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.176, + "format_failures": 1.0, + "grad_norm": 0.5130624175071716, + "kl": 0.034313835203647614, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 488336.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 80.125, + "completions/mean_terminated_length": 91.57142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.18, + "format_failures": 0.0, + "grad_norm": 4.689250946044922, + "kl": 0.8184864521026611, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 497608.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 94.75, + "completions/mean_terminated_length": 108.28571428571429, + "completions/min_length": 0.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.184, + "format_failures": 0.0, + "grad_norm": 1.715865135192871, + "kl": 0.04798049572855234, + "learning_rate": 1e-06, + "loss": 0.0176, + "num_tokens": 506248.0, + "reward": 0.3333333432674408, + "reward_std": 0.35634833574295044, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 235.75, + "completions/mean_terminated_length": 269.42857142857144, + "completions/min_length": 0.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.188, + "format_failures": 0.0, + "grad_norm": 0.770552933216095, + "kl": 0.026089726015925407, + "learning_rate": 1e-06, + "loss": -0.1374, + "num_tokens": 526432.0, + "reward": 0.615674614906311, + "reward_std": 0.2741951644420624, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 131.875, + "completions/mean_terminated_length": 150.71428571428572, + "completions/min_length": 0.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.192, + "format_failures": 0.0, + "grad_norm": 0.5728135108947754, + "kl": 0.08094584196805954, + "learning_rate": 1e-06, + "loss": -0.0257, + "num_tokens": 534408.0, + "reward": 0.09375, + "reward_std": 0.2651650309562683, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 116.25, + "completions/mean_terminated_length": 132.85714285714286, + "completions/min_length": 0.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.196, + "format_failures": 1.0, + "grad_norm": 1.1056642532348633, + "kl": 0.5166730880737305, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 542224.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 102.375, + "completions/mean_terminated_length": 117.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.2, + "format_failures": 0.0, + "grad_norm": 2.430076837539673, + "kl": 0.04860229790210724, + "learning_rate": 1e-06, + "loss": 1.0757, + "num_tokens": 557760.0, + "reward": 0.7159091234207153, + "reward_std": 0.41142913699150085, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 64.33333333333333, + "completions/min_length": 0.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.204, + "format_failures": 0.0, + "grad_norm": 62.65098190307617, + "kl": 8.405316352844238, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 565040.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2051.0, + "completions/max_terminated_length": 2051.0, + "completions/mean_length": 493.75, + "completions/mean_terminated_length": 564.2857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.208, + "format_failures": 0.0, + "grad_norm": 1.2885843515396118, + "kl": 0.019100312143564224, + "learning_rate": 1e-06, + "loss": -0.1233, + "num_tokens": 587416.0, + "reward": 0.22045454382896423, + "reward_std": 0.5661183595657349, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 195.375, + "completions/mean_terminated_length": 223.28571428571428, + "completions/min_length": 0.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.212, + "format_failures": 0.0, + "grad_norm": 1.2086297273635864, + "kl": 0.17173044383525848, + "learning_rate": 1e-06, + "loss": -0.0158, + "num_tokens": 598440.0, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 309.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.216, + "format_failures": 0.0, + "grad_norm": 0.35032373666763306, + "kl": 0.040101515129208565, + "learning_rate": 1e-06, + "loss": 0.0421, + "num_tokens": 608128.0, + "reward": 0.5280122756958008, + "reward_std": 0.23830601572990417, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 57.5, + "completions/mean_terminated_length": 65.71428571428571, + "completions/min_length": 0.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.22, + "format_failures": 0.0, + "grad_norm": 1.1529607772827148, + "kl": 0.18720119446516037, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 614320.0, + "reward": 0.1875, + "reward_std": 0.1157275140285492, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 160.625, + "completions/mean_terminated_length": 183.57142857142858, + "completions/min_length": 0.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.224, + "format_failures": 0.0, + "grad_norm": 1.1264208555221558, + "kl": 0.08833763748407364, + "learning_rate": 1e-06, + "loss": 0.0454, + "num_tokens": 623424.0, + "reward": 0.2916666567325592, + "reward_std": 0.4520675837993622, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 184.625, + "completions/mean_terminated_length": 211.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.228, + "format_failures": 0.0, + "grad_norm": 0.03325602412223816, + "kl": 0.04312510974705219, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 631976.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 69.5, + "completions/mean_terminated_length": 79.42857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.232, + "format_failures": 0.0, + "grad_norm": 0.04106176272034645, + "kl": 0.026903850957751274, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 637776.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 312.57142857142856, + "completions/min_length": 0.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.236, + "format_failures": 0.0, + "grad_norm": 0.11601117998361588, + "kl": 0.0859757624566555, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 650616.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 298.75, + "completions/mean_terminated_length": 341.42857142857144, + "completions/min_length": 0.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.24, + "format_failures": 0.0, + "grad_norm": 0.3067856729030609, + "kl": 0.1856345497071743, + "learning_rate": 1e-06, + "loss": 0.033, + "num_tokens": 659168.0, + "reward": 0.5159090757369995, + "reward_std": 0.1970113068819046, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 58.75, + "completions/mean_terminated_length": 67.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.244, + "format_failures": 0.0, + "grad_norm": 1.8961628675460815, + "kl": 0.0375029481947422, + "learning_rate": 1e-06, + "loss": -0.0261, + "num_tokens": 663984.0, + "reward": 0.8500000238418579, + "reward_std": 0.3505098223686218, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 251.625, + "completions/mean_terminated_length": 287.57142857142856, + "completions/min_length": 0.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.248, + "format_failures": 0.0, + "grad_norm": 0.2780621349811554, + "kl": 0.05490433797240257, + "learning_rate": 1e-06, + "loss": 0.0335, + "num_tokens": 672792.0, + "reward": 0.5874999761581421, + "reward_std": 0.16226325929164886, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 208.25, + "completions/mean_terminated_length": 238.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.252, + "format_failures": 0.0, + "grad_norm": 1.4749125242233276, + "kl": 0.11917952820658684, + "learning_rate": 1e-06, + "loss": 0.5038, + "num_tokens": 692808.0, + "reward": 0.5503472089767456, + "reward_std": 0.4739660620689392, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 121.625, + "completions/mean_terminated_length": 139.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.256, + "format_failures": 0.0, + "grad_norm": 0.9009966850280762, + "kl": 0.04829751141369343, + "learning_rate": 1e-06, + "loss": -0.0333, + "num_tokens": 713192.0, + "reward": 0.4819444417953491, + "reward_std": 0.13385315239429474, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 202.875, + "completions/mean_terminated_length": 231.85714285714286, + "completions/min_length": 0.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.26, + "format_failures": 0.0, + "grad_norm": 0.6346305012702942, + "kl": 0.08024599775671959, + "learning_rate": 1e-06, + "loss": 0.067, + "num_tokens": 720472.0, + "reward": 0.25189393758773804, + "reward_std": 0.2742690443992615, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 361.875, + "completions/mean_terminated_length": 413.57142857142856, + "completions/min_length": 0.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.264, + "format_failures": 0.0, + "grad_norm": 0.3846381604671478, + "kl": 0.03228219784796238, + "learning_rate": 1e-06, + "loss": 0.0465, + "num_tokens": 732112.0, + "reward": 0.3159722089767456, + "reward_std": 0.2696826457977295, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 350.25, + "completions/mean_terminated_length": 400.2857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.268, + "format_failures": 0.0, + "grad_norm": 0.2731687128543854, + "kl": 0.02399719413369894, + "learning_rate": 1e-06, + "loss": 0.0932, + "num_tokens": 752768.0, + "reward": 0.48750001192092896, + "reward_std": 0.17878557741641998, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 57.5, + "completions/mean_terminated_length": 65.71428571428571, + "completions/min_length": 0.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.272, + "format_failures": 0.0, + "grad_norm": 0.5548056960105896, + "kl": 0.3331392854452133, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 758312.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 96.0, + "completions/mean_terminated_length": 109.71428571428571, + "completions/min_length": 0.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.276, + "format_failures": 0.0, + "grad_norm": 5.602732181549072, + "kl": 1.5559703707695007, + "learning_rate": 1e-06, + "loss": -0.0333, + "num_tokens": 765560.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 288.25, + "completions/mean_terminated_length": 329.42857142857144, + "completions/min_length": 0.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.28, + "format_failures": 0.0, + "grad_norm": 0.28595268726348877, + "kl": 0.05494564212858677, + "learning_rate": 1e-06, + "loss": 0.0934, + "num_tokens": 782656.0, + "reward": 0.2406907081604004, + "reward_std": 0.2288402020931244, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 306.375, + "completions/mean_terminated_length": 350.14285714285717, + "completions/min_length": 0.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.284, + "format_failures": 0.0, + "grad_norm": 0.4375990033149719, + "kl": 0.15084227919578552, + "learning_rate": 1e-06, + "loss": -0.0137, + "num_tokens": 792064.0, + "reward": 0.6625000238418579, + "reward_std": 0.31139087677001953, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 126.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.288, + "format_failures": 0.0, + "grad_norm": 4.322110652923584, + "kl": 0.025789054110646248, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 800160.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 276.75, + "completions/mean_terminated_length": 316.2857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.292, + "format_failures": 0.0, + "grad_norm": 0.6268705725669861, + "kl": 0.08498941920697689, + "learning_rate": 1e-06, + "loss": -0.0614, + "num_tokens": 810104.0, + "reward": 0.41130954027175903, + "reward_std": 0.3625659644603729, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 221.125, + "completions/mean_terminated_length": 252.71428571428572, + "completions/min_length": 0.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.296, + "format_failures": 0.0, + "grad_norm": 0.8162993788719177, + "kl": 0.0225818594917655, + "learning_rate": 1e-06, + "loss": 0.0523, + "num_tokens": 822920.0, + "reward": 0.7083333730697632, + "reward_std": 0.4520675837993622, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 249.625, + "completions/mean_terminated_length": 285.2857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.3, + "format_failures": 0.0, + "grad_norm": 0.3638235032558441, + "kl": 0.10483588464558125, + "learning_rate": 1e-06, + "loss": 0.093, + "num_tokens": 833608.0, + "reward": 0.2819444537162781, + "reward_std": 0.2347228229045868, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 92.75, + "completions/mean_terminated_length": 106.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.304, + "format_failures": 0.0, + "grad_norm": 0.005022191442549229, + "kl": 0.01964521873742342, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 855560.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 300.25, + "completions/mean_terminated_length": 343.14285714285717, + "completions/min_length": 0.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.308, + "format_failures": 0.0, + "grad_norm": 0.6531589031219482, + "kl": 0.1464357189834118, + "learning_rate": 1e-06, + "loss": -0.0231, + "num_tokens": 873712.0, + "reward": 0.24836310744285583, + "reward_std": 0.23662379384040833, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 309.85714285714283, + "completions/min_length": 0.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.312, + "format_failures": 0.0, + "grad_norm": 0.585955023765564, + "kl": 0.0404690857976675, + "learning_rate": 1e-06, + "loss": 0.0946, + "num_tokens": 882408.0, + "reward": 0.6073564291000366, + "reward_std": 0.39037758111953735, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 75.25, + "completions/mean_terminated_length": 150.5, + "completions/min_length": 0.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.316, + "format_failures": 0.0, + "grad_norm": 4.991185188293457, + "kl": 0.13191331177949905, + "learning_rate": 1e-06, + "loss": -0.1159, + "num_tokens": 891768.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 244.0, + "completions/mean_terminated_length": 278.85714285714283, + "completions/min_length": 0.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.32, + "format_failures": 0.0, + "grad_norm": 1.556532621383667, + "kl": 0.30473417043685913, + "learning_rate": 1e-06, + "loss": -0.0131, + "num_tokens": 900480.0, + "reward": 0.375, + "reward_std": 0.5175491571426392, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 42.57142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.324, + "format_failures": 0.0, + "grad_norm": 6.271825790405273, + "kl": 1.4292120337486267, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 905384.0, + "reward": 0.09375, + "reward_std": 0.2651650309562683, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 106.375, + "completions/mean_terminated_length": 121.57142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.328, + "format_failures": 0.0, + "grad_norm": 1.0847584009170532, + "kl": 0.4334397315979004, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 912488.0, + "reward": 0.1875, + "reward_std": 0.2642374634742737, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 445.625, + "completions/mean_terminated_length": 509.2857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.332, + "format_failures": 0.0, + "grad_norm": 0.3242776393890381, + "kl": 0.028012586757540703, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 927144.0, + "reward": 0.6354166865348816, + "reward_std": 0.41770702600479126, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 80.75, + "completions/mean_terminated_length": 92.28571428571429, + "completions/min_length": 0.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.336, + "format_failures": 0.0, + "grad_norm": 2.415727376937866, + "kl": 1.3026588559150696, + "learning_rate": 1e-06, + "loss": -0.0519, + "num_tokens": 933024.0, + "reward": 0.2708333432674408, + "reward_std": 0.39778655767440796, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 287.25, + "completions/mean_terminated_length": 328.2857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.34, + "format_failures": 0.0, + "grad_norm": 0.5015918612480164, + "kl": 0.07602308504283428, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 943576.0, + "reward": 0.35555556416511536, + "reward_std": 0.330837219953537, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 92.875, + "completions/mean_terminated_length": 106.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.344, + "format_failures": 0.0, + "grad_norm": 3162.383056640625, + "kl": 592.9862050414085, + "learning_rate": 1e-06, + "loss": 4.4073, + "num_tokens": 950272.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 78.875, + "completions/mean_terminated_length": 157.75, + "completions/min_length": 0.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.348, + "format_failures": 0.0, + "grad_norm": 5.812924385070801, + "kl": 0.03395126201212406, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 956992.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 85.5, + "completions/mean_terminated_length": 97.71428571428571, + "completions/min_length": 0.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.352, + "format_failures": 0.0, + "grad_norm": 1.021047830581665, + "kl": 0.18108929693698883, + "learning_rate": 1e-06, + "loss": 0.0333, + "num_tokens": 962232.0, + "reward": 0.5613095164299011, + "reward_std": 0.23917356133460999, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 119.625, + "completions/mean_terminated_length": 136.71428571428572, + "completions/min_length": 0.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.356, + "format_failures": 0.0, + "grad_norm": 1.7294161319732666, + "kl": 0.41833993047475815, + "learning_rate": 1e-06, + "loss": 0.1262, + "num_tokens": 975584.0, + "reward": 0.109375, + "reward_std": 0.14250017702579498, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 138.0, + "completions/mean_terminated_length": 157.71428571428572, + "completions/min_length": 0.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.36, + "format_failures": 0.0, + "grad_norm": 64.84632873535156, + "kl": 29.09031867980957, + "learning_rate": 1e-06, + "loss": 0.188, + "num_tokens": 981256.0, + "reward": 0.7403273582458496, + "reward_std": 0.17907913029193878, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 91.125, + "completions/mean_terminated_length": 121.5, + "completions/min_length": 0.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.364, + "format_failures": 0.0, + "grad_norm": 84280.7421875, + "kl": 6375.002594873309, + "learning_rate": 1e-06, + "loss": 96.8935, + "num_tokens": 992408.0, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 301.625, + "completions/mean_terminated_length": 344.7142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.368, + "format_failures": 0.0, + "grad_norm": 1.2784438133239746, + "kl": 0.7278856039047241, + "learning_rate": 1e-06, + "loss": 0.1895, + "num_tokens": 1010848.0, + "reward": 0.39940476417541504, + "reward_std": 0.3344242572784424, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 275.375, + "completions/mean_terminated_length": 314.7142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.372, + "format_failures": 0.0, + "grad_norm": 0.2864258289337158, + "kl": 0.08188853040337563, + "learning_rate": 1e-06, + "loss": 0.0434, + "num_tokens": 1020488.0, + "reward": 0.5244791507720947, + "reward_std": 0.16294103860855103, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 58.25, + "completions/mean_terminated_length": 66.57142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.376, + "format_failures": 0.0, + "grad_norm": 22.611703872680664, + "kl": 1.769313856959343, + "learning_rate": 1e-06, + "loss": 0.0731, + "num_tokens": 1028496.0, + "reward": 0.5, + "reward_std": 0.37796446681022644, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 136.625, + "completions/mean_terminated_length": 156.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.38, + "format_failures": 0.0, + "grad_norm": 3.4661715030670166, + "kl": 0.3033728860318661, + "learning_rate": 1e-06, + "loss": 0.0753, + "num_tokens": 1038480.0, + "reward": 0.4479166567325592, + "reward_std": 0.41052013635635376, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 109.75, + "completions/mean_terminated_length": 125.42857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.384, + "format_failures": 0.0, + "grad_norm": 2.9459471702575684, + "kl": 0.8582945615053177, + "learning_rate": 1e-06, + "loss": -0.0371, + "num_tokens": 1048240.0, + "reward": 0.6180555820465088, + "reward_std": 0.42537203431129456, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 64.0, + "completions/mean_terminated_length": 73.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.388, + "format_failures": 0.0, + "grad_norm": 0.43100497126579285, + "kl": 0.06553871184587479, + "learning_rate": 1e-06, + "loss": 0.0368, + "num_tokens": 1054728.0, + "reward": 0.9642857313156128, + "reward_std": 0.10101523995399475, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 46.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.392, + "format_failures": 0.0, + "grad_norm": 6.787167072296143, + "kl": 1.8237296342849731, + "learning_rate": 1e-06, + "loss": 0.0307, + "num_tokens": 1060304.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 179.125, + "completions/mean_terminated_length": 204.71428571428572, + "completions/min_length": 0.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.396, + "format_failures": 0.0, + "grad_norm": 1.4021135568618774, + "kl": 0.06424028240144253, + "learning_rate": 1e-06, + "loss": -0.0136, + "num_tokens": 1082672.0, + "reward": 0.490579217672348, + "reward_std": 0.34001559019088745, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 140.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.4, + "format_failures": 0.0, + "grad_norm": 4.928287982940674, + "kl": 0.6296049430966377, + "learning_rate": 1e-06, + "loss": -0.2632, + "num_tokens": 1090648.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 73.125, + "completions/mean_terminated_length": 83.57142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.404, + "format_failures": 1.0, + "grad_norm": 1.0927364826202393, + "kl": 0.4457448348402977, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 1099240.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 100.625, + "completions/mean_terminated_length": 115.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.408, + "format_failures": 0.0, + "grad_norm": 0.3484640419483185, + "kl": 0.014615435153245926, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 1106512.0, + "reward": 0.8717262148857117, + "reward_std": 0.1315489560365677, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 79.0, + "completions/mean_terminated_length": 90.28571428571429, + "completions/min_length": 0.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.412, + "format_failures": 0.0, + "grad_norm": 2.578859329223633, + "kl": 0.05575744202360511, + "learning_rate": 1e-06, + "loss": -0.1115, + "num_tokens": 1113224.0, + "reward": 0.53125, + "reward_std": 0.41052013635635376, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 224.125, + "completions/mean_terminated_length": 256.14285714285717, + "completions/min_length": 0.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.416, + "format_failures": 0.0, + "grad_norm": 0.1279614269733429, + "kl": 0.008564054034650326, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 1120832.0, + "reward": 0.6416666507720947, + "reward_std": 0.08864051848649979, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 120.75, + "completions/mean_terminated_length": 138.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.42, + "format_failures": 0.0, + "grad_norm": 2.8721704483032227, + "kl": 0.028846602886915207, + "learning_rate": 1e-06, + "loss": 0.1163, + "num_tokens": 1129032.0, + "reward": 0.5833333730697632, + "reward_std": 0.49601587653160095, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 61.0, + "completions/mean_terminated_length": 69.71428571428571, + "completions/min_length": 0.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.424, + "format_failures": 0.0, + "grad_norm": 0.4012051820755005, + "kl": 0.13534526526927948, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 1137584.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 169.0, + "completions/mean_terminated_length": 193.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.428, + "format_failures": 0.0, + "grad_norm": 0.34922441840171814, + "kl": 0.014531925320625305, + "learning_rate": 1e-06, + "loss": 0.0412, + "num_tokens": 1144344.0, + "reward": 0.3263888955116272, + "reward_std": 0.19911068677902222, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 44.75, + "completions/mean_terminated_length": 51.142857142857146, + "completions/min_length": 0.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.432, + "format_failures": 0.0, + "grad_norm": 0.8536809682846069, + "kl": 0.01497908541932702, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 1148760.0, + "reward": 0.90625, + "reward_std": 0.2651650309562683, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 85.375, + "completions/mean_terminated_length": 97.57142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.436, + "format_failures": 0.0, + "grad_norm": 3.196063995361328, + "kl": 0.09259714558720589, + "learning_rate": 1e-06, + "loss": -0.0254, + "num_tokens": 1154816.0, + "reward": 0.375, + "reward_std": 0.5175491571426392, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 93.125, + "completions/mean_terminated_length": 106.42857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.44, + "format_failures": 0.0, + "grad_norm": 2.7271082401275635, + "kl": 0.04449745221063495, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 1163208.0, + "reward": 0.4166666865348816, + "reward_std": 0.34503278136253357, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 114.5, + "completions/mean_terminated_length": 130.85714285714286, + "completions/min_length": 0.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.444, + "format_failures": 0.0, + "grad_norm": 0.10579583793878555, + "kl": 0.055604600347578526, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 1174056.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 146.375, + "completions/mean_terminated_length": 167.28571428571428, + "completions/min_length": 0.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.448, + "format_failures": 0.0, + "grad_norm": 0.41507479548454285, + "kl": 0.019602006301283836, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 1181760.0, + "reward": 0.9750000238418579, + "reward_std": 0.0707106813788414, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 53.42857142857143, + "completions/min_length": 0.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.452, + "format_failures": 0.0, + "grad_norm": 2.7538251876831055, + "kl": 0.05537968873977661, + "learning_rate": 1e-06, + "loss": -0.0324, + "num_tokens": 1187360.0, + "reward": 0.5416666865348816, + "reward_std": 0.5019802451133728, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 341.875, + "completions/mean_terminated_length": 390.7142857142857, + "completions/min_length": 0.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.456, + "format_failures": 0.0, + "grad_norm": 0.6517180800437927, + "kl": 0.01990941632539034, + "learning_rate": 1e-06, + "loss": -0.0628, + "num_tokens": 1200928.0, + "reward": 0.3812499940395355, + "reward_std": 0.4225243031978607, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 283.5, + "completions/min_length": 0.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.46, + "format_failures": 0.0, + "grad_norm": 2.6183741092681885, + "kl": 0.3156433766707778, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 1209616.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 127.625, + "completions/mean_terminated_length": 145.85714285714286, + "completions/min_length": 0.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.464, + "format_failures": 0.0, + "grad_norm": 1.40922212600708, + "kl": 0.3603953216224909, + "learning_rate": 1e-06, + "loss": -0.1, + "num_tokens": 1216840.0, + "reward": 0.375, + "reward_std": 0.4520675837993622, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 172.875, + "completions/mean_terminated_length": 197.57142857142858, + "completions/min_length": 0.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.468, + "format_failures": 0.0, + "grad_norm": 0.5828225612640381, + "kl": 0.013718126341700554, + "learning_rate": 1e-06, + "loss": -0.0188, + "num_tokens": 1223464.0, + "reward": 0.3083333373069763, + "reward_std": 0.2980092763900757, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 113.0, + "completions/mean_terminated_length": 129.14285714285714, + "completions/min_length": 0.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.472, + "format_failures": 0.0, + "grad_norm": 1.907884120941162, + "kl": 0.16990539245307446, + "learning_rate": 1e-06, + "loss": 0.1637, + "num_tokens": 1231632.0, + "reward": 0.265625, + "reward_std": 0.45531338453292847, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 139.125, + "completions/mean_terminated_length": 159.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.476, + "format_failures": 0.0, + "grad_norm": 0.5671705007553101, + "kl": 0.0328083336353302, + "learning_rate": 1e-06, + "loss": 0.1641, + "num_tokens": 1242688.0, + "reward": 0.6208333373069763, + "reward_std": 0.3646862208843231, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 74.375, + "completions/mean_terminated_length": 85.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.48, + "format_failures": 0.0, + "grad_norm": 6.129162788391113, + "kl": 2.631644606590271, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 1250712.0, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 66.5, + "completions/min_length": 0.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.484, + "format_failures": 0.0, + "grad_norm": 2.2025108337402344, + "kl": 0.009498461615294218, + "learning_rate": 1e-06, + "loss": -0.1436, + "num_tokens": 1255560.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 367.875, + "completions/mean_terminated_length": 490.5, + "completions/min_length": 0.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.488, + "format_failures": 0.0, + "grad_norm": 0.6904863715171814, + "kl": 0.20124347042292356, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 1273976.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 230.25, + "completions/mean_terminated_length": 263.14285714285717, + "completions/min_length": 0.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.492, + "format_failures": 0.0, + "grad_norm": 0.813983142375946, + "kl": 0.09101713076233864, + "learning_rate": 1e-06, + "loss": 0.0777, + "num_tokens": 1282880.0, + "reward": 0.6432539820671082, + "reward_std": 0.3272421360015869, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 41.75, + "completions/mean_terminated_length": 47.714285714285715, + "completions/min_length": 0.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.496, + "format_failures": 0.0, + "grad_norm": 4.916449546813965, + "kl": 0.6664696265943348, + "learning_rate": 1e-06, + "loss": -0.0205, + "num_tokens": 1288032.0, + "reward": 0.8125, + "reward_std": 0.3720118999481201, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 98.66666666666667, + "completions/min_length": 0.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.5, + "format_failures": 0.0, + "grad_norm": 11.985437393188477, + "kl": 2.047822058200836, + "learning_rate": 1e-06, + "loss": -0.0949, + "num_tokens": 1294032.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 139.5, + "completions/mean_terminated_length": 223.2, + "completions/min_length": 0.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.504, + "format_failures": 0.0, + "grad_norm": 0.06410921365022659, + "kl": 0.024711698293685913, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 1302656.0, + "reward": 0.0, + "reward_std": 0.0, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 134.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.508, + "format_failures": 0.0, + "grad_norm": 5.750870704650879, + "kl": 0.32033737003803253, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 1311200.0, + "reward": 0.78125, + "reward_std": 0.33905068039894104, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 140.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.512, + "format_failures": 0.0, + "grad_norm": 2.3594982624053955, + "kl": 0.27750419452786446, + "learning_rate": 1e-06, + "loss": 0.1238, + "num_tokens": 1319216.0, + "reward": 0.6676406860351562, + "reward_std": 0.22850170731544495, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 109.625, + "completions/mean_terminated_length": 125.28571428571429, + "completions/min_length": 0.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.516, + "format_failures": 0.0, + "grad_norm": 0.7231677174568176, + "kl": 0.06682828813791275, + "learning_rate": 1e-06, + "loss": 0.0391, + "num_tokens": 1325216.0, + "reward": 0.6453869342803955, + "reward_std": 0.17804734408855438, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 87.25, + "completions/mean_terminated_length": 99.71428571428571, + "completions/min_length": 0.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.52, + "format_failures": 0.0, + "grad_norm": 5.297896385192871, + "kl": 0.6651033144444227, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 1330968.0, + "reward": 0.5601190328598022, + "reward_std": 0.13645371794700623, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 77.25, + "completions/mean_terminated_length": 309.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.524, + "format_failures": 0.0, + "grad_norm": 0.8595375418663025, + "kl": 0.06659615971148014, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 1339728.0, + "reward": 0.9583333730697632, + "reward_std": 0.11785111576318741, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 41.875, + "completions/mean_terminated_length": 167.5, + "completions/min_length": 0.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.528, + "format_failures": 0.0, + "grad_norm": 347.5839538574219, + "kl": 61.65154816582799, + "learning_rate": 1e-06, + "loss": 1.1776, + "num_tokens": 1346528.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 296.125, + "completions/mean_terminated_length": 338.42857142857144, + "completions/min_length": 0.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.532, + "format_failures": 0.0, + "grad_norm": 0.43614092469215393, + "kl": 0.10557529516518116, + "learning_rate": 1e-06, + "loss": -0.0366, + "num_tokens": 1355920.0, + "reward": 0.5107142925262451, + "reward_std": 0.21715763211250305, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 149.625, + "completions/mean_terminated_length": 171.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.536, + "format_failures": 0.0, + "grad_norm": 2.284273624420166, + "kl": 0.06671860627830029, + "learning_rate": 1e-06, + "loss": 0.0794, + "num_tokens": 1363104.0, + "reward": 0.663690447807312, + "reward_std": 0.2778385877609253, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 215.625, + "completions/mean_terminated_length": 246.42857142857142, + "completions/min_length": 0.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.54, + "format_failures": 0.0, + "grad_norm": 0.8071838021278381, + "kl": 0.046601174399256706, + "learning_rate": 1e-06, + "loss": -0.0342, + "num_tokens": 1370928.0, + "reward": 0.6100694537162781, + "reward_std": 0.3873949348926544, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 85.5, + "completions/mean_terminated_length": 228.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.544, + "format_failures": 0.0, + "grad_norm": 0.14378634095191956, + "kl": 0.04612975288182497, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 1380216.0, + "reward": 1.0, + "reward_std": 0.0, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 137.0, + "completions/mean_terminated_length": 156.57142857142858, + "completions/min_length": 0.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.548, + "format_failures": 0.0, + "grad_norm": 0.8862031698226929, + "kl": 0.07590018585324287, + "learning_rate": 1e-06, + "loss": -0.0377, + "num_tokens": 1386992.0, + "reward": 0.5406250357627869, + "reward_std": 0.23373425006866455, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 73.5, + "completions/mean_terminated_length": 84.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.552, + "format_failures": 0.0, + "grad_norm": 813.8618774414062, + "kl": 83.35732051730156, + "learning_rate": 1e-06, + "loss": 1.2934, + "num_tokens": 1396808.0, + "reward": 0.4833333492279053, + "reward_std": 0.4804098606109619, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 68.875, + "completions/mean_terminated_length": 137.75, + "completions/min_length": 0.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.556, + "format_failures": 0.0, + "grad_norm": 2.3204505443573, + "kl": 0.11221980676054955, + "learning_rate": 1e-06, + "loss": -0.2757, + "num_tokens": 1405448.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 20.625, + "completions/mean_terminated_length": 55.0, + "completions/min_length": 0.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.56, + "format_failures": 0.0, + "grad_norm": 0.12126144766807556, + "kl": 0.013866727240383625, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 1413312.0, + "reward": 1.0, + "reward_std": 0.0, + "step": 140 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 1413312, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}