diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.566953797963978, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1798.0, + "completions/max_terminated_length": 1798.0, + "completions/mean_length": 505.310302734375, + "completions/mean_terminated_length": 505.310302734375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.0031323414252153485, + "frac_reward_zero_std": 0.7767857313156128, + "grad_norm": 0.20241215825080872, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0018, + "num_tokens": 501715.0, + "reward": 0.00580357201397419, + "reward_std": 0.011229777708649635, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.0580357126891613, + "rewards/format_reward/std": 0.23407234251499176, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 513.6830444335938, + "completions/mean_terminated_length": 510.25054931640625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.006264682850430697, + "frac_reward_zero_std": 0.8839285969734192, + "grad_norm": 0.1246599331498146, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.0077, + "num_tokens": 1005093.0, + "reward": 0.0031250002793967724, + "reward_std": 0.005872634705156088, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17418713867664337, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 530.6116333007812, + "completions/mean_terminated_length": 520.3820190429688, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.009397024275646046, + "frac_reward_zero_std": 0.8303571939468384, + "grad_norm": 0.15069690346717834, + "kl": 0.00031280517578125, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0082, + "num_tokens": 1508271.0, + "reward": 0.005133929196745157, + "reward_std": 0.008758394047617912, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.0513392873108387, + "rewards/format_reward/std": 0.22093553841114044, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 511.8951110839844, + "completions/mean_terminated_length": 494.5575866699219, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.012529365700861394, + "frac_reward_zero_std": 0.8125000596046448, + "grad_norm": 0.1669161319732666, + "kl": 0.0003066062927246094, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0122, + "num_tokens": 2004696.0, + "reward": 0.00513392873108387, + "reward_std": 0.0095131266862154, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.0513392873108387, + "rewards/format_reward/std": 0.22093553841114044, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1429.0, + "completions/mean_length": 507.8817138671875, + "completions/mean_terminated_length": 494.00677490234375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.015661707126076743, + "frac_reward_zero_std": 0.7500000596046448, + "grad_norm": 0.21302837133407593, + "kl": 0.00034236907958984375, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0129, + "num_tokens": 2499987.0, + "reward": 0.00758928619325161, + "reward_std": 0.012776251882314682, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.0758928582072258, + "rewards/format_reward/std": 0.265122652053833, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1847.0, + "completions/mean_length": 475.997802734375, + "completions/mean_terminated_length": 472.4809875488281, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.018794048551292093, + "frac_reward_zero_std": 0.723214328289032, + "grad_norm": 0.21226118505001068, + "kl": 0.00040340423583984375, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0226, + "num_tokens": 2971282.0, + "reward": 0.00758928619325161, + "reward_std": 0.014046475291252136, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.0758928582072258, + "rewards/format_reward/std": 0.2651226818561554, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1340.0, + "completions/mean_length": 449.78350830078125, + "completions/mean_terminated_length": 446.20806884765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.02192638997650744, + "frac_reward_zero_std": 0.4464285969734192, + "grad_norm": 0.3362879157066345, + "kl": 0.0012226104736328125, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0194, + "num_tokens": 3431349.0, + "reward": 0.01897321455180645, + "reward_std": 0.028852637857198715, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.1897321492433548, + "rewards/format_reward/std": 0.39252743124961853, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1607.0, + "completions/mean_length": 463.33038330078125, + "completions/mean_terminated_length": 459.7852478027344, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.025058731401722788, + "frac_reward_zero_std": 0.2946428656578064, + "grad_norm": 0.383797287940979, + "kl": 0.002155303955078125, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.0593, + "num_tokens": 3884157.0, + "reward": 0.02879464253783226, + "reward_std": 0.036925364285707474, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.2879464328289032, + "rewards/format_reward/std": 0.4533122181892395, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1550.0, + "completions/mean_length": 449.450927734375, + "completions/mean_terminated_length": 442.28253173828125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.028191072826938137, + "frac_reward_zero_std": 0.267857164144516, + "grad_norm": 0.36076852679252625, + "kl": 0.0064544677734375, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0404, + "num_tokens": 4356035.0, + "reward": 0.06718750298023224, + "reward_std": 0.03860996663570404, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.470055490732193, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 434.0758972167969, + "completions/mean_terminated_length": 423.19549560546875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.031323414252153486, + "frac_reward_zero_std": 0.4285714626312256, + "grad_norm": 0.3158596158027649, + "kl": 0.0114593505859375, + "learning_rate": 3e-06, + "loss": 0.0567, + "num_tokens": 4819501.0, + "reward": 0.08214286714792252, + "reward_std": 0.029400181025266647, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.8214285969734192, + "rewards/format_reward/std": 0.3834212124347687, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1287.0, + "completions/mean_length": 417.1026916503906, + "completions/mean_terminated_length": 409.78924560546875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.03445575567736883, + "frac_reward_zero_std": 0.6071428656578064, + "grad_norm": 0.27019640803337097, + "kl": 0.015106201171875, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0273, + "num_tokens": 5272579.0, + "reward": 0.0881696492433548, + "reward_std": 0.019988171756267548, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.8816964030265808, + "rewards/format_reward/std": 0.32332828640937805, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1796.0, + "completions/mean_length": 359.1540222167969, + "completions/mean_terminated_length": 355.3758544921875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.037588097102584185, + "frac_reward_zero_std": 0.7678571939468384, + "grad_norm": 0.2617413103580475, + "kl": 0.02667236328125, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.0058, + "num_tokens": 5687156.0, + "reward": 0.09352679550647736, + "reward_std": 0.011814332567155361, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 351.8437805175781, + "completions/mean_terminated_length": 344.2376708984375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.04072043852779953, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.11741850525140762, + "kl": 0.026947021484375, + "learning_rate": 4.000000000000001e-06, + "loss": -0.0042, + "num_tokens": 6096534.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1702.0, + "completions/mean_length": 372.6383972167969, + "completions/mean_terminated_length": 365.1255798339844, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.04385277995301488, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.12467124313116074, + "kl": 0.02813720703125, + "learning_rate": 4.333333333333334e-06, + "loss": 0.0254, + "num_tokens": 6532928.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1241.0, + "completions/mean_length": 337.1942138671875, + "completions/mean_terminated_length": 333.36688232421875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.04698512137823023, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.09254973381757736, + "kl": 0.03271484375, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0124, + "num_tokens": 6940707.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 372.94866943359375, + "completions/mean_terminated_length": 361.65618896484375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.050117462803445575, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.15962667763233185, + "kl": 0.037109375, + "learning_rate": 5e-06, + "loss": 0.0134, + "num_tokens": 7368048.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 326.765625, + "completions/mean_terminated_length": 326.765625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.05324980422866092, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.13613449037075043, + "kl": 0.03857421875, + "learning_rate": 4.999952797253148e-06, + "loss": 0.0007, + "num_tokens": 7782387.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 303.1294860839844, + "completions/mean_terminated_length": 299.2259521484375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.056382145653876274, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.07718341052532196, + "kl": 0.030975341796875, + "learning_rate": 4.9998111909931225e-06, + "loss": 0.0177, + "num_tokens": 8187829.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1585.0, + "completions/max_terminated_length": 1585.0, + "completions/mean_length": 314.4821472167969, + "completions/mean_terminated_length": 314.4821472167969, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.05951448707909162, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.045888468623161316, + "kl": 0.03021240234375, + "learning_rate": 4.999575187161439e-06, + "loss": 0.0001, + "num_tokens": 8590681.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 304.5089416503906, + "completions/mean_terminated_length": 304.5089416503906, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.06264682850430697, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.07027683407068253, + "kl": 0.031890869140625, + "learning_rate": 4.9992447956603455e-06, + "loss": 0.0011, + "num_tokens": 8989701.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 321.484375, + "completions/mean_terminated_length": 321.484375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.06577916992952232, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.18601660430431366, + "kl": 0.050933837890625, + "learning_rate": 4.998820030352409e-06, + "loss": -0.0015, + "num_tokens": 9393762.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 324.7008972167969, + "completions/mean_terminated_length": 324.7008972167969, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.06891151135473766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025518523529171944, + "kl": 0.0302734375, + "learning_rate": 4.998300909059929e-06, + "loss": 0.0003, + "num_tokens": 9795744.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 343.93975830078125, + "completions/mean_terminated_length": 340.1275329589844, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.07204385277995301, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05894619971513748, + "kl": 0.027191162109375, + "learning_rate": 4.997687453564198e-06, + "loss": 0.0126, + "num_tokens": 10216469.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 360.7901916503906, + "completions/mean_terminated_length": 360.7901916503906, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.07517619420516837, + "frac_reward_zero_std": 0.910714328289032, + "grad_norm": 0.16142426431179047, + "kl": 0.037689208984375, + "learning_rate": 4.9969796896045775e-06, + "loss": -0.0011, + "num_tokens": 10652875.0, + "reward": 0.09776786714792252, + "reward_std": 0.004464285913854837, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9776785969734192, + "rewards/format_reward/std": 0.1478918492794037, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 376.31475830078125, + "completions/mean_terminated_length": 376.31475830078125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.07830853563038372, + "frac_reward_zero_std": 0.8839285969734192, + "grad_norm": 0.13303056359291077, + "kl": 0.025543212890625, + "learning_rate": 4.996177646877426e-06, + "loss": 0.0011, + "num_tokens": 11083768.0, + "reward": 0.0970982164144516, + "reward_std": 0.0058035715483129025, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9709821343421936, + "rewards/format_reward/std": 0.16804419457912445, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1904.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 405.3058166503906, + "completions/mean_terminated_length": 405.3058166503906, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.08144087705559906, + "frac_reward_zero_std": 0.9017857313156128, + "grad_norm": 0.12866255640983582, + "kl": 0.027313232421875, + "learning_rate": 4.995281359034851e-06, + "loss": 0.0056, + "num_tokens": 11541501.0, + "reward": 0.09754464775323868, + "reward_std": 0.004910714458674192, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9754464030265808, + "rewards/format_reward/std": 0.1549331247806549, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1885.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 422.83929443359375, + "completions/mean_terminated_length": 422.83929443359375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.08457321848081441, + "frac_reward_zero_std": 0.7678571939468384, + "grad_norm": 0.18961046636104584, + "kl": 0.02655029296875, + "learning_rate": 4.994290863683296e-06, + "loss": -0.0009, + "num_tokens": 11991825.0, + "reward": 0.09330358356237411, + "reward_std": 0.011883394792675972, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1210.0, + "completions/max_terminated_length": 1210.0, + "completions/mean_length": 429.5602722167969, + "completions/mean_terminated_length": 429.5602722167969, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.08770555990602975, + "frac_reward_zero_std": 0.8392857313156128, + "grad_norm": 0.1546013504266739, + "kl": 0.025421142578125, + "learning_rate": 4.99320620238196e-06, + "loss": 0.0138, + "num_tokens": 12445804.0, + "reward": 0.0959821492433548, + "reward_std": 0.008035714738070965, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1481.0, + "completions/mean_length": 457.8482360839844, + "completions/mean_terminated_length": 450.7174987792969, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.0908379013312451, + "frac_reward_zero_std": 0.8303571939468384, + "grad_norm": 0.13993386924266815, + "kl": 0.025054931640625, + "learning_rate": 4.99202742064106e-06, + "loss": 0.0033, + "num_tokens": 12914920.0, + "reward": 0.09531250596046448, + "reward_std": 0.008620268665254116, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1923.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 470.3192138671875, + "completions/mean_terminated_length": 470.3192138671875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.09397024275646046, + "frac_reward_zero_std": 0.8750000596046448, + "grad_norm": 0.2710864841938019, + "kl": 0.070037841796875, + "learning_rate": 4.990754567919917e-06, + "loss": 0.0026, + "num_tokens": 13388991.0, + "reward": 0.09687500447034836, + "reward_std": 0.006250000558793545, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17418713867664337, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1418.0, + "completions/mean_length": 475.71429443359375, + "completions/mean_terminated_length": 472.1968688964844, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.0971025841816758, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.104698546230793, + "kl": 0.025726318359375, + "learning_rate": 4.989387697624881e-06, + "loss": 0.0107, + "num_tokens": 13853939.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1609.0, + "completions/max_terminated_length": 1609.0, + "completions/mean_length": 451.71429443359375, + "completions/mean_terminated_length": 451.71429443359375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.10023492560689115, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.08060673624277115, + "kl": 0.024688720703125, + "learning_rate": 4.987926867107095e-06, + "loss": 0.0088, + "num_tokens": 14308803.0, + "reward": 0.09888393431901932, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1670.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 485.7544860839844, + "completions/mean_terminated_length": 485.7544860839844, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.1033672670321065, + "frac_reward_zero_std": 0.8839285969734192, + "grad_norm": 0.11720269918441772, + "kl": 0.025299072265625, + "learning_rate": 4.986372137660078e-06, + "loss": -0.0002, + "num_tokens": 14783641.0, + "reward": 0.09687500447034836, + "reward_std": 0.005872634705156088, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17418713867664337, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 473.2901916503906, + "completions/mean_terminated_length": 473.2901916503906, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.10649960845732184, + "frac_reward_zero_std": 0.8392857313156128, + "grad_norm": 0.13555070757865906, + "kl": 0.029815673828125, + "learning_rate": 4.984723574517165e-06, + "loss": 0.0023, + "num_tokens": 15258791.0, + "reward": 0.09575893729925156, + "reward_std": 0.00810477789491415, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 473.591552734375, + "completions/mean_terminated_length": 466.5314025878906, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1096319498825372, + "frac_reward_zero_std": 0.8125000596046448, + "grad_norm": 0.15424402058124542, + "kl": 0.02642822265625, + "learning_rate": 4.9829812468487655e-06, + "loss": 0.0147, + "num_tokens": 15733920.0, + "reward": 0.09531251341104507, + "reward_std": 0.009375001303851604, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1717.0, + "completions/max_terminated_length": 1717.0, + "completions/mean_length": 467.79913330078125, + "completions/mean_terminated_length": 467.79913330078125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.11276429130775255, + "frac_reward_zero_std": 0.7946428656578064, + "grad_norm": 0.16484294831752777, + "kl": 0.025634765625, + "learning_rate": 4.981145227759457e-06, + "loss": 0.0193, + "num_tokens": 16205306.0, + "reward": 0.0948660746216774, + "reward_std": 0.010267858393490314, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1251.0, + "completions/max_terminated_length": 1251.0, + "completions/mean_length": 471.763427734375, + "completions/mean_terminated_length": 471.763427734375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.1158966327329679, + "frac_reward_zero_std": 0.8035714626312256, + "grad_norm": 0.14881743490695953, + "kl": 0.023468017578125, + "learning_rate": 4.979215594284924e-06, + "loss": -0.009, + "num_tokens": 16681524.0, + "reward": 0.0948660746216774, + "reward_std": 0.00989049207419157, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1543.0, + "completions/max_terminated_length": 1543.0, + "completions/mean_length": 458.11163330078125, + "completions/mean_terminated_length": 458.11163330078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.11902897415818324, + "frac_reward_zero_std": 0.8750000596046448, + "grad_norm": 0.13113969564437866, + "kl": 0.024383544921875, + "learning_rate": 4.977192427388722e-06, + "loss": 0.0048, + "num_tokens": 17151366.0, + "reward": 0.09687500447034836, + "reward_std": 0.006250000558793545, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17418713867664337, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1578.0, + "completions/max_terminated_length": 1578.0, + "completions/mean_length": 461.6875305175781, + "completions/mean_terminated_length": 461.6875305175781, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.12216131558339859, + "frac_reward_zero_std": 0.8750000596046448, + "grad_norm": 0.13471558690071106, + "kl": 0.03570556640625, + "learning_rate": 4.9750758119588824e-06, + "loss": 0.0094, + "num_tokens": 17614714.0, + "reward": 0.09687500447034836, + "reward_std": 0.0062500000931322575, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17418713867664337, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1632.0, + "completions/max_terminated_length": 1632.0, + "completions/mean_length": 462.3906555175781, + "completions/mean_terminated_length": 462.3906555175781, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.12529365700861395, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.0811685174703598, + "kl": 0.02178955078125, + "learning_rate": 4.972865836804349e-06, + "loss": 0.0005, + "num_tokens": 18105921.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1528.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 417.5245666503906, + "completions/mean_terminated_length": 417.5245666503906, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.1284259984338293, + "frac_reward_zero_std": 0.8660714626312256, + "grad_norm": 0.12236718833446503, + "kl": 0.031585693359375, + "learning_rate": 4.970562594651254e-06, + "loss": -0.0011, + "num_tokens": 18550956.0, + "reward": 0.09665179997682571, + "reward_std": 0.0066964291036129, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9665178656578064, + "rewards/format_reward/std": 0.1800929754972458, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 457.47100830078125, + "completions/mean_terminated_length": 453.9127502441406, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.13155833985904464, + "frac_reward_zero_std": 0.8928571939468384, + "grad_norm": 0.10374283790588379, + "kl": 0.02362060546875, + "learning_rate": 4.968166182139026e-06, + "loss": -0.0029, + "num_tokens": 19008331.0, + "reward": 0.0970982164144516, + "reward_std": 0.005426206160336733, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9709821343421936, + "rewards/format_reward/std": 0.16804419457912445, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1390.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 437.80804443359375, + "completions/mean_terminated_length": 437.80804443359375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.13469068128425998, + "frac_reward_zero_std": 0.848214328289032, + "grad_norm": 0.13753058016300201, + "kl": 0.027191162109375, + "learning_rate": 4.9656766998163306e-06, + "loss": 0.01, + "num_tokens": 19467685.0, + "reward": 0.09620536863803864, + "reward_std": 0.00758928619325161, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.19128035008907318, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 455.4888610839844, + "completions/mean_terminated_length": 448.3475646972656, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.13782302270947533, + "frac_reward_zero_std": 0.910714328289032, + "grad_norm": 0.12159372866153717, + "kl": 0.024169921875, + "learning_rate": 4.963094252136865e-06, + "loss": 0.0191, + "num_tokens": 19929628.0, + "reward": 0.09776786714792252, + "reward_std": 0.004464285913854837, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9776785969734192, + "rewards/format_reward/std": 0.1478918492794037, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1549.0, + "completions/max_terminated_length": 1549.0, + "completions/mean_length": 421.5201110839844, + "completions/mean_terminated_length": 421.5201110839844, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.14095536413469067, + "frac_reward_zero_std": 0.9017857313156128, + "grad_norm": 0.1216520220041275, + "kl": 0.02484130859375, + "learning_rate": 4.960418947454958e-06, + "loss": 0.0017, + "num_tokens": 20381173.0, + "reward": 0.09754464775323868, + "reward_std": 0.004910714458674192, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9754464030265808, + "rewards/format_reward/std": 0.1549331247806549, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 452.63616943359375, + "completions/mean_terminated_length": 449.0671081542969, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.14408770555990602, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.07689127326011658, + "kl": 0.027130126953125, + "learning_rate": 4.957650898021038e-06, + "loss": 0.0083, + "num_tokens": 20847038.0, + "reward": 0.098214291036129, + "reward_std": 0.0031940629705786705, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9821428656578064, + "rewards/format_reward/std": 0.13258016109466553, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 446.3526916503906, + "completions/mean_terminated_length": 439.1704406738281, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.14722004698512137, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07279340922832489, + "kl": 0.025726318359375, + "learning_rate": 4.954790219976915e-06, + "loss": 0.0238, + "num_tokens": 21317148.0, + "reward": 0.09888393431901932, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1977.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 445.72991943359375, + "completions/mean_terminated_length": 445.72991943359375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.15035238841033674, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06722462177276611, + "kl": 0.02276611328125, + "learning_rate": 4.95183703335091e-06, + "loss": -0.001, + "num_tokens": 21781855.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349845170975, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 437.5558166503906, + "completions/mean_terminated_length": 430.3341064453125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.1534847298355521, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.0969611406326294, + "kl": 0.022369384765625, + "learning_rate": 4.948791462052819e-06, + "loss": 0.0179, + "num_tokens": 22236944.0, + "reward": 0.09866072982549667, + "reward_std": 0.0023012058809399605, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 445.7098388671875, + "completions/mean_terminated_length": 442.1252746582031, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.15661707126076743, + "frac_reward_zero_std": 0.9285714626312256, + "grad_norm": 0.10892393440008163, + "kl": 0.028045654296875, + "learning_rate": 4.945653633868716e-06, + "loss": 0.0165, + "num_tokens": 22691338.0, + "reward": 0.098214291036129, + "reward_std": 0.0035714288242161274, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9821428656578064, + "rewards/format_reward/std": 0.13258016109466553, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1601.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 454.4419860839844, + "completions/mean_terminated_length": 454.4419860839844, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.15974941268598278, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07626765221357346, + "kl": 0.02392578125, + "learning_rate": 4.942423680455584e-06, + "loss": 0.002, + "num_tokens": 23150556.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1433.0, + "completions/max_terminated_length": 1433.0, + "completions/mean_length": 421.6004638671875, + "completions/mean_terminated_length": 421.6004638671875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.16288175411119812, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0611153282225132, + "kl": 0.02496337890625, + "learning_rate": 4.939101737335802e-06, + "loss": 0.0028, + "num_tokens": 23604309.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1713.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 457.9285888671875, + "completions/mean_terminated_length": 457.9285888671875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.16601409553641347, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.0695890486240387, + "kl": 0.023895263671875, + "learning_rate": 4.935687943891447e-06, + "loss": 0.0034, + "num_tokens": 24078185.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 442.8794860839844, + "completions/mean_terminated_length": 439.2886047363281, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.16914643696162882, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07334944605827332, + "kl": 0.024688720703125, + "learning_rate": 4.932182443358458e-06, + "loss": 0.007, + "num_tokens": 24539203.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1782.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 435.8839416503906, + "completions/mean_terminated_length": 435.8839416503906, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.17227877838684416, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.030935026705265045, + "kl": 0.02337646484375, + "learning_rate": 4.928585382820616e-06, + "loss": 0.0012, + "num_tokens": 24991283.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1716.0, + "completions/mean_length": 446.72100830078125, + "completions/mean_terminated_length": 443.1387023925781, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.1754111198120595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11464553326368332, + "kl": 0.0272216796875, + "learning_rate": 4.924896913203376e-06, + "loss": 0.0003, + "num_tokens": 25445638.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1587.0, + "completions/max_terminated_length": 1587.0, + "completions/mean_length": 435.81475830078125, + "completions/mean_terminated_length": 435.81475830078125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.17854346123727485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00878103543072939, + "kl": 0.023681640625, + "learning_rate": 4.921117189267535e-06, + "loss": 0.0002, + "num_tokens": 25898279.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1819.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 429.1785888671875, + "completions/mean_terminated_length": 429.1785888671875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.1816758026624902, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.057063594460487366, + "kl": 0.025604248046875, + "learning_rate": 4.917246369602742e-06, + "loss": 0.0036, + "num_tokens": 26352967.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1359.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 466.357177734375, + "completions/mean_terminated_length": 466.357177734375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.18480814408770557, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04852420091629028, + "kl": 0.025787353515625, + "learning_rate": 4.9132846166208355e-06, + "loss": 0.0022, + "num_tokens": 26835947.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1690.0, + "completions/max_terminated_length": 1690.0, + "completions/mean_length": 424.3906555175781, + "completions/mean_terminated_length": 424.3906555175781, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.18794048551292092, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05861803889274597, + "kl": 0.027557373046875, + "learning_rate": 4.9092320965490365e-06, + "loss": 0.0049, + "num_tokens": 27288778.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1544.0, + "completions/max_terminated_length": 1544.0, + "completions/mean_length": 444.6339416503906, + "completions/mean_terminated_length": 444.6339416503906, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.19107282693813626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007782110944390297, + "kl": 0.024993896484375, + "learning_rate": 4.905088979422971e-06, + "loss": 0.0003, + "num_tokens": 27765426.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1613.0, + "completions/mean_length": 418.3571472167969, + "completions/mean_terminated_length": 414.71142578125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.1942051683633516, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.08726464956998825, + "kl": 0.023223876953125, + "learning_rate": 4.900855439079536e-06, + "loss": 0.0069, + "num_tokens": 28204946.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1796.0, + "completions/max_terminated_length": 1796.0, + "completions/mean_length": 433.5669860839844, + "completions/mean_terminated_length": 433.5669860839844, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.19733750978856696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007881193421781063, + "kl": 0.02392578125, + "learning_rate": 4.8965316531496055e-06, + "loss": 0.0002, + "num_tokens": 28669688.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 425.0870666503906, + "completions/mean_terminated_length": 421.4563903808594, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.2004698512137823, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.08527743816375732, + "kl": 0.021881103515625, + "learning_rate": 4.892117803050578e-06, + "loss": 0.0101, + "num_tokens": 29110943.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 449.9776916503906, + "completions/mean_terminated_length": 442.8116760253906, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.20360219263899765, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.08571747690439224, + "kl": 0.028167724609375, + "learning_rate": 4.887614073978761e-06, + "loss": 0.0202, + "num_tokens": 29577025.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 416.7790222167969, + "completions/mean_terminated_length": 413.1297607421875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.206734534064213, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.10931398719549179, + "kl": 0.0360107421875, + "learning_rate": 4.883020654901609e-06, + "loss": 0.0155, + "num_tokens": 30013478.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1469.0, + "completions/max_terminated_length": 1469.0, + "completions/mean_length": 397.09600830078125, + "completions/mean_terminated_length": 397.09600830078125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.20986687548942834, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.048762910068035126, + "kl": 0.023193359375, + "learning_rate": 4.878337738549785e-06, + "loss": 0.0026, + "num_tokens": 30456397.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 424.8326110839844, + "completions/mean_terminated_length": 417.5538330078125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.21299921691464369, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.10778295248746872, + "kl": 0.0274658203125, + "learning_rate": 4.873565521409082e-06, + "loss": 0.0321, + "num_tokens": 30896994.0, + "reward": 0.09866072982549667, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507843434810638, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2034.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 421.6473388671875, + "completions/mean_terminated_length": 421.6473388671875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.21613155833985903, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.04150565713644028, + "kl": 0.02215576171875, + "learning_rate": 4.868704203712173e-06, + "loss": 0.0038, + "num_tokens": 31344472.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1906.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 423.76116943359375, + "completions/mean_terminated_length": 423.76116943359375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.2192638997650744, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05390825495123863, + "kl": 0.0225830078125, + "learning_rate": 4.86375398943021e-06, + "loss": 0.0055, + "num_tokens": 31819665.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1426.0, + "completions/max_terminated_length": 1426.0, + "completions/mean_length": 409.8281555175781, + "completions/mean_terminated_length": 409.8281555175781, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.22239624119028975, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.10090767592191696, + "kl": 0.03179931640625, + "learning_rate": 4.858715086264274e-06, + "loss": 0.0091, + "num_tokens": 32259640.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 394.17413330078125, + "completions/mean_terminated_length": 394.17413330078125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.2255285826155051, + "frac_reward_zero_std": 0.910714328289032, + "grad_norm": 0.11244799196720123, + "kl": 0.032470703125, + "learning_rate": 4.853587705636646e-06, + "loss": 0.0034, + "num_tokens": 32687698.0, + "reward": 0.09776786714792252, + "reward_std": 0.004464285913854837, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9776785969734192, + "rewards/format_reward/std": 0.1478918492794037, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1513.0, + "completions/max_terminated_length": 1513.0, + "completions/mean_length": 411.8973388671875, + "completions/mean_terminated_length": 411.8973388671875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.22866092404072044, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.09290987253189087, + "kl": 0.023712158203125, + "learning_rate": 4.84837206268195e-06, + "loss": 0.0034, + "num_tokens": 33134052.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250002793967724, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1373.0, + "completions/max_terminated_length": 1373.0, + "completions/mean_length": 417.2879638671875, + "completions/mean_terminated_length": 417.2879638671875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.2317932654659358, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07182217389345169, + "kl": 0.02294921875, + "learning_rate": 4.8430683762381195e-06, + "loss": -0.0011, + "num_tokens": 33596689.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1758.0, + "completions/max_terminated_length": 1758.0, + "completions/mean_length": 418.7232360839844, + "completions/mean_terminated_length": 418.7232360839844, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.23492560689115113, + "frac_reward_zero_std": 0.9285714626312256, + "grad_norm": 0.10142670571804047, + "kl": 0.028717041015625, + "learning_rate": 4.837676868837213e-06, + "loss": 0.003, + "num_tokens": 34062025.0, + "reward": 0.098214291036129, + "reward_std": 0.0035714288242161274, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9821428656578064, + "rewards/format_reward/std": 0.13258016109466553, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 413.9776916503906, + "completions/mean_terminated_length": 410.3221435546875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.23805794831636648, + "frac_reward_zero_std": 0.8928571939468384, + "grad_norm": 0.12744517624378204, + "kl": 0.027923583984375, + "learning_rate": 4.832197766696085e-06, + "loss": 0.0077, + "num_tokens": 34512179.0, + "reward": 0.0970982164144516, + "reward_std": 0.005426206160336733, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9709821343421936, + "rewards/format_reward/std": 0.16804419457912445, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 410.0937805175781, + "completions/mean_terminated_length": 406.4295349121094, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.24119028974158183, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.11007879674434662, + "kl": 0.02178955078125, + "learning_rate": 4.826631299706887e-06, + "loss": 0.02, + "num_tokens": 34958717.0, + "reward": 0.09843751043081284, + "reward_std": 0.0031250002793967724, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1224.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 386.7879638671875, + "completions/mean_terminated_length": 386.7879638671875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.24432263116679717, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.09045854210853577, + "kl": 0.024200439453125, + "learning_rate": 4.820977701427424e-06, + "loss": 0.0054, + "num_tokens": 35386546.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1132.0, + "completions/max_terminated_length": 1132.0, + "completions/mean_length": 384.1250305175781, + "completions/mean_terminated_length": 384.1250305175781, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.24745497259201252, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.10079538077116013, + "kl": 0.027069091796875, + "learning_rate": 4.81523720907136e-06, + "loss": 0.0052, + "num_tokens": 35812890.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250002793967724, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1410.0, + "completions/max_terminated_length": 1410.0, + "completions/mean_length": 386.4107360839844, + "completions/mean_terminated_length": 386.4107360839844, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.2505873140172279, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.10219928622245789, + "kl": 0.026947021484375, + "learning_rate": 4.809410063498254e-06, + "loss": 0.0047, + "num_tokens": 36242746.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250002793967724, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1404.0, + "completions/max_terminated_length": 1404.0, + "completions/mean_length": 392.2879638671875, + "completions/mean_terminated_length": 392.2879638671875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.25371965544244324, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07012035697698593, + "kl": 0.02423095703125, + "learning_rate": 4.8034965092034656e-06, + "loss": 0.0061, + "num_tokens": 36673967.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2025.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 387.23663330078125, + "completions/mean_terminated_length": 387.23663330078125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.2568519968676586, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.04148247092962265, + "kl": 0.021942138671875, + "learning_rate": 4.797496794307889e-06, + "loss": -0.0007, + "num_tokens": 37103861.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1460.0, + "completions/max_terminated_length": 1460.0, + "completions/mean_length": 388.71429443359375, + "completions/mean_terminated_length": 388.71429443359375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.25998433829287393, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.07328309863805771, + "kl": 0.025054931640625, + "learning_rate": 4.791411170547545e-06, + "loss": -0.0, + "num_tokens": 37525381.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1918.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 382.81475830078125, + "completions/mean_terminated_length": 382.81475830078125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.2631166797180893, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06375343352556229, + "kl": 0.022857666015625, + "learning_rate": 4.785239893263017e-06, + "loss": 0.0022, + "num_tokens": 37937018.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 398.4508972167969, + "completions/mean_terminated_length": 394.7606201171875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.2662490211433046, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05883678048849106, + "kl": 0.022064208984375, + "learning_rate": 4.778983221388742e-06, + "loss": 0.005, + "num_tokens": 38373236.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1200.0, + "completions/max_terminated_length": 1200.0, + "completions/mean_length": 391.2723388671875, + "completions/mean_terminated_length": 391.2723388671875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.26938136256851997, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.062257394194602966, + "kl": 0.023406982421875, + "learning_rate": 4.77264141744214e-06, + "loss": 0.0035, + "num_tokens": 38805886.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1422.0, + "completions/max_terminated_length": 1422.0, + "completions/mean_length": 404.9285888671875, + "completions/mean_terminated_length": 404.9285888671875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.2725137039937353, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07183815538883209, + "kl": 0.024871826171875, + "learning_rate": 4.766214747512603e-06, + "loss": -0.0015, + "num_tokens": 39235654.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 410.9040222167969, + "completions/mean_terminated_length": 410.9040222167969, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.27564604541895066, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.052811577916145325, + "kl": 0.02093505859375, + "learning_rate": 4.759703481250331e-06, + "loss": 0.001, + "num_tokens": 39677175.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1363.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 436.5558166503906, + "completions/mean_terminated_length": 436.5558166503906, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.278778386844166, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.05771630257368088, + "kl": 0.024169921875, + "learning_rate": 4.753107891855015e-06, + "loss": -0.0017, + "num_tokens": 40135628.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1888.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 416.2857360839844, + "completions/mean_terminated_length": 416.2857360839844, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.28191072826938135, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.034645553678274155, + "kl": 0.023162841796875, + "learning_rate": 4.746428256064375e-06, + "loss": 0.0019, + "num_tokens": 40572900.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 445.5937805175781, + "completions/mean_terminated_length": 442.0089416503906, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.2850430696945967, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.1101684495806694, + "kl": 0.03033447265625, + "learning_rate": 4.7396648541425534e-06, + "loss": 0.0065, + "num_tokens": 41037066.0, + "reward": 0.09843751043081284, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 421.9308166503906, + "completions/mean_terminated_length": 418.2930603027344, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.28817541111981204, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07634352892637253, + "kl": 0.02423095703125, + "learning_rate": 4.732817969868348e-06, + "loss": 0.015, + "num_tokens": 41497779.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 463.3973388671875, + "completions/mean_terminated_length": 463.3973388671875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.2913077525450274, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07480979710817337, + "kl": 0.024383544921875, + "learning_rate": 4.7258878905233095e-06, + "loss": -0.0007, + "num_tokens": 41984321.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1998.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 443.8660888671875, + "completions/mean_terminated_length": 443.8660888671875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.29444009397024273, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.02839045412838459, + "kl": 0.02435302734375, + "learning_rate": 4.718874906879688e-06, + "loss": 0.0004, + "num_tokens": 42458645.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1458.0, + "completions/max_terminated_length": 1458.0, + "completions/mean_length": 437.51116943359375, + "completions/mean_terminated_length": 437.51116943359375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.2975724353954581, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03395234793424606, + "kl": 0.02294921875, + "learning_rate": 4.711779313188231e-06, + "loss": -0.0009, + "num_tokens": 42926142.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1207.0, + "completions/max_terminated_length": 1207.0, + "completions/mean_length": 429.41741943359375, + "completions/mean_terminated_length": 429.41741943359375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3007047768206735, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.03919963166117668, + "kl": 0.022430419921875, + "learning_rate": 4.70460140716584e-06, + "loss": -0.0033, + "num_tokens": 43380873.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1825.0, + "completions/max_terminated_length": 1825.0, + "completions/mean_length": 423.22991943359375, + "completions/mean_terminated_length": 423.22991943359375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.3038371182458888, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005938644986599684, + "kl": 0.021697998046875, + "learning_rate": 4.697341489983076e-06, + "loss": 0.0002, + "num_tokens": 43822468.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1374.0, + "completions/max_terminated_length": 1374.0, + "completions/mean_length": 413.8794860839844, + "completions/mean_terminated_length": 413.8794860839844, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.3069694596711042, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04433107748627663, + "kl": 0.025177001953125, + "learning_rate": 4.6899998662515215e-06, + "loss": 0.0009, + "num_tokens": 44279762.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 414.7232360839844, + "completions/mean_terminated_length": 414.7232360839844, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.3101018010963195, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.057663049548864365, + "kl": 0.020751953125, + "learning_rate": 4.682576844011007e-06, + "loss": 0.0016, + "num_tokens": 44715214.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1489.0, + "completions/max_terminated_length": 1489.0, + "completions/mean_length": 433.2254638671875, + "completions/mean_terminated_length": 433.2254638671875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.31323414252153486, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04842175915837288, + "kl": 0.020751953125, + "learning_rate": 4.675072734716678e-06, + "loss": -0.0001, + "num_tokens": 45166723.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 434.3571472167969, + "completions/mean_terminated_length": 434.3571472167969, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.3163664839467502, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.021488236263394356, + "kl": 0.0244140625, + "learning_rate": 4.667487853225931e-06, + "loss": -0.0005, + "num_tokens": 45640575.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 398.4687805175781, + "completions/mean_terminated_length": 398.4687805175781, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.31949882537196556, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05682607740163803, + "kl": 0.021942138671875, + "learning_rate": 4.659822517785203e-06, + "loss": -0.0031, + "num_tokens": 46075837.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1411.0, + "completions/max_terminated_length": 1411.0, + "completions/mean_length": 403.2901916503906, + "completions/mean_terminated_length": 403.2901916503906, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.3226311667971809, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04837144538760185, + "kl": 0.023529052734375, + "learning_rate": 4.6520770500166165e-06, + "loss": -0.0011, + "num_tokens": 46517495.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1265.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 406.4285888671875, + "completions/mean_terminated_length": 406.4285888671875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.32576350822239625, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0597638301551342, + "kl": 0.02203369140625, + "learning_rate": 4.644251774904487e-06, + "loss": 0.002, + "num_tokens": 46950835.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1361.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 420.3638610839844, + "completions/mean_terminated_length": 420.3638610839844, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.3288958496476116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00494805071502924, + "kl": 0.020751953125, + "learning_rate": 4.636347020781684e-06, + "loss": 0.0002, + "num_tokens": 47397086.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1567.0, + "completions/max_terminated_length": 1567.0, + "completions/mean_length": 413.9910888671875, + "completions/mean_terminated_length": 413.9910888671875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.33202819107282694, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0446472130715847, + "kl": 0.024139404296875, + "learning_rate": 4.6283631193158605e-06, + "loss": 0.0007, + "num_tokens": 47866570.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1410.0, + "completions/mean_length": 414.76788330078125, + "completions/mean_terminated_length": 407.4439697265625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.3351605324980423, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.10806752741336823, + "kl": 0.019500732421875, + "learning_rate": 4.620300405495532e-06, + "loss": 0.0157, + "num_tokens": 48311370.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1638.0, + "completions/max_terminated_length": 1638.0, + "completions/mean_length": 414.6183166503906, + "completions/mean_terminated_length": 414.6183166503906, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.33829287392325763, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05645902454853058, + "kl": 0.0223388671875, + "learning_rate": 4.612159217616022e-06, + "loss": 0.0027, + "num_tokens": 48761515.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1111.0, + "completions/max_terminated_length": 1111.0, + "completions/mean_length": 394.3415222167969, + "completions/mean_terminated_length": 394.3415222167969, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.341425215348473, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.058627162128686905, + "kl": 0.01983642578125, + "learning_rate": 4.603939897265268e-06, + "loss": 0.0038, + "num_tokens": 49201832.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1435.0, + "completions/max_terminated_length": 1435.0, + "completions/mean_length": 395.7388610839844, + "completions/mean_terminated_length": 395.7388610839844, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3445575567736883, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.030734708532691002, + "kl": 0.02105712890625, + "learning_rate": 4.595642789309492e-06, + "loss": 0.0012, + "num_tokens": 49626891.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1675.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 400.6294860839844, + "completions/mean_terminated_length": 400.6294860839844, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.34768989819890367, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07059233635663986, + "kl": 0.0218505859375, + "learning_rate": 4.587268241878724e-06, + "loss": 0.0051, + "num_tokens": 50058041.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1704.0, + "completions/max_terminated_length": 1704.0, + "completions/mean_length": 413.64288330078125, + "completions/mean_terminated_length": 413.64288330078125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.350822239624119, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.027100209146738052, + "kl": 0.02264404296875, + "learning_rate": 4.578816606352205e-06, + "loss": -0.0016, + "num_tokens": 50524089.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1593.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 394.61163330078125, + "completions/mean_terminated_length": 394.61163330078125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.35395458104933436, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.042038217186927795, + "kl": 0.02313232421875, + "learning_rate": 4.570288237343632e-06, + "loss": 0.0006, + "num_tokens": 50961063.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1470.0, + "completions/mean_length": 408.81475830078125, + "completions/mean_terminated_length": 405.14764404296875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.3570869224745497, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.07356320321559906, + "kl": 0.021484375, + "learning_rate": 4.561683492686289e-06, + "loss": 0.009, + "num_tokens": 51404344.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 410.5758972167969, + "completions/mean_terminated_length": 406.9127502441406, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.36021926389976505, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.08780305832624435, + "kl": 0.020294189453125, + "learning_rate": 4.5530027334180285e-06, + "loss": 0.0105, + "num_tokens": 51847726.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1985.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 397.01116943359375, + "completions/mean_terminated_length": 397.01116943359375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.3633516053249804, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05564754456281662, + "kl": 0.0213623046875, + "learning_rate": 4.544246323766122e-06, + "loss": -0.0018, + "num_tokens": 52279895.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1586.0, + "completions/max_terminated_length": 1586.0, + "completions/mean_length": 388.9620666503906, + "completions/mean_terminated_length": 388.9620666503906, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.36648394675019574, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06145547702908516, + "kl": 0.021392822265625, + "learning_rate": 4.535414631131983e-06, + "loss": 0.0012, + "num_tokens": 52716798.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1819.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 418.3214416503906, + "completions/mean_terminated_length": 418.3214416503906, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.36961628817541115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005788884125649929, + "kl": 0.021026611328125, + "learning_rate": 4.526508026075746e-06, + "loss": 0.0002, + "num_tokens": 53165782.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1465.0, + "completions/mean_length": 410.7500305175781, + "completions/mean_terminated_length": 407.0872497558594, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.3727486296006265, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07519423216581345, + "kl": 0.024017333984375, + "learning_rate": 4.517526882300721e-06, + "loss": 0.0092, + "num_tokens": 53605838.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1508.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 383.76788330078125, + "completions/mean_terminated_length": 383.76788330078125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.37588097102584184, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06233908608555794, + "kl": 0.02349853515625, + "learning_rate": 4.508471576637713e-06, + "loss": -0.0015, + "num_tokens": 54033266.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1982.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 435.67413330078125, + "completions/mean_terminated_length": 435.67413330078125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.3790133124510572, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.033383797854185104, + "kl": 0.020538330078125, + "learning_rate": 4.499342489029211e-06, + "loss": 0.0017, + "num_tokens": 54496496.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1542.0, + "completions/max_terminated_length": 1542.0, + "completions/mean_length": 393.4107360839844, + "completions/mean_terminated_length": 393.4107360839844, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.38214565387627253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00913175381720066, + "kl": 0.0216064453125, + "learning_rate": 4.490140002513449e-06, + "loss": 0.0002, + "num_tokens": 54941616.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 402.6071472167969, + "completions/mean_terminated_length": 402.6071472167969, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.3852779953014879, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06300325691699982, + "kl": 0.02117919921875, + "learning_rate": 4.48086450320833e-06, + "loss": -0.0011, + "num_tokens": 55384880.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1766.0, + "completions/max_terminated_length": 1766.0, + "completions/mean_length": 419.95538330078125, + "completions/mean_terminated_length": 419.95538330078125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.3884103367267032, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05057992786169052, + "kl": 0.02044677734375, + "learning_rate": 4.4715163802952266e-06, + "loss": -0.0017, + "num_tokens": 55829348.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1274.0, + "completions/max_terminated_length": 1274.0, + "completions/mean_length": 404.63616943359375, + "completions/mean_terminated_length": 404.63616943359375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.39154267815191857, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.037633538246154785, + "kl": 0.02178955078125, + "learning_rate": 4.462096026002655e-06, + "loss": 0.0016, + "num_tokens": 56270933.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1551.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 419.2321472167969, + "completions/mean_terminated_length": 419.2321472167969, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.3946750195771339, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03587530925869942, + "kl": 0.01971435546875, + "learning_rate": 4.4526038355898144e-06, + "loss": 0.0017, + "num_tokens": 56717029.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 414.0870666503906, + "completions/mean_terminated_length": 414.0870666503906, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.39780736100234926, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.038493674248456955, + "kl": 0.02093505859375, + "learning_rate": 4.4430402073300035e-06, + "loss": -0.0009, + "num_tokens": 57170764.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1342.0, + "completions/max_terminated_length": 1342.0, + "completions/mean_length": 416.1227722167969, + "completions/mean_terminated_length": 416.1227722167969, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.4009397024275646, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05014871433377266, + "kl": 0.02056884765625, + "learning_rate": 4.433405542493909e-06, + "loss": 0.0, + "num_tokens": 57619591.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1436.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 429.74554443359375, + "completions/mean_terminated_length": 429.74554443359375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.40407204385277995, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06428268551826477, + "kl": 0.022705078125, + "learning_rate": 4.4237002453327734e-06, + "loss": 0.0012, + "num_tokens": 58082885.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1455.0, + "completions/max_terminated_length": 1455.0, + "completions/mean_length": 418.7165222167969, + "completions/mean_terminated_length": 418.7165222167969, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.4072043852779953, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06798820197582245, + "kl": 0.02044677734375, + "learning_rate": 4.4139247230614245e-06, + "loss": 0.0079, + "num_tokens": 58533710.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1198.0, + "completions/max_terminated_length": 1198.0, + "completions/mean_length": 400.1875305175781, + "completions/mean_terminated_length": 400.1875305175781, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.41033672670321064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004895299207419157, + "kl": 0.019805908203125, + "learning_rate": 4.404079385841201e-06, + "loss": 0.0002, + "num_tokens": 58969546.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1498.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 432.14288330078125, + "completions/mean_terminated_length": 432.14288330078125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.413469068128426, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.04892277717590332, + "kl": 0.019287109375, + "learning_rate": 4.394164646762734e-06, + "loss": -0.0015, + "num_tokens": 59433330.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1279.0, + "completions/max_terminated_length": 1279.0, + "completions/mean_length": 436.12725830078125, + "completions/mean_terminated_length": 436.12725830078125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.41660140955364133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004631616175174713, + "kl": 0.02020263671875, + "learning_rate": 4.384180921828618e-06, + "loss": 0.0002, + "num_tokens": 59910347.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1304.0, + "completions/max_terminated_length": 1304.0, + "completions/mean_length": 402.9107360839844, + "completions/mean_terminated_length": 402.9107360839844, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.4197337509788567, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.033406227827072144, + "kl": 0.021697998046875, + "learning_rate": 4.374128629935955e-06, + "loss": -0.0037, + "num_tokens": 60354371.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1990.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 439.8370666503906, + "completions/mean_terminated_length": 439.8370666503906, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.422866092404072, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03381761536002159, + "kl": 0.022247314453125, + "learning_rate": 4.364008192858781e-06, + "loss": 0.0011, + "num_tokens": 60822058.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1820.0, + "completions/max_terminated_length": 1820.0, + "completions/mean_length": 422.09600830078125, + "completions/mean_terminated_length": 422.09600830078125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.42599843382928737, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0586375817656517, + "kl": 0.020355224609375, + "learning_rate": 4.353820035230366e-06, + "loss": 0.0007, + "num_tokens": 61267833.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1452.0, + "completions/max_terminated_length": 1452.0, + "completions/mean_length": 430.9598388671875, + "completions/mean_terminated_length": 430.9598388671875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.4291307752545027, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.024929601699113846, + "kl": 0.021820068359375, + "learning_rate": 4.3435645845254e-06, + "loss": 0.0024, + "num_tokens": 61724359.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1288.0, + "completions/max_terminated_length": 1288.0, + "completions/mean_length": 444.79241943359375, + "completions/mean_terminated_length": 444.79241943359375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.43226311667971806, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03543654829263687, + "kl": 0.022552490234375, + "learning_rate": 4.333242271042054e-06, + "loss": 0.0005, + "num_tokens": 62201802.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1507.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 449.638427734375, + "completions/mean_terminated_length": 449.638427734375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.43539545810493346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004247685428708792, + "kl": 0.0167236328125, + "learning_rate": 4.32285352788393e-06, + "loss": 0.0002, + "num_tokens": 62676456.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1653.0, + "completions/max_terminated_length": 1653.0, + "completions/mean_length": 444.70538330078125, + "completions/mean_terminated_length": 444.70538330078125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.4385277995301488, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03802114352583885, + "kl": 0.019500732421875, + "learning_rate": 4.312398790941882e-06, + "loss": 0.0003, + "num_tokens": 63147220.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 424.18975830078125, + "completions/mean_terminated_length": 424.18975830078125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.44166014095536416, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04238564521074295, + "kl": 0.018829345703125, + "learning_rate": 4.301878498875735e-06, + "loss": 0.0013, + "num_tokens": 63590481.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1127.0, + "completions/max_terminated_length": 1127.0, + "completions/mean_length": 435.8594055175781, + "completions/mean_terminated_length": 435.8594055175781, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.4447924823805795, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.039626821875572205, + "kl": 0.019744873046875, + "learning_rate": 4.291293093095873e-06, + "loss": -0.0006, + "num_tokens": 64036058.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1468.0, + "completions/mean_length": 463.24554443359375, + "completions/mean_terminated_length": 459.7002258300781, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.44792482380579485, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07558800280094147, + "kl": 0.0198974609375, + "learning_rate": 4.280643017744723e-06, + "loss": 0.0063, + "num_tokens": 64518516.0, + "reward": 0.09888393431901932, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1719.0, + "completions/max_terminated_length": 1719.0, + "completions/mean_length": 432.5826110839844, + "completions/mean_terminated_length": 432.5826110839844, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.4510571652310102, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.02952486276626587, + "kl": 0.01800537109375, + "learning_rate": 4.269928719678117e-06, + "loss": -0.0002, + "num_tokens": 64977465.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 444.7901916503906, + "completions/mean_terminated_length": 441.2035827636719, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.45418950665622554, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.029885146766901016, + "kl": 0.018646240234375, + "learning_rate": 4.2591506484465426e-06, + "loss": 0.009, + "num_tokens": 65450351.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 405.8281555175781, + "completions/mean_terminated_length": 402.15435791015625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.4573218480814409, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06477701663970947, + "kl": 0.016693115234375, + "learning_rate": 4.248309256276283e-06, + "loss": 0.0112, + "num_tokens": 65880754.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1250.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 424.8660888671875, + "completions/mean_terminated_length": 424.8660888671875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.46045418950665623, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06679224967956543, + "kl": 0.017852783203125, + "learning_rate": 4.23740499805044e-06, + "loss": -0.0018, + "num_tokens": 66332970.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1699.0, + "completions/max_terminated_length": 1699.0, + "completions/mean_length": 442.0357360839844, + "completions/mean_terminated_length": 442.0357360839844, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.4635865309318716, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.046017590910196304, + "kl": 0.02056884765625, + "learning_rate": 4.22643833128985e-06, + "loss": 0.0005, + "num_tokens": 66801122.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1761.0, + "completions/mean_length": 424.2008972167969, + "completions/mean_terminated_length": 420.5682373046875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.4667188723570869, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.060904357582330704, + "kl": 0.01885986328125, + "learning_rate": 4.215409716133885e-06, + "loss": 0.0105, + "num_tokens": 67249236.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1336.0, + "completions/max_terminated_length": 1336.0, + "completions/mean_length": 429.33038330078125, + "completions/mean_terminated_length": 429.33038330078125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.46985121378230227, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.026197869330644608, + "kl": 0.017242431640625, + "learning_rate": 4.204319615321151e-06, + "loss": -0.0012, + "num_tokens": 67707544.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1384.0, + "completions/max_terminated_length": 1384.0, + "completions/mean_length": 444.529052734375, + "completions/mean_terminated_length": 444.529052734375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.4729835552075176, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05660957843065262, + "kl": 0.01904296875, + "learning_rate": 4.193168494170065e-06, + "loss": -0.0, + "num_tokens": 68170761.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1827.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 434.9352722167969, + "completions/mean_terminated_length": 434.9352722167969, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.47611589663273296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005580439697951078, + "kl": 0.0181884765625, + "learning_rate": 4.181956820559339e-06, + "loss": 0.0002, + "num_tokens": 68652256.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1544.0, + "completions/mean_length": 426.40850830078125, + "completions/mean_terminated_length": 419.13677978515625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.4792482380579483, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07825148850679398, + "kl": 0.019317626953125, + "learning_rate": 4.170685064908342e-06, + "loss": 0.0164, + "num_tokens": 69091639.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1863.0, + "completions/max_terminated_length": 1863.0, + "completions/mean_length": 444.81475830078125, + "completions/mean_terminated_length": 444.81475830078125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.48238057948316365, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.035275887697935104, + "kl": 0.018341064453125, + "learning_rate": 4.159353700157365e-06, + "loss": -0.0041, + "num_tokens": 69557636.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1492.0, + "completions/max_terminated_length": 1492.0, + "completions/mean_length": 467.5714416503906, + "completions/mean_terminated_length": 467.5714416503906, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.485512920908379, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.029416698962450027, + "kl": 0.0168914794921875, + "learning_rate": 4.14796320174778e-06, + "loss": -0.0034, + "num_tokens": 70039472.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1553.0, + "completions/max_terminated_length": 1553.0, + "completions/mean_length": 471.575927734375, + "completions/mean_terminated_length": 471.575927734375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.48864526233359434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00416595907881856, + "kl": 0.016510009765625, + "learning_rate": 4.136514047602087e-06, + "loss": 0.0002, + "num_tokens": 70506482.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1955.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 451.16741943359375, + "completions/mean_terminated_length": 451.16741943359375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.4917776037588097, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.0342145673930645, + "kl": 0.019134521484375, + "learning_rate": 4.1250067181038635e-06, + "loss": 0.002, + "num_tokens": 70977613.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1630.0, + "completions/max_terminated_length": 1630.0, + "completions/mean_length": 479.9933166503906, + "completions/mean_terminated_length": 479.9933166503906, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.49490994518402504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005227183923125267, + "kl": 0.017730712890625, + "learning_rate": 4.113441696077608e-06, + "loss": 0.0002, + "num_tokens": 71459726.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1695.0, + "completions/mean_length": 495.5469055175781, + "completions/mean_terminated_length": 492.0738220214844, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.4980422866092404, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.046903930604457855, + "kl": 0.01763916015625, + "learning_rate": 4.101819466768484e-06, + "loss": 0.002, + "num_tokens": 71937311.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 467.83038330078125, + "completions/mean_terminated_length": 464.2953186035156, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.5011746280344558, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05908460170030594, + "kl": 0.02105712890625, + "learning_rate": 4.0901405178219535e-06, + "loss": 0.0088, + "num_tokens": 72401303.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 488.5625305175781, + "completions/mean_terminated_length": 485.0738220214844, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.5043069694596711, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.045315101742744446, + "kl": 0.0169677734375, + "learning_rate": 4.078405339263326e-06, + "loss": 0.0007, + "num_tokens": 72871379.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1948.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 493.0669860839844, + "completions/mean_terminated_length": 493.0669860839844, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.5074393108848865, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05223213881254196, + "kl": 0.016937255859375, + "learning_rate": 4.06661442347719e-06, + "loss": 0.0027, + "num_tokens": 73349369.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2040.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 474.2701110839844, + "completions/mean_terminated_length": 474.2701110839844, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.5105716523101018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005963857285678387, + "kl": 0.01861572265625, + "learning_rate": 4.054768265186758e-06, + "loss": 0.0002, + "num_tokens": 73814866.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1645.0, + "completions/mean_length": 485.99554443359375, + "completions/mean_terminated_length": 478.9910583496094, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.5137039937353172, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.044692981988191605, + "kl": 0.0167236328125, + "learning_rate": 4.0428673614331036e-06, + "loss": 0.0179, + "num_tokens": 74289444.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1815.0, + "completions/mean_length": 479.2723388671875, + "completions/mean_terminated_length": 475.76287841796875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.5168363351605325, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06909096240997314, + "kl": 0.020660400390625, + "learning_rate": 4.030912211554316e-06, + "loss": -0.0049, + "num_tokens": 74791110.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 489.1942138671875, + "completions/mean_terminated_length": 485.7069396972656, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.5199686765857479, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.057528018951416016, + "kl": 0.01971435546875, + "learning_rate": 4.018903317164539e-06, + "loss": 0.0066, + "num_tokens": 75277889.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1653.0, + "completions/max_terminated_length": 1653.0, + "completions/mean_length": 475.7388610839844, + "completions/mean_terminated_length": 475.7388610839844, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.5231010180109632, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.1031484305858612, + "kl": 0.027099609375, + "learning_rate": 4.006841182132932e-06, + "loss": 0.0005, + "num_tokens": 75765876.0, + "reward": 0.09843751043081284, + "reward_std": 0.002747634192928672, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 435.9531555175781, + "completions/mean_terminated_length": 435.9531555175781, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.5262333594361785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006024480331689119, + "kl": 0.019927978515625, + "learning_rate": 3.9947263125625195e-06, + "loss": 0.0002, + "num_tokens": 76214491.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1735.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 452.1808166503906, + "completions/mean_terminated_length": 452.1808166503906, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.5293657008613939, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.07645807415246964, + "kl": 0.020294189453125, + "learning_rate": 3.982559216768967e-06, + "loss": 0.0017, + "num_tokens": 76682528.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 469.8683166503906, + "completions/mean_terminated_length": 466.3377990722656, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.5324980422866092, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04011530801653862, + "kl": 0.01934814453125, + "learning_rate": 3.970340405259245e-06, + "loss": 0.0122, + "num_tokens": 77164905.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1763.0, + "completions/mean_length": 463.2410888671875, + "completions/mean_terminated_length": 459.69573974609375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.5356303837118246, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06689172238111496, + "kl": 0.018585205078125, + "learning_rate": 3.958070390710214e-06, + "loss": 0.0091, + "num_tokens": 77632085.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 453.47100830078125, + "completions/mean_terminated_length": 453.47100830078125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.5387627251370399, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.09764935076236725, + "kl": 0.023681640625, + "learning_rate": 3.945749687947109e-06, + "loss": 0.0, + "num_tokens": 78102628.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1647.0, + "completions/max_terminated_length": 1647.0, + "completions/mean_length": 463.6919860839844, + "completions/mean_terminated_length": 463.6919860839844, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.5418950665622553, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.05786895006895065, + "kl": 0.020263671875, + "learning_rate": 3.933378813921942e-06, + "loss": 0.0, + "num_tokens": 78588054.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 454.9263610839844, + "completions/mean_terminated_length": 447.78253173828125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.5450274079874706, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05149205029010773, + "kl": 0.0198974609375, + "learning_rate": 3.920958287691811e-06, + "loss": -0.0009, + "num_tokens": 79065681.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1642.0, + "completions/max_terminated_length": 1642.0, + "completions/mean_length": 433.6651916503906, + "completions/mean_terminated_length": 433.6651916503906, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.548159749412686, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06286544352769852, + "kl": 0.018463134765625, + "learning_rate": 3.908488630397121e-06, + "loss": 0.0025, + "num_tokens": 79513395.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 422.64288330078125, + "completions/mean_terminated_length": 422.64288330078125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.5512920908379013, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.055205706506967545, + "kl": 0.019012451171875, + "learning_rate": 3.8959703652397175e-06, + "loss": -0.0003, + "num_tokens": 79954775.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1709.0, + "completions/max_terminated_length": 1709.0, + "completions/mean_length": 430.5714416503906, + "completions/mean_terminated_length": 430.5714416503906, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.5544244322631167, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05063927546143532, + "kl": 0.018096923828125, + "learning_rate": 3.883404017460935e-06, + "loss": 0.0009, + "num_tokens": 80416767.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 464.22991943359375, + "completions/mean_terminated_length": 457.1278381347656, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.557556773688332, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.0442616306245327, + "kl": 0.018524169921875, + "learning_rate": 3.870790114319559e-06, + "loss": 0.0038, + "num_tokens": 80874922.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1573.0, + "completions/mean_length": 428.3169860839844, + "completions/mean_terminated_length": 424.6935119628906, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.5606891151135474, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04946316406130791, + "kl": 0.018310546875, + "learning_rate": 3.858129185069701e-06, + "loss": 0.002, + "num_tokens": 81331216.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2023.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 436.4196472167969, + "completions/mean_terminated_length": 436.4196472167969, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.5638214565387627, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.02826928347349167, + "kl": 0.01788330078125, + "learning_rate": 3.845421760938597e-06, + "loss": -0.0, + "num_tokens": 81782092.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 416.3370666503906, + "completions/mean_terminated_length": 412.6867980957031, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.566953797963978, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.060109034180641174, + "kl": 0.02069091796875, + "learning_rate": 3.832668375104312e-06, + "loss": -0.0046, + "num_tokens": 82213079.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 411.5513610839844, + "completions/mean_terminated_length": 407.890380859375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.5700861393891934, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.061223503202199936, + "kl": 0.019805908203125, + "learning_rate": 3.8198695626733725e-06, + "loss": 0.0014, + "num_tokens": 82657562.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1681.0, + "completions/max_terminated_length": 1681.0, + "completions/mean_length": 419.23663330078125, + "completions/mean_terminated_length": 419.23663330078125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.5732184808144087, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.024953968822956085, + "kl": 0.019195556640625, + "learning_rate": 3.8070258606583156e-06, + "loss": -0.0001, + "num_tokens": 83102644.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 457.5067138671875, + "completions/mean_terminated_length": 446.7842712402344, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.5763508222396241, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.07536093890666962, + "kl": 0.019256591796875, + "learning_rate": 3.7941378079551544e-06, + "loss": 0.0189, + "num_tokens": 83586407.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 436.5758972167969, + "completions/mean_terminated_length": 432.9709167480469, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.5794831636648394, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06124957650899887, + "kl": 0.018585205078125, + "learning_rate": 3.7812059453207677e-06, + "loss": 0.008, + "num_tokens": 84043805.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 432.4219055175781, + "completions/mean_terminated_length": 425.1771545410156, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.5826155050900548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0055785714648664, + "kl": 0.019317626953125, + "learning_rate": 3.768230815350213e-06, + "loss": 0.0002, + "num_tokens": 84493534.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 465.8348388671875, + "completions/mean_terminated_length": 451.5810852050781, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.5857478465152701, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04979455843567848, + "kl": 0.020751953125, + "learning_rate": 3.7552129624539557e-06, + "loss": 0.0026, + "num_tokens": 84971672.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1816.0, + "completions/max_terminated_length": 1816.0, + "completions/mean_length": 450.6473388671875, + "completions/mean_terminated_length": 450.6473388671875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.5888801879404855, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.08149141073226929, + "kl": 0.02978515625, + "learning_rate": 3.7421529328350316e-06, + "loss": 0.0034, + "num_tokens": 85424050.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1740.0, + "completions/mean_length": 457.09600830078125, + "completions/mean_terminated_length": 453.53692626953125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.5920125293657008, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.0363803468644619, + "kl": 0.01861572265625, + "learning_rate": 3.7290512744661274e-06, + "loss": 0.001, + "num_tokens": 85881473.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1649.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 444.8638610839844, + "completions/mean_terminated_length": 444.8638610839844, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.5951448707909162, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0563562847673893, + "kl": 0.019805908203125, + "learning_rate": 3.715908537066589e-06, + "loss": 0.0003, + "num_tokens": 86336596.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1747.0, + "completions/mean_length": 459.7879638671875, + "completions/mean_terminated_length": 449.0809020996094, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.5982772122161315, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07267095893621445, + "kl": 0.01983642578125, + "learning_rate": 3.7027252720793538e-06, + "loss": 0.0092, + "num_tokens": 86794781.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1922.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 432.91741943359375, + "completions/mean_terminated_length": 432.91741943359375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.601409553641347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00758605869486928, + "kl": 0.019561767578125, + "learning_rate": 3.689502032647817e-06, + "loss": 0.0002, + "num_tokens": 87246740.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1746.0, + "completions/mean_length": 455.49554443359375, + "completions/mean_terminated_length": 444.7595520019531, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.6045418950665623, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06063640117645264, + "kl": 0.02606201171875, + "learning_rate": 3.6762393735926245e-06, + "loss": -0.0047, + "num_tokens": 87717102.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1284.0, + "completions/mean_length": 431.96429443359375, + "completions/mean_terminated_length": 428.3489990234375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.6076742364917777, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07532497495412827, + "kl": 0.019561767578125, + "learning_rate": 3.6629378513883852e-06, + "loss": 0.012, + "num_tokens": 88162174.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349845170975, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 475.0312805175781, + "completions/mean_terminated_length": 467.97760009765625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.610806577916993, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.023675817996263504, + "kl": 0.0189208984375, + "learning_rate": 3.6495980241403307e-06, + "loss": -0.0001, + "num_tokens": 88636128.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1371.0, + "completions/mean_length": 461.94866943359375, + "completions/mean_terminated_length": 454.8363342285156, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.6139389193422083, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07393117249011993, + "kl": 0.01824951171875, + "learning_rate": 3.636220451560896e-06, + "loss": 0.0161, + "num_tokens": 89108929.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349845170975, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1992.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 455.7232360839844, + "completions/mean_terminated_length": 455.7232360839844, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.6170712607674237, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.035437002778053284, + "kl": 0.01934814453125, + "learning_rate": 3.622805694946235e-06, + "loss": -0.0001, + "num_tokens": 89565757.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1896.0, + "completions/max_terminated_length": 1896.0, + "completions/mean_length": 458.08929443359375, + "completions/mean_terminated_length": 458.08929443359375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.620203602192639, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04889894649386406, + "kl": 0.017913818359375, + "learning_rate": 3.609354317152667e-06, + "loss": 0.0013, + "num_tokens": 90026021.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1415.0, + "completions/mean_length": 457.75225830078125, + "completions/mean_terminated_length": 454.19464111328125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.6233359436178544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010155444033443928, + "kl": 0.020660400390625, + "learning_rate": 3.595866882573063e-06, + "loss": 0.0002, + "num_tokens": 90484742.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1497.0, + "completions/max_terminated_length": 1497.0, + "completions/mean_length": 455.13616943359375, + "completions/mean_terminated_length": 455.13616943359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.6264682850430697, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.042124487459659576, + "kl": 0.019500732421875, + "learning_rate": 3.5823439571131675e-06, + "loss": 0.001, + "num_tokens": 90942483.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 461.93975830078125, + "completions/mean_terminated_length": 458.3915100097656, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.6296006264682851, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.05955704674124718, + "kl": 0.02032470703125, + "learning_rate": 3.5687861081678477e-06, + "loss": 0.0103, + "num_tokens": 91410772.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1552.0, + "completions/mean_length": 462.01788330078125, + "completions/mean_terminated_length": 458.46978759765625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.6327329678935004, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.08808083087205887, + "kl": 0.020111083984375, + "learning_rate": 3.555193904597291e-06, + "loss": 0.0174, + "num_tokens": 91892716.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2044.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 435.52679443359375, + "completions/mean_terminated_length": 435.52679443359375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.6358653093187158, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03748089447617531, + "kl": 0.019012451171875, + "learning_rate": 3.541567916703138e-06, + "loss": 0.0037, + "num_tokens": 92336104.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1499.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 454.2388610839844, + "completions/mean_terminated_length": 454.2388610839844, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.6389976507439311, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04406944289803505, + "kl": 0.023773193359375, + "learning_rate": 3.5279087162045517e-06, + "loss": 0.0003, + "num_tokens": 92794711.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1629.0, + "completions/max_terminated_length": 1629.0, + "completions/mean_length": 427.97991943359375, + "completions/mean_terminated_length": 427.97991943359375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.6421299921691465, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06538143008947372, + "kl": 0.020416259765625, + "learning_rate": 3.5142168762142265e-06, + "loss": 0.0022, + "num_tokens": 93234970.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1121.0, + "completions/max_terminated_length": 1121.0, + "completions/mean_length": 419.4062805175781, + "completions/mean_terminated_length": 419.4062805175781, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.6452623335943618, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06724836677312851, + "kl": 0.02490234375, + "learning_rate": 3.500492971214347e-06, + "loss": -0.0024, + "num_tokens": 93680184.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 461.294677734375, + "completions/mean_terminated_length": 457.7449645996094, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.6483946750195771, + "frac_reward_zero_std": 0.9285714626312256, + "grad_norm": 0.1256112903356552, + "kl": 0.0238037109375, + "learning_rate": 3.48673757703248e-06, + "loss": 0.0233, + "num_tokens": 94152748.0, + "reward": 0.098214291036129, + "reward_std": 0.0035714288242161274, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9821428656578064, + "rewards/format_reward/std": 0.13258016109466553, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 458.2433166503906, + "completions/mean_terminated_length": 454.6867980957031, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.6515270164447925, + "frac_reward_zero_std": 0.9285714626312256, + "grad_norm": 0.11373941600322723, + "kl": 0.020263671875, + "learning_rate": 3.472951270817418e-06, + "loss": 0.0258, + "num_tokens": 94625329.0, + "reward": 0.098214291036129, + "reward_std": 0.0035714288242161274, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9821428656578064, + "rewards/format_reward/std": 0.13258016109466553, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 444.982177734375, + "completions/mean_terminated_length": 441.3959655761719, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.6546593578700078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007019388489425182, + "kl": 0.020660400390625, + "learning_rate": 3.4591346310149578e-06, + "loss": 0.0002, + "num_tokens": 95101317.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1100.0, + "completions/max_terminated_length": 1100.0, + "completions/mean_length": 428.8817138671875, + "completions/mean_terminated_length": 428.8817138671875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.6577916992952232, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0058451443910598755, + "kl": 0.0191650390625, + "learning_rate": 3.445288237343632e-06, + "loss": 0.0002, + "num_tokens": 95547108.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 403.9776916503906, + "completions/mean_terminated_length": 400.2997741699219, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.6609240407204385, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.0821026861667633, + "kl": 0.027679443359375, + "learning_rate": 3.4314126707703895e-06, + "loss": 0.0001, + "num_tokens": 95973198.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1607.0, + "completions/mean_length": 429.95538330078125, + "completions/mean_terminated_length": 422.6995849609375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.6640563821456539, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.08282407373189926, + "kl": 0.022308349609375, + "learning_rate": 3.4175085134862128e-06, + "loss": 0.0092, + "num_tokens": 96425170.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 441.01116943359375, + "completions/mean_terminated_length": 441.01116943359375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.6671887235708692, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06287416070699692, + "kl": 0.020721435546875, + "learning_rate": 3.4035763488816953e-06, + "loss": -0.0024, + "num_tokens": 96882499.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1645.0, + "completions/max_terminated_length": 1645.0, + "completions/mean_length": 431.3594055175781, + "completions/mean_terminated_length": 431.3594055175781, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.6703210649960846, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.053418248891830444, + "kl": 0.023468017578125, + "learning_rate": 3.3896167615225594e-06, + "loss": 0.0036, + "num_tokens": 97337316.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 411.4442138671875, + "completions/mean_terminated_length": 407.7829895019531, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.6734534064212999, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.08320935815572739, + "kl": 0.023681640625, + "learning_rate": 3.375630337125133e-06, + "loss": 0.0083, + "num_tokens": 97786963.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1463.0, + "completions/mean_length": 445.0067138671875, + "completions/mean_terminated_length": 441.42059326171875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.6765857478465153, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06361282616853714, + "kl": 0.02569580078125, + "learning_rate": 3.361617662531772e-06, + "loss": 0.0015, + "num_tokens": 98246590.0, + "reward": 0.09910715371370316, + "reward_std": 0.0014083485584706068, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 427.5848388671875, + "completions/mean_terminated_length": 423.9597473144531, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.6797180892717306, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.08153685182332993, + "kl": 0.0228271484375, + "learning_rate": 3.347579325686237e-06, + "loss": 0.0086, + "num_tokens": 98703436.0, + "reward": 0.09866072982549667, + "reward_std": 0.0023012058809399605, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507843434810638, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1743.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 441.7723388671875, + "completions/mean_terminated_length": 441.7723388671875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.682850430696946, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06435585021972656, + "kl": 0.0235595703125, + "learning_rate": 3.333515915609027e-06, + "loss": 0.0011, + "num_tokens": 99160286.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 448.700927734375, + "completions/mean_terminated_length": 441.5291748046875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.6859827721221613, + "frac_reward_zero_std": 0.910714328289032, + "grad_norm": 0.11906581372022629, + "kl": 0.023712158203125, + "learning_rate": 3.3194280223726616e-06, + "loss": 0.0233, + "num_tokens": 99614584.0, + "reward": 0.09776786714792252, + "reward_std": 0.004464285913854837, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9776785969734192, + "rewards/format_reward/std": 0.1478918492794037, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1427.0, + "completions/max_terminated_length": 1427.0, + "completions/mean_length": 416.8102722167969, + "completions/mean_terminated_length": 416.8102722167969, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.6891151135473766, + "frac_reward_zero_std": 0.9196429252624512, + "grad_norm": 0.11102369427680969, + "kl": 0.0250244140625, + "learning_rate": 3.305316237076927e-06, + "loss": 0.0118, + "num_tokens": 100060907.0, + "reward": 0.09776786714792252, + "reward_std": 0.004086920525878668, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9776785969734192, + "rewards/format_reward/std": 0.1478918492794037, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1860.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 406.8750305175781, + "completions/mean_terminated_length": 406.8750305175781, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.692247454972592, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04550117254257202, + "kl": 0.021881103515625, + "learning_rate": 3.291181151824071e-06, + "loss": 0.0005, + "num_tokens": 100497375.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1296.0, + "completions/mean_length": 423.2544860839844, + "completions/mean_terminated_length": 419.61968994140625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.6953797963978073, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.09862664341926575, + "kl": 0.024871826171875, + "learning_rate": 3.27702335969396e-06, + "loss": 0.0111, + "num_tokens": 100950377.0, + "reward": 0.098214291036129, + "reward_std": 0.0031940629705786705, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9821428656578064, + "rewards/format_reward/std": 0.13258016109466553, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 440.33929443359375, + "completions/mean_terminated_length": 436.74273681640625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.6985121378230227, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07386577129364014, + "kl": 0.022613525390625, + "learning_rate": 3.2628434547191985e-06, + "loss": 0.0053, + "num_tokens": 101405929.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 406.5758972167969, + "completions/mean_terminated_length": 406.5758972167969, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.701644479248238, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.09703066200017929, + "kl": 0.024566650390625, + "learning_rate": 3.2486420318601973e-06, + "loss": 0.0054, + "num_tokens": 101856723.0, + "reward": 0.098214291036129, + "reward_std": 0.0031940629705786705, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9821428656578064, + "rewards/format_reward/std": 0.13258016109466553, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1206.0, + "completions/max_terminated_length": 1206.0, + "completions/mean_length": 427.4754638671875, + "completions/mean_terminated_length": 427.4754638671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.7047768206734534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006238581147044897, + "kl": 0.020965576171875, + "learning_rate": 3.2344196869802187e-06, + "loss": 0.0002, + "num_tokens": 102329812.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1432.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 443.91741943359375, + "completions/mean_terminated_length": 443.91741943359375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.7079091620986687, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.029934454709291458, + "kl": 0.021331787109375, + "learning_rate": 3.2201770168203694e-06, + "loss": 0.0002, + "num_tokens": 102791863.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1830.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 430.18304443359375, + "completions/mean_terminated_length": 430.18304443359375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.7110415035238841, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03165459632873535, + "kl": 0.021728515625, + "learning_rate": 3.205914618974563e-06, + "loss": 0.001, + "num_tokens": 103242145.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1403.0, + "completions/max_terminated_length": 1403.0, + "completions/mean_length": 423.1875305175781, + "completions/mean_terminated_length": 423.1875305175781, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.7141738449490994, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.08442176133394241, + "kl": 0.02716064453125, + "learning_rate": 3.1916330918644496e-06, + "loss": 0.0014, + "num_tokens": 103697889.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1519.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 418.0000305175781, + "completions/mean_terminated_length": 418.0000305175781, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.7173061863743148, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05640263110399246, + "kl": 0.022369384765625, + "learning_rate": 3.177333034714303e-06, + "loss": -0.003, + "num_tokens": 104151233.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1411.0, + "completions/max_terminated_length": 1411.0, + "completions/mean_length": 443.0089416503906, + "completions/mean_terminated_length": 443.0089416503906, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.7204385277995301, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.04739142209291458, + "kl": 0.02618408203125, + "learning_rate": 3.1630150475258813e-06, + "loss": -0.0002, + "num_tokens": 104621633.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 446.7187805175781, + "completions/mean_terminated_length": 443.136474609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.7235708692247454, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07269170880317688, + "kl": 0.027984619140625, + "learning_rate": 3.148679731053252e-06, + "loss": 0.0067, + "num_tokens": 105104147.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1542.0, + "completions/max_terminated_length": 1542.0, + "completions/mean_length": 431.7232360839844, + "completions/mean_terminated_length": 431.7232360839844, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.7267032106499608, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.034618914127349854, + "kl": 0.027130126953125, + "learning_rate": 3.1343276867775805e-06, + "loss": 0.0011, + "num_tokens": 105558851.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1484.0, + "completions/max_terminated_length": 1484.0, + "completions/mean_length": 445.5848388671875, + "completions/mean_terminated_length": 445.5848388671875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.7298355520751761, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06168297678232193, + "kl": 0.023773193359375, + "learning_rate": 3.1199595168819043e-06, + "loss": 0.0091, + "num_tokens": 106029569.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1583.0, + "completions/max_terminated_length": 1583.0, + "completions/mean_length": 443.67413330078125, + "completions/mean_terminated_length": 443.67413330078125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.7329678935003915, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.08309945464134216, + "kl": 0.02178955078125, + "learning_rate": 3.105575824225852e-06, + "loss": 0.0087, + "num_tokens": 106481083.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507843434810638, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1423.0, + "completions/max_terminated_length": 1423.0, + "completions/mean_length": 447.7969055175781, + "completions/mean_terminated_length": 447.7969055175781, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.7361002349256069, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.1024441048502922, + "kl": 0.026641845703125, + "learning_rate": 3.091177212320363e-06, + "loss": 0.0123, + "num_tokens": 106938308.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1659.0, + "completions/max_terminated_length": 1659.0, + "completions/mean_length": 462.3906555175781, + "completions/mean_terminated_length": 462.3906555175781, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.7392325763508223, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.04785303771495819, + "kl": 0.023162841796875, + "learning_rate": 3.0767642853023538e-06, + "loss": 0.004, + "num_tokens": 107404983.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1454.0, + "completions/max_terminated_length": 1454.0, + "completions/mean_length": 439.9219055175781, + "completions/mean_terminated_length": 439.9219055175781, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.7423649177760376, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.0405932292342186, + "kl": 0.025421142578125, + "learning_rate": 3.062337647909376e-06, + "loss": 0.0004, + "num_tokens": 107865456.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1397.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 455.6339416503906, + "completions/mean_terminated_length": 455.6339416503906, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.745497259201253, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.08389384299516678, + "kl": 0.0250244140625, + "learning_rate": 3.04789790545424e-06, + "loss": 0.0098, + "num_tokens": 108329392.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507843434810638, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1771.0, + "completions/max_terminated_length": 1771.0, + "completions/mean_length": 448.8035888671875, + "completions/mean_terminated_length": 448.8035888671875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.7486296006264683, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05844506993889809, + "kl": 0.023773193359375, + "learning_rate": 3.033445663799621e-06, + "loss": 0.0079, + "num_tokens": 108793836.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1762.0, + "completions/mean_length": 460.6875305175781, + "completions/mean_terminated_length": 457.136474609375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.7517619420516837, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04849734157323837, + "kl": 0.021270751953125, + "learning_rate": 3.018981529332633e-06, + "loss": 0.0044, + "num_tokens": 109260756.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 444.5937805175781, + "completions/mean_terminated_length": 444.5937805175781, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.754894283476899, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03499262034893036, + "kl": 0.022125244140625, + "learning_rate": 3.00450610893939e-06, + "loss": 0.0038, + "num_tokens": 109738318.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1485.0, + "completions/mean_length": 430.6250305175781, + "completions/mean_terminated_length": 427.0067138671875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.7580266249021144, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06651164591312408, + "kl": 0.021942138671875, + "learning_rate": 2.9900200099795396e-06, + "loss": 0.0078, + "num_tokens": 110192098.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1463.0, + "completions/max_terminated_length": 1463.0, + "completions/mean_length": 421.08929443359375, + "completions/mean_terminated_length": 421.08929443359375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.7611589663273297, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05733862519264221, + "kl": 0.0235595703125, + "learning_rate": 2.9755238402607826e-06, + "loss": 0.0059, + "num_tokens": 110641770.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1468.0, + "completions/max_terminated_length": 1468.0, + "completions/mean_length": 410.3169860839844, + "completions/mean_terminated_length": 410.3169860839844, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.7642913077525451, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04911844804883003, + "kl": 0.0260009765625, + "learning_rate": 2.961018208013367e-06, + "loss": 0.0023, + "num_tokens": 111079484.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1232.0, + "completions/mean_length": 421.38616943359375, + "completions/mean_terminated_length": 417.7471923828125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.7674236491777604, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.09095726907253265, + "kl": 0.025390625, + "learning_rate": 2.9465037218645694e-06, + "loss": 0.0145, + "num_tokens": 111520617.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 425.9598388671875, + "completions/mean_terminated_length": 422.3310852050781, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.7705559906029757, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06249842047691345, + "kl": 0.0228271484375, + "learning_rate": 2.9319809908131604e-06, + "loss": 0.0022, + "num_tokens": 111987563.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 432.3370666503906, + "completions/mean_terminated_length": 421.4449462890625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.7736883320281911, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.050829462707042694, + "kl": 0.022216796875, + "learning_rate": 2.917450624203847e-06, + "loss": 0.0183, + "num_tokens": 112439314.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1364.0, + "completions/max_terminated_length": 1364.0, + "completions/mean_length": 399.1942138671875, + "completions/mean_terminated_length": 399.1942138671875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.7768206734534064, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.07356563955545425, + "kl": 0.020843505859375, + "learning_rate": 2.9029132317017118e-06, + "loss": 0.0064, + "num_tokens": 112884445.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1795.0, + "completions/max_terminated_length": 1795.0, + "completions/mean_length": 384.3683166503906, + "completions/mean_terminated_length": 384.3683166503906, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.7799530148786218, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04501449689269066, + "kl": 0.022918701171875, + "learning_rate": 2.888369423266629e-06, + "loss": -0.001, + "num_tokens": 113309646.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1603.0, + "completions/mean_length": 399.5781555175781, + "completions/mean_terminated_length": 395.890380859375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.7830853563038371, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05968547239899635, + "kl": 0.022918701171875, + "learning_rate": 2.8738198091276712e-06, + "loss": 0.0121, + "num_tokens": 113762245.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1657.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 405.6451110839844, + "completions/mean_terminated_length": 405.6451110839844, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.7862176977290525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006970382761210203, + "kl": 0.02166748046875, + "learning_rate": 2.859264999757509e-06, + "loss": 0.0002, + "num_tokens": 114217766.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1130.0, + "completions/mean_length": 410.8750305175781, + "completions/mean_terminated_length": 407.2125244140625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.7893500391542678, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.05900103971362114, + "kl": 0.023590087890625, + "learning_rate": 2.8447056058467928e-06, + "loss": 0.0112, + "num_tokens": 114663646.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 374.7567138671875, + "completions/mean_terminated_length": 371.013427734375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.7924823805794832, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.039162665605545044, + "kl": 0.0228271484375, + "learning_rate": 2.830142238278531e-06, + "loss": 0.0013, + "num_tokens": 115087633.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1354.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 400.8281555175781, + "completions/mean_terminated_length": 400.8281555175781, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.7956147220046985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006653350777924061, + "kl": 0.02276611328125, + "learning_rate": 2.81557550810246e-06, + "loss": 0.0002, + "num_tokens": 115553992.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 377.7790222167969, + "completions/mean_terminated_length": 374.0425109863281, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.7987470634299139, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.053671929985284805, + "kl": 0.024017333984375, + "learning_rate": 2.8010060265094026e-06, + "loss": 0.0011, + "num_tokens": 115979905.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1358.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 394.5848388671875, + "completions/mean_terminated_length": 394.5848388671875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.8018794048551292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006365491542965174, + "kl": 0.022216796875, + "learning_rate": 2.786434404805629e-06, + "loss": 0.0002, + "num_tokens": 116434471.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1748.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 390.3995666503906, + "completions/mean_terminated_length": 390.3995666503906, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.8050117462803446, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.03964247927069664, + "kl": 0.02349853515625, + "learning_rate": 2.771861254387199e-06, + "loss": 0.0036, + "num_tokens": 116883786.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1819.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 434.7857360839844, + "completions/mean_terminated_length": 434.7857360839844, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.8081440877055599, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0074180918745696545, + "kl": 0.020904541015625, + "learning_rate": 2.7572871867143204e-06, + "loss": 0.0002, + "num_tokens": 117352422.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1991.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 418.0915222167969, + "completions/mean_terminated_length": 418.0915222167969, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.8112764291307752, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.057186927646398544, + "kl": 0.023193359375, + "learning_rate": 2.742712813285681e-06, + "loss": 0.0048, + "num_tokens": 117792655.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1661.0, + "completions/mean_length": 446.1942138671875, + "completions/mean_terminated_length": 442.6107482910156, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.8144087705559906, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.04969083145260811, + "kl": 0.021240234375, + "learning_rate": 2.7281387456128017e-06, + "loss": 0.0079, + "num_tokens": 118252002.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 444.966552734375, + "completions/mean_terminated_length": 437.7780456542969, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.8175411119812059, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05854301527142525, + "kl": 0.02044677734375, + "learning_rate": 2.7135655951943716e-06, + "loss": 0.0094, + "num_tokens": 118720659.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1932.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 460.7344055175781, + "completions/mean_terminated_length": 460.7344055175781, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.8206734534064213, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03274833410978317, + "kl": 0.02215576171875, + "learning_rate": 2.698993973490598e-06, + "loss": -0.0004, + "num_tokens": 119192240.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1804.0, + "completions/mean_length": 468.075927734375, + "completions/mean_terminated_length": 464.5413818359375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.8238057948316366, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.08115614950656891, + "kl": 0.025299072265625, + "learning_rate": 2.6844244918975416e-06, + "loss": 0.0117, + "num_tokens": 119687098.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1677.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 436.9888610839844, + "completions/mean_terminated_length": 436.9888610839844, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.826938136256852, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07361668348312378, + "kl": 0.023895263671875, + "learning_rate": 2.66985776172147e-06, + "loss": 0.0126, + "num_tokens": 120148905.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1869.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 420.0870666503906, + "completions/mean_terminated_length": 420.0870666503906, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.8300704776820673, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06226085126399994, + "kl": 0.021820068359375, + "learning_rate": 2.6552943941532088e-06, + "loss": 0.0061, + "num_tokens": 120595856.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1416.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 454.18975830078125, + "completions/mean_terminated_length": 454.18975830078125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.8332028191072827, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.0545564629137516, + "kl": 0.020721435546875, + "learning_rate": 2.6407350002424927e-06, + "loss": 0.0032, + "num_tokens": 121059001.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1982.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 431.40850830078125, + "completions/mean_terminated_length": 431.40850830078125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.836335160532498, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006274199113249779, + "kl": 0.022247314453125, + "learning_rate": 2.626180190872329e-06, + "loss": 0.0002, + "num_tokens": 121503384.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2046.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 455.52679443359375, + "completions/mean_terminated_length": 455.52679443359375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.8394675019577134, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06729487329721451, + "kl": 0.02276611328125, + "learning_rate": 2.611630576733372e-06, + "loss": 0.0129, + "num_tokens": 121984632.0, + "reward": 0.0993303582072258, + "reward_std": 0.0009619199554435909, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 416.9732360839844, + "completions/mean_terminated_length": 413.32440185546875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.8425998433829287, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.02787000499665737, + "kl": 0.021240234375, + "learning_rate": 2.5970867682982885e-06, + "loss": 0.0107, + "num_tokens": 122415752.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1442.0, + "completions/mean_length": 436.3571472167969, + "completions/mean_terminated_length": 429.13006591796875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.845732184808144, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.08698167651891708, + "kl": 0.0216064453125, + "learning_rate": 2.582549375796154e-06, + "loss": 0.021, + "num_tokens": 122861704.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1617.0, + "completions/max_terminated_length": 1617.0, + "completions/mean_length": 427.7388610839844, + "completions/mean_terminated_length": 427.7388610839844, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.8488645262333594, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.07174947112798691, + "kl": 0.0216064453125, + "learning_rate": 2.568019009186841e-06, + "loss": 0.0053, + "num_tokens": 123313555.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1501.0, + "completions/mean_length": 412.3415222167969, + "completions/mean_terminated_length": 408.6823425292969, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.8519968676585747, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.06292321532964706, + "kl": 0.02337646484375, + "learning_rate": 2.5534962781354317e-06, + "loss": 0.0099, + "num_tokens": 123754768.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1286.0, + "completions/max_terminated_length": 1286.0, + "completions/mean_length": 420.5067138671875, + "completions/mean_terminated_length": 420.5067138671875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.8551292090837901, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05442672222852707, + "kl": 0.022003173828125, + "learning_rate": 2.538981791986634e-06, + "loss": 0.0006, + "num_tokens": 124204003.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1477.0, + "completions/mean_length": 441.4442138671875, + "completions/mean_terminated_length": 437.8501281738281, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.8582615505090054, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.040866632014513016, + "kl": 0.0224609375, + "learning_rate": 2.524476159739218e-06, + "loss": 0.0093, + "num_tokens": 124669806.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1620.0, + "completions/max_terminated_length": 1620.0, + "completions/mean_length": 425.5201110839844, + "completions/mean_terminated_length": 425.5201110839844, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.8613938919342208, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.03935139626264572, + "kl": 0.021148681640625, + "learning_rate": 2.5099799900204607e-06, + "loss": 0.0009, + "num_tokens": 125133043.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 418.55804443359375, + "completions/mean_terminated_length": 411.2511291503906, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.8645262333594361, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.09718144685029984, + "kl": 0.025390625, + "learning_rate": 2.4954938910606108e-06, + "loss": 0.0171, + "num_tokens": 125590697.0, + "reward": 0.09866072982549667, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1471.0, + "completions/max_terminated_length": 1471.0, + "completions/mean_length": 405.6250305175781, + "completions/mean_terminated_length": 405.6250305175781, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.8676585747846516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007152861915528774, + "kl": 0.023834228515625, + "learning_rate": 2.481018470667368e-06, + "loss": 0.0002, + "num_tokens": 126031093.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 420.04913330078125, + "completions/mean_terminated_length": 416.40716552734375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.8707909162098669, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.035602033138275146, + "kl": 0.025604248046875, + "learning_rate": 2.4665543362003802e-06, + "loss": 0.0013, + "num_tokens": 126486139.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1841.0, + "completions/max_terminated_length": 1841.0, + "completions/mean_length": 454.685302734375, + "completions/mean_terminated_length": 454.685302734375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.8739232576350823, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.060265325009822845, + "kl": 0.023223876953125, + "learning_rate": 2.4521020945457615e-06, + "loss": 0.003, + "num_tokens": 126961978.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1679.0, + "completions/max_terminated_length": 1679.0, + "completions/mean_length": 438.7901916503906, + "completions/mean_terminated_length": 438.7901916503906, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.8770555990602976, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.0392940528690815, + "kl": 0.028411865234375, + "learning_rate": 2.4376623520906255e-06, + "loss": -0.0003, + "num_tokens": 127428708.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1992.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 429.21429443359375, + "completions/mean_terminated_length": 429.21429443359375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.880187940485513, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.059850458055734634, + "kl": 0.02679443359375, + "learning_rate": 2.4232357146976478e-06, + "loss": 0.0111, + "num_tokens": 127876580.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1323.0, + "completions/mean_length": 447.3660888671875, + "completions/mean_terminated_length": 443.7852478027344, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.8833202819107283, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.08826036006212234, + "kl": 0.027618408203125, + "learning_rate": 2.408822787679637e-06, + "loss": 0.0175, + "num_tokens": 128355888.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 441.1495666503906, + "completions/mean_terminated_length": 437.5548095703125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.8864526233359437, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0543365515768528, + "kl": 0.022796630859375, + "learning_rate": 2.3944241757741475e-06, + "loss": 0.0084, + "num_tokens": 128807095.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 468.55804443359375, + "completions/mean_terminated_length": 461.4753723144531, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.889584964761159, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.10394247621297836, + "kl": 0.022979736328125, + "learning_rate": 2.380040483118097e-06, + "loss": 0.0244, + "num_tokens": 129285857.0, + "reward": 0.09843751043081284, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 438.06475830078125, + "completions/mean_terminated_length": 438.06475830078125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.8927173061863743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006505799014121294, + "kl": 0.02239990234375, + "learning_rate": 2.365672313222419e-06, + "loss": 0.0002, + "num_tokens": 129736330.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1988.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 454.4129638671875, + "completions/mean_terminated_length": 454.4129638671875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.8958496476115897, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04053695872426033, + "kl": 0.02239990234375, + "learning_rate": 2.351320268946749e-06, + "loss": 0.0034, + "num_tokens": 130217807.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1711.0, + "completions/max_terminated_length": 1711.0, + "completions/mean_length": 481.5312805175781, + "completions/mean_terminated_length": 481.5312805175781, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.898981989036805, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.053991250693798065, + "kl": 0.02142333984375, + "learning_rate": 2.336984952474119e-06, + "loss": 0.0022, + "num_tokens": 130701957.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1277.0, + "completions/max_terminated_length": 1277.0, + "completions/mean_length": 427.02679443359375, + "completions/mean_terminated_length": 427.02679443359375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.9021143304620204, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07717164605855942, + "kl": 0.023101806640625, + "learning_rate": 2.322666965285697e-06, + "loss": 0.0067, + "num_tokens": 131145185.0, + "reward": 0.09888393431901932, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1917.0, + "completions/mean_length": 414.7321472167969, + "completions/mean_terminated_length": 411.07830810546875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.9052466718872357, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.0475928969681263, + "kl": 0.021453857421875, + "learning_rate": 2.3083669081355507e-06, + "loss": 0.0082, + "num_tokens": 131581817.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1880.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 433.8683166503906, + "completions/mean_terminated_length": 433.8683166503906, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.9083790133124511, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006341110914945602, + "kl": 0.023681640625, + "learning_rate": 2.2940853810254377e-06, + "loss": 0.0002, + "num_tokens": 132037002.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2003.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 429.7745666503906, + "completions/mean_terminated_length": 429.7745666503906, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.9115113547376664, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.09808649867773056, + "kl": 0.02294921875, + "learning_rate": 2.2798229831796313e-06, + "loss": 0.0102, + "num_tokens": 132479889.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1437.0, + "completions/mean_length": 449.7723388671875, + "completions/mean_terminated_length": 446.1968688964844, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.9146436961628818, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.08777365833520889, + "kl": 0.0245361328125, + "learning_rate": 2.2655803130197816e-06, + "loss": 0.0154, + "num_tokens": 132947191.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1950.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 456.40179443359375, + "completions/mean_terminated_length": 456.40179443359375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.9177760375880971, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07971566915512085, + "kl": 0.024169921875, + "learning_rate": 2.2513579681398034e-06, + "loss": 0.0088, + "num_tokens": 133426487.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1486.0, + "completions/max_terminated_length": 1486.0, + "completions/mean_length": 427.9776916503906, + "completions/mean_terminated_length": 427.9776916503906, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.9209083790133125, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.0898425355553627, + "kl": 0.027099609375, + "learning_rate": 2.237156545280803e-06, + "loss": 0.004, + "num_tokens": 133863841.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1593.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 436.4531555175781, + "completions/mean_terminated_length": 436.4531555175781, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.9240407204385278, + "frac_reward_zero_std": 0.9285714626312256, + "grad_norm": 0.10014256089925766, + "kl": 0.02484130859375, + "learning_rate": 2.2229766403060403e-06, + "loss": 0.0178, + "num_tokens": 134307944.0, + "reward": 0.098214291036129, + "reward_std": 0.0035714288242161274, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9821428656578064, + "rewards/format_reward/std": 0.13258016109466553, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 430.29241943359375, + "completions/mean_terminated_length": 426.6733703613281, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.9271730618637432, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.09880220144987106, + "kl": 0.023956298828125, + "learning_rate": 2.2088188481759305e-06, + "loss": 0.0194, + "num_tokens": 134770243.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1572.0, + "completions/mean_length": 445.97991943359375, + "completions/mean_terminated_length": 435.1797790527344, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.9303054032889585, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.09142524749040604, + "kl": 0.024139404296875, + "learning_rate": 2.194683762923073e-06, + "loss": 0.0301, + "num_tokens": 135237458.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1950.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 438.9598388671875, + "completions/mean_terminated_length": 438.9598388671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9334377447141738, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.05904732644557953, + "kl": 0.028656005859375, + "learning_rate": 2.1805719776273387e-06, + "loss": 0.0047, + "num_tokens": 135703024.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1970.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 416.66741943359375, + "completions/mean_terminated_length": 416.66741943359375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.9365700861393892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004997703712433577, + "kl": 0.022674560546875, + "learning_rate": 2.166484084390974e-06, + "loss": 0.0002, + "num_tokens": 136138995.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 446.935302734375, + "completions/mean_terminated_length": 443.35345458984375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.9397024275646045, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.08101443946361542, + "kl": 0.02349853515625, + "learning_rate": 2.1524206743137636e-06, + "loss": 0.0105, + "num_tokens": 136609062.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 438.5937805175781, + "completions/mean_terminated_length": 434.9932861328125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.9428347689898199, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.054054487496614456, + "kl": 0.0244140625, + "learning_rate": 2.1383823374682287e-06, + "loss": 0.0002, + "num_tokens": 137083200.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1293.0, + "completions/max_terminated_length": 1293.0, + "completions/mean_length": 412.9375305175781, + "completions/mean_terminated_length": 412.9375305175781, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.9459671104150352, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.07072660326957703, + "kl": 0.022674560546875, + "learning_rate": 2.124369662874868e-06, + "loss": 0.004, + "num_tokens": 137511704.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1691.0, + "completions/max_terminated_length": 1691.0, + "completions/mean_length": 434.41741943359375, + "completions/mean_terminated_length": 434.41741943359375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.9490994518402506, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05086236819624901, + "kl": 0.023956298828125, + "learning_rate": 2.110383238477441e-06, + "loss": -0.0005, + "num_tokens": 137970979.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1339.0, + "completions/max_terminated_length": 1339.0, + "completions/mean_length": 406.79913330078125, + "completions/mean_terminated_length": 406.79913330078125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.9522317932654659, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.054840486496686935, + "kl": 0.024688720703125, + "learning_rate": 2.096423651118305e-06, + "loss": 0.001, + "num_tokens": 138406361.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 439.8370666503906, + "completions/mean_terminated_length": 436.2393798828125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.9553641346906813, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.08356355130672455, + "kl": 0.0250244140625, + "learning_rate": 2.082491486513788e-06, + "loss": 0.0114, + "num_tokens": 138879288.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1633.0, + "completions/mean_length": 442.4151916503906, + "completions/mean_terminated_length": 438.8232727050781, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.9584964761158966, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07848871499300003, + "kl": 0.02447509765625, + "learning_rate": 2.0685873292296116e-06, + "loss": 0.0122, + "num_tokens": 139350126.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1565.0, + "completions/max_terminated_length": 1565.0, + "completions/mean_length": 444.450927734375, + "completions/mean_terminated_length": 444.450927734375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.961628817541112, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0551765076816082, + "kl": 0.023590087890625, + "learning_rate": 2.054711762656369e-06, + "loss": 0.0009, + "num_tokens": 139818284.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1314.0, + "completions/max_terminated_length": 1314.0, + "completions/mean_length": 441.3526916503906, + "completions/mean_terminated_length": 441.3526916503906, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.9647611589663273, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.0681406557559967, + "kl": 0.0230712890625, + "learning_rate": 2.040865368985044e-06, + "loss": 0.001, + "num_tokens": 140287890.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1588.0, + "completions/mean_length": 440.9442138671875, + "completions/mean_terminated_length": 437.3489990234375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.9678935003915426, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06129396706819534, + "kl": 0.026763916015625, + "learning_rate": 2.027048729182583e-06, + "loss": 0.0028, + "num_tokens": 140744581.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 425.4933166503906, + "completions/mean_terminated_length": 425.4933166503906, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.971025841816758, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.02586253546178341, + "kl": 0.024810791015625, + "learning_rate": 2.0132624229675205e-06, + "loss": -0.0001, + "num_tokens": 141191846.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1322.0, + "completions/max_terminated_length": 1322.0, + "completions/mean_length": 433.65850830078125, + "completions/mean_terminated_length": 433.65850830078125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.9741581832419733, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07191614806652069, + "kl": 0.0242919921875, + "learning_rate": 1.9995070287856546e-06, + "loss": 0.0049, + "num_tokens": 141650249.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1687.0, + "completions/max_terminated_length": 1687.0, + "completions/mean_length": 465.0401916503906, + "completions/mean_terminated_length": 465.0401916503906, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9772905246671887, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.08211765438318253, + "kl": 0.0255126953125, + "learning_rate": 1.985783123785774e-06, + "loss": 0.0052, + "num_tokens": 142127455.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1627.0, + "completions/mean_length": 441.1138610839844, + "completions/mean_terminated_length": 437.5190124511719, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.980422866092404, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0549662746489048, + "kl": 0.02618408203125, + "learning_rate": 1.9720912837954486e-06, + "loss": 0.0109, + "num_tokens": 142582178.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 467.0826110839844, + "completions/mean_terminated_length": 463.5458679199219, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.9835552075176194, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07906267046928406, + "kl": 0.026336669921875, + "learning_rate": 1.958432083296862e-06, + "loss": 0.0044, + "num_tokens": 143052795.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1699.0, + "completions/max_terminated_length": 1699.0, + "completions/mean_length": 449.98663330078125, + "completions/mean_terminated_length": 449.98663330078125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.9866875489428347, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05579476058483124, + "kl": 0.025848388671875, + "learning_rate": 1.9448060954027093e-06, + "loss": -0.003, + "num_tokens": 143522337.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1506.0, + "completions/max_terminated_length": 1506.0, + "completions/mean_length": 465.5669860839844, + "completions/mean_terminated_length": 465.5669860839844, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.9898198903680501, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005142271984368563, + "kl": 0.02239990234375, + "learning_rate": 1.931213891832153e-06, + "loss": 0.0002, + "num_tokens": 143999915.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1704.0, + "completions/max_terminated_length": 1704.0, + "completions/mean_length": 459.7031555175781, + "completions/mean_terminated_length": 459.7031555175781, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.9929522317932654, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06087490916252136, + "kl": 0.02593994140625, + "learning_rate": 1.9176560428868336e-06, + "loss": 0.0006, + "num_tokens": 144468590.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 437.06475830078125, + "completions/mean_terminated_length": 437.06475830078125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.9960845732184808, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.0474519357085228, + "kl": 0.024566650390625, + "learning_rate": 1.9041331174269373e-06, + "loss": 0.0044, + "num_tokens": 144914623.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1923.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 477.39288330078125, + "completions/mean_terminated_length": 477.39288330078125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.9992169146436961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0056766727939248085, + "kl": 0.024200439453125, + "learning_rate": 1.8906456828473341e-06, + "loss": 0.0002, + "num_tokens": 145400879.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1724.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 478.8504638671875, + "completions/mean_terminated_length": 478.8504638671875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 1.0031323414252153, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.028180057182908058, + "kl": 0.023773193359375, + "learning_rate": 1.8771943050537656e-06, + "loss": -0.0003, + "num_tokens": 145886148.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1496.0, + "completions/max_terminated_length": 1496.0, + "completions/mean_length": 454.93975830078125, + "completions/mean_terminated_length": 454.93975830078125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 1.0062646828504307, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06182783097028732, + "kl": 0.027130126953125, + "learning_rate": 1.8637795484391046e-06, + "loss": 0.0029, + "num_tokens": 146346189.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1451.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 472.74554443359375, + "completions/mean_terminated_length": 472.74554443359375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 1.009397024275646, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.018497483804821968, + "kl": 0.023468017578125, + "learning_rate": 1.8504019758596698e-06, + "loss": 0.0009, + "num_tokens": 146826227.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1413.0, + "completions/mean_length": 461.1094055175781, + "completions/mean_terminated_length": 457.5592956542969, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 1.0125293657008614, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07279489189386368, + "kl": 0.024017333984375, + "learning_rate": 1.8370621486116163e-06, + "loss": 0.0012, + "num_tokens": 147295216.0, + "reward": 0.09888394176959991, + "reward_std": 0.0018547771032899618, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1111.0, + "completions/max_terminated_length": 1111.0, + "completions/mean_length": 449.21429443359375, + "completions/mean_terminated_length": 449.21429443359375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 1.0156617071260767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005811603274196386, + "kl": 0.025146484375, + "learning_rate": 1.823760626407377e-06, + "loss": 0.0003, + "num_tokens": 147761364.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2043.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 493.794677734375, + "completions/mean_terminated_length": 493.794677734375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 1.018794048551292, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.052810002118349075, + "kl": 0.024658203125, + "learning_rate": 1.8104979673521838e-06, + "loss": 0.0035, + "num_tokens": 148256384.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1498.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 466.6785888671875, + "completions/mean_terminated_length": 466.6785888671875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 1.0219263899765074, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04804644361138344, + "kl": 0.025238037109375, + "learning_rate": 1.7972747279206482e-06, + "loss": -0.0009, + "num_tokens": 148722092.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1574.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 464.9375305175781, + "completions/mean_terminated_length": 464.9375305175781, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 1.0250587314017228, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.046198148280382156, + "kl": 0.02734375, + "learning_rate": 1.7840914629334122e-06, + "loss": -0.0009, + "num_tokens": 149193064.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1471.0, + "completions/max_terminated_length": 1471.0, + "completions/mean_length": 454.98663330078125, + "completions/mean_terminated_length": 454.98663330078125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 1.0281910728269381, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04537181556224823, + "kl": 0.026519775390625, + "learning_rate": 1.7709487255338731e-06, + "loss": 0.0012, + "num_tokens": 149648370.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 486.5937805175781, + "completions/mean_terminated_length": 479.5919494628906, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 1.0313234142521535, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0876544788479805, + "kl": 0.025848388671875, + "learning_rate": 1.7578470671649684e-06, + "loss": 0.0178, + "num_tokens": 150135012.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1492.0, + "completions/mean_length": 466.79241943359375, + "completions/mean_terminated_length": 463.2550354003906, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 1.0344557556773688, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05037108063697815, + "kl": 0.024932861328125, + "learning_rate": 1.744787037546045e-06, + "loss": 0.0076, + "num_tokens": 150607767.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1755.0, + "completions/mean_length": 477.8973388671875, + "completions/mean_terminated_length": 470.8565368652344, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 1.0375880971025842, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06445058435201645, + "kl": 0.02587890625, + "learning_rate": 1.731769184649788e-06, + "loss": 0.0095, + "num_tokens": 151066009.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1552.0, + "completions/mean_length": 479.6183166503906, + "completions/mean_terminated_length": 469.0449523925781, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 1.0407204385277995, + "frac_reward_zero_std": 0.910714328289032, + "grad_norm": 0.10929381102323532, + "kl": 0.0255126953125, + "learning_rate": 1.7187940546792325e-06, + "loss": 0.0168, + "num_tokens": 151543282.0, + "reward": 0.09776786714792252, + "reward_std": 0.004464285913854837, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9776785969734192, + "rewards/format_reward/std": 0.14789186418056488, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2031.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 465.6094055175781, + "completions/mean_terminated_length": 465.6094055175781, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 1.0438527799530148, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.028067538514733315, + "kl": 0.02508544921875, + "learning_rate": 1.7058621920448465e-06, + "loss": 0.0008, + "num_tokens": 152011971.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1315.0, + "completions/max_terminated_length": 1315.0, + "completions/mean_length": 475.0826110839844, + "completions/mean_terminated_length": 475.0826110839844, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 1.0469851213782302, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.0809931755065918, + "kl": 0.026580810546875, + "learning_rate": 1.6929741393416855e-06, + "loss": 0.0057, + "num_tokens": 152500008.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 491.529052734375, + "completions/mean_terminated_length": 488.0469970703125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 1.0501174628034455, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.07615669071674347, + "kl": 0.028961181640625, + "learning_rate": 1.6801304373266286e-06, + "loss": 0.0163, + "num_tokens": 152988905.0, + "reward": 0.09910715371370316, + "reward_std": 0.0014083485584706068, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1863.0, + "completions/max_terminated_length": 1863.0, + "completions/mean_length": 472.919677734375, + "completions/mean_terminated_length": 472.919677734375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 1.0532498042286609, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.03995736315846443, + "kl": 0.0263671875, + "learning_rate": 1.667331624895689e-06, + "loss": -0.0038, + "num_tokens": 153466141.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1494.0, + "completions/max_terminated_length": 1494.0, + "completions/mean_length": 464.7901916503906, + "completions/mean_terminated_length": 464.7901916503906, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 1.0563821456538762, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.0571284182369709, + "kl": 0.02545166015625, + "learning_rate": 1.6545782390614037e-06, + "loss": 0.0025, + "num_tokens": 153941395.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1389.0, + "completions/mean_length": 458.72100830078125, + "completions/mean_terminated_length": 444.4031677246094, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 1.0595144870790916, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.09079112857580185, + "kl": 0.025604248046875, + "learning_rate": 1.6418708149302992e-06, + "loss": 0.022, + "num_tokens": 154405866.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 457.6785888671875, + "completions/mean_terminated_length": 450.547119140625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 1.062646828504307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005148791242390871, + "kl": 0.02783203125, + "learning_rate": 1.6292098856804423e-06, + "loss": 0.0003, + "num_tokens": 154886586.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1478.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 441.5223388671875, + "completions/mean_terminated_length": 441.5223388671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 1.0657791699295223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004288794938474894, + "kl": 0.024383544921875, + "learning_rate": 1.6165959825390661e-06, + "loss": 0.0002, + "num_tokens": 155352324.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 467.6094055175781, + "completions/mean_terminated_length": 460.5224304199219, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 1.0689115113547376, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06171921268105507, + "kl": 0.02642822265625, + "learning_rate": 1.604029634760284e-06, + "loss": 0.0118, + "num_tokens": 155834145.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1244.0, + "completions/mean_length": 423.8058166503906, + "completions/mean_terminated_length": 420.1722717285156, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 1.072043852779953, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06821896135807037, + "kl": 0.026397705078125, + "learning_rate": 1.59151136960288e-06, + "loss": 0.0135, + "num_tokens": 156277854.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1656.0, + "completions/max_terminated_length": 1656.0, + "completions/mean_length": 449.79241943359375, + "completions/mean_terminated_length": 449.79241943359375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 1.0751761942051683, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.046692315489053726, + "kl": 0.028411865234375, + "learning_rate": 1.5790417123081903e-06, + "loss": 0.0024, + "num_tokens": 156742385.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 448.9754638671875, + "completions/mean_terminated_length": 445.3982238769531, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 1.0783085356303836, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03497226536273956, + "kl": 0.027587890625, + "learning_rate": 1.5666211860780583e-06, + "loss": 0.001, + "num_tokens": 157204198.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1315.0, + "completions/mean_length": 450.0156555175781, + "completions/mean_terminated_length": 446.4407043457031, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 1.081440877055599, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005242807324975729, + "kl": 0.02734375, + "learning_rate": 1.5542503120528918e-06, + "loss": 0.0003, + "num_tokens": 157668169.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1658.0, + "completions/mean_length": 435.9508972167969, + "completions/mean_terminated_length": 428.72198486328125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 1.0845732184808143, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03854451701045036, + "kl": 0.029205322265625, + "learning_rate": 1.5419296092897866e-06, + "loss": 0.0024, + "num_tokens": 158131263.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1854.0, + "completions/mean_length": 476.7656555175781, + "completions/mean_terminated_length": 473.25054931640625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 1.0877055599060297, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06825908273458481, + "kl": 0.028594970703125, + "learning_rate": 1.529659594740755e-06, + "loss": 0.0181, + "num_tokens": 158613198.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 430.59600830078125, + "completions/mean_terminated_length": 426.9776306152344, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 1.090837901331245, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05947600677609444, + "kl": 0.030364990234375, + "learning_rate": 1.5174407832310338e-06, + "loss": 0.0149, + "num_tokens": 159062289.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1426.0, + "completions/max_terminated_length": 1426.0, + "completions/mean_length": 461.700927734375, + "completions/mean_terminated_length": 461.700927734375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 1.0939702427564604, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.08351102471351624, + "kl": 0.02777099609375, + "learning_rate": 1.5052736874374815e-06, + "loss": 0.0031, + "num_tokens": 159538107.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 456.2901916503906, + "completions/mean_terminated_length": 452.72930908203125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 1.0971025841816757, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05253274738788605, + "kl": 0.026214599609375, + "learning_rate": 1.4931588178670695e-06, + "loss": 0.0035, + "num_tokens": 160003109.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 470.1183166503906, + "completions/mean_terminated_length": 470.1183166503906, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 1.100234925606891, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04111175611615181, + "kl": 0.0279541015625, + "learning_rate": 1.4810966828354605e-06, + "loss": 0.0028, + "num_tokens": 160471706.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1699.0, + "completions/max_terminated_length": 1699.0, + "completions/mean_length": 443.47100830078125, + "completions/mean_terminated_length": 443.47100830078125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 1.1033672670321064, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05252375081181526, + "kl": 0.027618408203125, + "learning_rate": 1.469087788445684e-06, + "loss": 0.0028, + "num_tokens": 160923673.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 464.0982360839844, + "completions/mean_terminated_length": 464.0982360839844, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 1.1064996084573218, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04662067070603371, + "kl": 0.025726318359375, + "learning_rate": 1.4571326385668965e-06, + "loss": -0.0032, + "num_tokens": 161387025.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1831.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 481.5781555175781, + "completions/mean_terminated_length": 481.5781555175781, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 1.109631949882537, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07175103574991226, + "kl": 0.0281982421875, + "learning_rate": 1.4452317348132434e-06, + "loss": 0.0061, + "num_tokens": 161870000.0, + "reward": 0.09888393431901932, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 436.7076110839844, + "completions/mean_terminated_length": 429.4820861816406, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 1.1127642913077525, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.056466199457645416, + "kl": 0.027191162109375, + "learning_rate": 1.4333855765228104e-06, + "loss": 0.0098, + "num_tokens": 162308401.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1374.0, + "completions/mean_length": 460.200927734375, + "completions/mean_terminated_length": 456.6487731933594, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 1.1158966327329678, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.058717068284749985, + "kl": 0.028594970703125, + "learning_rate": 1.421594660736675e-06, + "loss": 0.0094, + "num_tokens": 162773583.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1352.0, + "completions/mean_length": 451.3794860839844, + "completions/mean_terminated_length": 447.8076171875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 1.1190289741581831, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04924802482128143, + "kl": 0.02874755859375, + "learning_rate": 1.4098594821780476e-06, + "loss": 0.0025, + "num_tokens": 163238261.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1812.0, + "completions/mean_length": 498.7232360839844, + "completions/mean_terminated_length": 491.77581787109375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 1.1221613155833985, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.052160490304231644, + "kl": 0.02813720703125, + "learning_rate": 1.3981805332315174e-06, + "loss": -0.004, + "num_tokens": 163728813.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1736.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 473.4308166503906, + "completions/mean_terminated_length": 473.4308166503906, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 1.1252936570086138, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.042579285800457, + "kl": 0.026641845703125, + "learning_rate": 1.3865583039223929e-06, + "loss": 0.0009, + "num_tokens": 164212862.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 480.388427734375, + "completions/mean_terminated_length": 469.8202209472656, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 1.1284259984338294, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.07509318739175797, + "kl": 0.03173828125, + "learning_rate": 1.374993281896137e-06, + "loss": 0.0112, + "num_tokens": 164687748.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 452.9442138671875, + "completions/mean_terminated_length": 449.3758544921875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 1.1315583398590445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00428752601146698, + "kl": 0.0277099609375, + "learning_rate": 1.3634859523979134e-06, + "loss": 0.0003, + "num_tokens": 165148559.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1200.0, + "completions/max_terminated_length": 1200.0, + "completions/mean_length": 455.5937805175781, + "completions/mean_terminated_length": 455.5937805175781, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 1.13469068128426, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04725253954529762, + "kl": 0.03082275390625, + "learning_rate": 1.3520367982522208e-06, + "loss": 0.001, + "num_tokens": 165599529.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 468.0625305175781, + "completions/mean_terminated_length": 464.5279541015625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 1.1378230227094752, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05850166082382202, + "kl": 0.0302734375, + "learning_rate": 1.3406462998426358e-06, + "loss": -0.0001, + "num_tokens": 166070637.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 505.2656555175781, + "completions/mean_terminated_length": 501.8143310546875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 1.1409553641346908, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0635019913315773, + "kl": 0.02899169921875, + "learning_rate": 1.3293149350916595e-06, + "loss": 0.0079, + "num_tokens": 166573200.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1975.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 462.82366943359375, + "completions/mean_terminated_length": 462.82366943359375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 1.144087705559906, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.033267270773649216, + "kl": 0.028076171875, + "learning_rate": 1.3180431794406623e-06, + "loss": -0.0041, + "num_tokens": 167031629.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 468.98663330078125, + "completions/mean_terminated_length": 461.9058532714844, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 1.1472200469851215, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.061029836535453796, + "kl": 0.03173828125, + "learning_rate": 1.3068315058299358e-06, + "loss": 0.0046, + "num_tokens": 167492363.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 489.1808166503906, + "completions/mean_terminated_length": 485.6935119628906, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 1.1503523884103368, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.052797582000494, + "kl": 0.030487060546875, + "learning_rate": 1.2956803846788503e-06, + "loss": -0.0002, + "num_tokens": 167967980.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1426.0, + "completions/mean_length": 486.08929443359375, + "completions/mean_terminated_length": 482.5950927734375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 1.1534847298355522, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.0657566487789154, + "kl": 0.02642822265625, + "learning_rate": 1.284590283866116e-06, + "loss": 0.0079, + "num_tokens": 168431144.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 460.2589416503906, + "completions/mean_terminated_length": 453.1390380859375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 1.1566170712607675, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.08155252784490585, + "kl": 0.031463623046875, + "learning_rate": 1.2735616687101518e-06, + "loss": 0.017, + "num_tokens": 168882988.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507843434810638, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1665.0, + "completions/mean_length": 495.30804443359375, + "completions/mean_terminated_length": 484.8404541015625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 1.1597494126859829, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06710833311080933, + "kl": 0.0286865234375, + "learning_rate": 1.2625950019495614e-06, + "loss": 0.0085, + "num_tokens": 169349070.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1542.0, + "completions/mean_length": 498.904052734375, + "completions/mean_terminated_length": 491.9574279785156, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 1.1628817541111982, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05639323592185974, + "kl": 0.024932861328125, + "learning_rate": 1.251690743723718e-06, + "loss": 0.0063, + "num_tokens": 169829167.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1703.0, + "completions/mean_length": 476.8482360839844, + "completions/mean_terminated_length": 473.3333435058594, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 1.1660140955364136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0039844331331551075, + "kl": 0.02655029296875, + "learning_rate": 1.2408493515534581e-06, + "loss": 0.0003, + "num_tokens": 170293723.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1738.0, + "completions/mean_length": 487.9531555175781, + "completions/mean_terminated_length": 480.9574279785156, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 1.169146436961629, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.048629023134708405, + "kl": 0.027557373046875, + "learning_rate": 1.2300712803218834e-06, + "loss": 0.0019, + "num_tokens": 170770834.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1837.0, + "completions/mean_length": 503.1651916503906, + "completions/mean_terminated_length": 499.70916748046875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 1.1722787783868442, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06796332448720932, + "kl": 0.02691650390625, + "learning_rate": 1.2193569822552772e-06, + "loss": 0.0009, + "num_tokens": 171263788.0, + "reward": 0.09888393431901932, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1590.0, + "completions/mean_length": 488.1629638671875, + "completions/mean_terminated_length": 484.6733703613281, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 1.1754111198120596, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.052055083215236664, + "kl": 0.02685546875, + "learning_rate": 1.2087069069041268e-06, + "loss": 0.006, + "num_tokens": 171747501.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1393.0, + "completions/max_terminated_length": 1393.0, + "completions/mean_length": 475.4219055175781, + "completions/mean_terminated_length": 475.4219055175781, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 1.178543461237275, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06868158280849457, + "kl": 0.027099609375, + "learning_rate": 1.1981215011242654e-06, + "loss": 0.0041, + "num_tokens": 172228566.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 479.54913330078125, + "completions/mean_terminated_length": 476.040283203125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 1.1816758026624903, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06923764199018478, + "kl": 0.027374267578125, + "learning_rate": 1.1876012090581184e-06, + "loss": 0.0091, + "num_tokens": 172709344.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1606.0, + "completions/max_terminated_length": 1606.0, + "completions/mean_length": 484.732177734375, + "completions/mean_terminated_length": 484.732177734375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 1.1848081440877056, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.07628154754638672, + "kl": 0.0279541015625, + "learning_rate": 1.177146472116071e-06, + "loss": 0.0029, + "num_tokens": 173206488.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1671.0, + "completions/max_terminated_length": 1671.0, + "completions/mean_length": 514.6920166015625, + "completions/mean_terminated_length": 514.6920166015625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 1.187940485512921, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06043136119842529, + "kl": 0.03057861328125, + "learning_rate": 1.1667577289579462e-06, + "loss": 0.004, + "num_tokens": 173705406.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 463.5714416503906, + "completions/mean_terminated_length": 463.5714416503906, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 1.1910728269381363, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.028529752045869827, + "kl": 0.024932861328125, + "learning_rate": 1.1564354154746007e-06, + "loss": -0.0006, + "num_tokens": 174162742.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1422.0, + "completions/mean_length": 469.99554443359375, + "completions/mean_terminated_length": 462.9193115234375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 1.1942051683633517, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06935816258192062, + "kl": 0.027130126953125, + "learning_rate": 1.146179964769635e-06, + "loss": 0.0092, + "num_tokens": 174634960.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1863.0, + "completions/mean_length": 505.029052734375, + "completions/mean_terminated_length": 501.5771789550781, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 1.197337509788567, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06551840901374817, + "kl": 0.025482177734375, + "learning_rate": 1.1359918071412195e-06, + "loss": 0.011, + "num_tokens": 175125097.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1439.0, + "completions/mean_length": 477.8169860839844, + "completions/mean_terminated_length": 470.77581787109375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 1.2004698512137824, + "frac_reward_zero_std": 0.9285714626312256, + "grad_norm": 0.0989466980099678, + "kl": 0.026947021484375, + "learning_rate": 1.1258713700640456e-06, + "loss": 0.0068, + "num_tokens": 175597535.0, + "reward": 0.098214291036129, + "reward_std": 0.0035714288242161274, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9821428656578064, + "rewards/format_reward/std": 0.13258016109466553, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1944.0, + "completions/max_terminated_length": 1944.0, + "completions/mean_length": 487.77679443359375, + "completions/mean_terminated_length": 487.77679443359375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 1.2036021926389977, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06642964482307434, + "kl": 0.031097412109375, + "learning_rate": 1.115819078171383e-06, + "loss": 0.0078, + "num_tokens": 176081243.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1785.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 503.9442138671875, + "completions/mean_terminated_length": 503.9442138671875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 1.206734534064213, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.036115702241659164, + "kl": 0.02703857421875, + "learning_rate": 1.1058353532372667e-06, + "loss": -0.0038, + "num_tokens": 176576438.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1755.0, + "completions/max_terminated_length": 1755.0, + "completions/mean_length": 483.8638610839844, + "completions/mean_terminated_length": 483.8638610839844, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 1.2098668754894284, + "frac_reward_zero_std": 0.9375000596046448, + "grad_norm": 0.078130804002285, + "kl": 0.028900146484375, + "learning_rate": 1.0959206141587998e-06, + "loss": 0.0033, + "num_tokens": 177075037.0, + "reward": 0.09843750298023224, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12415824085474014, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1571.0, + "completions/mean_length": 489.41741943359375, + "completions/mean_terminated_length": 482.42828369140625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 1.2129992169146437, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.09146633744239807, + "kl": 0.027496337890625, + "learning_rate": 1.0860752769385766e-06, + "loss": 0.0189, + "num_tokens": 177549900.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1805.0, + "completions/mean_length": 505.6451110839844, + "completions/mean_terminated_length": 498.7287292480469, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 1.216131558339859, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06432457268238068, + "kl": 0.024932861328125, + "learning_rate": 1.0762997546672279e-06, + "loss": 0.0141, + "num_tokens": 178036621.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1462.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 483.2567138671875, + "completions/mean_terminated_length": 483.2567138671875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 1.2192638997650744, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.0236533060669899, + "kl": 0.023773193359375, + "learning_rate": 1.0665944575060914e-06, + "loss": 0.0, + "num_tokens": 178509784.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1793.0, + "completions/max_terminated_length": 1793.0, + "completions/mean_length": 511.7701110839844, + "completions/mean_terminated_length": 511.7701110839844, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 1.2223962411902898, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04216481000185013, + "kl": 0.02362060546875, + "learning_rate": 1.056959792669997e-06, + "loss": -0.0013, + "num_tokens": 178999069.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 483.1406555175781, + "completions/mean_terminated_length": 476.12335205078125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 1.2255285826155051, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06707365810871124, + "kl": 0.02569580078125, + "learning_rate": 1.0473961644101856e-06, + "loss": 0.0175, + "num_tokens": 179478988.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1817.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 520.6785888671875, + "completions/mean_terminated_length": 520.6785888671875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 1.2286609240407205, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06412012130022049, + "kl": 0.024017333984375, + "learning_rate": 1.037903973997345e-06, + "loss": 0.0042, + "num_tokens": 180001864.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1590.0, + "completions/mean_length": 506.12725830078125, + "completions/mean_terminated_length": 502.6778564453125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 1.2317932654659358, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04480452090501785, + "kl": 0.023223876953125, + "learning_rate": 1.0284836197047737e-06, + "loss": 0.0043, + "num_tokens": 180493109.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1615.0, + "completions/mean_length": 485.12054443359375, + "completions/mean_terminated_length": 481.6241760253906, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 1.2349256068911512, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07531718909740448, + "kl": 0.024139404296875, + "learning_rate": 1.0191354967916712e-06, + "loss": 0.0065, + "num_tokens": 180985655.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1869.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 488.92413330078125, + "completions/mean_terminated_length": 488.92413330078125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 1.2380579483163665, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.0507173016667366, + "kl": 0.0238037109375, + "learning_rate": 1.0098599974865515e-06, + "loss": 0.0008, + "num_tokens": 181472869.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1597.0, + "completions/mean_length": 495.982177734375, + "completions/mean_terminated_length": 489.0224609375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 1.2411902897415819, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.08001532405614853, + "kl": 0.025360107421875, + "learning_rate": 1.0006575109707898e-06, + "loss": 0.0118, + "num_tokens": 181964033.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 471.4151916503906, + "completions/mean_terminated_length": 467.8881530761719, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 1.2443226311667972, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.047472745180130005, + "kl": 0.0235595703125, + "learning_rate": 9.915284233622877e-07, + "loss": -0.0005, + "num_tokens": 182426307.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1486.0, + "completions/mean_length": 489.7031555175781, + "completions/mean_terminated_length": 486.2170104980469, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 1.2474549725920125, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.042243439704179764, + "kl": 0.021759033203125, + "learning_rate": 9.824731176992796e-07, + "loss": 0.0092, + "num_tokens": 182915230.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 482.66741943359375, + "completions/mean_terminated_length": 482.66741943359375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 1.250587314017228, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0642925575375557, + "kl": 0.02362060546875, + "learning_rate": 9.734919739242543e-07, + "loss": -0.0004, + "num_tokens": 183404789.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 463.4375305175781, + "completions/mean_terminated_length": 459.8926086425781, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 1.2537196554424432, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.023158075287938118, + "kl": 0.022979736328125, + "learning_rate": 9.645853688680177e-07, + "loss": -0.0003, + "num_tokens": 183878805.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1462.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 460.029052734375, + "completions/mean_terminated_length": 460.029052734375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 1.2568519968676586, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.026797110214829445, + "kl": 0.02398681640625, + "learning_rate": 9.557536762338786e-07, + "loss": -0.001, + "num_tokens": 184336766.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1513.0, + "completions/mean_length": 484.8594055175781, + "completions/mean_terminated_length": 481.3624267578125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 1.259984338292874, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.0658196285367012, + "kl": 0.022552490234375, + "learning_rate": 9.46997266581973e-07, + "loss": 0.0072, + "num_tokens": 184816999.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 474.77679443359375, + "completions/mean_terminated_length": 471.25726318359375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 1.2631166797180893, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05241454392671585, + "kl": 0.0277099609375, + "learning_rate": 9.383165073137115e-07, + "loss": -0.0004, + "num_tokens": 185284403.0, + "reward": 0.09910715371370316, + "reward_std": 0.0014083485584706068, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1993.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 483.22991943359375, + "completions/mean_terminated_length": 483.22991943359375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 1.2662490211433046, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06283499300479889, + "kl": 0.024993896484375, + "learning_rate": 9.297117626563687e-07, + "loss": 0.0031, + "num_tokens": 185759526.0, + "reward": 0.09888393431901932, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1893.0, + "completions/mean_length": 497.4375305175781, + "completions/mean_terminated_length": 490.4843444824219, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 1.26938136256852, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06819828599691391, + "kl": 0.025360107421875, + "learning_rate": 9.211833936477957e-07, + "loss": 0.0038, + "num_tokens": 186241982.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1955.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 526.4933471679688, + "completions/mean_terminated_length": 526.4933471679688, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 1.2725137039937353, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06821630150079727, + "kl": 0.025146484375, + "learning_rate": 9.127317581212753e-07, + "loss": 0.0, + "num_tokens": 186747479.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 475.7276916503906, + "completions/mean_terminated_length": 468.6771545410156, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 1.2756460454189507, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04914433881640434, + "kl": 0.0233154296875, + "learning_rate": 9.043572106905084e-07, + "loss": -0.0012, + "num_tokens": 187211321.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1809.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 501.263427734375, + "completions/mean_terminated_length": 501.263427734375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 1.278778386844166, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.08543560653924942, + "kl": 0.025360107421875, + "learning_rate": 8.960601027347321e-07, + "loss": 0.012, + "num_tokens": 187701775.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 530.122802734375, + "completions/mean_terminated_length": 523.316162109375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 1.2819107282693813, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.0857296958565712, + "kl": 0.025390625, + "learning_rate": 8.878407823839788e-07, + "loss": 0.0216, + "num_tokens": 188222914.0, + "reward": 0.09866071492433548, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1523.0, + "completions/mean_length": 515.9620971679688, + "completions/mean_terminated_length": 512.53466796875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 1.2850430696945967, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.022322332486510277, + "kl": 0.02471923828125, + "learning_rate": 8.796995945044689e-07, + "loss": 0.0005, + "num_tokens": 188724341.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 495.3437805175781, + "completions/mean_terminated_length": 491.8702392578125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 1.288175411119812, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.05220493674278259, + "kl": 0.024261474609375, + "learning_rate": 8.716368806841405e-07, + "loss": 0.0092, + "num_tokens": 189211275.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 498.90850830078125, + "completions/mean_terminated_length": 495.4429626464844, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 1.2913077525450274, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07294944673776627, + "kl": 0.02459716796875, + "learning_rate": 8.636529792183171e-07, + "loss": 0.0063, + "num_tokens": 189690770.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1497.0, + "completions/max_terminated_length": 1497.0, + "completions/mean_length": 492.65179443359375, + "completions/mean_terminated_length": 492.65179443359375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 1.2944400939702427, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.060865618288517, + "kl": 0.0234375, + "learning_rate": 8.557482250955144e-07, + "loss": 0.0067, + "num_tokens": 190159654.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1629.0, + "completions/mean_length": 503.94866943359375, + "completions/mean_terminated_length": 497.0246887207031, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 1.297572435395458, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.07160454243421555, + "kl": 0.023223876953125, + "learning_rate": 8.479229499833844e-07, + "loss": 0.0151, + "num_tokens": 190664131.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1765.0, + "completions/mean_length": 494.00225830078125, + "completions/mean_terminated_length": 480.00225830078125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 1.3007047768206734, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.09838045388460159, + "kl": 0.024200439453125, + "learning_rate": 8.401774822147976e-07, + "loss": 0.0326, + "num_tokens": 191144424.0, + "reward": 0.09866072982549667, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507843434810638, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 507.0156555175781, + "completions/mean_terminated_length": 503.5682373046875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 1.3038371182458888, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004182870965451002, + "kl": 0.023223876953125, + "learning_rate": 8.325121467740695e-07, + "loss": 0.0002, + "num_tokens": 191641327.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1446.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 480.54241943359375, + "completions/mean_terminated_length": 480.54241943359375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 1.3069694596711041, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05771760269999504, + "kl": 0.0250244140625, + "learning_rate": 8.249272652833226e-07, + "loss": 0.0042, + "num_tokens": 192121602.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 488.6562805175781, + "completions/mean_terminated_length": 485.16778564453125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 1.3101018010963195, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.026134848594665527, + "kl": 0.02435302734375, + "learning_rate": 8.174231559889931e-07, + "loss": -0.0003, + "num_tokens": 192597040.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 478.9442138671875, + "completions/mean_terminated_length": 464.8085632324219, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 1.3132341425215348, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.07261735200881958, + "kl": 0.0244140625, + "learning_rate": 8.100001337484787e-07, + "loss": 0.0173, + "num_tokens": 193064699.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1548.0, + "completions/mean_length": 471.2879638671875, + "completions/mean_terminated_length": 467.7606201171875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 1.3163664839467502, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07018846273422241, + "kl": 0.0264892578125, + "learning_rate": 8.026585100169251e-07, + "loss": 0.0109, + "num_tokens": 193537504.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 488.1317138671875, + "completions/mean_terminated_length": 481.1368103027344, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 1.3194988253719655, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0712767019867897, + "kl": 0.025177001953125, + "learning_rate": 7.953985928341601e-07, + "loss": 0.0104, + "num_tokens": 194016815.0, + "reward": 0.09910715371370316, + "reward_std": 0.0014083485584706068, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349845170975, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1746.0, + "completions/mean_length": 493.654052734375, + "completions/mean_terminated_length": 486.6838684082031, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 1.3226311667971808, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.02791789546608925, + "kl": 0.025390625, + "learning_rate": 7.882206868117693e-07, + "loss": -0.0005, + "num_tokens": 194509924.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1597.0, + "completions/mean_length": 500.1495666503906, + "completions/mean_terminated_length": 475.58050537109375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 1.3257635082223962, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07584591954946518, + "kl": 0.026092529296875, + "learning_rate": 7.81125093120313e-07, + "loss": 0.0135, + "num_tokens": 194990939.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1429.0, + "completions/mean_length": 477.8683166503906, + "completions/mean_terminated_length": 474.355712890625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 1.3288958496476115, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.08384353667497635, + "kl": 0.025360107421875, + "learning_rate": 7.741121094766916e-07, + "loss": 0.0109, + "num_tokens": 195457244.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1652.0, + "completions/mean_length": 496.9375305175781, + "completions/mean_terminated_length": 493.4675598144531, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 1.3320281910728269, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.0455026738345623, + "kl": 0.0233154296875, + "learning_rate": 7.671820301316532e-07, + "loss": 0.0005, + "num_tokens": 195934532.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 469.7232360839844, + "completions/mean_terminated_length": 466.1923828125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 1.3351605324980422, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06310570985078812, + "kl": 0.026702880859375, + "learning_rate": 7.603351458574474e-07, + "loss": 0.0007, + "num_tokens": 196411972.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1801.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 483.1942138671875, + "completions/mean_terminated_length": 483.1942138671875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 1.3382928739232576, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.08518809825181961, + "kl": 0.033111572265625, + "learning_rate": 7.535717439356255e-07, + "loss": 0.0026, + "num_tokens": 196891755.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1730.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 491.53350830078125, + "completions/mean_terminated_length": 491.53350830078125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 1.341425215348473, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.033343762159347534, + "kl": 0.025146484375, + "learning_rate": 7.46892108144986e-07, + "loss": 0.0, + "num_tokens": 197359458.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1625.0, + "completions/max_terminated_length": 1625.0, + "completions/mean_length": 490.13616943359375, + "completions/mean_terminated_length": 490.13616943359375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 1.3445575567736883, + "frac_reward_zero_std": 0.9464285969734192, + "grad_norm": 0.07570716738700867, + "kl": 0.02911376953125, + "learning_rate": 7.402965187496697e-07, + "loss": 0.0043, + "num_tokens": 197849503.0, + "reward": 0.09866072237491608, + "reward_std": 0.0026785717345774174, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9866071343421936, + "rewards/format_reward/std": 0.11507844179868698, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1609.0, + "completions/mean_length": 483.950927734375, + "completions/mean_terminated_length": 473.40673828125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 1.3476898981989036, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06877574324607849, + "kl": 0.026519775390625, + "learning_rate": 7.337852524873974e-07, + "loss": 0.014, + "num_tokens": 198336077.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1749.0, + "completions/mean_length": 511.1651916503906, + "completions/mean_terminated_length": 500.80450439453125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 1.350822239624119, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06570061296224594, + "kl": 0.0260009765625, + "learning_rate": 7.273585825578608e-07, + "loss": 0.0159, + "num_tokens": 198836399.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1456.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 486.58038330078125, + "completions/mean_terminated_length": 486.58038330078125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 1.3539545810493343, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.031636979430913925, + "kl": 0.024078369140625, + "learning_rate": 7.21016778611259e-07, + "loss": 0.0007, + "num_tokens": 199308379.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 456.1964416503906, + "completions/mean_terminated_length": 452.6353454589844, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 1.3570869224745497, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.04416336119174957, + "kl": 0.0255126953125, + "learning_rate": 7.147601067369835e-07, + "loss": 0.0038, + "num_tokens": 199755371.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1626.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 488.7276916503906, + "completions/mean_terminated_length": 488.7276916503906, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 1.360219263899765, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05251280963420868, + "kl": 0.025390625, + "learning_rate": 7.085888294524561e-07, + "loss": 0.0024, + "num_tokens": 200240805.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 506.63616943359375, + "completions/mean_terminated_length": 503.18792724609375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 1.3633516053249803, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06147502362728119, + "kl": 0.02386474609375, + "learning_rate": 7.025032056921117e-07, + "loss": 0.0044, + "num_tokens": 200751278.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1494.0, + "completions/max_terminated_length": 1494.0, + "completions/mean_length": 461.4263610839844, + "completions/mean_terminated_length": 461.4263610839844, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 1.3664839467501957, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06340942531824112, + "kl": 0.02862548828125, + "learning_rate": 6.965034907965349e-07, + "loss": 0.0027, + "num_tokens": 201228333.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1740.0, + "completions/max_terminated_length": 1740.0, + "completions/mean_length": 487.89288330078125, + "completions/mean_terminated_length": 487.89288330078125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 1.3696162881754113, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.06350374966859818, + "kl": 0.02587890625, + "learning_rate": 6.905899365017462e-07, + "loss": 0.0048, + "num_tokens": 201722321.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1618.0, + "completions/mean_length": 478.1651916503906, + "completions/mean_terminated_length": 474.65325927734375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 1.3727486296006264, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05255701392889023, + "kl": 0.0262451171875, + "learning_rate": 6.847627909286409e-07, + "loss": 0.0078, + "num_tokens": 202188531.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1899.0, + "completions/max_terminated_length": 1899.0, + "completions/mean_length": 484.47991943359375, + "completions/mean_terminated_length": 484.47991943359375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 1.375880971025842, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04030289873480797, + "kl": 0.03009033203125, + "learning_rate": 6.790222985725761e-07, + "loss": 0.0015, + "num_tokens": 202674694.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1273.0, + "completions/max_terminated_length": 1273.0, + "completions/mean_length": 477.7433166503906, + "completions/mean_terminated_length": 477.7433166503906, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 1.379013312451057, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05206632614135742, + "kl": 0.024444580078125, + "learning_rate": 6.733687002931141e-07, + "loss": 0.0004, + "num_tokens": 203151055.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 472.5245666503906, + "completions/mean_terminated_length": 465.45965576171875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 1.3821456538762726, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06680779159069061, + "kl": 0.024383544921875, + "learning_rate": 6.678022333039158e-07, + "loss": 0.0147, + "num_tokens": 203625066.0, + "reward": 0.09910715371370316, + "reward_std": 0.0014083485584706068, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1173.0, + "completions/mean_length": 471.9687805175781, + "completions/mean_terminated_length": 464.9013671875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 1.3852779953014878, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06744161993265152, + "kl": 0.02581787109375, + "learning_rate": 6.623231311627876e-07, + "loss": 0.0102, + "num_tokens": 204099328.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 482.0357360839844, + "completions/mean_terminated_length": 478.5324401855469, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 1.3884103367267033, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03292333334684372, + "kl": 0.025054931640625, + "learning_rate": 6.569316237618811e-07, + "loss": 0.0111, + "num_tokens": 204580032.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1585.0, + "completions/max_terminated_length": 1585.0, + "completions/mean_length": 473.0714416503906, + "completions/mean_terminated_length": 473.0714416503906, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 1.3915426781519185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003990632481873035, + "kl": 0.025115966796875, + "learning_rate": 6.516279373180499e-07, + "loss": 0.0003, + "num_tokens": 205056364.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 461.15850830078125, + "completions/mean_terminated_length": 457.6084899902344, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 1.394675019577134, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06944836676120758, + "kl": 0.02813720703125, + "learning_rate": 6.464122943633543e-07, + "loss": 0.0043, + "num_tokens": 205519511.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 473.9888610839844, + "completions/mean_terminated_length": 466.9305114746094, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 1.3978073610023491, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.06057046353816986, + "kl": 0.027374267578125, + "learning_rate": 6.412849137357271e-07, + "loss": 0.0112, + "num_tokens": 205990182.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1715.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 482.5223388671875, + "completions/mean_terminated_length": 482.5223388671875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 1.4009397024275647, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.02567136101424694, + "kl": 0.024810791015625, + "learning_rate": 6.3624601056979e-07, + "loss": -0.0025, + "num_tokens": 206460972.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 477.6763610839844, + "completions/mean_terminated_length": 470.6345520019531, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 1.4040720438527798, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04776311665773392, + "kl": 0.0274658203125, + "learning_rate": 6.312957962878278e-07, + "loss": 0.0164, + "num_tokens": 206926083.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1220.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 458.3526916503906, + "completions/mean_terminated_length": 458.3526916503906, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 1.4072043852779954, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.0575631819665432, + "kl": 0.024932861328125, + "learning_rate": 6.264344785909181e-07, + "loss": 0.0033, + "num_tokens": 207380765.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 496.8504638671875, + "completions/mean_terminated_length": 493.38031005859375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 1.4103367267032105, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.03922717645764351, + "kl": 0.02581787109375, + "learning_rate": 6.216622614502149e-07, + "loss": -0.0002, + "num_tokens": 207872270.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1875.0, + "completions/max_terminated_length": 1875.0, + "completions/mean_length": 468.21429443359375, + "completions/mean_terminated_length": 468.21429443359375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 1.413469068128426, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.024948997423052788, + "kl": 0.02484130859375, + "learning_rate": 6.169793450983916e-07, + "loss": 0.0003, + "num_tokens": 208347890.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1728.0, + "completions/max_terminated_length": 1728.0, + "completions/mean_length": 478.4442138671875, + "completions/mean_terminated_length": 478.4442138671875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 1.4166014095536412, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05131931230425835, + "kl": 0.027587890625, + "learning_rate": 6.123859260212393e-07, + "loss": 0.0057, + "num_tokens": 208835433.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1463.0, + "completions/mean_length": 470.8951110839844, + "completions/mean_terminated_length": 463.8229064941406, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 1.4197337509788568, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.05947602540254593, + "kl": 0.031524658203125, + "learning_rate": 6.07882196949423e-07, + "loss": -0.0008, + "num_tokens": 209321922.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1731.0, + "completions/max_terminated_length": 1731.0, + "completions/mean_length": 482.9732360839844, + "completions/mean_terminated_length": 482.9732360839844, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 1.422866092404072, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06987164169549942, + "kl": 0.02783203125, + "learning_rate": 6.034683468503948e-07, + "loss": 0.0089, + "num_tokens": 209791234.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 470.7969055175781, + "completions/mean_terminated_length": 467.2684631347656, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 1.4259984338292875, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04625406488776207, + "kl": 0.02459716796875, + "learning_rate": 5.991445609204641e-07, + "loss": 0.0063, + "num_tokens": 210277471.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1426.0, + "completions/mean_length": 490.8326110839844, + "completions/mean_terminated_length": 487.3489990234375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 1.4291307752545026, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.0674547329545021, + "kl": 0.025299072265625, + "learning_rate": 5.949110205770292e-07, + "loss": 0.0011, + "num_tokens": 210758448.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 489.04241943359375, + "completions/mean_terminated_length": 485.5548095703125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 1.4322631166797182, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.0695299431681633, + "kl": 0.02850341796875, + "learning_rate": 5.90767903450964e-07, + "loss": 0.0103, + "num_tokens": 211248507.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 461.9576110839844, + "completions/mean_terminated_length": 458.4093933105469, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 1.4353954581049335, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06716445088386536, + "kl": 0.0242919921875, + "learning_rate": 5.867153833791652e-07, + "loss": 0.0091, + "num_tokens": 211734692.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1846.0, + "completions/max_terminated_length": 1846.0, + "completions/mean_length": 495.0714416503906, + "completions/mean_terminated_length": 495.0714416503906, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 1.4385277995301489, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.026018859818577766, + "kl": 0.023956298828125, + "learning_rate": 5.827536303972587e-07, + "loss": 0.0002, + "num_tokens": 212220668.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1368.0, + "completions/max_terminated_length": 1368.0, + "completions/mean_length": 458.6607360839844, + "completions/mean_terminated_length": 458.6607360839844, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 1.4416601409553642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005018766038119793, + "kl": 0.024810791015625, + "learning_rate": 5.78882810732465e-07, + "loss": 0.0002, + "num_tokens": 212680360.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1576.0, + "completions/max_terminated_length": 1576.0, + "completions/mean_length": 466.060302734375, + "completions/mean_terminated_length": 466.060302734375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 1.4447924823805796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037062016781419516, + "kl": 0.023284912109375, + "learning_rate": 5.75103086796625e-07, + "loss": 0.0002, + "num_tokens": 213148359.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1786.0, + "completions/mean_length": 476.9576110839844, + "completions/mean_terminated_length": 473.4429626464844, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 1.447924823805795, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06331861764192581, + "kl": 0.0240478515625, + "learning_rate": 5.714146171793846e-07, + "loss": 0.0068, + "num_tokens": 213618540.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1947.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 493.4844055175781, + "completions/mean_terminated_length": 493.4844055175781, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 1.4510571652310102, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.06573637574911118, + "kl": 0.02557373046875, + "learning_rate": 5.678175566415422e-07, + "loss": -0.0058, + "num_tokens": 214105357.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 485.0625305175781, + "completions/mean_terminated_length": 485.0625305175781, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 1.4541895066562256, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.0482289083302021, + "kl": 0.02227783203125, + "learning_rate": 5.643120561085528e-07, + "loss": 0.0014, + "num_tokens": 214597601.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1312.0, + "completions/max_terminated_length": 1312.0, + "completions/mean_length": 491.9219055175781, + "completions/mean_terminated_length": 491.9219055175781, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 1.457321848081441, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.056341905146837234, + "kl": 0.023773193359375, + "learning_rate": 5.608982626641991e-07, + "loss": -0.0007, + "num_tokens": 215091246.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1699.0, + "completions/mean_length": 482.04913330078125, + "completions/mean_terminated_length": 478.5458679199219, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 1.4604541895066563, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07036087661981583, + "kl": 0.0262451171875, + "learning_rate": 5.575763195444166e-07, + "loss": 0.0102, + "num_tokens": 215567200.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1483.0, + "completions/max_terminated_length": 1483.0, + "completions/mean_length": 486.29241943359375, + "completions/mean_terminated_length": 486.29241943359375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 1.4635865309318716, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06618266552686691, + "kl": 0.022735595703125, + "learning_rate": 5.543463661312847e-07, + "loss": 0.0073, + "num_tokens": 216049055.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1991.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 472.544677734375, + "completions/mean_terminated_length": 472.544677734375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 1.466718872357087, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07233790308237076, + "kl": 0.023529052734375, + "learning_rate": 5.512085379471808e-07, + "loss": 0.0092, + "num_tokens": 216512551.0, + "reward": 0.09888394176959991, + "reward_std": 0.0018547771032899618, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1370.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 454.51788330078125, + "completions/mean_terminated_length": 454.51788330078125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 1.4698512137823023, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.07708113640546799, + "kl": 0.027862548828125, + "learning_rate": 5.481629666490903e-07, + "loss": 0.0052, + "num_tokens": 216960739.0, + "reward": 0.09888394176959991, + "reward_std": 0.0018547771032899618, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1147.0, + "completions/max_terminated_length": 1147.0, + "completions/mean_length": 478.0982360839844, + "completions/mean_terminated_length": 478.0982360839844, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 1.4729835552075177, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.058717548847198486, + "kl": 0.0230712890625, + "learning_rate": 5.452097800230853e-07, + "loss": 0.0031, + "num_tokens": 217448251.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1134.0, + "completions/max_terminated_length": 1134.0, + "completions/mean_length": 452.10491943359375, + "completions/mean_terminated_length": 452.10491943359375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 1.476115896632733, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.037330519407987595, + "kl": 0.0238037109375, + "learning_rate": 5.423491019789623e-07, + "loss": -0.0009, + "num_tokens": 217909894.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1714.0, + "completions/mean_length": 470.7589416503906, + "completions/mean_terminated_length": 463.6861267089844, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 1.4792482380579484, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0496637262403965, + "kl": 0.02435302734375, + "learning_rate": 5.395810525450425e-07, + "loss": 0.0042, + "num_tokens": 218383290.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1579.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 467.20538330078125, + "completions/mean_terminated_length": 467.20538330078125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 1.4823805794831637, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.02580227144062519, + "kl": 0.024078369140625, + "learning_rate": 5.369057478631359e-07, + "loss": 0.0002, + "num_tokens": 218843014.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1588.0, + "completions/max_terminated_length": 1588.0, + "completions/mean_length": 472.33038330078125, + "completions/mean_terminated_length": 472.33038330078125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 1.485512920908379, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05219980701804161, + "kl": 0.025054931640625, + "learning_rate": 5.343233001836694e-07, + "loss": 0.0017, + "num_tokens": 219308998.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1497.0, + "completions/max_terminated_length": 1497.0, + "completions/mean_length": 449.6719055175781, + "completions/mean_terminated_length": 449.6719055175781, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 1.4886452623335944, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.0831604078412056, + "kl": 0.02471923828125, + "learning_rate": 5.318338178609754e-07, + "loss": 0.0078, + "num_tokens": 219770947.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1604.0, + "completions/max_terminated_length": 1604.0, + "completions/mean_length": 473.2656555175781, + "completions/mean_terminated_length": 473.2656555175781, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 1.4917776037588097, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.029114052653312683, + "kl": 0.024505615234375, + "learning_rate": 5.294374053487459e-07, + "loss": 0.0009, + "num_tokens": 220240870.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 471.7276916503906, + "completions/mean_terminated_length": 468.20135498046875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 1.494909945184025, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05871419981122017, + "kl": 0.02642822265625, + "learning_rate": 5.271341631956511e-07, + "loss": 0.0022, + "num_tokens": 220700876.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 483.5535888671875, + "completions/mean_terminated_length": 480.0536804199219, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 1.4980422866092404, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.041242510080337524, + "kl": 0.023712158203125, + "learning_rate": 5.249241880411181e-07, + "loss": 0.0046, + "num_tokens": 221173236.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1886.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 470.4062805175781, + "completions/mean_terminated_length": 470.4062805175781, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 1.5011746280344558, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06449981778860092, + "kl": 0.02703857421875, + "learning_rate": 5.228075726112785e-07, + "loss": 0.0039, + "num_tokens": 221651418.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1518.0, + "completions/mean_length": 486.38616943359375, + "completions/mean_terminated_length": 482.8926086425781, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 1.5043069694596711, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.08103009313344955, + "kl": 0.025360107421875, + "learning_rate": 5.207844057150768e-07, + "loss": 0.0097, + "num_tokens": 222142515.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1153.0, + "completions/max_terminated_length": 1153.0, + "completions/mean_length": 478.21429443359375, + "completions/mean_terminated_length": 478.21429443359375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 1.5074393108848865, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.046365488320589066, + "kl": 0.02337646484375, + "learning_rate": 5.188547722405437e-07, + "loss": 0.0008, + "num_tokens": 222622099.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 449.22100830078125, + "completions/mean_terminated_length": 449.22100830078125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 1.5105716523101018, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.035620175302028656, + "kl": 0.02386474609375, + "learning_rate": 5.170187531512351e-07, + "loss": 0.0001, + "num_tokens": 223083242.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1915.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 486.93975830078125, + "completions/mean_terminated_length": 486.93975830078125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 1.5137039937353172, + "frac_reward_zero_std": 0.9553571939468384, + "grad_norm": 0.07992950081825256, + "kl": 0.02490234375, + "learning_rate": 5.152764254828348e-07, + "loss": 0.0078, + "num_tokens": 223569875.0, + "reward": 0.09888394176959991, + "reward_std": 0.0022321429569274187, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1455.0, + "completions/mean_length": 475.8482360839844, + "completions/mean_terminated_length": 472.3310852050781, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 1.5168363351605325, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.09052633494138718, + "kl": 0.024261474609375, + "learning_rate": 5.136278623399225e-07, + "loss": 0.0115, + "num_tokens": 224043279.0, + "reward": 0.09888393431901932, + "reward_std": 0.0018547771032899618, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9888392686843872, + "rewards/format_reward/std": 0.10517053306102753, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1678.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 484.4263610839844, + "completions/mean_terminated_length": 484.4263610839844, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 1.5199686765857479, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.05671367421746254, + "kl": 0.02569580078125, + "learning_rate": 5.120731328929058e-07, + "loss": 0.007, + "num_tokens": 224519662.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 470.57366943359375, + "completions/mean_terminated_length": 467.04473876953125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 1.5231010180109632, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.0533718578517437, + "kl": 0.02435302734375, + "learning_rate": 5.106123023751187e-07, + "loss": 0.0026, + "num_tokens": 224990235.0, + "reward": 0.09955357760190964, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1612.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 487.29241943359375, + "completions/mean_terminated_length": 487.29241943359375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 1.5262333594361785, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.05758502334356308, + "kl": 0.02410888671875, + "learning_rate": 5.092454320800833e-07, + "loss": 0.0077, + "num_tokens": 225477934.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1311.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 480.05804443359375, + "completions/mean_terminated_length": 480.05804443359375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 1.529365700861394, + "frac_reward_zero_std": 0.9821429252624512, + "grad_norm": 0.04527491703629494, + "kl": 0.02484130859375, + "learning_rate": 5.079725793589405e-07, + "loss": -0.0018, + "num_tokens": 225950836.0, + "reward": 0.09955357015132904, + "reward_std": 0.0008928572060540318, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9955357313156128, + "rewards/format_reward/std": 0.06674052774906158, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1675.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 470.69866943359375, + "completions/mean_terminated_length": 470.69866943359375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 1.5324980422866092, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06746812909841537, + "kl": 0.024200439453125, + "learning_rate": 5.067937976180407e-07, + "loss": 0.0044, + "num_tokens": 226414141.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1295.0, + "completions/mean_length": 454.9442138671875, + "completions/mean_terminated_length": 451.38031005859375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 1.5356303837118246, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.0758332759141922, + "kl": 0.02435302734375, + "learning_rate": 5.057091363167046e-07, + "loss": 0.0095, + "num_tokens": 226881188.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1997.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 468.544677734375, + "completions/mean_terminated_length": 468.544677734375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 1.53876272513704, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.034185778349637985, + "kl": 0.024810791015625, + "learning_rate": 5.047186409651489e-07, + "loss": 0.0011, + "num_tokens": 227343752.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1299.0, + "completions/mean_length": 464.2879638671875, + "completions/mean_terminated_length": 460.7449645996094, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 1.5418950665622553, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.052202969789505005, + "kl": 0.024658203125, + "learning_rate": 5.038223531225742e-07, + "loss": 0.0093, + "num_tokens": 227810269.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 475.107177734375, + "completions/mean_terminated_length": 475.107177734375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 1.5450274079874706, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05039405822753906, + "kl": 0.02630615234375, + "learning_rate": 5.030203103954232e-07, + "loss": 0.0046, + "num_tokens": 228290465.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843171834946, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1567.0, + "completions/max_terminated_length": 1567.0, + "completions/mean_length": 484.6607360839844, + "completions/mean_terminated_length": 484.6607360839844, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 1.548159749412686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004147546365857124, + "kl": 0.023651123046875, + "learning_rate": 5.023125464358026e-07, + "loss": 0.0002, + "num_tokens": 228789737.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 487.6451110839844, + "completions/mean_terminated_length": 487.6451110839844, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 1.5512920908379013, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.05506591126322746, + "kl": 0.02447509765625, + "learning_rate": 5.016990909400709e-07, + "loss": -0.0005, + "num_tokens": 229284114.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349845170975, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1831.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 470.622802734375, + "completions/mean_terminated_length": 470.622802734375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 1.5544244322631167, + "frac_reward_zero_std": 0.9642857313156128, + "grad_norm": 0.06729212403297424, + "kl": 0.023956298828125, + "learning_rate": 5.011799696475915e-07, + "loss": 0.003, + "num_tokens": 229754005.0, + "reward": 0.09910715371370316, + "reward_std": 0.0017857144121080637, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9910714030265808, + "rewards/format_reward/std": 0.09417349100112915, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1789.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 477.950927734375, + "completions/mean_terminated_length": 477.950927734375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 1.557556773688332, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.051064975559711456, + "kl": 0.0235595703125, + "learning_rate": 5.007552043396547e-07, + "loss": 0.0011, + "num_tokens": 230228635.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 478.8437805175781, + "completions/mean_terminated_length": 468.2651672363281, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 1.5606891151135474, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.06102602183818817, + "kl": 0.02349853515625, + "learning_rate": 5.004248128385618e-07, + "loss": 0.0222, + "num_tokens": 230696589.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 1462.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 488.8281555175781, + "completions/mean_terminated_length": 488.8281555175781, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 1.5638214565387627, + "frac_reward_zero_std": 0.973214328289032, + "grad_norm": 0.05913792923092842, + "kl": 0.02447509765625, + "learning_rate": 5.001888090068784e-07, + "loss": 0.006, + "num_tokens": 231177316.0, + "reward": 0.0993303582072258, + "reward_std": 0.0013392858672887087, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9933035969734192, + "rewards/format_reward/std": 0.08164843916893005, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 2035.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 472.98663330078125, + "completions/mean_terminated_length": 472.98663330078125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 1.566953797963978, + "frac_reward_zero_std": 0.9910714626312256, + "grad_norm": 0.03334412723779678, + "kl": 0.02264404296875, + "learning_rate": 5.000472027468528e-07, + "loss": 0.0002, + "num_tokens": 231647270.0, + "reward": 0.09977678954601288, + "reward_std": 0.0004464286030270159, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9977678656578064, + "rewards/format_reward/std": 0.047245558351278305, + "step": 500 + }, + { + "epoch": 1.566953797963978, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.005327709036401529, + "train_runtime": 36562.0256, + "train_samples_per_second": 6.127, + "train_steps_per_second": 0.014 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 231647270, + "num_train_epochs": 2, + "save_steps": 25, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}