{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.566953797963978, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 535.109375, "completions/mean_terminated_length": 514.5724487304688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.0031323414252153485, "frac_reward_zero_std": 0.8214285969734192, "grad_norm": 0.1588766723871231, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0193, "num_tokens": 515065.0, "reward": 0.00491071492433548, "reward_std": 0.009066697210073471, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.0491071417927742, "rewards/format_reward/std": 0.2163332849740982, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 511.2812805175781, "completions/mean_terminated_length": 511.2812805175781, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.006264682850430697, "frac_reward_zero_std": 0.910714328289032, "grad_norm": 0.11207292228937149, "kl": 0.0, "learning_rate": 3.3333333333333335e-07, "loss": 0.0045, "num_tokens": 1017367.0, "reward": 0.002455357229337096, "reward_std": 0.0045333486050367355, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.0245535708963871, "rewards/format_reward/std": 0.1549331247806549, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 501.138427734375, "completions/mean_terminated_length": 497.6778564453125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.009397024275646046, "frac_reward_zero_std": 0.848214328289032, "grad_norm": 0.14721733331680298, "kl": 0.00030612945556640625, "learning_rate": 6.666666666666667e-07, "loss": 0.005, "num_tokens": 1507341.0, "reward": 0.004464285913854837, "reward_std": 0.007796474266797304, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.0446428582072258, "rewards/format_reward/std": 0.2067493349313736, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 511.1808166503906, "completions/mean_terminated_length": 497.3356018066406, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.012529365700861394, "frac_reward_zero_std": 0.8750000596046448, "grad_norm": 0.130089670419693, "kl": 0.0003070831298828125, "learning_rate": 1.0000000000000002e-06, "loss": 0.0006, "num_tokens": 2003446.0, "reward": 0.0035714288242161274, "reward_std": 0.006388125941157341, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.0357142873108387, "rewards/format_reward/std": 0.18578433990478516, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 513.8460083007812, "completions/mean_terminated_length": 503.5033874511719, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.015661707126076743, "frac_reward_zero_std": 0.848214328289032, "grad_norm": 0.1528593897819519, "kl": 0.0003299713134765625, "learning_rate": 1.3333333333333334e-06, "loss": -0.0062, "num_tokens": 2501409.0, "reward": 0.004017857369035482, "reward_std": 0.007658348884433508, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.0401785708963871, "rewards/format_reward/std": 0.1965973675251007, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 487.3638610839844, "completions/mean_terminated_length": 480.3655090332031, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.018794048551292093, "frac_reward_zero_std": 0.7053571939468384, "grad_norm": 0.22701597213745117, "kl": 0.0003814697265625, "learning_rate": 1.6666666666666667e-06, "loss": 0.0048, "num_tokens": 2977796.0, "reward": 0.007366071455180645, "reward_std": 0.01473214291036129, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.0736607164144516, "rewards/format_reward/std": 0.2615099549293518, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 471.4687805175781, "completions/mean_terminated_length": 464.3991394042969, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.02192638997650744, "frac_reward_zero_std": 0.5535714626312256, "grad_norm": 0.2991149127483368, "kl": 0.0009431838989257812, "learning_rate": 2.0000000000000003e-06, "loss": 0.0301, "num_tokens": 3447578.0, "reward": 0.01584821566939354, "reward_std": 0.023357370868325233, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.1584821492433548, "rewards/format_reward/std": 0.36560073494911194, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 465.4062805175781, "completions/mean_terminated_length": 465.4062805175781, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.025058731401722788, "frac_reward_zero_std": 0.2767857313156128, "grad_norm": 0.40428227186203003, "kl": 0.0018405914306640625, "learning_rate": 2.3333333333333336e-06, "loss": 0.0334, "num_tokens": 3901316.0, "reward": 0.02745535969734192, "reward_std": 0.037818219512701035, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.2745535671710968, "rewards/format_reward/std": 0.4467879831790924, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1880.0, "completions/max_terminated_length": 1880.0, "completions/mean_length": 430.1227722167969, "completions/mean_terminated_length": 430.1227722167969, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.028191072826938137, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.4027329385280609, "kl": 0.00598907470703125, "learning_rate": 2.666666666666667e-06, "loss": 0.0265, "num_tokens": 4364535.0, "reward": 0.05669642984867096, "reward_std": 0.044110190123319626, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.5669642686843872, "rewards/format_reward/std": 0.4960494339466095, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 424.27679443359375, "completions/mean_terminated_length": 424.27679443359375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.031323414252153486, "frac_reward_zero_std": 0.4107142984867096, "grad_norm": 0.30183181166648865, "kl": 0.0088348388671875, "learning_rate": 3e-06, "loss": 0.0037, "num_tokens": 4823611.0, "reward": 0.0803571492433548, "reward_std": 0.030293039977550507, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.8035714030265808, "rewards/format_reward/std": 0.39774051308631897, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 416.3035888671875, "completions/mean_terminated_length": 416.3035888671875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.03445575567736883, "frac_reward_zero_std": 0.6696428656578064, "grad_norm": 0.23141951858997345, "kl": 0.0111083984375, "learning_rate": 3.3333333333333333e-06, "loss": 0.0071, "num_tokens": 5276331.0, "reward": 0.08973214775323868, "reward_std": 0.01700129732489586, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 363.7388610839844, "completions/mean_terminated_length": 356.1861267089844, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.037588097102584185, "frac_reward_zero_std": 0.8125000596046448, "grad_norm": 0.2404984086751938, "kl": 0.022918701171875, "learning_rate": 3.6666666666666666e-06, "loss": 0.0147, "num_tokens": 5692962.0, "reward": 0.094866082072258, "reward_std": 0.009513124823570251, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 333.5669860839844, "completions/mean_terminated_length": 333.5669860839844, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.04072043852779953, "frac_reward_zero_std": 0.8928571939468384, "grad_norm": 0.16233885288238525, "kl": 0.0269775390625, "learning_rate": 4.000000000000001e-06, "loss": 0.0068, "num_tokens": 6094152.0, "reward": 0.0970982238650322, "reward_std": 0.005426206160336733, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9709821343421936, "rewards/format_reward/std": 0.16804419457912445, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 377.9285888671875, "completions/mean_terminated_length": 374.1923828125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.04385277995301488, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.060587041079998016, "kl": 0.02960205078125, "learning_rate": 4.333333333333334e-06, "loss": 0.0093, "num_tokens": 6532916.0, "reward": 0.09955357760190964, "reward_std": 0.000515491352416575, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 355.1317138671875, "completions/mean_terminated_length": 347.5403747558594, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.04698512137823023, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.10373149067163467, "kl": 0.029449462890625, "learning_rate": 4.666666666666667e-06, "loss": 0.0023, "num_tokens": 6948731.0, "reward": 0.09843750298023224, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 378.08929443359375, "completions/mean_terminated_length": 359.2415466308594, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.050117462803445575, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.04813814163208008, "kl": 0.03216552734375, "learning_rate": 5e-06, "loss": -0.0023, "num_tokens": 7378375.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 391.8348388671875, "completions/mean_terminated_length": 376.9144287109375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.05324980422866092, "frac_reward_zero_std": 1.0, "grad_norm": 0.05238564684987068, "kl": 0.0379638671875, "learning_rate": 4.999952797253148e-06, "loss": 0.0004, "num_tokens": 7821865.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 390.02679443359375, "completions/mean_terminated_length": 367.5203857421875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.056382145653876274, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.06826696544885635, "kl": 0.03680419921875, "learning_rate": 4.9998111909931225e-06, "loss": 0.0027, "num_tokens": 8266237.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 309.99554443359375, "completions/mean_terminated_length": 306.1073913574219, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.05951448707909162, "frac_reward_zero_std": 1.0, "grad_norm": 0.06451667100191116, "kl": 0.04046630859375, "learning_rate": 4.999575187161439e-06, "loss": 0.0004, "num_tokens": 8667079.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 319.84375, "completions/mean_terminated_length": 319.84375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.06264682850430697, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.07596829533576965, "kl": 0.03192138671875, "learning_rate": 4.9992447956603455e-06, "loss": 0.0022, "num_tokens": 9072969.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 327.99554443359375, "completions/mean_terminated_length": 324.14764404296875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.06577916992952232, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.09359221905469894, "kl": 0.029449462890625, "learning_rate": 4.998820030352409e-06, "loss": 0.0049, "num_tokens": 9479947.0, "reward": 0.09866072237491608, "reward_std": 0.0023012058809399605, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 348.7321472167969, "completions/mean_terminated_length": 348.7321472167969, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.06891151135473766, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.10616543143987656, "kl": 0.025604248046875, "learning_rate": 4.998300909059929e-06, "loss": -0.0001, "num_tokens": 9892695.0, "reward": 0.09843751043081284, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 380.21429443359375, "completions/mean_terminated_length": 380.21429443359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.07204385277995301, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06697159260511398, "kl": 0.024871826171875, "learning_rate": 4.997687453564198e-06, "loss": 0.0025, "num_tokens": 10329671.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 405.0312805175781, "completions/mean_terminated_length": 401.355712890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.07517619420516837, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.04193543270230293, "kl": 0.0230712890625, "learning_rate": 4.9969796896045775e-06, "loss": 0.0011, "num_tokens": 10785897.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 425.32366943359375, "completions/mean_terminated_length": 421.6935119628906, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.07830853563038372, "frac_reward_zero_std": 0.9196429252624512, "grad_norm": 0.16237600147724152, "kl": 0.03460693359375, "learning_rate": 4.996177646877426e-06, "loss": 0.0082, "num_tokens": 11238746.0, "reward": 0.09799107909202576, "reward_std": 0.004017857369035482, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9799107313156128, "rewards/format_reward/std": 0.14046260714530945, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 410.6495666503906, "completions/mean_terminated_length": 406.986572265625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.08144087705559906, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.09943713247776031, "kl": 0.027923583984375, "learning_rate": 4.995281359034851e-06, "loss": 0.0115, "num_tokens": 11698873.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1960.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 434.4419860839844, "completions/mean_terminated_length": 434.4419860839844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.08457321848081441, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.03789101541042328, "kl": 0.023956298828125, "learning_rate": 4.994290863683296e-06, "loss": -0.0042, "num_tokens": 12154395.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 440.96429443359375, "completions/mean_terminated_length": 437.369140625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.08770555990602975, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.09652919322252274, "kl": 0.021881103515625, "learning_rate": 4.99320620238196e-06, "loss": 0.0139, "num_tokens": 12613483.0, "reward": 0.098214291036129, "reward_std": 0.0031940629705786705, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.13258016109466553, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 435.0826110839844, "completions/mean_terminated_length": 435.0826110839844, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.0908379013312451, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08473248034715652, "kl": 0.02325439453125, "learning_rate": 4.99202742064106e-06, "loss": 0.0007, "num_tokens": 13072400.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 456.5714416503906, "completions/mean_terminated_length": 456.5714416503906, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.09397024275646046, "frac_reward_zero_std": 0.8839285969734192, "grad_norm": 0.1065942719578743, "kl": 0.02191162109375, "learning_rate": 4.990754567919917e-06, "loss": -0.0032, "num_tokens": 13540312.0, "reward": 0.09687500447034836, "reward_std": 0.005872634239494801, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17418713867664337, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 447.4687805175781, "completions/mean_terminated_length": 447.4687805175781, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0971025841816758, "frac_reward_zero_std": 0.8660714626312256, "grad_norm": 0.13856230676174164, "kl": 0.02203369140625, "learning_rate": 4.989387697624881e-06, "loss": 0.0031, "num_tokens": 13992606.0, "reward": 0.09620536118745804, "reward_std": 0.006834554020315409, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.19128035008907318, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 425.32366943359375, "completions/mean_terminated_length": 425.32366943359375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.10023492560689115, "frac_reward_zero_std": 0.8839285969734192, "grad_norm": 0.13931778073310852, "kl": 0.02252197265625, "learning_rate": 4.987926867107095e-06, "loss": 0.0025, "num_tokens": 14435647.0, "reward": 0.09687500447034836, "reward_std": 0.005872634705156088, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17418713867664337, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 442.65850830078125, "completions/mean_terminated_length": 442.65850830078125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.1033672670321065, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.09321776777505875, "kl": 0.01947021484375, "learning_rate": 4.986372137660078e-06, "loss": 0.0037, "num_tokens": 14891178.0, "reward": 0.09843751043081284, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 430.6317138671875, "completions/mean_terminated_length": 430.6317138671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.10649960845732184, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.051053401082754135, "kl": 0.019927978515625, "learning_rate": 4.984723574517165e-06, "loss": 0.0016, "num_tokens": 15347217.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1230.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 404.8125305175781, "completions/mean_terminated_length": 404.8125305175781, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.1096319498825372, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.11693686246871948, "kl": 0.020751953125, "learning_rate": 4.9829812468487655e-06, "loss": 0.0029, "num_tokens": 15791533.0, "reward": 0.09843751043081284, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 406.04241943359375, "completions/mean_terminated_length": 406.04241943359375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.11276429130775255, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07735206186771393, "kl": 0.01849365234375, "learning_rate": 4.981145227759457e-06, "loss": 0.0085, "num_tokens": 16235252.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 376.4107360839844, "completions/mean_terminated_length": 376.4107360839844, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.1158966327329679, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.11777744442224503, "kl": 0.03057861328125, "learning_rate": 4.979215594284924e-06, "loss": 0.0019, "num_tokens": 16668752.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 373.4910888671875, "completions/mean_terminated_length": 373.4910888671875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.11902897415818324, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05367812514305115, "kl": 0.020263671875, "learning_rate": 4.977192427388722e-06, "loss": 0.0039, "num_tokens": 17100684.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 367.5401916503906, "completions/mean_terminated_length": 363.78076171875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.12216131558339859, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.06863244622945786, "kl": 0.020965576171875, "learning_rate": 4.9750758119588824e-06, "loss": 0.0149, "num_tokens": 17521854.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 345.2969055175781, "completions/mean_terminated_length": 345.2969055175781, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.12529365700861395, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.08065449446439743, "kl": 0.027313232421875, "learning_rate": 4.972865836804349e-06, "loss": 0.0035, "num_tokens": 17960603.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 342.7500305175781, "completions/mean_terminated_length": 338.93511962890625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.1284259984338293, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.07921568304300308, "kl": 0.030120849609375, "learning_rate": 4.970562594651254e-06, "loss": 0.014, "num_tokens": 18372139.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 352.82366943359375, "completions/mean_terminated_length": 349.03131103515625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.13155833985904464, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.10244850814342499, "kl": 0.02459716796875, "learning_rate": 4.968166182139026e-06, "loss": 0.0164, "num_tokens": 18782632.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 378.3571472167969, "completions/mean_terminated_length": 374.6219177246094, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.13469068128425998, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.07367311418056488, "kl": 0.02655029296875, "learning_rate": 4.9656766998163306e-06, "loss": 0.0034, "num_tokens": 19215352.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 387.7276916503906, "completions/mean_terminated_length": 384.013427734375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.13782302270947533, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.061480794101953506, "kl": 0.029449462890625, "learning_rate": 4.963094252136865e-06, "loss": 0.0038, "num_tokens": 19646938.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 403.6339416503906, "completions/mean_terminated_length": 399.95526123046875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.14095536413469067, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.09340798854827881, "kl": 0.030120849609375, "learning_rate": 4.960418947454958e-06, "loss": 0.0266, "num_tokens": 20090470.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 416.0446472167969, "completions/mean_terminated_length": 416.0446472167969, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.14408770555990602, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.054629698395729065, "kl": 0.0316162109375, "learning_rate": 4.957650898021038e-06, "loss": 0.002, "num_tokens": 20539942.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 416.3348388671875, "completions/mean_terminated_length": 412.6845703125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.14722004698512137, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05650290846824646, "kl": 0.030487060546875, "learning_rate": 4.954790219976915e-06, "loss": 0.0012, "num_tokens": 20996604.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 434.2254638671875, "completions/mean_terminated_length": 423.3460693359375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.15035238841033674, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06960015743970871, "kl": 0.025848388671875, "learning_rate": 4.95183703335091e-06, "loss": 0.0056, "num_tokens": 21456157.0, "reward": 0.09910715371370316, "reward_std": 0.0014083485584706068, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 414.5089416503906, "completions/mean_terminated_length": 410.8545837402344, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.1534847298355521, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03207956254482269, "kl": 0.02642822265625, "learning_rate": 4.948791462052819e-06, "loss": -0.0007, "num_tokens": 21900921.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 421.95538330078125, "completions/mean_terminated_length": 418.31768798828125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.15661707126076743, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.10218677669763565, "kl": 0.024322509765625, "learning_rate": 4.945653633868716e-06, "loss": 0.0095, "num_tokens": 22344673.0, "reward": 0.09843750298023224, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 424.70538330078125, "completions/mean_terminated_length": 421.0738220214844, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.15974941268598278, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08220592886209488, "kl": 0.023101806640625, "learning_rate": 4.942423680455584e-06, "loss": 0.0144, "num_tokens": 22790569.0, "reward": 0.09843751043081284, "reward_std": 0.002747634658589959, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 398.90179443359375, "completions/mean_terminated_length": 398.90179443359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.16288175411119812, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05870000645518303, "kl": 0.026336669921875, "learning_rate": 4.939101737335802e-06, "loss": -0.0019, "num_tokens": 23234153.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2020.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 433.2946472167969, "completions/mean_terminated_length": 433.2946472167969, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.16601409553641347, "frac_reward_zero_std": 1.0, "grad_norm": 0.007355809677392244, "kl": 0.019683837890625, "learning_rate": 4.935687943891447e-06, "loss": 0.0002, "num_tokens": 23696993.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 424.2745666503906, "completions/mean_terminated_length": 420.6420593261719, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.16914643696162882, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.09630747884511948, "kl": 0.020782470703125, "learning_rate": 4.932182443358458e-06, "loss": 0.018, "num_tokens": 24149676.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 408.0067138671875, "completions/mean_terminated_length": 404.3377990722656, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.17227877838684416, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.042548321187496185, "kl": 0.0206298828125, "learning_rate": 4.928585382820616e-06, "loss": 0.0065, "num_tokens": 24589267.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 414.7879638671875, "completions/mean_terminated_length": 414.7879638671875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.1754111198120595, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03165193274617195, "kl": 0.019256591796875, "learning_rate": 4.924896913203376e-06, "loss": -0.0002, "num_tokens": 25029316.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 426.3125305175781, "completions/mean_terminated_length": 426.3125305175781, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.17854346123727485, "frac_reward_zero_std": 1.0, "grad_norm": 0.006991757545620203, "kl": 0.01995849609375, "learning_rate": 4.921117189267535e-06, "loss": 0.0002, "num_tokens": 25477700.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 445.30804443359375, "completions/mean_terminated_length": 441.72259521484375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1816758026624902, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.07717428356409073, "kl": 0.02203369140625, "learning_rate": 4.917246369602742e-06, "loss": 0.0102, "num_tokens": 25939614.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 474.216552734375, "completions/mean_terminated_length": 470.69573974609375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.18480814408770557, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.08366987109184265, "kl": 0.032501220703125, "learning_rate": 4.9132846166208355e-06, "loss": 0.009, "num_tokens": 26426115.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 452.6495666503906, "completions/mean_terminated_length": 441.8943786621094, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.18794048551292092, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.5095189213752747, "kl": 0.129974365234375, "learning_rate": 4.9092320965490365e-06, "loss": 0.0245, "num_tokens": 26891606.0, "reward": 0.098214291036129, "reward_std": 0.0031940629705786705, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.13258016109466553, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 458.08038330078125, "completions/mean_terminated_length": 458.08038330078125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.19107282693813626, "frac_reward_zero_std": 0.9285714626312256, "grad_norm": 0.09414026886224747, "kl": 0.0223388671875, "learning_rate": 4.905088979422971e-06, "loss": 0.0051, "num_tokens": 27374278.0, "reward": 0.098214291036129, "reward_std": 0.0035714288242161274, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.13258016109466553, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 438.8683166503906, "completions/mean_terminated_length": 435.2684631347656, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.1942051683633516, "frac_reward_zero_std": 0.5089285969734192, "grad_norm": 0.256947785615921, "kl": 0.025604248046875, "learning_rate": 4.900855439079536e-06, "loss": 0.0347, "num_tokens": 27822987.0, "reward": 0.08638394623994827, "reward_std": 0.02482982538640499, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.8638392686843872, "rewards/format_reward/std": 0.34334254264831543, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 420.0602722167969, "completions/mean_terminated_length": 416.4183349609375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.19733750978856696, "frac_reward_zero_std": 0.3839285969734192, "grad_norm": 0.31054815649986267, "kl": 0.030364990234375, "learning_rate": 4.8965316531496055e-06, "loss": 0.0106, "num_tokens": 28281678.0, "reward": 0.07723214477300644, "reward_std": 0.03239201754331589, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.7723214030265808, "rewards/format_reward/std": 0.41980281472206116, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 460.95538330078125, "completions/mean_terminated_length": 450.2561950683594, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2004698512137823, "frac_reward_zero_std": 0.3214285969734192, "grad_norm": 0.31729573011398315, "kl": 0.029296875, "learning_rate": 4.892117803050578e-06, "loss": 0.0504, "num_tokens": 28739002.0, "reward": 0.07834821939468384, "reward_std": 0.035240765661001205, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.7834821343421936, "rewards/format_reward/std": 0.41233164072036743, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 453.8638610839844, "completions/mean_terminated_length": 453.8638610839844, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.20360219263899765, "frac_reward_zero_std": 0.3839285969734192, "grad_norm": 0.28879427909851074, "kl": 0.03399658203125, "learning_rate": 4.887614073978761e-06, "loss": -0.0113, "num_tokens": 29206825.0, "reward": 0.0792410746216774, "reward_std": 0.03218482807278633, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.7924107313156128, "rewards/format_reward/std": 0.4060344398021698, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 423.07366943359375, "completions/mean_terminated_length": 419.4384765625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.206734534064213, "frac_reward_zero_std": 0.5446428656578064, "grad_norm": 0.25178584456443787, "kl": 0.0355224609375, "learning_rate": 4.883020654901609e-06, "loss": 0.0192, "num_tokens": 29646098.0, "reward": 0.08549107611179352, "reward_std": 0.023734737187623978, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.8549107313156128, "rewards/format_reward/std": 0.3525845408439636, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 450.1094055175781, "completions/mean_terminated_length": 435.7139892578125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.20986687548942834, "frac_reward_zero_std": 0.660714328289032, "grad_norm": 0.21421125531196594, "kl": 0.03936767578125, "learning_rate": 4.878337738549785e-06, "loss": 0.012, "num_tokens": 30112767.0, "reward": 0.08995536714792252, "reward_std": 0.01744772680103779, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 469.14288330078125, "completions/mean_terminated_length": 454.9189147949219, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.21299921691464369, "frac_reward_zero_std": 0.7321428656578064, "grad_norm": 0.20110578835010529, "kl": 0.048583984375, "learning_rate": 4.873565521409082e-06, "loss": 0.0035, "num_tokens": 30573215.0, "reward": 0.09218750894069672, "reward_std": 0.013738172128796577, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 500.779052734375, "completions/mean_terminated_length": 479.7760314941406, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.21613155833985903, "frac_reward_zero_std": 0.7321428656578064, "grad_norm": 0.18321290612220764, "kl": 0.04046630859375, "learning_rate": 4.868704203712173e-06, "loss": 0.0113, "num_tokens": 31056144.0, "reward": 0.09308035671710968, "reward_std": 0.01346192043274641, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 527.3482666015625, "completions/mean_terminated_length": 499.6999816894531, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.2192638997650744, "frac_reward_zero_std": 0.8125000596046448, "grad_norm": 0.15901301801204681, "kl": 0.0457763671875, "learning_rate": 4.86375398943021e-06, "loss": 0.0263, "num_tokens": 31577744.0, "reward": 0.0948660746216774, "reward_std": 0.0095131266862154, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 553.71875, "completions/mean_terminated_length": 509.06207275390625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.22239624119028975, "frac_reward_zero_std": 0.8660714626312256, "grad_norm": 0.14502239227294922, "kl": 0.043701171875, "learning_rate": 4.858715086264274e-06, "loss": 0.0185, "num_tokens": 32082182.0, "reward": 0.09665178507566452, "reward_std": 0.0066964291036129, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9665178656578064, "rewards/format_reward/std": 0.1800929754972458, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 592.53125, "completions/mean_terminated_length": 535.1229248046875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.2255285826155051, "frac_reward_zero_std": 0.9017857313156128, "grad_norm": 0.10964933037757874, "kl": 0.04742431640625, "learning_rate": 4.853587705636646e-06, "loss": 0.0052, "num_tokens": 32599104.0, "reward": 0.09754464775323868, "reward_std": 0.004910714458674192, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9754464030265808, "rewards/format_reward/std": 0.1549331247806549, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 609.21875, "completions/mean_terminated_length": 527.7783203125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.22866092404072044, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.0965803936123848, "kl": 0.04388427734375, "learning_rate": 4.84837206268195e-06, "loss": 0.0215, "num_tokens": 33133858.0, "reward": 0.09843750298023224, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 622.0580444335938, "completions/mean_terminated_length": 530.6080932617188, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2317932654659358, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07050427049398422, "kl": 0.0433349609375, "learning_rate": 4.8430683762381195e-06, "loss": -0.0036, "num_tokens": 33688232.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 615.4085083007812, "completions/mean_terminated_length": 516.25537109375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.23492560689115113, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07452395558357239, "kl": 0.0457763671875, "learning_rate": 4.837676868837213e-06, "loss": -0.0054, "num_tokens": 34241683.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 617.0111694335938, "completions/mean_terminated_length": 543.1103515625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.23805794831636648, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.059764716774225235, "kl": 0.04339599609375, "learning_rate": 4.832197766696085e-06, "loss": 0.0046, "num_tokens": 34782796.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 577.6763916015625, "completions/mean_terminated_length": 483.38006591796875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.24119028974158183, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08536234498023987, "kl": 0.0479736328125, "learning_rate": 4.826631299706887e-06, "loss": 0.0008, "num_tokens": 35304411.0, "reward": 0.09843751043081284, "reward_std": 0.002747634192928672, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 533.3348388671875, "completions/mean_terminated_length": 458.84307861328125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.24432263116679717, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.06963478773832321, "kl": 0.0467529296875, "learning_rate": 4.820977701427424e-06, "loss": 0.0048, "num_tokens": 35797893.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 538.5067138671875, "completions/mean_terminated_length": 456.81646728515625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.24745497259201252, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07347165048122406, "kl": 0.04718017578125, "learning_rate": 4.81523720907136e-06, "loss": -0.0014, "num_tokens": 36293400.0, "reward": 0.09843751043081284, "reward_std": 0.002747634192928672, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 488.8750305175781, "completions/mean_terminated_length": 412.19671630859375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2505873140172279, "frac_reward_zero_std": 0.9285714626312256, "grad_norm": 0.10902077704668045, "kl": 0.04638671875, "learning_rate": 4.809410063498254e-06, "loss": 0.018, "num_tokens": 36769160.0, "reward": 0.09799107909202576, "reward_std": 0.003640491748228669, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9799107313156128, "rewards/format_reward/std": 0.14046260714530945, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 527.825927734375, "completions/mean_terminated_length": 456.7897033691406, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.25371965544244324, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08092744648456573, "kl": 0.04595947265625, "learning_rate": 4.8034965092034656e-06, "loss": 0.0137, "num_tokens": 37261102.0, "reward": 0.09843750298023224, "reward_std": 0.002747634192928672, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 485.2589416503906, "completions/mean_terminated_length": 404.5539855957031, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.2568519968676586, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.049136415123939514, "kl": 0.0416259765625, "learning_rate": 4.797496794307889e-06, "loss": -0.0031, "num_tokens": 37734910.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 436.78350830078125, "completions/mean_terminated_length": 388.6321716308594, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.25998433829287393, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05257245525717735, "kl": 0.04052734375, "learning_rate": 4.791411170547545e-06, "loss": -0.0024, "num_tokens": 38177965.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 431.2701110839844, "completions/mean_terminated_length": 394.3584289550781, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.2631166797180893, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.08425591886043549, "kl": 0.039306640625, "learning_rate": 4.785239893263017e-06, "loss": 0.0031, "num_tokens": 38611310.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 422.83929443359375, "completions/mean_terminated_length": 400.7782897949219, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2662490211433046, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04909777268767357, "kl": 0.037841796875, "learning_rate": 4.778983221388742e-06, "loss": -0.0009, "num_tokens": 39058454.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 387.0201110839844, "completions/mean_terminated_length": 379.57177734375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.26938136256851997, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.09303503483533859, "kl": 0.03814697265625, "learning_rate": 4.77264141744214e-06, "loss": 0.0082, "num_tokens": 39489199.0, "reward": 0.09843751043081284, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 418.76788330078125, "completions/mean_terminated_length": 411.4619140625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.2725137039937353, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.037742938846349716, "kl": 0.03558349609375, "learning_rate": 4.766214747512603e-06, "loss": 0.0017, "num_tokens": 39925167.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 428.60491943359375, "completions/mean_terminated_length": 410.32733154296875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.27564604541895066, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.14446213841438293, "kl": 0.0531005859375, "learning_rate": 4.759703481250331e-06, "loss": 0.0174, "num_tokens": 40374618.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 437.6719055175781, "completions/mean_terminated_length": 430.45068359375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.278778386844166, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.08747456967830658, "kl": 0.04180908203125, "learning_rate": 4.753107891855015e-06, "loss": 0.0076, "num_tokens": 40833571.0, "reward": 0.09843751043081284, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 410.22991943359375, "completions/mean_terminated_length": 387.99774169921875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.28191072826938135, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06740201264619827, "kl": 0.03466796875, "learning_rate": 4.746428256064375e-06, "loss": 0.0044, "num_tokens": 41268130.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 429.4910888671875, "completions/mean_terminated_length": 422.23321533203125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.2850430696945967, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.0947580561041832, "kl": 0.03240966796875, "learning_rate": 4.7396648541425534e-06, "loss": 0.0182, "num_tokens": 41725082.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 425.33038330078125, "completions/mean_terminated_length": 410.71173095703125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.28817541111981204, "frac_reward_zero_std": 0.8571429252624512, "grad_norm": 0.147536039352417, "kl": 0.033935546875, "learning_rate": 4.732817969868348e-06, "loss": 0.0177, "num_tokens": 42187318.0, "reward": 0.09642858803272247, "reward_std": 0.007142857648432255, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18578432500362396, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 438.6808166503906, "completions/mean_terminated_length": 438.6808166503906, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2913077525450274, "frac_reward_zero_std": 0.9285714626312256, "grad_norm": 0.10546483099460602, "kl": 0.034637451171875, "learning_rate": 4.7258878905233095e-06, "loss": 0.0057, "num_tokens": 42662787.0, "reward": 0.098214291036129, "reward_std": 0.0035714288242161274, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.13258016109466553, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 434.0133972167969, "completions/mean_terminated_length": 426.77581787109375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.29444009397024273, "frac_reward_zero_std": 0.8839285969734192, "grad_norm": 0.12148799002170563, "kl": 0.0355224609375, "learning_rate": 4.718874906879688e-06, "loss": 0.0103, "num_tokens": 43132697.0, "reward": 0.0970982164144516, "reward_std": 0.0058035715483129025, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9709821343421936, "rewards/format_reward/std": 0.16804419457912445, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 422.6696472167969, "completions/mean_terminated_length": 419.0335693359375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2975724353954581, "frac_reward_zero_std": 0.8660714626312256, "grad_norm": 0.11725667119026184, "kl": 0.03143310546875, "learning_rate": 4.711779313188231e-06, "loss": 0.0097, "num_tokens": 43593545.0, "reward": 0.09665179997682571, "reward_std": 0.0066964291036129, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9665178656578064, "rewards/format_reward/std": 0.1800929754972458, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 410.6785888671875, "completions/mean_terminated_length": 410.6785888671875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.3007047768206735, "frac_reward_zero_std": 0.8839285969734192, "grad_norm": 0.12994614243507385, "kl": 0.032928466796875, "learning_rate": 4.70460140716584e-06, "loss": 0.0248, "num_tokens": 44039881.0, "reward": 0.09687500447034836, "reward_std": 0.005872634239494801, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17418713867664337, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 402.0937805175781, "completions/mean_terminated_length": 398.41162109375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.3038371182458888, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07438544929027557, "kl": 0.0323486328125, "learning_rate": 4.697341489983076e-06, "loss": -0.0003, "num_tokens": 44472007.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 421.0089416503906, "completions/mean_terminated_length": 413.7130126953125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.3069694596711042, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08899196237325668, "kl": 0.038116455078125, "learning_rate": 4.6899998662515215e-06, "loss": 0.008, "num_tokens": 44932495.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 427.8415222167969, "completions/mean_terminated_length": 424.2170104980469, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.3101018010963195, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07636722177267075, "kl": 0.03070068359375, "learning_rate": 4.682576844011007e-06, "loss": 0.0018, "num_tokens": 45373824.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 425.8058166503906, "completions/mean_terminated_length": 422.1767272949219, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.31323414252153486, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.08891923725605011, "kl": 0.0328369140625, "learning_rate": 4.675072734716678e-06, "loss": 0.0144, "num_tokens": 45822009.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 432.95538330078125, "completions/mean_terminated_length": 432.95538330078125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3163664839467502, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04809121415019035, "kl": 0.032012939453125, "learning_rate": 4.667487853225931e-06, "loss": 0.0019, "num_tokens": 46295233.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 433.34600830078125, "completions/mean_terminated_length": 429.7337951660156, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.31949882537196556, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.091945119202137, "kl": 0.0325927734375, "learning_rate": 4.659822517785203e-06, "loss": 0.011, "num_tokens": 46746120.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 427.24554443359375, "completions/mean_terminated_length": 427.24554443359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3226311667971809, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04783743992447853, "kl": 0.0289306640625, "learning_rate": 4.6520770500166165e-06, "loss": 0.0018, "num_tokens": 47198510.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 439.9754638671875, "completions/mean_terminated_length": 439.9754638671875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.32576350822239625, "frac_reward_zero_std": 1.0, "grad_norm": 0.007613445166498423, "kl": 0.028656005859375, "learning_rate": 4.644251774904487e-06, "loss": 0.0003, "num_tokens": 47646879.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 441.325927734375, "completions/mean_terminated_length": 441.325927734375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3288958496476116, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04654070734977722, "kl": 0.0301513671875, "learning_rate": 4.636347020781684e-06, "loss": -0.0004, "num_tokens": 48102521.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 457.0692138671875, "completions/mean_terminated_length": 457.0692138671875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.33202819107282694, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04551827535033226, "kl": 0.027801513671875, "learning_rate": 4.6283631193158605e-06, "loss": -0.0022, "num_tokens": 48591304.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 432.7812805175781, "completions/mean_terminated_length": 432.7812805175781, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3351605324980423, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.035452328622341156, "kl": 0.028594970703125, "learning_rate": 4.620300405495532e-06, "loss": -0.0008, "num_tokens": 49044174.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 448.122802734375, "completions/mean_terminated_length": 444.54364013671875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.33829287392325763, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.05171548202633858, "kl": 0.02899169921875, "learning_rate": 4.612159217616022e-06, "loss": 0.0059, "num_tokens": 49509329.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 446.44866943359375, "completions/mean_terminated_length": 446.44866943359375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.341425215348473, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06818084418773651, "kl": 0.0286865234375, "learning_rate": 4.603939897265268e-06, "loss": 0.0019, "num_tokens": 49972990.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 437.4352722167969, "completions/mean_terminated_length": 437.4352722167969, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.3445575567736883, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07957317680120468, "kl": 0.0308837890625, "learning_rate": 4.595642789309492e-06, "loss": 0.0062, "num_tokens": 50416729.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 439.9107360839844, "completions/mean_terminated_length": 436.3132019042969, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.34768989819890367, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05212220922112465, "kl": 0.02703857421875, "learning_rate": 4.587268241878724e-06, "loss": 0.0, "num_tokens": 50865477.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 450.185302734375, "completions/mean_terminated_length": 450.185302734375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.350822239624119, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.03825162351131439, "kl": 0.02960205078125, "learning_rate": 4.578816606352205e-06, "loss": 0.0014, "num_tokens": 51347896.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 438.25225830078125, "completions/mean_terminated_length": 438.25225830078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.35395458104933436, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.06213603913784027, "kl": 0.031951904296875, "learning_rate": 4.570288237343632e-06, "loss": 0.0025, "num_tokens": 51804421.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2029.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 444.404052734375, "completions/mean_terminated_length": 444.404052734375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.3570869224745497, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.04999566823244095, "kl": 0.029998779296875, "learning_rate": 4.561683492686289e-06, "loss": 0.0017, "num_tokens": 52263646.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 437.24554443359375, "completions/mean_terminated_length": 437.24554443359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.36021926389976505, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.0414663627743721, "kl": 0.0269775390625, "learning_rate": 4.5530027334180285e-06, "loss": 0.0023, "num_tokens": 52718976.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 430.0602722167969, "completions/mean_terminated_length": 430.0602722167969, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.3633516053249804, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04705972969532013, "kl": 0.024871826171875, "learning_rate": 4.544246323766122e-06, "loss": 0.0013, "num_tokens": 53165951.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 432.5669860839844, "completions/mean_terminated_length": 432.5669860839844, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.36648394675019574, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.05293172597885132, "kl": 0.02838134765625, "learning_rate": 4.535414631131983e-06, "loss": 0.0008, "num_tokens": 53622389.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 434.3415222167969, "completions/mean_terminated_length": 434.3415222167969, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.36961628817541115, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08219528198242188, "kl": 0.028472900390625, "learning_rate": 4.526508026075746e-06, "loss": 0.0026, "num_tokens": 54078550.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 416.7701110839844, "completions/mean_terminated_length": 413.1208190917969, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.3727486296006265, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.06043415144085884, "kl": 0.028839111328125, "learning_rate": 4.517526882300721e-06, "loss": 0.0091, "num_tokens": 54521303.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 379.9531555175781, "completions/mean_terminated_length": 379.9531555175781, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.37588097102584184, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04358622804284096, "kl": 0.027984619140625, "learning_rate": 4.508471576637713e-06, "loss": -0.0009, "num_tokens": 54947022.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 433.34600830078125, "completions/mean_terminated_length": 433.34600830078125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.3790133124510572, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06653858721256256, "kl": 0.026611328125, "learning_rate": 4.499342489029211e-06, "loss": 0.005, "num_tokens": 55409209.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 422.63616943359375, "completions/mean_terminated_length": 419.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.38214565387627253, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.06137501075863838, "kl": 0.02532958984375, "learning_rate": 4.490140002513449e-06, "loss": 0.0076, "num_tokens": 55867422.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1368.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 410.9776916503906, "completions/mean_terminated_length": 410.9776916503906, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3852779953014879, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.051805704832077026, "kl": 0.028076171875, "learning_rate": 4.48086450320833e-06, "loss": 0.0016, "num_tokens": 56314436.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 427.3192138671875, "completions/mean_terminated_length": 427.3192138671875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.3884103367267032, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.05959983170032501, "kl": 0.027557373046875, "learning_rate": 4.4715163802952266e-06, "loss": 0.0028, "num_tokens": 56762203.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 411.9576110839844, "completions/mean_terminated_length": 411.9576110839844, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.39154267815191857, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.02710678055882454, "kl": 0.027252197265625, "learning_rate": 4.462096026002655e-06, "loss": -0.0017, "num_tokens": 57207068.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 421.6183166503906, "completions/mean_terminated_length": 421.6183166503906, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3946750195771339, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04759184271097183, "kl": 0.024932861328125, "learning_rate": 4.4526038355898144e-06, "loss": 0.003, "num_tokens": 57654233.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 442.810302734375, "completions/mean_terminated_length": 442.810302734375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.39780736100234926, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.06409808248281479, "kl": 0.02532958984375, "learning_rate": 4.4430402073300035e-06, "loss": -0.0045, "num_tokens": 58120836.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1437.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 428.2745666503906, "completions/mean_terminated_length": 428.2745666503906, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.4009397024275646, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058276294730603695, "kl": 0.02386474609375, "learning_rate": 4.433405542493909e-06, "loss": 0.0002, "num_tokens": 58575107.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 463.732177734375, "completions/mean_terminated_length": 463.732177734375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.40407204385277995, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.05623449757695198, "kl": 0.024688720703125, "learning_rate": 4.4237002453327734e-06, "loss": -0.0039, "num_tokens": 59053627.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1965.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 461.8504638671875, "completions/mean_terminated_length": 461.8504638671875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4072043852779953, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.04040304571390152, "kl": 0.0238037109375, "learning_rate": 4.4139247230614245e-06, "loss": 0.0031, "num_tokens": 59523776.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 442.716552734375, "completions/mean_terminated_length": 442.716552734375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.41033672670321064, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06449800729751587, "kl": 0.025360107421875, "learning_rate": 4.404079385841201e-06, "loss": 0.0003, "num_tokens": 59978665.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 465.3504638671875, "completions/mean_terminated_length": 465.3504638671875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.413469068128426, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.050808168947696686, "kl": 0.025177001953125, "learning_rate": 4.394164646762734e-06, "loss": 0.0025, "num_tokens": 60457326.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 474.33038330078125, "completions/mean_terminated_length": 467.2735595703125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.41660140955364133, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.06710772216320038, "kl": 0.02642822265625, "learning_rate": 4.384180921828618e-06, "loss": 0.002, "num_tokens": 60951458.0, "reward": 0.09866072237491608, "reward_std": 0.0023012058809399605, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 466.2410888671875, "completions/mean_terminated_length": 466.2410888671875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.4197337509788567, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.03740407153964043, "kl": 0.02484130859375, "learning_rate": 4.374128629935955e-06, "loss": 0.0008, "num_tokens": 61423854.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 474.22991943359375, "completions/mean_terminated_length": 470.70916748046875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.422866092404072, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07887433469295502, "kl": 0.023895263671875, "learning_rate": 4.364008192858781e-06, "loss": 0.009, "num_tokens": 61906949.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 450.1138610839844, "completions/mean_terminated_length": 450.1138610839844, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.42599843382928737, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.06140615791082382, "kl": 0.0235595703125, "learning_rate": 4.353820035230366e-06, "loss": 0.0034, "num_tokens": 62365276.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 446.0469055175781, "completions/mean_terminated_length": 442.4631042480469, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4291307752545027, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.08514049649238586, "kl": 0.026275634765625, "learning_rate": 4.3435645845254e-06, "loss": 0.0131, "num_tokens": 62828561.0, "reward": 0.09843751043081284, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 449.7344055175781, "completions/mean_terminated_length": 449.7344055175781, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.43226311667971806, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 1.6080666780471802, "kl": 0.12799072265625, "learning_rate": 4.333242271042054e-06, "loss": 0.0034, "num_tokens": 63308218.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1999.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 465.435302734375, "completions/mean_terminated_length": 465.435302734375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.43539545810493346, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04606800898909569, "kl": 0.022979736328125, "learning_rate": 4.32285352788393e-06, "loss": 0.0051, "num_tokens": 63789949.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 440.47100830078125, "completions/mean_terminated_length": 440.47100830078125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4385277995301488, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.046981774270534515, "kl": 0.02618408203125, "learning_rate": 4.312398790941882e-06, "loss": 0.0021, "num_tokens": 64258816.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 443.43304443359375, "completions/mean_terminated_length": 443.43304443359375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.44166014095536416, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.041135162115097046, "kl": 0.026123046875, "learning_rate": 4.301878498875735e-06, "loss": -0.0015, "num_tokens": 64710698.0, "reward": 0.0993303582072258, "reward_std": 0.0009619199554435909, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 423.1629638671875, "completions/mean_terminated_length": 423.1629638671875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.4447924823805795, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05955037474632263, "kl": 0.024749755859375, "learning_rate": 4.291293093095873e-06, "loss": 0.0029, "num_tokens": 65150587.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 453.9151916503906, "completions/mean_terminated_length": 450.3489990234375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.44792482380579485, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.08163747191429138, "kl": 0.02642822265625, "learning_rate": 4.280643017744723e-06, "loss": 0.0081, "num_tokens": 65628865.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 431.9151916503906, "completions/mean_terminated_length": 428.2997741699219, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.4510571652310102, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.03620860353112221, "kl": 0.026275634765625, "learning_rate": 4.269928719678117e-06, "loss": -0.0003, "num_tokens": 66087515.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 440.9308166503906, "completions/mean_terminated_length": 440.9308166503906, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.45418950665622554, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07583116739988327, "kl": 0.028839111328125, "learning_rate": 4.2591506484465426e-06, "loss": 0.0018, "num_tokens": 66558672.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 406.45538330078125, "completions/mean_terminated_length": 406.45538330078125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.4573218480814409, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.08960352838039398, "kl": 0.02459716796875, "learning_rate": 4.248309256276283e-06, "loss": 0.0087, "num_tokens": 66989356.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 420.0133972167969, "completions/mean_terminated_length": 420.0133972167969, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.46045418950665623, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03320499137043953, "kl": 0.0252685546875, "learning_rate": 4.23740499805044e-06, "loss": 0.0019, "num_tokens": 67439398.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1941.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 429.0290222167969, "completions/mean_terminated_length": 429.0290222167969, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4635865309318716, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06326489895582199, "kl": 0.029327392578125, "learning_rate": 4.22643833128985e-06, "loss": 0.0028, "num_tokens": 67901723.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 420.0156555175781, "completions/mean_terminated_length": 416.37359619140625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.4667188723570869, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07819637656211853, "kl": 0.03009033203125, "learning_rate": 4.215409716133885e-06, "loss": 0.0053, "num_tokens": 68347962.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 432.1406555175781, "completions/mean_terminated_length": 432.1406555175781, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.46985121378230227, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.058754753321409225, "kl": 0.02691650390625, "learning_rate": 4.204319615321151e-06, "loss": 0.0079, "num_tokens": 68807529.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 422.7008972167969, "completions/mean_terminated_length": 422.7008972167969, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.4729835552075176, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0693555697798729, "kl": 0.02801513671875, "learning_rate": 4.193168494170065e-06, "loss": 0.0055, "num_tokens": 69260967.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 428.70538330078125, "completions/mean_terminated_length": 428.70538330078125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.47611589663273296, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07846702635288239, "kl": 0.03033447265625, "learning_rate": 4.181956820559339e-06, "loss": 0.0056, "num_tokens": 69739671.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 392.3660888671875, "completions/mean_terminated_length": 388.6622009277344, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.4792482380579483, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05286122485995293, "kl": 0.02813720703125, "learning_rate": 4.170685064908342e-06, "loss": 0.0008, "num_tokens": 70163803.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 396.9977722167969, "completions/mean_terminated_length": 396.9977722167969, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.48238057948316365, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.08835630863904953, "kl": 0.0330810546875, "learning_rate": 4.159353700157365e-06, "loss": 0.0052, "num_tokens": 70608378.0, "reward": 0.09843750298023224, "reward_std": 0.0031250000465661287, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 409.5067138671875, "completions/mean_terminated_length": 409.5067138671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.485512920908379, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06833317875862122, "kl": 0.028900146484375, "learning_rate": 4.14796320174778e-06, "loss": 0.0012, "num_tokens": 71064201.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 401.08929443359375, "completions/mean_terminated_length": 397.4049377441406, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.48864526233359434, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04613521695137024, "kl": 0.029388427734375, "learning_rate": 4.136514047602087e-06, "loss": 0.0026, "num_tokens": 71499633.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 391.68975830078125, "completions/mean_terminated_length": 387.9843444824219, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.4917776037588097, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.08334057778120041, "kl": 0.03076171875, "learning_rate": 4.1250067181038635e-06, "loss": 0.0057, "num_tokens": 71944118.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 399.7232360839844, "completions/mean_terminated_length": 399.7232360839844, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.49490994518402504, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.06817489117383957, "kl": 0.031982421875, "learning_rate": 4.113441696077608e-06, "loss": 0.0044, "num_tokens": 72390270.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 410.0513610839844, "completions/mean_terminated_length": 406.38702392578125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4980422866092404, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05156361684203148, "kl": 0.027587890625, "learning_rate": 4.101819466768484e-06, "loss": 0.0023, "num_tokens": 72829553.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 388.15850830078125, "completions/mean_terminated_length": 384.4451904296875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.5011746280344558, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.0680699497461319, "kl": 0.030548095703125, "learning_rate": 4.0901405178219535e-06, "loss": 0.0019, "num_tokens": 73257852.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 398.1004638671875, "completions/mean_terminated_length": 398.1004638671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5043069694596711, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07783219963312149, "kl": 0.027496337890625, "learning_rate": 4.078405339263326e-06, "loss": 0.0074, "num_tokens": 73687401.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 413.21429443359375, "completions/mean_terminated_length": 413.21429443359375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5074393108848865, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.034954771399497986, "kl": 0.026092529296875, "learning_rate": 4.06661442347719e-06, "loss": 0.0025, "num_tokens": 74129617.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 384.8370666503906, "completions/mean_terminated_length": 384.8370666503906, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5105716523101018, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07982336729764938, "kl": 0.03155517578125, "learning_rate": 4.054768265186758e-06, "loss": 0.003, "num_tokens": 74555048.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 414.37054443359375, "completions/mean_terminated_length": 407.04486083984375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.5137039937353172, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.07002979516983032, "kl": 0.027862548828125, "learning_rate": 4.0428673614331036e-06, "loss": 0.0118, "num_tokens": 74997538.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 437.7254638671875, "completions/mean_terminated_length": 437.7254638671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5168363351605325, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.031722791492938995, "kl": 0.029205322265625, "learning_rate": 4.030912211554316e-06, "loss": 0.0003, "num_tokens": 75480591.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 410.9419860839844, "completions/mean_terminated_length": 410.9419860839844, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5199686765857479, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.033658482134342194, "kl": 0.02789306640625, "learning_rate": 4.018903317164539e-06, "loss": 0.0009, "num_tokens": 75932313.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 402.3437805175781, "completions/mean_terminated_length": 398.6622009277344, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.5231010180109632, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07195191085338593, "kl": 0.031768798828125, "learning_rate": 4.006841182132932e-06, "loss": 0.0125, "num_tokens": 76387419.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 395.8594055175781, "completions/mean_terminated_length": 395.8594055175781, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5262333594361785, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03147158771753311, "kl": 0.027191162109375, "learning_rate": 3.9947263125625195e-06, "loss": -0.0001, "num_tokens": 76818072.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 417.9308166503906, "completions/mean_terminated_length": 414.28411865234375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.5293657008613939, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03752738982439041, "kl": 0.028289794921875, "learning_rate": 3.982559216768967e-06, "loss": 0.0069, "num_tokens": 77270765.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 439.0870666503906, "completions/mean_terminated_length": 439.0870666503906, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.5324980422866092, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05622073635458946, "kl": 0.0284423828125, "learning_rate": 3.970340405259245e-06, "loss": 0.0046, "num_tokens": 77739352.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 415.6227722167969, "completions/mean_terminated_length": 415.6227722167969, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5356303837118246, "frac_reward_zero_std": 1.0, "grad_norm": 0.006053614895790815, "kl": 0.026611328125, "learning_rate": 3.958070390710214e-06, "loss": 0.0003, "num_tokens": 78185199.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 455.700927734375, "completions/mean_terminated_length": 452.1387023925781, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5387627251370399, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.2228330373764038, "kl": 0.033935546875, "learning_rate": 3.945749687947109e-06, "loss": 0.0069, "num_tokens": 78656741.0, "reward": 0.09888393431901932, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 422.0714416503906, "completions/mean_terminated_length": 422.0714416503906, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5418950665622553, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04723968729376793, "kl": 0.02838134765625, "learning_rate": 3.933378813921942e-06, "loss": -0.0041, "num_tokens": 79123521.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 425.5469055175781, "completions/mean_terminated_length": 425.5469055175781, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.5450274079874706, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05306388810276985, "kl": 0.028350830078125, "learning_rate": 3.920958287691811e-06, "loss": 0.0022, "num_tokens": 79587986.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 421.0401916503906, "completions/mean_terminated_length": 421.0401916503906, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.548159749412686, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03145791217684746, "kl": 0.025970458984375, "learning_rate": 3.908488630397121e-06, "loss": -0.0004, "num_tokens": 80030044.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 424.36163330078125, "completions/mean_terminated_length": 420.72930908203125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.5512920908379013, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05254034325480461, "kl": 0.02630615234375, "learning_rate": 3.8959703652397175e-06, "loss": 0.0048, "num_tokens": 80472194.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 439.7812805175781, "completions/mean_terminated_length": 439.7812805175781, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5544244322631167, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.02612248808145523, "kl": 0.025604248046875, "learning_rate": 3.883404017460935e-06, "loss": -0.0006, "num_tokens": 80938312.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 427.43975830078125, "completions/mean_terminated_length": 427.43975830078125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.557556773688332, "frac_reward_zero_std": 1.0, "grad_norm": 0.00535553926602006, "kl": 0.0240478515625, "learning_rate": 3.870790114319559e-06, "loss": 0.0002, "num_tokens": 81379985.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 425.9308166503906, "completions/mean_terminated_length": 425.9308166503906, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5606891151135474, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.05865020677447319, "kl": 0.02984619140625, "learning_rate": 3.858129185069701e-06, "loss": -0.0064, "num_tokens": 81835210.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 415.44866943359375, "completions/mean_terminated_length": 411.7964172363281, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5638214565387627, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.0726887434720993, "kl": 0.0255126953125, "learning_rate": 3.845421760938597e-06, "loss": 0.0104, "num_tokens": 82276691.0, "reward": 0.09888393431901932, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 428.86163330078125, "completions/mean_terminated_length": 425.2393798828125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.566953797963978, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.04142824187874794, "kl": 0.02618408203125, "learning_rate": 3.832668375104312e-06, "loss": 0.0018, "num_tokens": 82713289.0, "reward": 0.09910715371370316, "reward_std": 0.0014083485584706068, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 411.9776916503906, "completions/mean_terminated_length": 411.9776916503906, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5700861393891934, "frac_reward_zero_std": 1.0, "grad_norm": 0.013058731332421303, "kl": 0.027130126953125, "learning_rate": 3.8198695626733725e-06, "loss": 0.0003, "num_tokens": 83157963.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 410.68975830078125, "completions/mean_terminated_length": 410.68975830078125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.5732184808144087, "frac_reward_zero_std": 1.0, "grad_norm": 0.008703905157744884, "kl": 0.0289306640625, "learning_rate": 3.8070258606583156e-06, "loss": 0.0003, "num_tokens": 83599216.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 423.7254638671875, "completions/mean_terminated_length": 423.7254638671875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5763508222396241, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.048433806747198105, "kl": 0.026947021484375, "learning_rate": 3.7941378079551544e-06, "loss": -0.0023, "num_tokens": 84067845.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 426.42413330078125, "completions/mean_terminated_length": 426.42413330078125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.5794831636648394, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04670101776719093, "kl": 0.030303955078125, "learning_rate": 3.7812059453207677e-06, "loss": 0.004, "num_tokens": 84520695.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 391.3326110839844, "completions/mean_terminated_length": 391.3326110839844, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.5826155050900548, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.06392010301351547, "kl": 0.02703857421875, "learning_rate": 3.768230815350213e-06, "loss": 0.0022, "num_tokens": 84952016.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 427.62054443359375, "completions/mean_terminated_length": 427.62054443359375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.5857478465152701, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07798371464014053, "kl": 0.029327392578125, "learning_rate": 3.7552129624539557e-06, "loss": 0.0059, "num_tokens": 85413034.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 401.25225830078125, "completions/mean_terminated_length": 401.25225830078125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.5888801879404855, "frac_reward_zero_std": 1.0, "grad_norm": 0.010199005715548992, "kl": 0.02606201171875, "learning_rate": 3.7421529328350316e-06, "loss": 0.0003, "num_tokens": 85843283.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 412.1227722167969, "completions/mean_terminated_length": 408.4631042480469, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5920125293657008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072288550436496735, "kl": 0.025146484375, "learning_rate": 3.7290512744661274e-06, "loss": 0.0003, "num_tokens": 86280558.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 412.1562805175781, "completions/mean_terminated_length": 412.1562805175781, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.5951448707909162, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04434200003743172, "kl": 0.0252685546875, "learning_rate": 3.715908537066589e-06, "loss": -0.0006, "num_tokens": 86721028.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 403.5848388671875, "completions/mean_terminated_length": 403.5848388671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5982772122161315, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03215348348021507, "kl": 0.024322509765625, "learning_rate": 3.7027252720793538e-06, "loss": -0.0002, "num_tokens": 87154034.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 421.5937805175781, "completions/mean_terminated_length": 421.5937805175781, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.601409553641347, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06849931925535202, "kl": 0.028411865234375, "learning_rate": 3.689502032647817e-06, "loss": 0.0003, "num_tokens": 87600920.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349845170975, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 421.6004638671875, "completions/mean_terminated_length": 417.96197509765625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.6045418950665623, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.07146414369344711, "kl": 0.027191162109375, "learning_rate": 3.6762393735926245e-06, "loss": 0.0112, "num_tokens": 88056097.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 419.83038330078125, "completions/mean_terminated_length": 419.83038330078125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6076742364917777, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04334600642323494, "kl": 0.029022216796875, "learning_rate": 3.6629378513883852e-06, "loss": -0.003, "num_tokens": 88495733.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 437.6964416503906, "completions/mean_terminated_length": 437.6964416503906, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.610806577916993, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.037501778453588486, "kl": 0.026092529296875, "learning_rate": 3.6495980241403307e-06, "loss": 0.0021, "num_tokens": 88952961.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 424.0469055175781, "completions/mean_terminated_length": 424.0469055175781, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.6139389193422083, "frac_reward_zero_std": 1.0, "grad_norm": 0.005850474815815687, "kl": 0.023956298828125, "learning_rate": 3.636220451560896e-06, "loss": 0.0002, "num_tokens": 89408782.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 419.1094055175781, "completions/mean_terminated_length": 419.1094055175781, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6170712607674237, "frac_reward_zero_std": 1.0, "grad_norm": 0.009443786926567554, "kl": 0.02734375, "learning_rate": 3.622805694946235e-06, "loss": 0.0003, "num_tokens": 89849207.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 449.14288330078125, "completions/mean_terminated_length": 449.14288330078125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.620203602192639, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06979766488075256, "kl": 0.023956298828125, "learning_rate": 3.609354317152667e-06, "loss": 0.0022, "num_tokens": 90305463.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 457.1562805175781, "completions/mean_terminated_length": 457.1562805175781, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6233359436178544, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05540035665035248, "kl": 0.02606201171875, "learning_rate": 3.595866882573063e-06, "loss": 0.0022, "num_tokens": 90763917.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 447.70538330078125, "completions/mean_terminated_length": 447.70538330078125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6264682850430697, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.054835036396980286, "kl": 0.033905029296875, "learning_rate": 3.5823439571131675e-06, "loss": -0.0021, "num_tokens": 91218329.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1631.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 471.7232360839844, "completions/mean_terminated_length": 471.7232360839844, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.6296006264682851, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08853886276483536, "kl": 0.026031494140625, "learning_rate": 3.5687861081678477e-06, "loss": 0.0029, "num_tokens": 91691001.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 456.8326110839844, "completions/mean_terminated_length": 456.8326110839844, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.6327329678935004, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.03734137862920761, "kl": 0.0242919921875, "learning_rate": 3.555193904597291e-06, "loss": -0.0007, "num_tokens": 92170622.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 451.1719055175781, "completions/mean_terminated_length": 451.1719055175781, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.6358653093187158, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.060290198773145676, "kl": 0.024993896484375, "learning_rate": 3.541567916703138e-06, "loss": 0.0045, "num_tokens": 92621019.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 468.75225830078125, "completions/mean_terminated_length": 465.21923828125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.6389976507439311, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0836215689778328, "kl": 0.02545166015625, "learning_rate": 3.5279087162045517e-06, "loss": 0.0141, "num_tokens": 93086128.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 424.06475830078125, "completions/mean_terminated_length": 424.06475830078125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6421299921691465, "frac_reward_zero_std": 1.0, "grad_norm": 0.006045397836714983, "kl": 0.02410888671875, "learning_rate": 3.5142168762142265e-06, "loss": 0.0002, "num_tokens": 93524633.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1873.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 444.83038330078125, "completions/mean_terminated_length": 444.83038330078125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6452623335943618, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03578446805477142, "kl": 0.0225830078125, "learning_rate": 3.500492971214347e-06, "loss": 0.0026, "num_tokens": 93981237.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 461.1808166503906, "completions/mean_terminated_length": 461.1808166503906, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6483946750195771, "frac_reward_zero_std": 1.0, "grad_norm": 0.005883317906409502, "kl": 0.023193359375, "learning_rate": 3.48673757703248e-06, "loss": 0.0002, "num_tokens": 94453750.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 451.9442138671875, "completions/mean_terminated_length": 444.7870178222656, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6515270164447925, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07975054532289505, "kl": 0.02203369140625, "learning_rate": 3.472951270817418e-06, "loss": 0.018, "num_tokens": 94923509.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 468.6026916503906, "completions/mean_terminated_length": 468.6026916503906, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6546593578700078, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07465320825576782, "kl": 0.02520751953125, "learning_rate": 3.4591346310149578e-06, "loss": 0.0033, "num_tokens": 95410079.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 430.3125305175781, "completions/mean_terminated_length": 430.3125305175781, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6577916992952232, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.0428648479282856, "kl": 0.028839111328125, "learning_rate": 3.445288237343632e-06, "loss": 0.0002, "num_tokens": 95856511.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 402.9888610839844, "completions/mean_terminated_length": 402.9888610839844, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.6609240407204385, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.050433553755283356, "kl": 0.025421142578125, "learning_rate": 3.4314126707703895e-06, "loss": -0.0019, "num_tokens": 96282158.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 449.9732360839844, "completions/mean_terminated_length": 449.9732360839844, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.6640563821456539, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.028973883017897606, "kl": 0.022003173828125, "learning_rate": 3.4175085134862128e-06, "loss": -0.0011, "num_tokens": 96743098.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 430.71429443359375, "completions/mean_terminated_length": 430.71429443359375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.6671887235708692, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06305299699306488, "kl": 0.025177001953125, "learning_rate": 3.4035763488816953e-06, "loss": 0.0054, "num_tokens": 97195814.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 420.8571472167969, "completions/mean_terminated_length": 420.8571472167969, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.6703210649960846, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07460971176624298, "kl": 0.022369384765625, "learning_rate": 3.3896167615225594e-06, "loss": 0.0083, "num_tokens": 97645926.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 408.43975830078125, "completions/mean_terminated_length": 408.43975830078125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.6734534064212999, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.06192918121814728, "kl": 0.026336669921875, "learning_rate": 3.375630337125133e-06, "loss": 0.0063, "num_tokens": 98094227.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 448.2232360839844, "completions/mean_terminated_length": 448.2232360839844, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6765857478465153, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.03638318181037903, "kl": 0.0238037109375, "learning_rate": 3.361617662531772e-06, "loss": -0.0003, "num_tokens": 98555295.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 419.6406555175781, "completions/mean_terminated_length": 419.6406555175781, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.6797180892717306, "frac_reward_zero_std": 1.0, "grad_norm": 0.004299124702811241, "kl": 0.02276611328125, "learning_rate": 3.347579325686237e-06, "loss": 0.0002, "num_tokens": 99008582.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 428.3906555175781, "completions/mean_terminated_length": 428.3906555175781, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.682850430696946, "frac_reward_zero_std": 1.0, "grad_norm": 0.004403684753924608, "kl": 0.021728515625, "learning_rate": 3.333515915609027e-06, "loss": 0.0002, "num_tokens": 99459437.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 425.74554443359375, "completions/mean_terminated_length": 425.74554443359375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.6859827721221613, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.07151926308870316, "kl": 0.023651123046875, "learning_rate": 3.3194280223726616e-06, "loss": 0.0119, "num_tokens": 99903451.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 408.3125305175781, "completions/mean_terminated_length": 408.3125305175781, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.6891151135473766, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.0670456811785698, "kl": 0.025054931640625, "learning_rate": 3.305316237076927e-06, "loss": 0.0022, "num_tokens": 100345967.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 415.0401916503906, "completions/mean_terminated_length": 411.38702392578125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.692247454972592, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05287168174982071, "kl": 0.0216064453125, "learning_rate": 3.291181151824071e-06, "loss": 0.0079, "num_tokens": 100786093.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 411.79241943359375, "completions/mean_terminated_length": 408.1319885253906, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.6953797963978073, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.09867533296346664, "kl": 0.02484130859375, "learning_rate": 3.27702335969396e-06, "loss": 0.0094, "num_tokens": 101233960.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 404.15179443359375, "completions/mean_terminated_length": 404.15179443359375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6985121378230227, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06554672122001648, "kl": 0.024200439453125, "learning_rate": 3.2628434547191985e-06, "loss": 0.0021, "num_tokens": 101673300.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 398.83038330078125, "completions/mean_terminated_length": 398.83038330078125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.701644479248238, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.045040424913167953, "kl": 0.021514892578125, "learning_rate": 3.2486420318601973e-06, "loss": 0.0011, "num_tokens": 102120624.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 417.1138610839844, "completions/mean_terminated_length": 417.1138610839844, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7047768206734534, "frac_reward_zero_std": 1.0, "grad_norm": 0.004586922004818916, "kl": 0.020965576171875, "learning_rate": 3.2344196869802187e-06, "loss": 0.0002, "num_tokens": 102589071.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 415.9263610839844, "completions/mean_terminated_length": 415.9263610839844, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7079091620986687, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.08239328861236572, "kl": 0.022186279296875, "learning_rate": 3.2201770168203694e-06, "loss": 0.0082, "num_tokens": 103038582.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 400.03350830078125, "completions/mean_terminated_length": 400.03350830078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.7110415035238841, "frac_reward_zero_std": 1.0, "grad_norm": 0.005274351220577955, "kl": 0.0224609375, "learning_rate": 3.205914618974563e-06, "loss": 0.0002, "num_tokens": 103475357.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 416.64288330078125, "completions/mean_terminated_length": 416.64288330078125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.7141738449490994, "frac_reward_zero_std": 1.0, "grad_norm": 0.006492116022855043, "kl": 0.02392578125, "learning_rate": 3.1916330918644496e-06, "loss": 0.0002, "num_tokens": 103928169.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 407.7633972167969, "completions/mean_terminated_length": 404.0939636230469, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7173061863743148, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.037993624806404114, "kl": 0.024871826171875, "learning_rate": 3.177333034714303e-06, "loss": -0.0025, "num_tokens": 104376927.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 433.4687805175781, "completions/mean_terminated_length": 433.4687805175781, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7204385277995301, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.050914015620946884, "kl": 0.025421142578125, "learning_rate": 3.1630150475258813e-06, "loss": -0.0002, "num_tokens": 104843053.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 427.25225830078125, "completions/mean_terminated_length": 423.62640380859375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7235708692247454, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.0790075957775116, "kl": 0.0272216796875, "learning_rate": 3.148679731053252e-06, "loss": 0.0096, "num_tokens": 105316846.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 421.03350830078125, "completions/mean_terminated_length": 421.03350830078125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.7267032106499608, "frac_reward_zero_std": 1.0, "grad_norm": 0.015399309806525707, "kl": 0.025360107421875, "learning_rate": 3.1343276867775805e-06, "loss": 0.0003, "num_tokens": 105766761.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 427.84600830078125, "completions/mean_terminated_length": 427.84600830078125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7298355520751761, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.027338499203324318, "kl": 0.022552490234375, "learning_rate": 3.1199595168819043e-06, "loss": -0.0008, "num_tokens": 106229532.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1272.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 411.40179443359375, "completions/mean_terminated_length": 411.40179443359375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7329678935003915, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.022664135321974754, "kl": 0.021270751953125, "learning_rate": 3.105575824225852e-06, "loss": -0.0017, "num_tokens": 106666588.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 436.04913330078125, "completions/mean_terminated_length": 436.04913330078125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7361002349256069, "frac_reward_zero_std": 1.0, "grad_norm": 0.003992246463894844, "kl": 0.020172119140625, "learning_rate": 3.091177212320363e-06, "loss": 0.0002, "num_tokens": 107118550.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 432.2098388671875, "completions/mean_terminated_length": 428.5950927734375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.7392325763508223, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05538037046790123, "kl": 0.020538330078125, "learning_rate": 3.0767642853023538e-06, "loss": 0.008, "num_tokens": 107571704.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 432.0915222167969, "completions/mean_terminated_length": 432.0915222167969, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.7423649177760376, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.06568653881549835, "kl": 0.021240234375, "learning_rate": 3.062337647909376e-06, "loss": 0.0047, "num_tokens": 108028669.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 431.6317138671875, "completions/mean_terminated_length": 428.0156555175781, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.745497259201253, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.05682549253106117, "kl": 0.023162841796875, "learning_rate": 3.04789790545424e-06, "loss": 0.0089, "num_tokens": 108481852.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 456.10491943359375, "completions/mean_terminated_length": 456.10491943359375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7486296006264683, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.03749333694577217, "kl": 0.022064208984375, "learning_rate": 3.033445663799621e-06, "loss": -0.0012, "num_tokens": 108949567.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 470.2567138671875, "completions/mean_terminated_length": 470.2567138671875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.7517619420516837, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.023022564128041267, "kl": 0.0218505859375, "learning_rate": 3.018981529332633e-06, "loss": -0.0002, "num_tokens": 109420774.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 444.0625305175781, "completions/mean_terminated_length": 444.0625305175781, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.754894283476899, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06515809893608093, "kl": 0.023834228515625, "learning_rate": 3.00450610893939e-06, "loss": 0.001, "num_tokens": 109898098.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 459.1808166503906, "completions/mean_terminated_length": 459.1808166503906, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7580266249021144, "frac_reward_zero_std": 1.0, "grad_norm": 0.004318064544349909, "kl": 0.021331787109375, "learning_rate": 2.9900200099795396e-06, "loss": 0.0002, "num_tokens": 110364671.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 470.8482360839844, "completions/mean_terminated_length": 470.8482360839844, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7611589663273297, "frac_reward_zero_std": 1.0, "grad_norm": 0.004060268867760897, "kl": 0.0208740234375, "learning_rate": 2.9755238402607826e-06, "loss": 0.0002, "num_tokens": 110836635.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 479.5870666503906, "completions/mean_terminated_length": 476.07830810546875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7642913077525451, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.055694330483675, "kl": 0.021820068359375, "learning_rate": 2.961018208013367e-06, "loss": 0.0122, "num_tokens": 111305382.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 442.50225830078125, "completions/mean_terminated_length": 442.50225830078125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7674236491777604, "frac_reward_zero_std": 1.0, "grad_norm": 0.005155134480446577, "kl": 0.022064208984375, "learning_rate": 2.9465037218645694e-06, "loss": 0.0002, "num_tokens": 111755975.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 461.3951110839844, "completions/mean_terminated_length": 461.3951110839844, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7705559906029757, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.054661769419908524, "kl": 0.02130126953125, "learning_rate": 2.9319809908131604e-06, "loss": -0.0026, "num_tokens": 112238796.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 444.044677734375, "completions/mean_terminated_length": 440.4563903808594, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.7736883320281911, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.06267818063497543, "kl": 0.02117919921875, "learning_rate": 2.917450624203847e-06, "loss": 0.0098, "num_tokens": 112695792.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1305.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 460.9375305175781, "completions/mean_terminated_length": 460.9375305175781, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.7768206734534064, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04118034616112709, "kl": 0.0234375, "learning_rate": 2.9029132317017118e-06, "loss": -0.0012, "num_tokens": 113168584.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 439.3995666503906, "completions/mean_terminated_length": 439.3995666503906, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7799530148786218, "frac_reward_zero_std": 1.0, "grad_norm": 0.01076538022607565, "kl": 0.0235595703125, "learning_rate": 2.888369423266629e-06, "loss": 0.0002, "num_tokens": 113618439.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 454.90179443359375, "completions/mean_terminated_length": 454.90179443359375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.7830853563038371, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03368180990219116, "kl": 0.020263671875, "learning_rate": 2.8738198091276712e-06, "loss": 0.0001, "num_tokens": 114095823.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 454.9620666503906, "completions/mean_terminated_length": 454.9620666503906, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7862176977290525, "frac_reward_zero_std": 1.0, "grad_norm": 0.011131834238767624, "kl": 0.024017333984375, "learning_rate": 2.859264999757509e-06, "loss": 0.0002, "num_tokens": 114573438.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 453.2544860839844, "completions/mean_terminated_length": 453.2544860839844, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7893500391542678, "frac_reward_zero_std": 1.0, "grad_norm": 0.012259057722985744, "kl": 0.02325439453125, "learning_rate": 2.8447056058467928e-06, "loss": 0.0002, "num_tokens": 115038304.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 422.1808166503906, "completions/mean_terminated_length": 422.1808166503906, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7924823805794832, "frac_reward_zero_std": 1.0, "grad_norm": 0.006518123671412468, "kl": 0.022308349609375, "learning_rate": 2.830142238278531e-06, "loss": 0.0002, "num_tokens": 115483537.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 451.49554443359375, "completions/mean_terminated_length": 451.49554443359375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.7956147220046985, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.029494930058717728, "kl": 0.023406982421875, "learning_rate": 2.81557550810246e-06, "loss": -0.0022, "num_tokens": 115972595.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 442.0000305175781, "completions/mean_terminated_length": 442.0000305175781, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.7987470634299139, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.021625924855470657, "kl": 0.0211181640625, "learning_rate": 2.8010060265094026e-06, "loss": -0.0005, "num_tokens": 116427279.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 441.7031555175781, "completions/mean_terminated_length": 441.7031555175781, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8018794048551292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039117648266255856, "kl": 0.020721435546875, "learning_rate": 2.786434404805629e-06, "loss": 0.0002, "num_tokens": 116902954.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 438.53350830078125, "completions/mean_terminated_length": 438.53350830078125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.8050117462803446, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05075149983167648, "kl": 0.024200439453125, "learning_rate": 2.771861254387199e-06, "loss": 0.0078, "num_tokens": 117373833.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 440.685302734375, "completions/mean_terminated_length": 440.685302734375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8081440877055599, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.025670140981674194, "kl": 0.020965576171875, "learning_rate": 2.7572871867143204e-06, "loss": -0.0011, "num_tokens": 117845112.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 436.3683166503906, "completions/mean_terminated_length": 436.3683166503906, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.8112764291307752, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05100719630718231, "kl": 0.0218505859375, "learning_rate": 2.742712813285681e-06, "loss": 0.0046, "num_tokens": 118293533.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 436.12054443359375, "completions/mean_terminated_length": 436.12054443359375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.8144087705559906, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.04223935678601265, "kl": 0.022125244140625, "learning_rate": 2.7281387456128017e-06, "loss": -0.0009, "num_tokens": 118748367.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 448.5625305175781, "completions/mean_terminated_length": 444.9843444824219, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.8175411119812059, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05608447268605232, "kl": 0.0224609375, "learning_rate": 2.7135655951943716e-06, "loss": 0.0061, "num_tokens": 119218635.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 444.46429443359375, "completions/mean_terminated_length": 444.46429443359375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.8206734534064213, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.050068244338035583, "kl": 0.026519775390625, "learning_rate": 2.698993973490598e-06, "loss": -0.0008, "num_tokens": 119682927.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 453.607177734375, "completions/mean_terminated_length": 453.607177734375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8238057948316366, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06239871680736542, "kl": 0.022918701171875, "learning_rate": 2.6844244918975416e-06, "loss": -0.0038, "num_tokens": 120171303.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 447.0714416503906, "completions/mean_terminated_length": 447.0714416503906, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.826938136256852, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.08211582154035568, "kl": 0.021148681640625, "learning_rate": 2.66985776172147e-06, "loss": 0.0019, "num_tokens": 120637627.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 422.5915222167969, "completions/mean_terminated_length": 422.5915222167969, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8300704776820673, "frac_reward_zero_std": 1.0, "grad_norm": 0.005049035418778658, "kl": 0.022735595703125, "learning_rate": 2.6552943941532088e-06, "loss": 0.0002, "num_tokens": 121085700.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 445.11163330078125, "completions/mean_terminated_length": 445.11163330078125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8332028191072827, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03739725425839424, "kl": 0.023040771484375, "learning_rate": 2.6407350002424927e-06, "loss": -0.0006, "num_tokens": 121544778.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 424.2544860839844, "completions/mean_terminated_length": 424.2544860839844, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.836335160532498, "frac_reward_zero_std": 1.0, "grad_norm": 0.006837096996605396, "kl": 0.02349853515625, "learning_rate": 2.626180190872329e-06, "loss": 0.0002, "num_tokens": 121985956.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 457.2901916503906, "completions/mean_terminated_length": 457.2901916503906, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8394675019577134, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.055564843118190765, "kl": 0.023406982421875, "learning_rate": 2.611630576733372e-06, "loss": 0.0008, "num_tokens": 122467994.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 426.72100830078125, "completions/mean_terminated_length": 426.72100830078125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.8425998433829287, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.026000672951340675, "kl": 0.024139404296875, "learning_rate": 2.5970867682982885e-06, "loss": 0.0013, "num_tokens": 122903481.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 432.7589416503906, "completions/mean_terminated_length": 432.7589416503906, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.845732184808144, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.038823939859867096, "kl": 0.0206298828125, "learning_rate": 2.582549375796154e-06, "loss": 0.0014, "num_tokens": 123347821.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 432.35491943359375, "completions/mean_terminated_length": 432.35491943359375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.8488645262333594, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.0538831390440464, "kl": 0.023834228515625, "learning_rate": 2.568019009186841e-06, "loss": -0.0025, "num_tokens": 123801740.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 425.3013610839844, "completions/mean_terminated_length": 421.671142578125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8519968676585747, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03680819645524025, "kl": 0.0225830078125, "learning_rate": 2.5534962781354317e-06, "loss": 0.0078, "num_tokens": 124248759.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 432.2276916503906, "completions/mean_terminated_length": 432.2276916503906, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.8551292090837901, "frac_reward_zero_std": 1.0, "grad_norm": 0.005161454901099205, "kl": 0.0208740234375, "learning_rate": 2.538981791986634e-06, "loss": 0.0002, "num_tokens": 124703245.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 451.0848388671875, "completions/mean_terminated_length": 447.5122985839844, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.8582615505090054, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.0766170546412468, "kl": 0.02410888671875, "learning_rate": 2.524476159739218e-06, "loss": 0.0159, "num_tokens": 125173367.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 444.2812805175781, "completions/mean_terminated_length": 444.2812805175781, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8613938919342208, "frac_reward_zero_std": 1.0, "grad_norm": 0.004667214117944241, "kl": 0.021484375, "learning_rate": 2.5099799900204607e-06, "loss": 0.0002, "num_tokens": 125645009.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 436.4285888671875, "completions/mean_terminated_length": 436.4285888671875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.8645262333594361, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07615981251001358, "kl": 0.024261474609375, "learning_rate": 2.4954938910606108e-06, "loss": 0.0022, "num_tokens": 126110669.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 421.5870666503906, "completions/mean_terminated_length": 417.94854736328125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.8676585747846516, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.062257397919893265, "kl": 0.026611328125, "learning_rate": 2.481018470667368e-06, "loss": 0.0119, "num_tokens": 126558216.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 434.1473388671875, "completions/mean_terminated_length": 434.1473388671875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.8707909162098669, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0650908425450325, "kl": 0.025665283203125, "learning_rate": 2.4665543362003802e-06, "loss": 0.0025, "num_tokens": 127019578.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 442.43304443359375, "completions/mean_terminated_length": 442.43304443359375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8739232576350823, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.042217496782541275, "kl": 0.024017333984375, "learning_rate": 2.4521020945457615e-06, "loss": -0.0011, "num_tokens": 127489928.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 451.17413330078125, "completions/mean_terminated_length": 451.17413330078125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.8770555990602976, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03330700844526291, "kl": 0.022552490234375, "learning_rate": 2.4376623520906255e-06, "loss": 0.0015, "num_tokens": 127962206.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 409.8370666503906, "completions/mean_terminated_length": 409.8370666503906, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.880187940485513, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07713975012302399, "kl": 0.02685546875, "learning_rate": 2.4232357146976478e-06, "loss": -0.0, "num_tokens": 128401397.0, "reward": 0.09888393431901932, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 427.3013610839844, "completions/mean_terminated_length": 427.3013610839844, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.8833202819107283, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05073979124426842, "kl": 0.023956298828125, "learning_rate": 2.408822787679637e-06, "loss": 0.0021, "num_tokens": 128871716.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 433.9888610839844, "completions/mean_terminated_length": 433.9888610839844, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8864526233359437, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.048994965851306915, "kl": 0.024017333984375, "learning_rate": 2.3944241757741475e-06, "loss": -0.001, "num_tokens": 129319715.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 432.7232360839844, "completions/mean_terminated_length": 432.7232360839844, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.889584964761159, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06556610018014908, "kl": 0.02392578125, "learning_rate": 2.380040483118097e-06, "loss": -0.0019, "num_tokens": 129782423.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 421.96429443359375, "completions/mean_terminated_length": 421.96429443359375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8927173061863743, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.029706869274377823, "kl": 0.0244140625, "learning_rate": 2.365672313222419e-06, "loss": 0.0001, "num_tokens": 130225683.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 442.3594055175781, "completions/mean_terminated_length": 442.3594055175781, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.8958496476115897, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07522924989461899, "kl": 0.024139404296875, "learning_rate": 2.351320268946749e-06, "loss": 0.003, "num_tokens": 130701760.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 438.5848388671875, "completions/mean_terminated_length": 438.5848388671875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.898981989036805, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03690180554986, "kl": 0.0233154296875, "learning_rate": 2.336984952474119e-06, "loss": -0.0007, "num_tokens": 131166670.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 418.6250305175781, "completions/mean_terminated_length": 418.6250305175781, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.9021143304620204, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07948677241802216, "kl": 0.025482177734375, "learning_rate": 2.322666965285697e-06, "loss": 0.0012, "num_tokens": 131606134.0, "reward": 0.09888393431901932, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1083.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 394.1183166503906, "completions/mean_terminated_length": 394.1183166503906, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.9052466718872357, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.03446091711521149, "kl": 0.0233154296875, "learning_rate": 2.3083669081355507e-06, "loss": -0.0028, "num_tokens": 132033531.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 415.6540222167969, "completions/mean_terminated_length": 415.6540222167969, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.9083790133124511, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.0464656837284565, "kl": 0.024658203125, "learning_rate": 2.2940853810254377e-06, "loss": 0.0002, "num_tokens": 132480556.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 410.41741943359375, "completions/mean_terminated_length": 410.41741943359375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9115113547376664, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.036193717271089554, "kl": 0.026458740234375, "learning_rate": 2.2798229831796313e-06, "loss": -0.0003, "num_tokens": 132914771.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 419.5000305175781, "completions/mean_terminated_length": 419.5000305175781, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9146436961628818, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03984314948320389, "kl": 0.024017333984375, "learning_rate": 2.2655803130197816e-06, "loss": 0.0011, "num_tokens": 133368511.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 427.5915222167969, "completions/mean_terminated_length": 423.9664306640625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9177760375880971, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06957186758518219, "kl": 0.026092529296875, "learning_rate": 2.2513579681398034e-06, "loss": 0.0087, "num_tokens": 133834900.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 410.27679443359375, "completions/mean_terminated_length": 410.27679443359375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.9209083790133125, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.032954346388578415, "kl": 0.024444580078125, "learning_rate": 2.237156545280803e-06, "loss": -0.0013, "num_tokens": 134264324.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 405.62725830078125, "completions/mean_terminated_length": 405.62725830078125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.9240407204385278, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07128036767244339, "kl": 0.02777099609375, "learning_rate": 2.2229766403060403e-06, "loss": 0.0042, "num_tokens": 134694617.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349845170975, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 402.9687805175781, "completions/mean_terminated_length": 402.9687805175781, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9271730618637432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047294325195252895, "kl": 0.02593994140625, "learning_rate": 2.2088188481759305e-06, "loss": 0.0003, "num_tokens": 135144675.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 422.00225830078125, "completions/mean_terminated_length": 422.00225830078125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.9303054032889585, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03172848001122475, "kl": 0.024688720703125, "learning_rate": 2.194683762923073e-06, "loss": -0.0004, "num_tokens": 135601148.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 416.3995666503906, "completions/mean_terminated_length": 416.3995666503906, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.9334377447141738, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04834676906466484, "kl": 0.025390625, "learning_rate": 2.1805719776273387e-06, "loss": -0.0005, "num_tokens": 136056607.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 415.1919860839844, "completions/mean_terminated_length": 415.1919860839844, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.9365700861393892, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.05837247520685196, "kl": 0.037933349609375, "learning_rate": 2.166484084390974e-06, "loss": -0.0018, "num_tokens": 136491917.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 413.8839416503906, "completions/mean_terminated_length": 410.2281799316406, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.9397024275646045, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.058703795075416565, "kl": 0.0247802734375, "learning_rate": 2.1524206743137636e-06, "loss": 0.0136, "num_tokens": 136947177.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 428.7232360839844, "completions/mean_terminated_length": 428.7232360839844, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.9428347689898199, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04723570495843887, "kl": 0.02593994140625, "learning_rate": 2.1383823374682287e-06, "loss": -0.0002, "num_tokens": 137416893.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 407.2701110839844, "completions/mean_terminated_length": 407.2701110839844, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.9459671104150352, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05798759311437607, "kl": 0.025238037109375, "learning_rate": 2.124369662874868e-06, "loss": 0.0016, "num_tokens": 137842858.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 414.7901916503906, "completions/mean_terminated_length": 414.7901916503906, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9490994518402506, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.12113261222839355, "kl": 0.04620361328125, "learning_rate": 2.110383238477441e-06, "loss": -0.0001, "num_tokens": 138293340.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 383.18975830078125, "completions/mean_terminated_length": 383.18975830078125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.9522317932654659, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03409220650792122, "kl": 0.025787353515625, "learning_rate": 2.096423651118305e-06, "loss": -0.0011, "num_tokens": 138718145.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 433.33929443359375, "completions/mean_terminated_length": 433.33929443359375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9553641346906813, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05700921267271042, "kl": 0.028106689453125, "learning_rate": 2.082491486513788e-06, "loss": 0.0022, "num_tokens": 139188161.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 432.4933166503906, "completions/mean_terminated_length": 432.4933166503906, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.9584964761158966, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.061191376298666, "kl": 0.0267333984375, "learning_rate": 2.0685873292296116e-06, "loss": 0.003, "num_tokens": 139654554.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 449.5848388671875, "completions/mean_terminated_length": 446.0089416503906, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.961628817541112, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07805483788251877, "kl": 0.0283203125, "learning_rate": 2.054711762656369e-06, "loss": 0.0257, "num_tokens": 140125012.0, "reward": 0.09866072982549667, "reward_std": 0.0023012058809399605, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 412.0625305175781, "completions/mean_terminated_length": 412.0625305175781, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.9647611589663273, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.08274301141500473, "kl": 0.026214599609375, "learning_rate": 2.040865368985044e-06, "loss": 0.0055, "num_tokens": 140581496.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 420.5937805175781, "completions/mean_terminated_length": 420.5937805175781, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.9678935003915426, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06637118011713028, "kl": 0.02679443359375, "learning_rate": 2.027048729182583e-06, "loss": 0.0052, "num_tokens": 141029070.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 416.6607360839844, "completions/mean_terminated_length": 416.6607360839844, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.971025841816758, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07922264188528061, "kl": 0.027069091796875, "learning_rate": 2.0132624229675205e-06, "loss": 0.0059, "num_tokens": 141472378.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 434.3281555175781, "completions/mean_terminated_length": 434.3281555175781, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.9741581832419733, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.0869169533252716, "kl": 0.02813720703125, "learning_rate": 1.9995070287856546e-06, "loss": 0.0099, "num_tokens": 141931081.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1595.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 442.7857360839844, "completions/mean_terminated_length": 442.7857360839844, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.9772905246671887, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.09933750331401825, "kl": 0.02801513671875, "learning_rate": 1.985783123785774e-06, "loss": 0.0093, "num_tokens": 142398317.0, "reward": 0.09843750298023224, "reward_std": 0.0031250000465661287, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 433.9754638671875, "completions/mean_terminated_length": 430.3646545410156, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.980422866092404, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.08437841385602951, "kl": 0.02716064453125, "learning_rate": 1.9720912837954486e-06, "loss": 0.0068, "num_tokens": 142849842.0, "reward": 0.09866072237491608, "reward_std": 0.0023012058809399605, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 436.1852722167969, "completions/mean_terminated_length": 432.57940673828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9835552075176194, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.050111617892980576, "kl": 0.027069091796875, "learning_rate": 1.958432083296862e-06, "loss": 0.009, "num_tokens": 143306617.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 425.03350830078125, "completions/mean_terminated_length": 425.03350830078125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.9866875489428347, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07233233749866486, "kl": 0.028900146484375, "learning_rate": 1.9448060954027093e-06, "loss": 0.0062, "num_tokens": 143764980.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 453.5982360839844, "completions/mean_terminated_length": 446.4484558105469, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.9898198903680501, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.08113447576761246, "kl": 0.0269775390625, "learning_rate": 1.931213891832153e-06, "loss": 0.0123, "num_tokens": 144237196.0, "reward": 0.09888393431901932, "reward_std": 0.0018547771032899618, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1560.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 446.3370666503906, "completions/mean_terminated_length": 446.3370666503906, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.9929522317932654, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07229574024677277, "kl": 0.027374267578125, "learning_rate": 1.9176560428868336e-06, "loss": 0.0078, "num_tokens": 144699883.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 422.4732360839844, "completions/mean_terminated_length": 422.4732360839844, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.9960845732184808, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.040984079241752625, "kl": 0.027252197265625, "learning_rate": 1.9041331174269373e-06, "loss": 0.0013, "num_tokens": 145139379.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 438.77679443359375, "completions/mean_terminated_length": 435.1767272949219, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.9992169146436961, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08571083098649979, "kl": 0.0263671875, "learning_rate": 1.8906456828473341e-06, "loss": 0.01, "num_tokens": 145608335.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 436.6294860839844, "completions/mean_terminated_length": 433.02459716796875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.0031323414252153, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.0498218797147274, "kl": 0.025390625, "learning_rate": 1.8771943050537656e-06, "loss": 0.0127, "num_tokens": 146074689.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 415.6495666503906, "completions/mean_terminated_length": 411.9977722167969, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.0062646828504307, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.045207131654024124, "kl": 0.027313232421875, "learning_rate": 1.8637795484391046e-06, "loss": 0.0008, "num_tokens": 146517128.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1923.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 420.46429443359375, "completions/mean_terminated_length": 420.46429443359375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.009397024275646, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.02512957900762558, "kl": 0.02685546875, "learning_rate": 1.8504019758596698e-06, "loss": -0.0014, "num_tokens": 146973744.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 421.9464416503906, "completions/mean_terminated_length": 421.9464416503906, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.0125293657008614, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.07044515758752823, "kl": 0.028045654296875, "learning_rate": 1.8370621486116163e-06, "loss": 0.0029, "num_tokens": 147425188.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164842426776886, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 413.28350830078125, "completions/mean_terminated_length": 413.28350830078125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.0156617071260767, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05170854181051254, "kl": 0.027191162109375, "learning_rate": 1.823760626407377e-06, "loss": 0.0083, "num_tokens": 147875239.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 432.5067138671875, "completions/mean_terminated_length": 428.8926086425781, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.018794048551292, "frac_reward_zero_std": 0.9196429252624512, "grad_norm": 0.09464503079652786, "kl": 0.02911376953125, "learning_rate": 1.8104979673521838e-06, "loss": 0.0098, "num_tokens": 148342802.0, "reward": 0.09799107909202576, "reward_std": 0.004017857369035482, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9799107313156128, "rewards/format_reward/std": 0.14046260714530945, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 421.1317138671875, "completions/mean_terminated_length": 421.1317138671875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.0219263899765074, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.0723169818520546, "kl": 0.02642822265625, "learning_rate": 1.7972747279206482e-06, "loss": 0.0106, "num_tokens": 148788105.0, "reward": 0.09910715371370316, "reward_std": 0.0014083485584706068, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 409.1250305175781, "completions/mean_terminated_length": 405.4586181640625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.0250587314017228, "frac_reward_zero_std": 1.0, "grad_norm": 0.005071789491921663, "kl": 0.02880859375, "learning_rate": 1.7840914629334122e-06, "loss": 0.0003, "num_tokens": 149234073.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1978.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 413.5714416503906, "completions/mean_terminated_length": 413.5714416503906, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.0281910728269381, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.035749662667512894, "kl": 0.0279541015625, "learning_rate": 1.7709487255338731e-06, "loss": -0.0025, "num_tokens": 149670825.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 440.1294860839844, "completions/mean_terminated_length": 440.1294860839844, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.0313234142521535, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03024335578083992, "kl": 0.0286865234375, "learning_rate": 1.7578470671649684e-06, "loss": 0.0022, "num_tokens": 150136651.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 420.8660888671875, "completions/mean_terminated_length": 417.2259521484375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.0344557556773688, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.0715809315443039, "kl": 0.025848388671875, "learning_rate": 1.744787037546045e-06, "loss": 0.0164, "num_tokens": 150588831.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 423.7477722167969, "completions/mean_terminated_length": 423.7477722167969, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.0375880971025842, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05135364085435867, "kl": 0.027191162109375, "learning_rate": 1.731769184649788e-06, "loss": 0.0009, "num_tokens": 151022814.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 421.3973388671875, "completions/mean_terminated_length": 421.3973388671875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.0407204385277995, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06290540844202042, "kl": 0.0274658203125, "learning_rate": 1.7187940546792325e-06, "loss": 0.0013, "num_tokens": 151474004.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 426.9129638671875, "completions/mean_terminated_length": 426.9129638671875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 1.0438527799530148, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06438305228948593, "kl": 0.028228759765625, "learning_rate": 1.7058621920448465e-06, "loss": 0.002, "num_tokens": 151925357.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1273.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 426.44866943359375, "completions/mean_terminated_length": 426.44866943359375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.0469851213782302, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.053351372480392456, "kl": 0.029266357421875, "learning_rate": 1.6929741393416855e-06, "loss": -0.0016, "num_tokens": 152391606.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 426.8192138671875, "completions/mean_terminated_length": 423.1923828125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.0501174628034455, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07440299540758133, "kl": 0.031707763671875, "learning_rate": 1.6801304373266286e-06, "loss": 0.0188, "num_tokens": 152851513.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 425.1629638671875, "completions/mean_terminated_length": 425.1629638671875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.0532498042286609, "frac_reward_zero_std": 1.0, "grad_norm": 0.004723084159195423, "kl": 0.02764892578125, "learning_rate": 1.667331624895689e-06, "loss": 0.0003, "num_tokens": 153307354.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 424.4464416503906, "completions/mean_terminated_length": 424.4464416503906, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.0563821456538762, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03821306303143501, "kl": 0.027069091796875, "learning_rate": 1.6545782390614037e-06, "loss": 0.0012, "num_tokens": 153764534.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 409.9687805175781, "completions/mean_terminated_length": 409.9687805175781, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 1.0595144870790916, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.035542406141757965, "kl": 0.027313232421875, "learning_rate": 1.6418708149302992e-06, "loss": 0.0014, "num_tokens": 154207164.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 424.83929443359375, "completions/mean_terminated_length": 424.83929443359375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.062646828504307, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03146401047706604, "kl": 0.027618408203125, "learning_rate": 1.6292098856804423e-06, "loss": 0.0006, "num_tokens": 154673172.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 420.7232360839844, "completions/mean_terminated_length": 420.7232360839844, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.0657791699295223, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.0520419180393219, "kl": 0.026947021484375, "learning_rate": 1.6165959825390661e-06, "loss": -0.0007, "num_tokens": 155129592.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 433.9308166503906, "completions/mean_terminated_length": 433.9308166503906, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 1.0689115113547376, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.042372722178697586, "kl": 0.027984619140625, "learning_rate": 1.604029634760284e-06, "loss": -0.0021, "num_tokens": 155596325.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 405.37725830078125, "completions/mean_terminated_length": 405.37725830078125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 1.072043852779953, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.06196702644228935, "kl": 0.02813720703125, "learning_rate": 1.59151136960288e-06, "loss": 0.0051, "num_tokens": 156031778.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 422.0848388671875, "completions/mean_terminated_length": 422.0848388671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 1.0751761942051683, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.025487007573246956, "kl": 0.02679443359375, "learning_rate": 1.5790417123081903e-06, "loss": -0.001, "num_tokens": 156483896.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1400.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 416.3437805175781, "completions/mean_terminated_length": 416.3437805175781, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.0783085356303836, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.03959706798195839, "kl": 0.028564453125, "learning_rate": 1.5666211860780583e-06, "loss": -0.0034, "num_tokens": 156931090.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 429.47991943359375, "completions/mean_terminated_length": 429.47991943359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.081440877055599, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03484117239713669, "kl": 0.025848388671875, "learning_rate": 1.5542503120528918e-06, "loss": -0.0022, "num_tokens": 157385861.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1855.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 398.62054443359375, "completions/mean_terminated_length": 398.62054443359375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.0845732184808143, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.04725842550396919, "kl": 0.02960205078125, "learning_rate": 1.5419296092897866e-06, "loss": -0.0003, "num_tokens": 157832231.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 417.6160888671875, "completions/mean_terminated_length": 417.6160888671875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.0877055599060297, "frac_reward_zero_std": 1.0, "grad_norm": 0.004887523129582405, "kl": 0.028900146484375, "learning_rate": 1.529659594740755e-06, "loss": 0.0003, "num_tokens": 158287667.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 407.52679443359375, "completions/mean_terminated_length": 403.8568115234375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 1.090837901331245, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06294778734445572, "kl": 0.028411865234375, "learning_rate": 1.5174407832310338e-06, "loss": 0.0016, "num_tokens": 158726423.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 425.2790222167969, "completions/mean_terminated_length": 418.00225830078125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.0939702427564604, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.04328838735818863, "kl": 0.026947021484375, "learning_rate": 1.5052736874374815e-06, "loss": 0.015, "num_tokens": 159185924.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 426.8437805175781, "completions/mean_terminated_length": 423.2170104980469, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.0971025841816757, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0718429684638977, "kl": 0.028594970703125, "learning_rate": 1.4931588178670695e-06, "loss": 0.0093, "num_tokens": 159637734.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 440.17413330078125, "completions/mean_terminated_length": 440.17413330078125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.100234925606891, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07508985698223114, "kl": 0.02667236328125, "learning_rate": 1.4810966828354605e-06, "loss": 0.002, "num_tokens": 160092916.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 404.8348388671875, "completions/mean_terminated_length": 404.8348388671875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.1033672670321064, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.02465839497745037, "kl": 0.028900146484375, "learning_rate": 1.469087788445684e-06, "loss": -0.0018, "num_tokens": 160527574.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 425.0446472167969, "completions/mean_terminated_length": 421.41387939453125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.1064996084573218, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.07332020252943039, "kl": 0.029541015625, "learning_rate": 1.4571326385668965e-06, "loss": 0.0141, "num_tokens": 160973430.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 438.3370666503906, "completions/mean_terminated_length": 438.3370666503906, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.109631949882537, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.029598809778690338, "kl": 0.027587890625, "learning_rate": 1.4452317348132434e-06, "loss": -0.001, "num_tokens": 161437033.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 411.05804443359375, "completions/mean_terminated_length": 407.3959655761719, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 1.1127642913077525, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.097141794860363, "kl": 0.030609130859375, "learning_rate": 1.4333855765228104e-06, "loss": 0.012, "num_tokens": 161863943.0, "reward": 0.09843750298023224, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 422.7901916503906, "completions/mean_terminated_length": 422.7901916503906, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.1158966327329678, "frac_reward_zero_std": 1.0, "grad_norm": 0.00387443695217371, "kl": 0.02728271484375, "learning_rate": 1.421594660736675e-06, "loss": 0.0003, "num_tokens": 162312365.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 427.9219055175781, "completions/mean_terminated_length": 427.9219055175781, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.1190289741581831, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.031242579221725464, "kl": 0.0267333984375, "learning_rate": 1.4098594821780476e-06, "loss": 0.0013, "num_tokens": 162766534.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 456.86163330078125, "completions/mean_terminated_length": 456.86163330078125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.1221613155833985, "frac_reward_zero_std": 1.0, "grad_norm": 0.004097921308130026, "kl": 0.0264892578125, "learning_rate": 1.3981805332315174e-06, "loss": 0.0003, "num_tokens": 163238332.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 443.044677734375, "completions/mean_terminated_length": 443.044677734375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.1252936570086138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036922527942806482, "kl": 0.025543212890625, "learning_rate": 1.3865583039223929e-06, "loss": 0.0003, "num_tokens": 163708768.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 412.7031555175781, "completions/mean_terminated_length": 412.7031555175781, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.1284259984338294, "frac_reward_zero_std": 1.0, "grad_norm": 0.007208974100649357, "kl": 0.02764892578125, "learning_rate": 1.374993281896137e-06, "loss": 0.0003, "num_tokens": 164153331.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 414.61163330078125, "completions/mean_terminated_length": 414.61163330078125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.1315583398590445, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03279886394739151, "kl": 0.0272216796875, "learning_rate": 1.3634859523979134e-06, "loss": 0.0017, "num_tokens": 164596969.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 424.2723388671875, "completions/mean_terminated_length": 420.63983154296875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.13469068128426, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.10188231617212296, "kl": 0.030029296875, "learning_rate": 1.3520367982522208e-06, "loss": 0.0138, "num_tokens": 165033907.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 449.26788330078125, "completions/mean_terminated_length": 449.26788330078125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.1378230227094752, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.037508007138967514, "kl": 0.0252685546875, "learning_rate": 1.3406462998426358e-06, "loss": -0.0023, "num_tokens": 165496595.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 452.4844055175781, "completions/mean_terminated_length": 452.4844055175781, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.1409553641346908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036941261496394873, "kl": 0.026397705078125, "learning_rate": 1.3293149350916595e-06, "loss": 0.0003, "num_tokens": 165975512.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2003.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 440.8526916503906, "completions/mean_terminated_length": 440.8526916503906, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.144087705559906, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.025066563859581947, "kl": 0.026214599609375, "learning_rate": 1.3180431794406623e-06, "loss": -0.0008, "num_tokens": 166424098.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 431.0401916503906, "completions/mean_terminated_length": 431.0401916503906, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 1.1472200469851215, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.054079219698905945, "kl": 0.026702880859375, "learning_rate": 1.3068315058299358e-06, "loss": 0.0039, "num_tokens": 166867832.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 427.3482360839844, "completions/mean_terminated_length": 427.3482360839844, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.1503523884103368, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04300840571522713, "kl": 0.02777099609375, "learning_rate": 1.2956803846788503e-06, "loss": 0.0021, "num_tokens": 167315748.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 439.27679443359375, "completions/mean_terminated_length": 439.27679443359375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.1534847298355522, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06679526716470718, "kl": 0.02691650390625, "learning_rate": 1.284590283866116e-06, "loss": 0.0001, "num_tokens": 167757940.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 420.6094055175781, "completions/mean_terminated_length": 420.6094055175781, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.1566170712607675, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.032696861773729324, "kl": 0.025177001953125, "learning_rate": 1.2735616687101518e-06, "loss": -0.0002, "num_tokens": 168192021.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 427.78350830078125, "completions/mean_terminated_length": 427.78350830078125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.1597494126859829, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.053933847695589066, "kl": 0.02899169921875, "learning_rate": 1.2625950019495614e-06, "loss": 0.0001, "num_tokens": 168627852.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 446.4576110839844, "completions/mean_terminated_length": 442.8747253417969, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.1628817541111982, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03861292079091072, "kl": 0.02447509765625, "learning_rate": 1.251690743723718e-06, "loss": 0.0086, "num_tokens": 169084453.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 423.93975830078125, "completions/mean_terminated_length": 423.93975830078125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 1.1660140955364136, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.02309408411383629, "kl": 0.026702880859375, "learning_rate": 1.2408493515534581e-06, "loss": -0.0015, "num_tokens": 169525306.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 438.3727722167969, "completions/mean_terminated_length": 438.3727722167969, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.169146436961629, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06198767200112343, "kl": 0.03106689453125, "learning_rate": 1.2300712803218834e-06, "loss": 0.0021, "num_tokens": 169980205.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 463.4576110839844, "completions/mean_terminated_length": 459.9127502441406, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 1.1722787783868442, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06443637609481812, "kl": 0.024139404296875, "learning_rate": 1.2193569822552772e-06, "loss": 0.0079, "num_tokens": 170455370.0, "reward": 0.09910715371370316, "reward_std": 0.0014083485584706068, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 453.9464416503906, "completions/mean_terminated_length": 453.9464416503906, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.1754111198120596, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.05658130720257759, "kl": 0.02520751953125, "learning_rate": 1.2087069069041268e-06, "loss": -0.0, "num_tokens": 170923754.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 453.1719055175781, "completions/mean_terminated_length": 449.6040344238281, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 1.178543461237275, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 4239.47509765625, "kl": 179.2628173828125, "learning_rate": 1.1981215011242654e-06, "loss": 1.7966, "num_tokens": 171394851.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 445.9531555175781, "completions/mean_terminated_length": 445.9531555175781, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 1.1816758026624903, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.028231700882315636, "kl": 0.025299072265625, "learning_rate": 1.1876012090581184e-06, "loss": 0.0016, "num_tokens": 171860578.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 448.7812805175781, "completions/mean_terminated_length": 448.7812805175781, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 1.1848081440877056, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03244471922516823, "kl": 0.0255126953125, "learning_rate": 1.177146472116071e-06, "loss": -0.0007, "num_tokens": 172341616.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 460.9285888671875, "completions/mean_terminated_length": 460.9285888671875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.187940485512921, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.013095604255795479, "kl": 0.0247802734375, "learning_rate": 1.1667577289579462e-06, "loss": -0.0006, "num_tokens": 172816448.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 439.4754638671875, "completions/mean_terminated_length": 435.876953125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.1910728269381363, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.071053147315979, "kl": 0.02716064453125, "learning_rate": 1.1564354154746007e-06, "loss": 0.0142, "num_tokens": 173262989.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 436.42413330078125, "completions/mean_terminated_length": 436.42413330078125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.1942051683633517, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.05090510845184326, "kl": 0.024932861328125, "learning_rate": 1.146179964769635e-06, "loss": -0.0003, "num_tokens": 173720167.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 457.08929443359375, "completions/mean_terminated_length": 453.53021240234375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.197337509788567, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.05607256665825844, "kl": 0.02587890625, "learning_rate": 1.1359918071412195e-06, "loss": 0.0083, "num_tokens": 174188827.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 448.40179443359375, "completions/mean_terminated_length": 448.40179443359375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.2004698512137824, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.12926536798477173, "kl": 0.02996826171875, "learning_rate": 1.1258713700640456e-06, "loss": 0.005, "num_tokens": 174648087.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 441.0245666503906, "completions/mean_terminated_length": 441.0245666503906, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 1.2036021926389977, "frac_reward_zero_std": 1.0, "grad_norm": 0.004159407690167427, "kl": 0.02587890625, "learning_rate": 1.115819078171383e-06, "loss": 0.0003, "num_tokens": 175110850.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 458.0870666503906, "completions/mean_terminated_length": 458.0870666503906, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.206734534064213, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07305943965911865, "kl": 0.03033447265625, "learning_rate": 1.1058353532372667e-06, "loss": 0.0066, "num_tokens": 175585501.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 455.029052734375, "completions/mean_terminated_length": 455.029052734375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.2098668754894284, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07473640888929367, "kl": 0.033233642578125, "learning_rate": 1.0959206141587998e-06, "loss": 0.0059, "num_tokens": 176071182.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 449.7812805175781, "completions/mean_terminated_length": 449.7812805175781, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.2129992169146437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032927945721894503, "kl": 0.02386474609375, "learning_rate": 1.0860752769385766e-06, "loss": 0.0002, "num_tokens": 176528288.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 458.6026916503906, "completions/mean_terminated_length": 458.6026916503906, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 1.216131558339859, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.028019176796078682, "kl": 0.025482177734375, "learning_rate": 1.0762997546672279e-06, "loss": 0.0, "num_tokens": 176993934.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 461.263427734375, "completions/mean_terminated_length": 461.263427734375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 1.2192638997650744, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0768606886267662, "kl": 0.028900146484375, "learning_rate": 1.0665944575060914e-06, "loss": 0.0083, "num_tokens": 177457244.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 476.2388610839844, "completions/mean_terminated_length": 476.2388610839844, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 1.2223962411902898, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06251931935548782, "kl": 0.025543212890625, "learning_rate": 1.056959792669997e-06, "loss": 0.0048, "num_tokens": 177930611.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 444.3169860839844, "completions/mean_terminated_length": 444.3169860839844, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.2255285826155051, "frac_reward_zero_std": 1.0, "grad_norm": 0.003558945143595338, "kl": 0.0247802734375, "learning_rate": 1.0473961644101856e-06, "loss": 0.0002, "num_tokens": 178393137.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 472.7076110839844, "completions/mean_terminated_length": 472.7076110839844, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.2286609240407205, "frac_reward_zero_std": 1.0, "grad_norm": 0.003436759114265442, "kl": 0.02459716796875, "learning_rate": 1.037903973997345e-06, "loss": 0.0002, "num_tokens": 178894522.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 464.4531555175781, "completions/mean_terminated_length": 460.9105224609375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.2317932654659358, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05089332163333893, "kl": 0.025665283203125, "learning_rate": 1.0284836197047737e-06, "loss": 0.0109, "num_tokens": 179367097.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 467.7745666503906, "completions/mean_terminated_length": 467.7745666503906, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.2349256068911512, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.053179092705249786, "kl": 0.027313232421875, "learning_rate": 1.0191354967916712e-06, "loss": -0.0063, "num_tokens": 179851872.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 443.89288330078125, "completions/mean_terminated_length": 443.89288330078125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 1.2380579483163665, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04853790998458862, "kl": 0.025360107421875, "learning_rate": 1.0098599974865515e-06, "loss": 0.0019, "num_tokens": 180318912.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 458.3326110839844, "completions/mean_terminated_length": 458.3326110839844, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.2411902897415819, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.04246772453188896, "kl": 0.02435302734375, "learning_rate": 1.0006575109707898e-06, "loss": 0.0059, "num_tokens": 180793209.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 459.79913330078125, "completions/mean_terminated_length": 456.24609375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 1.2443226311667972, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.04254293814301491, "kl": 0.026397705078125, "learning_rate": 9.915284233622877e-07, "loss": 0.0026, "num_tokens": 181250279.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 469.3594055175781, "completions/mean_terminated_length": 465.8277282714844, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 1.2474549725920125, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.09480379521846771, "kl": 0.027374267578125, "learning_rate": 9.824731176992796e-07, "loss": 0.0158, "num_tokens": 181730088.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 463.50225830078125, "completions/mean_terminated_length": 463.50225830078125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.250587314017228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033475097734481096, "kl": 0.0240478515625, "learning_rate": 9.734919739242543e-07, "loss": 0.0002, "num_tokens": 182211061.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 467.4062805175781, "completions/mean_terminated_length": 467.4062805175781, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.2537196554424432, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.042104050517082214, "kl": 0.026763916015625, "learning_rate": 9.645853688680177e-07, "loss": 0.0047, "num_tokens": 182686855.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 440.43304443359375, "completions/mean_terminated_length": 440.43304443359375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 1.2568519968676586, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.055505525320768356, "kl": 0.027862548828125, "learning_rate": 9.557536762338786e-07, "loss": 0.0044, "num_tokens": 183136037.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 446.26116943359375, "completions/mean_terminated_length": 446.26116943359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.259984338292874, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.0523519329726696, "kl": 0.027008056640625, "learning_rate": 9.46997266581973e-07, "loss": 0.0039, "num_tokens": 183598978.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 454.18304443359375, "completions/mean_terminated_length": 454.18304443359375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.2631166797180893, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03365373983979225, "kl": 0.024200439453125, "learning_rate": 9.383165073137115e-07, "loss": 0.0, "num_tokens": 184057156.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 441.3192138671875, "completions/mean_terminated_length": 441.3192138671875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.2662490211433046, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05644859001040459, "kl": 0.0255126953125, "learning_rate": 9.297117626563687e-07, "loss": -0.0012, "num_tokens": 184513503.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 445.8370666503906, "completions/mean_terminated_length": 445.8370666503906, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.26938136256852, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.048435747623443604, "kl": 0.026153564453125, "learning_rate": 9.211833936477957e-07, "loss": 0.0029, "num_tokens": 184972842.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 459.3906555175781, "completions/mean_terminated_length": 459.3906555175781, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.2725137039937353, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03401060029864311, "kl": 0.02435302734375, "learning_rate": 9.127317581212753e-07, "loss": 0.0004, "num_tokens": 185448277.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 441.9263610839844, "completions/mean_terminated_length": 441.9263610839844, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.2756460454189507, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.07122519612312317, "kl": 0.025299072265625, "learning_rate": 9.043572106905084e-07, "loss": 0.0053, "num_tokens": 185896976.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 453.7410888671875, "completions/mean_terminated_length": 453.7410888671875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 1.278778386844166, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.27351024746894836, "kl": 0.085845947265625, "learning_rate": 8.960601027347321e-07, "loss": 0.0038, "num_tokens": 186366140.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 453.0870666503906, "completions/mean_terminated_length": 449.5190124511719, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.2819107282693813, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06908712536096573, "kl": 0.0272216796875, "learning_rate": 8.878407823839788e-07, "loss": 0.0082, "num_tokens": 186852767.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 466.5781555175781, "completions/mean_terminated_length": 463.040283203125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 1.2850430696945967, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03923846408724785, "kl": 0.02655029296875, "learning_rate": 8.796995945044689e-07, "loss": 0.0038, "num_tokens": 187332070.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 453.7254638671875, "completions/mean_terminated_length": 453.7254638671875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.288175411119812, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05868186056613922, "kl": 0.02703857421875, "learning_rate": 8.716368806841405e-07, "loss": 0.0028, "num_tokens": 187800359.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 436.4933166503906, "completions/mean_terminated_length": 436.4933166503906, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.2913077525450274, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03568020090460777, "kl": 0.024658203125, "learning_rate": 8.636529792183171e-07, "loss": 0.0021, "num_tokens": 188251892.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1418.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 432.0535888671875, "completions/mean_terminated_length": 432.0535888671875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.2944400939702427, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07019329816102982, "kl": 0.026947021484375, "learning_rate": 8.557482250955144e-07, "loss": 0.0023, "num_tokens": 188693628.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 454.9598388671875, "completions/mean_terminated_length": 454.9598388671875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.297572435395458, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0755876675248146, "kl": 0.026885986328125, "learning_rate": 8.479229499833844e-07, "loss": 0.0005, "num_tokens": 189176158.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 426.8571472167969, "completions/mean_terminated_length": 426.8571472167969, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 1.3007047768206734, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0657104030251503, "kl": 0.027923583984375, "learning_rate": 8.401774822147976e-07, "loss": 0.0033, "num_tokens": 189626370.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 442.7276916503906, "completions/mean_terminated_length": 442.7276916503906, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.3038371182458888, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0672403946518898, "kl": 0.026824951171875, "learning_rate": 8.325121467740695e-07, "loss": 0.0041, "num_tokens": 190094472.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 434.4844055175781, "completions/mean_terminated_length": 430.8747253417969, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 1.3069694596711041, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07434650510549545, "kl": 0.02740478515625, "learning_rate": 8.249272652833226e-07, "loss": 0.0014, "num_tokens": 190554113.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 428.8638610839844, "completions/mean_terminated_length": 428.8638610839844, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 1.3101018010963195, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07310064136981964, "kl": 0.02801513671875, "learning_rate": 8.174231559889931e-07, "loss": 0.0111, "num_tokens": 191002764.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 430.83929443359375, "completions/mean_terminated_length": 427.2214660644531, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.3132341425215348, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08830343186855316, "kl": 0.02764892578125, "learning_rate": 8.100001337484787e-07, "loss": 0.0065, "num_tokens": 191448872.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1621.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 407.5535888671875, "completions/mean_terminated_length": 407.5535888671875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.3163664839467502, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.10848723351955414, "kl": 0.03021240234375, "learning_rate": 8.026585100169251e-07, "loss": 0.0144, "num_tokens": 191893124.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 431.12725830078125, "completions/mean_terminated_length": 431.12725830078125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 1.3194988253719655, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07124645262956619, "kl": 0.02691650390625, "learning_rate": 7.953985928341601e-07, "loss": 0.0034, "num_tokens": 192346897.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 438.53350830078125, "completions/mean_terminated_length": 438.53350830078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.3226311667971808, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06490432471036911, "kl": 0.027740478515625, "learning_rate": 7.882206868117693e-07, "loss": 0.0038, "num_tokens": 192815312.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 420.66741943359375, "completions/mean_terminated_length": 420.66741943359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.3257635082223962, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05128738284111023, "kl": 0.027435302734375, "learning_rate": 7.81125093120313e-07, "loss": 0.0026, "num_tokens": 193260719.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 414.3415222167969, "completions/mean_terminated_length": 414.3415222167969, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.3288958496476115, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.0690830796957016, "kl": 0.0296630859375, "learning_rate": 7.741121094766916e-07, "loss": 0.0044, "num_tokens": 193698564.0, "reward": 0.09866072237491608, "reward_std": 0.0023012058809399605, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 435.1451110839844, "completions/mean_terminated_length": 435.1451110839844, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.3320281910728269, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04790590703487396, "kl": 0.027008056640625, "learning_rate": 7.671820301316532e-07, "loss": 0.0036, "num_tokens": 194148169.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 422.7901916503906, "completions/mean_terminated_length": 422.7901916503906, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.3351605324980422, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.08871312439441681, "kl": 0.03045654296875, "learning_rate": 7.603351458574474e-07, "loss": 0.0053, "num_tokens": 194604583.0, "reward": 0.09843750298023224, "reward_std": 0.0031250000465661287, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 432.79913330078125, "completions/mean_terminated_length": 432.79913330078125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.3382928739232576, "frac_reward_zero_std": 0.9196429252624512, "grad_norm": 0.11320719867944717, "kl": 0.028656005859375, "learning_rate": 7.535717439356255e-07, "loss": 0.0158, "num_tokens": 195061789.0, "reward": 0.09799107909202576, "reward_std": 0.004017857369035482, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9799107313156128, "rewards/format_reward/std": 0.14046260714530945, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 421.4531555175781, "completions/mean_terminated_length": 421.4531555175781, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.341425215348473, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06767603754997253, "kl": 0.02557373046875, "learning_rate": 7.46892108144986e-07, "loss": 0.0004, "num_tokens": 195498096.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1220.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 431.27679443359375, "completions/mean_terminated_length": 431.27679443359375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.3445575567736883, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06242584437131882, "kl": 0.027191162109375, "learning_rate": 7.402965187496697e-07, "loss": 0.0034, "num_tokens": 195961772.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 413.92413330078125, "completions/mean_terminated_length": 410.2684631347656, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.3476898981989036, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0716230496764183, "kl": 0.029052734375, "learning_rate": 7.337852524873974e-07, "loss": 0.0103, "num_tokens": 196416974.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 422.93304443359375, "completions/mean_terminated_length": 422.93304443359375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.350822239624119, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08740809559822083, "kl": 0.028961181640625, "learning_rate": 7.273585825578608e-07, "loss": 0.0011, "num_tokens": 196877768.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 412.18975830078125, "completions/mean_terminated_length": 412.18975830078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.3539545810493343, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06361445039510727, "kl": 0.02813720703125, "learning_rate": 7.21016778611259e-07, "loss": 0.0004, "num_tokens": 197316421.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 403.0937805175781, "completions/mean_terminated_length": 403.0937805175781, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.3570869224745497, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08929287642240524, "kl": 0.027557373046875, "learning_rate": 7.147601067369835e-07, "loss": 0.0099, "num_tokens": 197739623.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 416.4576110839844, "completions/mean_terminated_length": 416.4576110839844, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.360219263899765, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06961924582719803, "kl": 0.02838134765625, "learning_rate": 7.085888294524561e-07, "loss": -0.0013, "num_tokens": 198192680.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 442.9219055175781, "completions/mean_terminated_length": 442.9219055175781, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.3633516053249803, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06558452546596527, "kl": 0.02557373046875, "learning_rate": 7.025032056921117e-07, "loss": 0.0042, "num_tokens": 198674609.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 427.9888610839844, "completions/mean_terminated_length": 427.9888610839844, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.3664839467501957, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.09540320932865143, "kl": 0.02960205078125, "learning_rate": 6.965034907965349e-07, "loss": 0.0093, "num_tokens": 199136684.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 436.1919860839844, "completions/mean_terminated_length": 432.58612060546875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.3696162881754113, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.0718950480222702, "kl": 0.0272216796875, "learning_rate": 6.905899365017462e-07, "loss": 0.0129, "num_tokens": 199607510.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 405.8727722167969, "completions/mean_terminated_length": 405.8727722167969, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.3727486296006264, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.024640727788209915, "kl": 0.026275634765625, "learning_rate": 6.847627909286409e-07, "loss": -0.0006, "num_tokens": 200041333.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 415.66741943359375, "completions/mean_terminated_length": 415.66741943359375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.375880971025842, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05953597277402878, "kl": 0.02655029296875, "learning_rate": 6.790222985725761e-07, "loss": 0.0028, "num_tokens": 200496668.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 427.0535888671875, "completions/mean_terminated_length": 427.0535888671875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.379013312451057, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07108840346336365, "kl": 0.0296630859375, "learning_rate": 6.733687002931141e-07, "loss": 0.0071, "num_tokens": 200950320.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 426.72991943359375, "completions/mean_terminated_length": 423.1029052734375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.3821456538762726, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.0630938857793808, "kl": 0.027557373046875, "learning_rate": 6.678022333039158e-07, "loss": 0.0088, "num_tokens": 201403815.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 409.4665222167969, "completions/mean_terminated_length": 409.4665222167969, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.3852779953014878, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07846879959106445, "kl": 0.03106689453125, "learning_rate": 6.623231311627876e-07, "loss": 0.0054, "num_tokens": 201850076.0, "reward": 0.09888394176959991, "reward_std": 0.0018547771032899618, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 426.5067138671875, "completions/mean_terminated_length": 426.5067138671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 1.3884103367267033, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08503103256225586, "kl": 0.037628173828125, "learning_rate": 6.569316237618811e-07, "loss": -0.0032, "num_tokens": 202305903.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 421.99554443359375, "completions/mean_terminated_length": 421.99554443359375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.3915426781519185, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.059757303446531296, "kl": 0.02886962890625, "learning_rate": 6.516279373180499e-07, "loss": 0.0054, "num_tokens": 202759353.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 424.49554443359375, "completions/mean_terminated_length": 424.49554443359375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.394675019577134, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05640324950218201, "kl": 0.025787353515625, "learning_rate": 6.464122943633543e-07, "loss": 0.0031, "num_tokens": 203206075.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 444.4375305175781, "completions/mean_terminated_length": 444.4375305175781, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 1.3978073610023491, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08702774345874786, "kl": 0.026580810546875, "learning_rate": 6.412849137357271e-07, "loss": 0.0094, "num_tokens": 203663507.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 455.7857360839844, "completions/mean_terminated_length": 452.2237243652344, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.4009397024275647, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.07445820420980453, "kl": 0.023345947265625, "learning_rate": 6.3624601056979e-07, "loss": 0.0104, "num_tokens": 204122319.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 442.6562805175781, "completions/mean_terminated_length": 439.06488037109375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 1.4040720438527798, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.049604322761297226, "kl": 0.025115966796875, "learning_rate": 6.312957962878278e-07, "loss": 0.0074, "num_tokens": 204571741.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 437.9821472167969, "completions/mean_terminated_length": 437.9821472167969, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.4072043852779954, "frac_reward_zero_std": 0.9285714626312256, "grad_norm": 0.11870156228542328, "kl": 0.045867919921875, "learning_rate": 6.264344785909181e-07, "loss": 0.0063, "num_tokens": 205017297.0, "reward": 0.098214291036129, "reward_std": 0.0035714288242161274, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.13258016109466553, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 450.0848388671875, "completions/mean_terminated_length": 450.0848388671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 1.4103367267032105, "frac_reward_zero_std": 1.0, "grad_norm": 0.010148593224585056, "kl": 0.02569580078125, "learning_rate": 6.216622614502149e-07, "loss": 0.0003, "num_tokens": 205487851.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 458.7723388671875, "completions/mean_terminated_length": 458.7723388671875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.413469068128426, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.059165503829717636, "kl": 0.02703857421875, "learning_rate": 6.169793450983916e-07, "loss": 0.0051, "num_tokens": 205959241.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 445.497802734375, "completions/mean_terminated_length": 445.497802734375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.4166014095536412, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06916095316410065, "kl": 0.0283203125, "learning_rate": 6.123859260212393e-07, "loss": 0.0019, "num_tokens": 206432024.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349845170975, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 460.4687805175781, "completions/mean_terminated_length": 460.4687805175781, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.4197337509788568, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04256457835435867, "kl": 0.02606201171875, "learning_rate": 6.07882196949423e-07, "loss": 0.0029, "num_tokens": 206913842.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1470.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 447.9419860839844, "completions/mean_terminated_length": 447.9419860839844, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.422866092404072, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.10990621149539948, "kl": 0.029937744140625, "learning_rate": 6.034683468503948e-07, "loss": 0.0065, "num_tokens": 207367460.0, "reward": 0.09843751043081284, "reward_std": 0.0031250002793967724, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 468.3995666503906, "completions/mean_terminated_length": 468.3995666503906, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 1.4259984338292875, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06326734274625778, "kl": 0.027252197265625, "learning_rate": 5.991445609204641e-07, "loss": 0.0085, "num_tokens": 207852623.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 446.8482360839844, "completions/mean_terminated_length": 443.2662353515625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.4291307752545026, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.06092901900410652, "kl": 0.026092529296875, "learning_rate": 5.949110205770292e-07, "loss": 0.0076, "num_tokens": 208313895.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 468.3281555175781, "completions/mean_terminated_length": 464.794189453125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 1.4322631166797182, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.08804488182067871, "kl": 0.0316162109375, "learning_rate": 5.90767903450964e-07, "loss": 0.0099, "num_tokens": 208794674.0, "reward": 0.098214291036129, "reward_std": 0.0031940629705786705, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.13258016109466553, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 464.2901916503906, "completions/mean_terminated_length": 464.2901916503906, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 1.4353954581049335, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.0663062185049057, "kl": 0.02667236328125, "learning_rate": 5.867153833791652e-07, "loss": 0.0029, "num_tokens": 209281904.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 465.3504638671875, "completions/mean_terminated_length": 461.8098449707031, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.4385277995301489, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04055793955922127, "kl": 0.025177001953125, "learning_rate": 5.827536303972587e-07, "loss": 0.0069, "num_tokens": 209754565.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 443.779052734375, "completions/mean_terminated_length": 443.779052734375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.4416601409553642, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.06736601144075394, "kl": 0.02728271484375, "learning_rate": 5.78882810732465e-07, "loss": -0.0009, "num_tokens": 210207590.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 440.22991943359375, "completions/mean_terminated_length": 440.22991943359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.4447924823805796, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07810962945222855, "kl": 0.0262451171875, "learning_rate": 5.75103086796625e-07, "loss": 0.005, "num_tokens": 210664017.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 461.6785888671875, "completions/mean_terminated_length": 461.6785888671875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 1.447924823805795, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04274836927652359, "kl": 0.02386474609375, "learning_rate": 5.714146171793846e-07, "loss": 0.0007, "num_tokens": 211127353.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1662.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 449.32366943359375, "completions/mean_terminated_length": 449.32366943359375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.4510571652310102, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05472171679139137, "kl": 0.026123046875, "learning_rate": 5.678175566415422e-07, "loss": 0.0037, "num_tokens": 211594386.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 474.3214416503906, "completions/mean_terminated_length": 474.3214416503906, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 1.4541895066562256, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.07211575657129288, "kl": 0.026641845703125, "learning_rate": 5.643120561085528e-07, "loss": 0.0065, "num_tokens": 212081818.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 443.19866943359375, "completions/mean_terminated_length": 443.19866943359375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.457321848081441, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.047176580876111984, "kl": 0.028289794921875, "learning_rate": 5.608982626641991e-07, "loss": -0.0003, "num_tokens": 212553635.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 455.575927734375, "completions/mean_terminated_length": 452.013427734375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.4604541895066563, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08354708552360535, "kl": 0.0257568359375, "learning_rate": 5.575763195444166e-07, "loss": 0.0119, "num_tokens": 213017729.0, "reward": 0.09866072982549667, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 469.3058166503906, "completions/mean_terminated_length": 465.7740478515625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.4635865309318716, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06769128888845444, "kl": 0.02813720703125, "learning_rate": 5.543463661312847e-07, "loss": 0.0052, "num_tokens": 213491974.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 425.0781555175781, "completions/mean_terminated_length": 425.0781555175781, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.466718872357087, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.057655010372400284, "kl": 0.026214599609375, "learning_rate": 5.512085379471808e-07, "loss": 0.0018, "num_tokens": 213934205.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 416.90179443359375, "completions/mean_terminated_length": 416.90179443359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.4698512137823023, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08374013751745224, "kl": 0.027679443359375, "learning_rate": 5.481629666490903e-07, "loss": 0.0034, "num_tokens": 214365541.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 445.8437805175781, "completions/mean_terminated_length": 445.8437805175781, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.4729835552075177, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04593599587678909, "kl": 0.025390625, "learning_rate": 5.452097800230853e-07, "loss": 0.0005, "num_tokens": 214838603.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 420.32366943359375, "completions/mean_terminated_length": 420.32366943359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.476115896632733, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.0593700185418129, "kl": 0.03009033203125, "learning_rate": 5.423491019789623e-07, "loss": 0.0026, "num_tokens": 215286008.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843171834946, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 439.75225830078125, "completions/mean_terminated_length": 439.75225830078125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.4792482380579484, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07813020050525665, "kl": 0.027099609375, "learning_rate": 5.395810525450425e-07, "loss": 0.0033, "num_tokens": 215745513.0, "reward": 0.09888393431901932, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 428.3058166503906, "completions/mean_terminated_length": 428.3058166503906, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.4823805794831637, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.0518001914024353, "kl": 0.027008056640625, "learning_rate": 5.369057478631359e-07, "loss": 0.0011, "num_tokens": 216187810.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 443.8482360839844, "completions/mean_terminated_length": 443.8482360839844, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.485512920908379, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06536412239074707, "kl": 0.030029296875, "learning_rate": 5.343233001836694e-07, "loss": 0.0049, "num_tokens": 216641034.0, "reward": 0.09888393431901932, "reward_std": 0.0018547771032899618, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 435.6451110839844, "completions/mean_terminated_length": 432.03802490234375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.4886452623335944, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.08661390095949173, "kl": 0.02862548828125, "learning_rate": 5.318338178609754e-07, "loss": 0.0169, "num_tokens": 217096699.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 437.72100830078125, "completions/mean_terminated_length": 437.72100830078125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.4917776037588097, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.058107640594244, "kl": 0.0294189453125, "learning_rate": 5.294374053487459e-07, "loss": 0.0052, "num_tokens": 217550698.0, "reward": 0.09910715371370316, "reward_std": 0.0014083485584706068, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 436.2165222167969, "completions/mean_terminated_length": 436.2165222167969, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.494909945184025, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.042302459478378296, "kl": 0.029144287109375, "learning_rate": 5.271341631956511e-07, "loss": 0.0007, "num_tokens": 217994795.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 437.2723388671875, "completions/mean_terminated_length": 437.2723388671875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.4980422866092404, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05731819570064545, "kl": 0.030487060546875, "learning_rate": 5.249241880411181e-07, "loss": 0.0024, "num_tokens": 218446421.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 430.51788330078125, "completions/mean_terminated_length": 430.51788330078125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.5011746280344558, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05495809391140938, "kl": 0.02838134765625, "learning_rate": 5.228075726112785e-07, "loss": 0.0044, "num_tokens": 218906733.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 453.7879638671875, "completions/mean_terminated_length": 453.7879638671875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.5043069694596711, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07989178597927094, "kl": 0.026092529296875, "learning_rate": 5.207844057150768e-07, "loss": 0.0053, "num_tokens": 219383226.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 459.0089416503906, "completions/mean_terminated_length": 455.4541320800781, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.5074393108848865, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.07920684665441513, "kl": 0.024871826171875, "learning_rate": 5.188547722405437e-07, "loss": 0.0108, "num_tokens": 219854206.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 425.8526916503906, "completions/mean_terminated_length": 425.8526916503906, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.5105716523101018, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05114280432462692, "kl": 0.030029296875, "learning_rate": 5.170187531512351e-07, "loss": 0.0016, "num_tokens": 220304880.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 436.6719055175781, "completions/mean_terminated_length": 436.6719055175781, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 1.5137039937353172, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.047800514847040176, "kl": 0.025115966796875, "learning_rate": 5.152764254828348e-07, "loss": 0.0014, "num_tokens": 220768993.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 431.56475830078125, "completions/mean_terminated_length": 431.56475830078125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.5168363351605325, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05218655616044998, "kl": 0.028350830078125, "learning_rate": 5.136278623399225e-07, "loss": 0.0014, "num_tokens": 221222558.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 458.716552734375, "completions/mean_terminated_length": 458.716552734375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 1.5199686765857479, "frac_reward_zero_std": 0.9285714626312256, "grad_norm": 0.08404982089996338, "kl": 0.02899169921875, "learning_rate": 5.120731328929058e-07, "loss": 0.0061, "num_tokens": 221687423.0, "reward": 0.098214291036129, "reward_std": 0.0035714288242161274, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.13258016109466553, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 433.43304443359375, "completions/mean_terminated_length": 429.821044921875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.5231010180109632, "frac_reward_zero_std": 0.9375000596046448, "grad_norm": 0.1080755963921547, "kl": 0.029510498046875, "learning_rate": 5.106123023751187e-07, "loss": 0.0192, "num_tokens": 222141357.0, "reward": 0.09843750298023224, "reward_std": 0.0031250000465661287, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12415824085474014, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 442.9375305175781, "completions/mean_terminated_length": 442.9375305175781, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.5262333594361785, "frac_reward_zero_std": 0.973214328289032, "grad_norm": 0.05659080296754837, "kl": 0.02691650390625, "learning_rate": 5.092454320800833e-07, "loss": -0.0002, "num_tokens": 222609185.0, "reward": 0.0993303582072258, "reward_std": 0.0013392858672887087, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9933035969734192, "rewards/format_reward/std": 0.08164843916893005, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 451.060302734375, "completions/mean_terminated_length": 447.4877014160156, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 1.529365700861394, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.06833017617464066, "kl": 0.0281982421875, "learning_rate": 5.079725793589405e-07, "loss": 0.0011, "num_tokens": 223069096.0, "reward": 0.09888394176959991, "reward_std": 0.0018547771032899618, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 431.68975830078125, "completions/mean_terminated_length": 431.68975830078125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.5324980422866092, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07133956998586655, "kl": 0.032928466796875, "learning_rate": 5.067937976180407e-07, "loss": 0.0037, "num_tokens": 223514925.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 428.14288330078125, "completions/mean_terminated_length": 428.14288330078125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 1.5356303837118246, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07541512697935104, "kl": 0.02886962890625, "learning_rate": 5.057091363167046e-07, "loss": 0.0006, "num_tokens": 223969965.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507844179868698, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 430.8906555175781, "completions/mean_terminated_length": 430.8906555175781, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 1.53876272513704, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.0861428752541542, "kl": 0.0291748046875, "learning_rate": 5.047186409651489e-07, "loss": 0.0059, "num_tokens": 224415660.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 432.9754638671875, "completions/mean_terminated_length": 432.9754638671875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.5418950665622553, "frac_reward_zero_std": 0.9642857313156128, "grad_norm": 0.0699593797326088, "kl": 0.028656005859375, "learning_rate": 5.038223531225742e-07, "loss": 0.0056, "num_tokens": 224868149.0, "reward": 0.09910715371370316, "reward_std": 0.0017857144121080637, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09417349100112915, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2013.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 444.3035888671875, "completions/mean_terminated_length": 444.3035888671875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 1.5450274079874706, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.07217267155647278, "kl": 0.029327392578125, "learning_rate": 5.030203103954232e-07, "loss": -0.0008, "num_tokens": 225334545.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1624.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 447.482177734375, "completions/mean_terminated_length": 447.482177734375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 1.548159749412686, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.057098113000392914, "kl": 0.026824951171875, "learning_rate": 5.023125464358026e-07, "loss": 0.0046, "num_tokens": 225817161.0, "reward": 0.09955357760190964, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 2043.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 446.466552734375, "completions/mean_terminated_length": 446.466552734375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.5512920908379013, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.04939984530210495, "kl": 0.025482177734375, "learning_rate": 5.016990909400709e-07, "loss": 0.0021, "num_tokens": 226293090.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 432.0446472167969, "completions/mean_terminated_length": 432.0446472167969, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.5544244322631167, "frac_reward_zero_std": 0.9553571939468384, "grad_norm": 0.09384603798389435, "kl": 0.028228759765625, "learning_rate": 5.011799696475915e-07, "loss": 0.0054, "num_tokens": 226745698.0, "reward": 0.09888394176959991, "reward_std": 0.0022321429569274187, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9888392686843872, "rewards/format_reward/std": 0.10517053306102753, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 441.36163330078125, "completions/mean_terminated_length": 441.36163330078125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 1.557556773688332, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.03569803759455681, "kl": 0.02642822265625, "learning_rate": 5.007552043396547e-07, "loss": -0.0011, "num_tokens": 227203936.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 434.9151916503906, "completions/mean_terminated_length": 434.9151916503906, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 1.5606891151135474, "frac_reward_zero_std": 0.9464285969734192, "grad_norm": 0.07235594838857651, "kl": 0.027801513671875, "learning_rate": 5.004248128385618e-07, "loss": -0.0044, "num_tokens": 227652210.0, "reward": 0.09866072237491608, "reward_std": 0.0026785717345774174, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9866071343421936, "rewards/format_reward/std": 0.11507843434810638, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1384.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 445.2656555175781, "completions/mean_terminated_length": 445.2656555175781, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 1.5638214565387627, "frac_reward_zero_std": 0.9910714626312256, "grad_norm": 0.024015244096517563, "kl": 0.028961181640625, "learning_rate": 5.001888090068784e-07, "loss": -0.0016, "num_tokens": 228113421.0, "reward": 0.09977678954601288, "reward_std": 0.0004464286030270159, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9977678656578064, "rewards/format_reward/std": 0.047245558351278305, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 424.9977722167969, "completions/mean_terminated_length": 424.9977722167969, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 1.566953797963978, "frac_reward_zero_std": 0.9821429252624512, "grad_norm": 0.05101495981216431, "kl": 0.0252685546875, "learning_rate": 5.000472027468528e-07, "loss": 0.0024, "num_tokens": 228561876.0, "reward": 0.09955357015132904, "reward_std": 0.0008928572060540318, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.9955357313156128, "rewards/format_reward/std": 0.06674052774906158, "step": 500 }, { "epoch": 1.566953797963978, "step": 500, "total_flos": 0.0, "train_loss": 0.007788732183169486, "train_runtime": 41670.176, "train_samples_per_second": 5.376, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 228561876, "num_train_epochs": 2, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }