{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6078943749140421, "eval_steps": 500, "global_step": 8840, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 422.65, "completions/clipped_ratio": 0.025, "completions/max_length": 835.6, "completions/max_terminated_length": 738.6, "completions/mean_length": 422.65, "completions/mean_terminated_length": 395.875, "completions/min_length": 131.9, "completions/min_terminated_length": 131.9, "epoch": 0.0006876633200385092, "frac_reward_zero_std": 0.3, "grad_norm": 2.2393298149108887, "kl": 1.021408660709858, "learning_rate": 4.722222222222222e-06, "loss": 0.001, "num_tokens": 26778.0, "reward": 4.25, "reward_std": 0.35469383001327515, "rewards/check_coherence/mean": 0.5625, "rewards/check_coherence/std": 0.3129152894020081, "rewards/check_response_quality/mean": 2.0875, "rewards/check_response_quality/std": 0.17355985641479493, "rewards/match_format_approximately/mean": 0.6, "rewards/match_format_approximately/std": 0.15773502588272095, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 10 }, { "completion_length": 334.6, "completions/clipped_ratio": 0.025, "completions/max_length": 824.9, "completions/max_terminated_length": 713.2, "completions/mean_length": 334.6, "completions/mean_terminated_length": 299.05833435058594, "completions/min_length": 50.2, "completions/min_terminated_length": 50.2, "epoch": 0.0013753266400770184, "frac_reward_zero_std": 0.1, "grad_norm": 5.82886266708374, "kl": 1.5751606613397597, "learning_rate": 2.3750000000000003e-07, "loss": 0.0016, "num_tokens": 51474.0, "reward": 4.4375, "reward_std": 0.4622055947780609, "rewards/check_coherence/mean": 0.4875, "rewards/check_coherence/std": 0.33231321573257444, "rewards/check_response_quality/mean": 2.2125, "rewards/check_response_quality/std": 0.2633414089679718, "rewards/match_format_approximately/mean": 0.7375, "rewards/match_format_approximately/std": 0.21933756470680238, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 20 }, { "completion_length": 299.05, "completions/clipped_ratio": 0.025, "completions/max_length": 684.5, "completions/max_terminated_length": 582.1, "completions/mean_length": 299.05, "completions/mean_terminated_length": 268.775, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0020629899601155273, "frac_reward_zero_std": 0.0, "grad_norm": 1.6133402585983276, "kl": 1.2915641874074937, "learning_rate": 4.1428571428571435e-06, "loss": 0.0013, "num_tokens": 75404.0, "reward": 4.5625, "reward_std": 0.6107304871082306, "rewards/check_coherence/mean": 0.6125, "rewards/check_coherence/std": 0.403445702791214, "rewards/check_response_quality/mean": 2.2125, "rewards/check_response_quality/std": 0.2752987265586853, "rewards/match_format_approximately/mean": 0.7375, "rewards/match_format_approximately/std": 0.2404700517654419, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 30 }, { "completion_length": 347.025, "completions/clipped_ratio": 0.075, "completions/max_length": 862.6, "completions/max_terminated_length": 509.7, "completions/mean_length": 347.025, "completions/mean_terminated_length": 242.34166870117187, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.0027506532801540367, "frac_reward_zero_std": 0.1, "grad_norm": 2.190549850463867, "kl": 0.8691788390278816, "learning_rate": 4.936507936507937e-06, "loss": 0.0009, "num_tokens": 102089.0, "reward": 4.775, "reward_std": 0.5918105363845825, "rewards/check_coherence/mean": 0.85, "rewards/check_coherence/std": 0.345650315284729, "rewards/check_response_quality/mean": 2.2, "rewards/check_response_quality/std": 0.20347774028778076, "rewards/match_format_approximately/mean": 0.725, "rewards/match_format_approximately/std": 0.16547005176544188, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 40 }, { "completion_length": 278.275, "completions/clipped_ratio": 0.025, "completions/max_length": 695.3, "completions/max_terminated_length": 556.3, "completions/mean_length": 278.275, "completions/mean_terminated_length": 240.375, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "epoch": 0.0034383166001925457, "frac_reward_zero_std": 0.0, "grad_norm": 1.1459013223648071, "kl": 0.7370456486940384, "learning_rate": 4.793650793650794e-06, "loss": 0.0007, "num_tokens": 125192.0, "reward": 4.7375, "reward_std": 0.9061064124107361, "rewards/check_coherence/mean": 0.8625, "rewards/check_coherence/std": 0.4861203670501709, "rewards/check_response_quality/mean": 2.175, "rewards/check_response_quality/std": 0.34289742112159727, "rewards/match_format_approximately/mean": 0.725, "rewards/match_format_approximately/std": 0.2654700517654419, "rewards/match_format_exactly/mean": 0.975, "rewards/match_format_exactly/std": 0.05, "step": 50 }, { "completion_length": 127.625, "completions/clipped_ratio": 0.0, "completions/max_length": 249.7, "completions/max_terminated_length": 249.7, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 11.6, "completions/min_terminated_length": 11.6, "epoch": 0.004125979920231055, "frac_reward_zero_std": 0.1, "grad_norm": 1.772578477859497, "kl": 1.2247508466243744, "learning_rate": 4.634920634920635e-06, "loss": 0.0012, "num_tokens": 140377.0, "reward": 5.075, "reward_std": 0.583526349067688, "rewards/check_coherence/mean": 0.925, "rewards/check_coherence/std": 0.4689477920532227, "rewards/check_response_quality/mean": 2.325, "rewards/check_response_quality/std": 0.17886751294136047, "rewards/match_format_approximately/mean": 0.825, "rewards/match_format_approximately/std": 0.17886751294136047, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 60 }, { "completion_length": 161.4, "completions/clipped_ratio": 0.0, "completions/max_length": 423.5, "completions/max_terminated_length": 423.5, "completions/mean_length": 161.4, "completions/mean_terminated_length": 161.4, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.004813643240269564, "frac_reward_zero_std": 0.0, "grad_norm": 3.128584861755371, "kl": 0.8261611372232437, "learning_rate": 4.476190476190477e-06, "loss": 0.0008, "num_tokens": 156317.0, "reward": 5.1625, "reward_std": 0.6853798747062683, "rewards/check_coherence/mean": 1.025, "rewards/check_coherence/std": 0.4077350258827209, "rewards/check_response_quality/mean": 2.3125, "rewards/check_response_quality/std": 0.21582483053207396, "rewards/match_format_approximately/mean": 0.825, "rewards/match_format_approximately/std": 0.2, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 70 }, { "completion_length": 120.125, "completions/clipped_ratio": 0.0, "completions/max_length": 248.3, "completions/max_terminated_length": 248.3, "completions/mean_length": 120.125, "completions/mean_terminated_length": 120.125, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.0055013065603080735, "frac_reward_zero_std": 0.4, "grad_norm": 2.667182683944702, "kl": 0.8017235696315765, "learning_rate": 4.317460317460318e-06, "loss": 0.0008, "num_tokens": 171822.0, "reward": 5.4, "reward_std": 0.41746232509613035, "rewards/check_coherence/mean": 1.2625, "rewards/check_coherence/std": 0.2978713572025299, "rewards/check_response_quality/mean": 2.3125, "rewards/check_response_quality/std": 0.11582483053207397, "rewards/match_format_approximately/mean": 0.825, "rewards/match_format_approximately/std": 0.1, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 80 }, { "completion_length": 164.875, "completions/clipped_ratio": 0.0, "completions/max_length": 490.9, "completions/max_terminated_length": 490.9, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.006188969880346582, "frac_reward_zero_std": 0.0, "grad_norm": 1.8317680358886719, "kl": 0.8454075694084168, "learning_rate": 4.158730158730159e-06, "loss": 0.0008, "num_tokens": 189509.0, "reward": 5.325, "reward_std": 0.7779198408126831, "rewards/check_coherence/mean": 1.15, "rewards/check_coherence/std": 0.47320507764816283, "rewards/check_response_quality/mean": 2.3375, "rewards/check_response_quality/std": 0.21933756470680238, "rewards/match_format_approximately/mean": 0.8375, "rewards/match_format_approximately/std": 0.21933756470680238, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 90 }, { "completion_length": 128.325, "completions/clipped_ratio": 0.0, "completions/max_length": 322.5, "completions/max_terminated_length": 322.5, "completions/mean_length": 128.325, "completions/mean_terminated_length": 128.325, "completions/min_length": 46.3, "completions/min_terminated_length": 46.3, "epoch": 0.006876633200385091, "frac_reward_zero_std": 0.3, "grad_norm": 0.003549874061718583, "kl": 0.820201675593853, "learning_rate": 4.000000000000001e-06, "loss": 0.0008, "num_tokens": 205998.0, "reward": 5.45, "reward_std": 0.5343478560447693, "rewards/check_coherence/mean": 1.225, "rewards/check_coherence/std": 0.3809401035308838, "rewards/check_response_quality/mean": 2.3625, "rewards/check_response_quality/std": 0.10386751294136047, "rewards/match_format_approximately/mean": 0.8625, "rewards/match_format_approximately/std": 0.10386751294136047, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 100 }, { "completion_length": 147.575, "completions/clipped_ratio": 0.0, "completions/max_length": 452.6, "completions/max_terminated_length": 452.6, "completions/mean_length": 147.575, "completions/mean_terminated_length": 147.575, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.007564296520423601, "frac_reward_zero_std": 0.1, "grad_norm": 2.094895124435425, "kl": 1.165158998966217, "learning_rate": 3.857142857142858e-06, "loss": 0.0012, "num_tokens": 224325.0, "reward": 5.3, "reward_std": 0.8748959302902222, "rewards/check_coherence/mean": 1.2125, "rewards/check_coherence/std": 0.38273502588272096, "rewards/check_response_quality/mean": 2.2875, "rewards/check_response_quality/std": 0.28898603916168214, "rewards/match_format_approximately/mean": 0.825, "rewards/match_format_approximately/std": 0.22320507764816283, "rewards/match_format_exactly/mean": 0.975, "rewards/match_format_exactly/std": 0.05, "step": 110 }, { "completion_length": 93.475, "completions/clipped_ratio": 0.0, "completions/max_length": 232.9, "completions/max_terminated_length": 232.9, "completions/mean_length": 93.475, "completions/mean_terminated_length": 93.475, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.00825195984046211, "frac_reward_zero_std": 0.3, "grad_norm": 1.8320742845535278, "kl": 0.8784003466367721, "learning_rate": 3.6984126984126987e-06, "loss": 0.0009, "num_tokens": 238872.0, "reward": 5.6125, "reward_std": 0.5023834943771363, "rewards/check_coherence/mean": 1.3875, "rewards/check_coherence/std": 0.1978713572025299, "rewards/check_response_quality/mean": 2.3625, "rewards/check_response_quality/std": 0.18273502588272095, "rewards/match_format_approximately/mean": 0.8625, "rewards/match_format_approximately/std": 0.18273502588272095, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 120 }, { "completion_length": 63.425, "completions/clipped_ratio": 0.0, "completions/max_length": 180.7, "completions/max_terminated_length": 180.7, "completions/mean_length": 63.425, "completions/mean_terminated_length": 63.425, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.008939623160500619, "frac_reward_zero_std": 0.5, "grad_norm": 1.388489007949829, "kl": 0.855793622136116, "learning_rate": 3.53968253968254e-06, "loss": 0.0009, "num_tokens": 252433.0, "reward": 5.75, "reward_std": 0.4457427144050598, "rewards/check_coherence/mean": 1.4, "rewards/check_coherence/std": 0.2, "rewards/check_response_quality/mean": 2.425, "rewards/check_response_quality/std": 0.12886751294136048, "rewards/match_format_approximately/mean": 0.925, "rewards/match_format_approximately/std": 0.12886751294136048, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 130 }, { "completion_length": 41.35, "completions/clipped_ratio": 0.0, "completions/max_length": 127.7, "completions/max_terminated_length": 127.7, "completions/mean_length": 41.35, "completions/mean_terminated_length": 41.35, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.009627286480539128, "frac_reward_zero_std": 0.0, "grad_norm": 2.7645645141601562, "kl": 1.5349723994731903, "learning_rate": 3.3809523809523814e-06, "loss": 0.0015, "num_tokens": 264579.0, "reward": 5.0875, "reward_std": 1.1462337374687195, "rewards/check_coherence/mean": 1.15, "rewards/check_coherence/std": 0.48867512941360475, "rewards/check_response_quality/mean": 2.1875, "rewards/check_response_quality/std": 0.3914190471172333, "rewards/match_format_approximately/mean": 0.85, "rewards/match_format_approximately/std": 0.20773502588272094, "rewards/match_format_exactly/mean": 0.9, "rewards/match_format_exactly/std": 0.1154700517654419, "step": 140 }, { "completion_length": 46.625, "completions/clipped_ratio": 0.0, "completions/max_length": 135.3, "completions/max_terminated_length": 135.3, "completions/mean_length": 46.625, "completions/mean_terminated_length": 46.625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.010314949800577638, "frac_reward_zero_std": 0.4, "grad_norm": 0.36925405263900757, "kl": 1.2759959518909454, "learning_rate": 3.2222222222222227e-06, "loss": 0.0013, "num_tokens": 278280.0, "reward": 5.7, "reward_std": 0.4686140716075897, "rewards/check_coherence/mean": 1.3875, "rewards/check_coherence/std": 0.225, "rewards/check_response_quality/mean": 2.4, "rewards/check_response_quality/std": 0.15173887014389037, "rewards/match_format_approximately/mean": 0.9125, "rewards/match_format_approximately/std": 0.13273502588272096, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 150 }, { "completion_length": 37.85, "completions/clipped_ratio": 0.0, "completions/max_length": 108.9, "completions/max_terminated_length": 108.9, "completions/mean_length": 37.85, "completions/mean_terminated_length": 37.85, "completions/min_length": 10.3, "completions/min_terminated_length": 10.3, "epoch": 0.011002613120616147, "frac_reward_zero_std": 0.5, "grad_norm": 0.003718956606462598, "kl": 1.1356925666332245, "learning_rate": 3.063492063492064e-06, "loss": 0.0011, "num_tokens": 289698.0, "reward": 5.8625, "reward_std": 0.275, "rewards/check_coherence/mean": 1.4, "rewards/check_coherence/std": 0.2, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 160 }, { "completion_length": 24.475, "completions/clipped_ratio": 0.0, "completions/max_length": 58.6, "completions/max_terminated_length": 58.6, "completions/mean_length": 24.475, "completions/mean_terminated_length": 24.475, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.011690276440654656, "frac_reward_zero_std": 0.8, "grad_norm": 0.0026451745070517063, "kl": 1.1320037961006164, "learning_rate": 2.9047619047619053e-06, "loss": 0.0011, "num_tokens": 302601.0, "reward": 5.9, "reward_std": 0.14574271440505981, "rewards/check_coherence/mean": 1.425, "rewards/check_coherence/std": 0.10773502588272095, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 170 }, { "completion_length": 65.375, "completions/clipped_ratio": 0.025, "completions/max_length": 216.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 65.375, "completions/mean_terminated_length": 26.866666793823242, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.012377939760693164, "frac_reward_zero_std": 0.3, "grad_norm": 2.4312984943389893, "kl": 1.0950138330459596, "learning_rate": 2.7460317460317466e-06, "loss": 0.0011, "num_tokens": 317912.0, "reward": 5.775, "reward_std": 0.3957427144050598, "rewards/check_coherence/mean": 1.4, "rewards/check_coherence/std": 0.2, "rewards/check_response_quality/mean": 2.4375, "rewards/check_response_quality/std": 0.10386751294136047, "rewards/match_format_approximately/mean": 0.9375, "rewards/match_format_approximately/std": 0.10386751294136047, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 180 }, { "completion_length": 16.275, "completions/clipped_ratio": 0.0, "completions/max_length": 25.2, "completions/max_terminated_length": 25.2, "completions/mean_length": 16.275, "completions/mean_terminated_length": 16.275, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.013065603080731673, "frac_reward_zero_std": 0.6, "grad_norm": 0.004265956114977598, "kl": 1.1890327751636505, "learning_rate": 2.587301587301588e-06, "loss": 0.0012, "num_tokens": 332055.0, "reward": 5.8875, "reward_std": 0.18273502588272095, "rewards/check_coherence/mean": 1.3875, "rewards/check_coherence/std": 0.18273502588272095, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 190 }, { "completion_length": 37.9, "completions/clipped_ratio": 0.0, "completions/max_length": 112.8, "completions/max_terminated_length": 112.8, "completions/mean_length": 37.9, "completions/mean_terminated_length": 37.9, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.013753266400770183, "frac_reward_zero_std": 0.4, "grad_norm": 16.73357582092285, "kl": 1.1858276724815369, "learning_rate": 2.428571428571429e-06, "loss": 0.0012, "num_tokens": 344875.0, "reward": 5.75, "reward_std": 0.4457427144050598, "rewards/check_coherence/mean": 1.35, "rewards/check_coherence/std": 0.25773502588272096, "rewards/check_response_quality/mean": 2.45, "rewards/check_response_quality/std": 0.1, "rewards/match_format_approximately/mean": 0.95, "rewards/match_format_approximately/std": 0.1, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 200 }, { "completion_length": 57.225, "completions/clipped_ratio": 0.0, "completions/max_length": 142.9, "completions/max_terminated_length": 142.9, "completions/mean_length": 57.225, "completions/mean_terminated_length": 57.225, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.014440929720808692, "frac_reward_zero_std": 0.6, "grad_norm": 0.006797971669584513, "kl": 1.0238984107971192, "learning_rate": 2.26984126984127e-06, "loss": 0.001, "num_tokens": 359308.0, "reward": 5.7625, "reward_std": 0.2946484684944153, "rewards/check_coherence/mean": 1.375, "rewards/check_coherence/std": 0.20773502588272094, "rewards/check_response_quality/mean": 2.4375, "rewards/check_response_quality/std": 0.06582483053207397, "rewards/match_format_approximately/mean": 0.95, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 210 }, { "completion_length": 60.225, "completions/clipped_ratio": 0.0, "completions/max_length": 198.7, "completions/max_terminated_length": 198.7, "completions/mean_length": 60.225, "completions/mean_terminated_length": 60.225, "completions/min_length": 10.7, "completions/min_terminated_length": 10.7, "epoch": 0.015128593040847202, "frac_reward_zero_std": 0.4, "grad_norm": 1.2490910291671753, "kl": 1.139722502231598, "learning_rate": 2.1111111111111114e-06, "loss": 0.0011, "num_tokens": 372089.0, "reward": 5.8, "reward_std": 0.4, "rewards/check_coherence/mean": 1.375, "rewards/check_coherence/std": 0.25, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 220 }, { "completion_length": 30.075, "completions/clipped_ratio": 0.0, "completions/max_length": 84.5, "completions/max_terminated_length": 84.5, "completions/mean_length": 30.075, "completions/mean_terminated_length": 30.075, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.01581625636088571, "frac_reward_zero_std": 0.7, "grad_norm": 0.004948398098349571, "kl": 1.1260765612125396, "learning_rate": 1.9523809523809527e-06, "loss": 0.0011, "num_tokens": 384276.0, "reward": 5.875, "reward_std": 0.1728713572025299, "rewards/check_coherence/mean": 1.4375, "rewards/check_coherence/std": 0.0978713572025299, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 230 }, { "completion_length": 65.25, "completions/clipped_ratio": 0.025, "completions/max_length": 223.1, "completions/max_terminated_length": 65.9, "completions/mean_length": 65.25, "completions/mean_terminated_length": 25.908333396911623, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.01650391968092422, "frac_reward_zero_std": 0.6, "grad_norm": 0.0049158609472215176, "kl": 1.044507622718811, "learning_rate": 1.7936507936507938e-06, "loss": 0.001, "num_tokens": 399686.0, "reward": 5.8375, "reward_std": 0.325, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4375, "rewards/check_response_quality/std": 0.125, "rewards/match_format_approximately/mean": 0.95, "rewards/match_format_approximately/std": 0.1, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 240 }, { "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 97.6, "completions/max_terminated_length": 97.6, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.01719158300096273, "frac_reward_zero_std": 0.6, "grad_norm": 0.0035998751409351826, "kl": 1.131579464673996, "learning_rate": 1.6349206349206351e-06, "loss": 0.0011, "num_tokens": 412936.0, "reward": 5.875, "reward_std": 0.25, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 250 }, { "completion_length": 44.525, "completions/clipped_ratio": 0.0, "completions/max_length": 138.9, "completions/max_terminated_length": 138.9, "completions/mean_length": 44.525, "completions/mean_terminated_length": 44.525, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.017879246321001237, "frac_reward_zero_std": 0.8, "grad_norm": 0.003912392072379589, "kl": 1.0597735822200776, "learning_rate": 1.4761904761904762e-06, "loss": 0.0011, "num_tokens": 427709.0, "reward": 5.9375, "reward_std": 0.125, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 260 }, { "completion_length": 13.175, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 13.175, "completions/mean_terminated_length": 13.175, "completions/min_length": 7.3, "completions/min_terminated_length": 7.3, "epoch": 0.01856690964103975, "frac_reward_zero_std": 0.6, "grad_norm": 4.749101638793945, "kl": 1.352827501296997, "learning_rate": 1.3174603174603175e-06, "loss": 0.0014, "num_tokens": 437888.0, "reward": 5.7625, "reward_std": 0.41304759979248046, "rewards/check_coherence/mean": 1.4125, "rewards/check_coherence/std": 0.175, "rewards/check_response_quality/mean": 2.4125, "rewards/check_response_quality/std": 0.14464847445487977, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.053867512941360475, "rewards/match_format_exactly/mean": 0.975, "rewards/match_format_exactly/std": 0.05, "step": 270 }, { "completion_length": 29.05, "completions/clipped_ratio": 0.0, "completions/max_length": 75.4, "completions/max_terminated_length": 75.4, "completions/mean_length": 29.05, "completions/mean_terminated_length": 29.05, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.019254572961078256, "frac_reward_zero_std": 0.7, "grad_norm": 0.0032444808166474104, "kl": 1.0743911743164063, "learning_rate": 1.1587301587301589e-06, "loss": 0.0011, "num_tokens": 450566.0, "reward": 5.8875, "reward_std": 0.225, "rewards/check_coherence/mean": 1.4375, "rewards/check_coherence/std": 0.125, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 280 }, { "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.7, "completions/max_terminated_length": 100.7, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.019942236281116764, "frac_reward_zero_std": 0.7, "grad_norm": 4.597572326660156, "kl": 9.488562166690826, "learning_rate": 1.0000000000000002e-06, "loss": 0.0095, "num_tokens": 462366.0, "reward": 5.875, "reward_std": 0.20773502588272094, "rewards/check_coherence/mean": 1.4, "rewards/check_coherence/std": 0.15773502588272095, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 290 }, { "completion_length": 16.875, "completions/clipped_ratio": 0.0, "completions/max_length": 31.8, "completions/max_terminated_length": 31.8, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.020629899601155275, "frac_reward_zero_std": 0.8, "grad_norm": 0.0025285985320806503, "kl": 1.0927811563014984, "learning_rate": 8.412698412698414e-07, "loss": 0.0011, "num_tokens": 473501.0, "reward": 5.9375, "reward_std": 0.125, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 300 }, { "completion_length": 34.825, "completions/clipped_ratio": 0.0, "completions/max_length": 104.8, "completions/max_terminated_length": 104.8, "completions/mean_length": 34.825, "completions/mean_terminated_length": 34.825, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.021317562921193783, "frac_reward_zero_std": 0.9, "grad_norm": 0.004729569889605045, "kl": 1.0428565800189973, "learning_rate": 6.825396825396826e-07, "loss": 0.001, "num_tokens": 487282.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 310 }, { "completion_length": 28.475, "completions/clipped_ratio": 0.0, "completions/max_length": 81.7, "completions/max_terminated_length": 81.7, "completions/mean_length": 28.475, "completions/mean_terminated_length": 28.475, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.022005226241232294, "frac_reward_zero_std": 0.8, "grad_norm": 6.385797500610352, "kl": 1.2766858220100403, "learning_rate": 5.238095238095239e-07, "loss": 0.0013, "num_tokens": 500121.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 320 }, { "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.8, "completions/max_terminated_length": 33.8, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.0226928895612708, "frac_reward_zero_std": 0.7, "grad_norm": 0.010602269321680069, "kl": 1.1651100397109986, "learning_rate": 3.6507936507936514e-07, "loss": 0.0012, "num_tokens": 514331.0, "reward": 5.8875, "reward_std": 0.18273502588272095, "rewards/check_coherence/mean": 1.4625, "rewards/check_coherence/std": 0.075, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.053867512941360475, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.053867512941360475, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 330 }, { "completion_length": 13.8, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 13.8, "completions/mean_terminated_length": 13.8, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.023380552881309313, "frac_reward_zero_std": 0.9, "grad_norm": 0.007794269360601902, "kl": 1.1532993257045745, "learning_rate": 2.0634920634920635e-07, "loss": 0.0012, "num_tokens": 528547.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 340 }, { "completion_length": 13.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.9, "completions/mean_terminated_length": 13.9, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.02406821620134782, "frac_reward_zero_std": 0.8, "grad_norm": 3.0694658756256104, "kl": 1.2572884202003478, "learning_rate": 4.7619047619047627e-08, "loss": 0.0013, "num_tokens": 540139.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 350 }, { "completion_length": 15.875, "completions/clipped_ratio": 0.0, "completions/max_length": 28.5, "completions/max_terminated_length": 28.5, "completions/mean_length": 15.875, "completions/mean_terminated_length": 15.875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.024755879521386328, "frac_reward_zero_std": 0.7, "grad_norm": 0.00623699277639389, "kl": 1.0916778862476348, "learning_rate": 1.588888888888889e-06, "loss": 0.0011, "num_tokens": 553702.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 360 }, { "completion_length": 22.4, "completions/clipped_ratio": 0.0, "completions/max_length": 54.6, "completions/max_terminated_length": 54.6, "completions/mean_length": 22.4, "completions/mean_terminated_length": 22.4, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.02544354284142484, "frac_reward_zero_std": 0.7, "grad_norm": 0.007368564605712891, "kl": 1.159883439540863, "learning_rate": 1.477777777777778e-06, "loss": 0.0012, "num_tokens": 566342.0, "reward": 5.9, "reward_std": 0.2, "rewards/check_coherence/mean": 1.425, "rewards/check_coherence/std": 0.15, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 370 }, { "completion_length": 42.2, "completions/clipped_ratio": 0.0, "completions/max_length": 131.7, "completions/max_terminated_length": 131.7, "completions/mean_length": 42.2, "completions/mean_terminated_length": 42.2, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.026131206161463347, "frac_reward_zero_std": 0.6, "grad_norm": 1.8988958597183228, "kl": 1.0139860570430757, "learning_rate": 1.3666666666666668e-06, "loss": 0.001, "num_tokens": 581202.0, "reward": 5.85, "reward_std": 0.3, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4375, "rewards/check_response_quality/std": 0.125, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 380 }, { "completion_length": 18.65, "completions/clipped_ratio": 0.0, "completions/max_length": 39.9, "completions/max_terminated_length": 39.9, "completions/mean_length": 18.65, "completions/mean_terminated_length": 18.65, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.026818869481501858, "frac_reward_zero_std": 0.8, "grad_norm": 0.006959094665944576, "kl": 1.2067798852920533, "learning_rate": 1.2555555555555557e-06, "loss": 0.0012, "num_tokens": 593448.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 390 }, { "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 52.6, "completions/max_terminated_length": 52.6, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.027506532801540366, "frac_reward_zero_std": 0.8, "grad_norm": 0.0012704557739198208, "kl": 0.9775669932365417, "learning_rate": 1.1444444444444446e-06, "loss": 0.001, "num_tokens": 607732.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 400 }, { "completion_length": 38.85, "completions/clipped_ratio": 0.0, "completions/max_length": 123.2, "completions/max_terminated_length": 123.2, "completions/mean_length": 38.85, "completions/mean_terminated_length": 38.85, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.028194196121578877, "frac_reward_zero_std": 0.7, "grad_norm": 0.004135606344789267, "kl": 1.141333144903183, "learning_rate": 1.0333333333333333e-06, "loss": 0.0011, "num_tokens": 620094.0, "reward": 5.9125, "reward_std": 0.175, "rewards/check_coherence/mean": 1.4625, "rewards/check_coherence/std": 0.075, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 410 }, { "completion_length": 33.075, "completions/clipped_ratio": 0.0, "completions/max_length": 96.7, "completions/max_terminated_length": 96.7, "completions/mean_length": 33.075, "completions/mean_terminated_length": 33.075, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.028881859441617384, "frac_reward_zero_std": 0.7, "grad_norm": 0.01156042329967022, "kl": 1.4050394296646118, "learning_rate": 9.222222222222222e-07, "loss": 0.0014, "num_tokens": 633025.0, "reward": 5.875, "reward_std": 0.25, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 420 }, { "completion_length": 30.55, "completions/clipped_ratio": 0.0, "completions/max_length": 80.9, "completions/max_terminated_length": 80.9, "completions/mean_length": 30.55, "completions/mean_terminated_length": 30.55, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.029569522761655892, "frac_reward_zero_std": 0.7, "grad_norm": 2.049229621887207, "kl": 0.9671716094017029, "learning_rate": 8.111111111111112e-07, "loss": 0.001, "num_tokens": 645383.0, "reward": 5.8625, "reward_std": 0.175, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.05773502588272095, "rewards/check_response_quality/mean": 2.45, "rewards/check_response_quality/std": 0.1, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 430 }, { "completion_length": 32.375, "completions/clipped_ratio": 0.0, "completions/max_length": 96.9, "completions/max_terminated_length": 96.9, "completions/mean_length": 32.375, "completions/mean_terminated_length": 32.375, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.030257186081694403, "frac_reward_zero_std": 0.9, "grad_norm": 0.00446512084454298, "kl": 1.2508110523223877, "learning_rate": 7.000000000000001e-07, "loss": 0.0013, "num_tokens": 659314.0, "reward": 5.9375, "reward_std": 0.125, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 440 }, { "completion_length": 27.725, "completions/clipped_ratio": 0.0, "completions/max_length": 77.2, "completions/max_terminated_length": 77.2, "completions/mean_length": 27.725, "completions/mean_terminated_length": 27.725, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.03094484940173291, "frac_reward_zero_std": 0.6, "grad_norm": 0.0031550778076052666, "kl": 1.1202711045742035, "learning_rate": 5.888888888888889e-07, "loss": 0.0011, "num_tokens": 671575.0, "reward": 5.8625, "reward_std": 0.275, "rewards/check_coherence/mean": 1.425, "rewards/check_coherence/std": 0.15, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 450 }, { "completion_length": 13.225, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 13.225, "completions/mean_terminated_length": 13.225, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.03163251272177142, "frac_reward_zero_std": 0.9, "grad_norm": 0.0025077220052480698, "kl": 1.1845026671886445, "learning_rate": 4.777777777777778e-07, "loss": 0.0012, "num_tokens": 680956.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 460 }, { "completion_length": 14.7, "completions/clipped_ratio": 0.0, "completions/max_length": 22.4, "completions/max_terminated_length": 22.4, "completions/mean_length": 14.7, "completions/mean_terminated_length": 14.7, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.03232017604180993, "frac_reward_zero_std": 0.9, "grad_norm": 0.0026869364082813263, "kl": 1.0701819598674773, "learning_rate": 3.666666666666667e-07, "loss": 0.0011, "num_tokens": 694668.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 470 }, { "completion_length": 26.9, "completions/clipped_ratio": 0.0, "completions/max_length": 74.9, "completions/max_terminated_length": 74.9, "completions/mean_length": 26.9, "completions/mean_terminated_length": 26.9, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.03300783936184844, "frac_reward_zero_std": 0.8, "grad_norm": 0.0027040443383157253, "kl": 1.3269443392753602, "learning_rate": 2.555555555555556e-07, "loss": 0.0013, "num_tokens": 707508.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 480 }, { "completion_length": 27.025, "completions/clipped_ratio": 0.0, "completions/max_length": 60.3, "completions/max_terminated_length": 60.3, "completions/mean_length": 27.025, "completions/mean_terminated_length": 27.025, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.033695502681886945, "frac_reward_zero_std": 0.8, "grad_norm": 0.0057538580149412155, "kl": 1.142683470249176, "learning_rate": 1.4444444444444445e-07, "loss": 0.0011, "num_tokens": 717845.0, "reward": 5.875, "reward_std": 0.13164966106414794, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.05773502588272095, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.053867512941360475, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.053867512941360475, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 490 }, { "completion_length": 17.275, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.03438316600192546, "frac_reward_zero_std": 0.8, "grad_norm": 0.005735357291996479, "kl": 1.0870766162872314, "learning_rate": 3.333333333333334e-08, "loss": 0.0011, "num_tokens": 729556.0, "reward": 5.9125, "reward_std": 0.175, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 500 }, { "completion_length": 65.975, "completions/clipped_ratio": 0.0, "completions/max_length": 135.7, "completions/max_terminated_length": 135.7, "completions/mean_length": 65.975, "completions/mean_terminated_length": 65.975, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.03507082932196397, "frac_reward_zero_std": 0.7, "grad_norm": 3.473423719406128, "kl": 1.3229190528392791, "learning_rate": 4.330917874396136e-06, "loss": 0.0013, "num_tokens": 740839.0, "reward": 5.8375, "reward_std": 0.153445702791214, "rewards/check_coherence/mean": 1.4125, "rewards/check_coherence/std": 0.13273502588272096, "rewards/check_response_quality/mean": 2.45, "rewards/check_response_quality/std": 0.05773502588272095, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.028867512941360474, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 510 }, { "completion_length": 51.975, "completions/clipped_ratio": 0.025, "completions/max_length": 175.6, "completions/max_terminated_length": 19.1, "completions/mean_length": 51.975, "completions/mean_terminated_length": 12.7, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.035758492642002475, "frac_reward_zero_std": 0.9, "grad_norm": 0.005841626785695553, "kl": 3.1703031241893767, "learning_rate": 4.3067632850241545e-06, "loss": 0.0032, "num_tokens": 754322.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 520 }, { "completion_length": 26.65, "completions/clipped_ratio": 0.0, "completions/max_length": 69.3, "completions/max_terminated_length": 69.3, "completions/mean_length": 26.65, "completions/mean_terminated_length": 26.65, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.03644615596204098, "frac_reward_zero_std": 0.8, "grad_norm": 0.0034844442270696163, "kl": 1.1955626547336577, "learning_rate": 4.282608695652175e-06, "loss": 0.0012, "num_tokens": 768516.0, "reward": 5.9125, "reward_std": 0.12074271440505982, "rewards/check_coherence/mean": 1.4375, "rewards/check_coherence/std": 0.08273502588272094, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 530 }, { "completion_length": 49.6, "completions/clipped_ratio": 0.0, "completions/max_length": 158.8, "completions/max_terminated_length": 158.8, "completions/mean_length": 49.6, "completions/mean_terminated_length": 49.6, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.0371338192820795, "frac_reward_zero_std": 0.7, "grad_norm": 2.5357494354248047, "kl": 1.0726108729839325, "learning_rate": 4.2584541062801936e-06, "loss": 0.0011, "num_tokens": 783124.0, "reward": 5.8125, "reward_std": 0.24571067690849305, "rewards/check_coherence/mean": 1.425, "rewards/check_coherence/std": 0.09082483053207398, "rewards/check_response_quality/mean": 2.4375, "rewards/check_response_quality/std": 0.10386751294136047, "rewards/match_format_approximately/mean": 0.95, "rewards/match_format_approximately/std": 0.07886751294136048, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 540 }, { "completion_length": 10.4, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 10.4, "completions/mean_terminated_length": 10.4, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.037821482602118005, "frac_reward_zero_std": 1.0, "grad_norm": 0.007602162193506956, "kl": 1.2256620168685912, "learning_rate": 4.234299516908213e-06, "loss": 0.0012, "num_tokens": 793232.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 550 }, { "completion_length": 11.275, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.275, "completions/mean_terminated_length": 11.275, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.03850914592215651, "frac_reward_zero_std": 0.9, "grad_norm": 0.005174298770725727, "kl": 1.4170250117778778, "learning_rate": 4.210144927536232e-06, "loss": 0.0014, "num_tokens": 804267.0, "reward": 5.9875, "reward_std": 0.025, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 560 }, { "completion_length": 31.3, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 31.3, "completions/mean_terminated_length": 31.3, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.03919680924219502, "frac_reward_zero_std": 0.7, "grad_norm": 4.267595291137695, "kl": 1.1769875526428222, "learning_rate": 4.185990338164251e-06, "loss": 0.0012, "num_tokens": 817063.0, "reward": 5.875, "reward_std": 0.25, "rewards/check_coherence/mean": 1.4375, "rewards/check_coherence/std": 0.125, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 570 }, { "completion_length": 32.925, "completions/clipped_ratio": 0.0, "completions/max_length": 99.6, "completions/max_terminated_length": 99.6, "completions/mean_length": 32.925, "completions/mean_terminated_length": 32.925, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.03988447256223353, "frac_reward_zero_std": 0.6, "grad_norm": 4.495794773101807, "kl": 1.3460102200508117, "learning_rate": 4.161835748792271e-06, "loss": 0.0013, "num_tokens": 830080.0, "reward": 5.9125, "reward_std": 0.175, "rewards/check_coherence/mean": 1.4375, "rewards/check_coherence/std": 0.125, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 580 }, { "completion_length": 64.65, "completions/clipped_ratio": 0.0, "completions/max_length": 202.7, "completions/max_terminated_length": 202.7, "completions/mean_length": 64.65, "completions/mean_terminated_length": 64.65, "completions/min_length": 7.1, "completions/min_terminated_length": 7.1, "epoch": 0.04057213588227204, "frac_reward_zero_std": 0.6, "grad_norm": 2.4681625366210938, "kl": 1.3417995631694795, "learning_rate": 4.13768115942029e-06, "loss": 0.0013, "num_tokens": 843546.0, "reward": 5.725, "reward_std": 0.4217355728149414, "rewards/check_coherence/mean": 1.4375, "rewards/check_coherence/std": 0.125, "rewards/check_response_quality/mean": 2.3875, "rewards/check_response_quality/std": 0.16160253882408143, "rewards/match_format_approximately/mean": 0.925, "rewards/match_format_approximately/std": 0.1, "rewards/match_format_exactly/mean": 0.975, "rewards/match_format_exactly/std": 0.05, "step": 590 }, { "completion_length": 10.9, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 10.9, "completions/mean_terminated_length": 10.9, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.04125979920231055, "frac_reward_zero_std": 0.9, "grad_norm": 0.002406905870884657, "kl": 1.624447101354599, "learning_rate": 4.11352657004831e-06, "loss": 0.0016, "num_tokens": 854534.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 600 }, { "completion_length": 10.725, "completions/clipped_ratio": 0.0, "completions/max_length": 13.4, "completions/max_terminated_length": 13.4, "completions/mean_length": 10.725, "completions/mean_terminated_length": 10.725, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.04194746252234906, "frac_reward_zero_std": 1.0, "grad_norm": 0.003911010455340147, "kl": 1.4192960679531097, "learning_rate": 4.0893719806763285e-06, "loss": 0.0014, "num_tokens": 865203.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 610 }, { "completion_length": 12.425, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 12.425, "completions/mean_terminated_length": 12.425, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.042635125842387565, "frac_reward_zero_std": 0.9, "grad_norm": 0.0038089642766863108, "kl": 1.4190670549869537, "learning_rate": 4.065217391304348e-06, "loss": 0.0014, "num_tokens": 876636.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 620 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 31.9, "completions/max_terminated_length": 31.9, "completions/mean_length": 17.1, "completions/mean_terminated_length": 17.1, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.04332278916242607, "frac_reward_zero_std": 0.9, "grad_norm": 0.004204964730888605, "kl": 1.2702333629131317, "learning_rate": 4.0410628019323675e-06, "loss": 0.0013, "num_tokens": 887312.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 630 }, { "completion_length": 11.675, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 11.675, "completions/mean_terminated_length": 11.675, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.04401045248246459, "frac_reward_zero_std": 0.9, "grad_norm": 0.013301965780556202, "kl": 1.4880116164684296, "learning_rate": 4.016908212560387e-06, "loss": 0.0015, "num_tokens": 900155.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 640 }, { "completion_length": 11.975, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 11.975, "completions/mean_terminated_length": 11.975, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.044698115802503095, "frac_reward_zero_std": 0.9, "grad_norm": 0.003986676223576069, "kl": 1.375555509328842, "learning_rate": 3.9927536231884065e-06, "loss": 0.0014, "num_tokens": 913378.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 650 }, { "completion_length": 12.375, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 12.375, "completions/mean_terminated_length": 12.375, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.0453857791225416, "frac_reward_zero_std": 0.9, "grad_norm": 0.0072197564877569675, "kl": 1.4778401851654053, "learning_rate": 3.968599033816425e-06, "loss": 0.0015, "num_tokens": 925801.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 660 }, { "completion_length": 10.1, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.1, "completions/mean_terminated_length": 10.1, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.04607344244258011, "frac_reward_zero_std": 1.0, "grad_norm": 0.00877147726714611, "kl": 1.5386210203170776, "learning_rate": 3.944444444444445e-06, "loss": 0.0015, "num_tokens": 938605.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 670 }, { "completion_length": 26.9, "completions/clipped_ratio": 0.0, "completions/max_length": 79.7, "completions/max_terminated_length": 79.7, "completions/mean_length": 26.9, "completions/mean_terminated_length": 26.9, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.046761105762618625, "frac_reward_zero_std": 0.8, "grad_norm": 0.0056604305282235146, "kl": 1.3617631494998932, "learning_rate": 3.920289855072464e-06, "loss": 0.0014, "num_tokens": 951505.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 680 }, { "completion_length": 11.65, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 11.65, "completions/mean_terminated_length": 11.65, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.04744876908265713, "frac_reward_zero_std": 1.0, "grad_norm": 0.011466645635664463, "kl": 1.4758114337921142, "learning_rate": 3.896135265700484e-06, "loss": 0.0015, "num_tokens": 965291.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 690 }, { "completion_length": 11.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 11.6, "completions/mean_terminated_length": 11.6, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.04813643240269564, "frac_reward_zero_std": 0.9, "grad_norm": 0.004098635632544756, "kl": 1.9950886607170104, "learning_rate": 3.871980676328503e-06, "loss": 0.002, "num_tokens": 980115.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 700 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.04882409572273415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038452409207820892, "kl": 1.3746231377124787, "learning_rate": 3.847826086956522e-06, "loss": 0.0014, "num_tokens": 992504.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 710 }, { "completion_length": 10.875, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 10.875, "completions/mean_terminated_length": 10.875, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.049511759042772656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027600303292274475, "kl": 1.4807618498802184, "learning_rate": 3.8236714975845414e-06, "loss": 0.0015, "num_tokens": 1005663.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 720 }, { "completion_length": 12.375, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 12.375, "completions/mean_terminated_length": 12.375, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.05019942236281117, "frac_reward_zero_std": 0.9, "grad_norm": 0.006777458358556032, "kl": 1.5830685496330261, "learning_rate": 3.7995169082125605e-06, "loss": 0.0016, "num_tokens": 1018350.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 730 }, { "completion_length": 12.125, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 12.125, "completions/mean_terminated_length": 12.125, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.05088708568284968, "frac_reward_zero_std": 0.9, "grad_norm": 0.003976897802203894, "kl": 1.3298546731472016, "learning_rate": 3.7753623188405805e-06, "loss": 0.0013, "num_tokens": 1030623.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 740 }, { "completion_length": 10.35, "completions/clipped_ratio": 0.0, "completions/max_length": 12.9, "completions/max_terminated_length": 12.9, "completions/mean_length": 10.35, "completions/mean_terminated_length": 10.35, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.051574749002888186, "frac_reward_zero_std": 1.0, "grad_norm": 0.004374953452497721, "kl": 1.4491690695285797, "learning_rate": 3.7512077294685995e-06, "loss": 0.0014, "num_tokens": 1040569.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 750 }, { "completion_length": 13.875, "completions/clipped_ratio": 0.0, "completions/max_length": 23.8, "completions/max_terminated_length": 23.8, "completions/mean_length": 13.875, "completions/mean_terminated_length": 13.875, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.052262412322926693, "frac_reward_zero_std": 0.9, "grad_norm": 0.005473052617162466, "kl": 1.305114781856537, "learning_rate": 3.7270531400966186e-06, "loss": 0.0013, "num_tokens": 1051800.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 760 }, { "completion_length": 10.35, "completions/clipped_ratio": 0.0, "completions/max_length": 14.2, "completions/max_terminated_length": 14.2, "completions/mean_length": 10.35, "completions/mean_terminated_length": 10.35, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.0529500756429652, "frac_reward_zero_std": 1.0, "grad_norm": 0.00831334013491869, "kl": 1.5019812345504762, "learning_rate": 3.7028985507246377e-06, "loss": 0.0015, "num_tokens": 1063438.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 770 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 49.6, "completions/max_terminated_length": 49.6, "completions/mean_length": 19.4, "completions/mean_terminated_length": 19.4, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.053637738963003716, "frac_reward_zero_std": 0.9, "grad_norm": 0.005551347974687815, "kl": 1.6162667870521545, "learning_rate": 3.6787439613526572e-06, "loss": 0.0016, "num_tokens": 1075242.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 780 }, { "completion_length": 12.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 12.1, "completions/mean_terminated_length": 12.1, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.05432540228304222, "frac_reward_zero_std": 1.0, "grad_norm": 0.006306353025138378, "kl": 1.3515527844429016, "learning_rate": 3.6545893719806768e-06, "loss": 0.0014, "num_tokens": 1089426.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 790 }, { "completion_length": 11.2, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.2, "completions/mean_terminated_length": 11.2, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.05501306560308073, "frac_reward_zero_std": 0.9, "grad_norm": 0.006242894567549229, "kl": 1.4019363343715667, "learning_rate": 3.6304347826086963e-06, "loss": 0.0014, "num_tokens": 1099998.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 800 }, { "completion_length": 12.55, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 12.55, "completions/mean_terminated_length": 12.55, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.05570072892311924, "frac_reward_zero_std": 1.0, "grad_norm": 0.006163584999740124, "kl": 1.3902488827705384, "learning_rate": 3.6062801932367154e-06, "loss": 0.0014, "num_tokens": 1111480.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 810 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.05638839224315775, "frac_reward_zero_std": 1.0, "grad_norm": 0.006469434592872858, "kl": 1.574440735578537, "learning_rate": 3.5821256038647344e-06, "loss": 0.0016, "num_tokens": 1123270.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 820 }, { "completion_length": 11.225, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 11.225, "completions/mean_terminated_length": 11.225, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.05707605556319626, "frac_reward_zero_std": 1.0, "grad_norm": 0.009432986378669739, "kl": 1.5892924427986146, "learning_rate": 3.5579710144927535e-06, "loss": 0.0016, "num_tokens": 1133435.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 830 }, { "completion_length": 14.325, "completions/clipped_ratio": 0.0, "completions/max_length": 27.8, "completions/max_terminated_length": 27.8, "completions/mean_length": 14.325, "completions/mean_terminated_length": 14.325, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.05776371888323477, "frac_reward_zero_std": 0.8, "grad_norm": 0.005321599077433348, "kl": 1.4528217315673828, "learning_rate": 3.5338164251207735e-06, "loss": 0.0015, "num_tokens": 1145400.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 840 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.058451382203273276, "frac_reward_zero_std": 1.0, "grad_norm": 0.009609498083591461, "kl": 1.5806709051132202, "learning_rate": 3.5096618357487926e-06, "loss": 0.0016, "num_tokens": 1159344.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 850 }, { "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.059139045523311784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030479375272989273, "kl": 1.3322069704532624, "learning_rate": 3.485507246376812e-06, "loss": 0.0013, "num_tokens": 1171028.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 860 }, { "completion_length": 12.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 12.6, "completions/mean_terminated_length": 12.6, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.0598267088433503, "frac_reward_zero_std": 0.9, "grad_norm": 0.004412870854139328, "kl": 1.3979312300682067, "learning_rate": 3.461352657004831e-06, "loss": 0.0014, "num_tokens": 1184724.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 870 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 7.4, "completions/min_terminated_length": 7.4, "epoch": 0.060514372163388806, "frac_reward_zero_std": 0.9, "grad_norm": 0.006056824699044228, "kl": 1.382024598121643, "learning_rate": 3.4371980676328503e-06, "loss": 0.0014, "num_tokens": 1195929.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 880 }, { "completion_length": 10.875, "completions/clipped_ratio": 0.0, "completions/max_length": 14.4, "completions/max_terminated_length": 14.4, "completions/mean_length": 10.875, "completions/mean_terminated_length": 10.875, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.061202035483427314, "frac_reward_zero_std": 1.0, "grad_norm": 0.005581828765571117, "kl": 1.3684062242507935, "learning_rate": 3.4130434782608698e-06, "loss": 0.0014, "num_tokens": 1207208.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 890 }, { "completion_length": 12.825, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 12.825, "completions/mean_terminated_length": 12.825, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.06188969880346582, "frac_reward_zero_std": 0.9, "grad_norm": 0.0034383470192551613, "kl": 1.2667877137660981, "learning_rate": 3.3888888888888893e-06, "loss": 0.0013, "num_tokens": 1220125.0, "reward": 5.9875, "reward_std": 0.025, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 900 }, { "completion_length": 10.625, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 10.625, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.06257736212350433, "frac_reward_zero_std": 0.9, "grad_norm": 0.009170152246952057, "kl": 3.0979028224945067, "learning_rate": 3.3647342995169084e-06, "loss": 0.0031, "num_tokens": 1232450.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 910 }, { "completion_length": 10.425, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.425, "completions/mean_terminated_length": 10.425, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.06326502544354284, "frac_reward_zero_std": 1.0, "grad_norm": 0.011768506839871407, "kl": 1.4945785045623778, "learning_rate": 3.340579710144928e-06, "loss": 0.0015, "num_tokens": 1244439.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 920 }, { "completion_length": 12.525, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 12.525, "completions/mean_terminated_length": 12.525, "completions/min_length": 7.4, "completions/min_terminated_length": 7.4, "epoch": 0.06395268876358134, "frac_reward_zero_std": 0.9, "grad_norm": 0.005672871135175228, "kl": 1.5882929801940917, "learning_rate": 3.316425120772947e-06, "loss": 0.0016, "num_tokens": 1256788.0, "reward": 5.875, "reward_std": 0.25, "rewards/check_coherence/mean": 1.4625, "rewards/check_coherence/std": 0.075, "rewards/check_response_quality/mean": 2.45, "rewards/check_response_quality/std": 0.1, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 0.975, "rewards/match_format_exactly/std": 0.05, "step": 930 }, { "completion_length": 11.65, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 11.65, "completions/mean_terminated_length": 11.65, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.06464035208361986, "frac_reward_zero_std": 0.9, "grad_norm": 0.008376842364668846, "kl": 1.31221883893013, "learning_rate": 3.292270531400966e-06, "loss": 0.0013, "num_tokens": 1268566.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 940 }, { "completion_length": 41.625, "completions/clipped_ratio": 0.0, "completions/max_length": 136.3, "completions/max_terminated_length": 136.3, "completions/mean_length": 41.625, "completions/mean_terminated_length": 41.625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.06532801540365837, "frac_reward_zero_std": 0.9, "grad_norm": 0.0034077914897352457, "kl": 1.3816728472709656, "learning_rate": 3.268115942028986e-06, "loss": 0.0014, "num_tokens": 1280407.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 950 }, { "completion_length": 12.025, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 12.025, "completions/mean_terminated_length": 12.025, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.06601567872369687, "frac_reward_zero_std": 1.0, "grad_norm": 0.003883879631757736, "kl": 1.327423983812332, "learning_rate": 3.243961352657005e-06, "loss": 0.0013, "num_tokens": 1290632.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 960 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.06670334204373539, "frac_reward_zero_std": 1.0, "grad_norm": 0.005790382158011198, "kl": 1.360637903213501, "learning_rate": 3.219806763285024e-06, "loss": 0.0014, "num_tokens": 1303109.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 970 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.06739100536377389, "frac_reward_zero_std": 1.0, "grad_norm": 0.009436859749257565, "kl": 1.261417853832245, "learning_rate": 3.1956521739130437e-06, "loss": 0.0013, "num_tokens": 1315829.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 980 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.0680786686838124, "frac_reward_zero_std": 0.9, "grad_norm": 0.023381376639008522, "kl": 1.452295684814453, "learning_rate": 3.171497584541063e-06, "loss": 0.0015, "num_tokens": 1327605.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 990 }, { "completion_length": 10.85, "completions/clipped_ratio": 0.0, "completions/max_length": 13.9, "completions/max_terminated_length": 13.9, "completions/mean_length": 10.85, "completions/mean_terminated_length": 10.85, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.06876633200385092, "frac_reward_zero_std": 1.0, "grad_norm": 0.021764662116765976, "kl": 1.3832166135311126, "learning_rate": 3.1473429951690827e-06, "loss": 0.0014, "num_tokens": 1340799.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1000 }, { "completion_length": 11.1, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 11.1, "completions/mean_terminated_length": 11.1, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.06945399532388942, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027048310730606318, "kl": 1.2940570950508117, "learning_rate": 3.123188405797102e-06, "loss": 0.0013, "num_tokens": 1353767.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1010 }, { "completion_length": 25.825, "completions/clipped_ratio": 0.0, "completions/max_length": 72.7, "completions/max_terminated_length": 72.7, "completions/mean_length": 25.825, "completions/mean_terminated_length": 25.825, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.07014165864392793, "frac_reward_zero_std": 0.9, "grad_norm": 0.0019137284252792597, "kl": 1.2903836131095887, "learning_rate": 3.099033816425121e-06, "loss": 0.0013, "num_tokens": 1366024.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1020 }, { "completion_length": 11.875, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.875, "completions/mean_terminated_length": 11.875, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.07082932196396644, "frac_reward_zero_std": 1.0, "grad_norm": 0.004841749556362629, "kl": 1.242459374666214, "learning_rate": 3.0748792270531404e-06, "loss": 0.0012, "num_tokens": 1377851.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1030 }, { "completion_length": 11.925, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 11.925, "completions/mean_terminated_length": 11.925, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.07151698528400495, "frac_reward_zero_std": 1.0, "grad_norm": 0.00714073283597827, "kl": 1.2855034828186036, "learning_rate": 3.0507246376811595e-06, "loss": 0.0013, "num_tokens": 1389808.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1040 }, { "completion_length": 10.125, "completions/clipped_ratio": 0.0, "completions/max_length": 12.2, "completions/max_terminated_length": 12.2, "completions/mean_length": 10.125, "completions/mean_terminated_length": 10.125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.07220464860404346, "frac_reward_zero_std": 1.0, "grad_norm": 0.00511928740888834, "kl": 1.4257906317710876, "learning_rate": 3.026570048309179e-06, "loss": 0.0014, "num_tokens": 1400881.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1050 }, { "completion_length": 22.875, "completions/clipped_ratio": 0.0, "completions/max_length": 61.2, "completions/max_terminated_length": 61.2, "completions/mean_length": 22.875, "completions/mean_terminated_length": 22.875, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.07289231192408197, "frac_reward_zero_std": 0.7, "grad_norm": 3.7891669273376465, "kl": 1.345647120475769, "learning_rate": 3.0024154589371985e-06, "loss": 0.0013, "num_tokens": 1414468.0, "reward": 5.9, "reward_std": 0.2, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1060 }, { "completion_length": 9.65, "completions/clipped_ratio": 0.0, "completions/max_length": 12.7, "completions/max_terminated_length": 12.7, "completions/mean_length": 9.65, "completions/mean_terminated_length": 9.65, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.07357997524412048, "frac_reward_zero_std": 1.0, "grad_norm": 0.03541218861937523, "kl": 1.4368322610855102, "learning_rate": 2.9782608695652176e-06, "loss": 0.0014, "num_tokens": 1425650.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1070 }, { "completion_length": 10.525, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.525, "completions/mean_terminated_length": 10.525, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.074267638564159, "frac_reward_zero_std": 1.0, "grad_norm": 0.005114713683724403, "kl": 1.6687398076057434, "learning_rate": 2.9541062801932367e-06, "loss": 0.0017, "num_tokens": 1437255.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1080 }, { "completion_length": 20.95, "completions/clipped_ratio": 0.0, "completions/max_length": 52.2, "completions/max_terminated_length": 52.2, "completions/mean_length": 20.95, "completions/mean_terminated_length": 20.95, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.0749553018841975, "frac_reward_zero_std": 0.9, "grad_norm": 0.005882755853235722, "kl": 2.042826807498932, "learning_rate": 2.9299516908212562e-06, "loss": 0.002, "num_tokens": 1450757.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1090 }, { "completion_length": 10.225, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 10.225, "completions/mean_terminated_length": 10.225, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.07564296520423601, "frac_reward_zero_std": 1.0, "grad_norm": 0.006175467278808355, "kl": 1.8858429431915282, "learning_rate": 2.9057971014492758e-06, "loss": 0.0019, "num_tokens": 1465250.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1100 }, { "completion_length": 10.7, "completions/clipped_ratio": 0.0, "completions/max_length": 13.2, "completions/max_terminated_length": 13.2, "completions/mean_length": 10.7, "completions/mean_terminated_length": 10.7, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.07633062852427451, "frac_reward_zero_std": 1.0, "grad_norm": 0.01212387252599001, "kl": 1.431203842163086, "learning_rate": 2.8816425120772953e-06, "loss": 0.0014, "num_tokens": 1476954.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1110 }, { "completion_length": 10.95, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 10.95, "completions/mean_terminated_length": 10.95, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.07701829184431302, "frac_reward_zero_std": 1.0, "grad_norm": 0.007129206787794828, "kl": 1.5234606921672822, "learning_rate": 2.8574879227053144e-06, "loss": 0.0015, "num_tokens": 1489896.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1120 }, { "completion_length": 9.95, "completions/clipped_ratio": 0.0, "completions/max_length": 11.9, "completions/max_terminated_length": 11.9, "completions/mean_length": 9.95, "completions/mean_terminated_length": 9.95, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.07770595516435154, "frac_reward_zero_std": 1.0, "grad_norm": 0.005986523348838091, "kl": 1.9270170032978058, "learning_rate": 2.8333333333333335e-06, "loss": 0.0019, "num_tokens": 1500578.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1130 }, { "completion_length": 14.4, "completions/clipped_ratio": 0.0, "completions/max_length": 25.3, "completions/max_terminated_length": 25.3, "completions/mean_length": 14.4, "completions/mean_terminated_length": 14.4, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.07839361848439004, "frac_reward_zero_std": 0.9, "grad_norm": 0.010054023936390877, "kl": 1.3340931117534638, "learning_rate": 2.8091787439613525e-06, "loss": 0.0013, "num_tokens": 1512906.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1140 }, { "completion_length": 20.15, "completions/clipped_ratio": 0.0, "completions/max_length": 49.2, "completions/max_terminated_length": 49.2, "completions/mean_length": 20.15, "completions/mean_terminated_length": 20.15, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.07908128180442855, "frac_reward_zero_std": 0.9, "grad_norm": 0.0031711028423160315, "kl": 1.3866869628429412, "learning_rate": 2.7850241545893725e-06, "loss": 0.0014, "num_tokens": 1524192.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1150 }, { "completion_length": 13.175, "completions/clipped_ratio": 0.0, "completions/max_length": 23.6, "completions/max_terminated_length": 23.6, "completions/mean_length": 13.175, "completions/mean_terminated_length": 13.175, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.07976894512446706, "frac_reward_zero_std": 0.9, "grad_norm": 0.01567765325307846, "kl": 1.2489181756973267, "learning_rate": 2.7608695652173916e-06, "loss": 0.0012, "num_tokens": 1536839.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1160 }, { "completion_length": 12.175, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 12.175, "completions/mean_terminated_length": 12.175, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.08045660844450557, "frac_reward_zero_std": 1.0, "grad_norm": 0.01604825258255005, "kl": 1.3663283944129945, "learning_rate": 2.736714975845411e-06, "loss": 0.0014, "num_tokens": 1550150.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1170 }, { "completion_length": 11.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 11.4, "completions/mean_terminated_length": 11.4, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.08114427176454408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025910416152328253, "kl": 1.2825160801410675, "learning_rate": 2.71256038647343e-06, "loss": 0.0013, "num_tokens": 1562550.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1180 }, { "completion_length": 11.35, "completions/clipped_ratio": 0.0, "completions/max_length": 14.4, "completions/max_terminated_length": 14.4, "completions/mean_length": 11.35, "completions/mean_terminated_length": 11.35, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.08183193508458259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024119107984006405, "kl": 1.2274268567562103, "learning_rate": 2.6884057971014493e-06, "loss": 0.0012, "num_tokens": 1575764.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1190 }, { "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.9, "completions/max_terminated_length": 12.9, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.0825195984046211, "frac_reward_zero_std": 0.9, "grad_norm": 0.0072606573812663555, "kl": 1.4195207893848418, "learning_rate": 2.6642512077294684e-06, "loss": 0.0014, "num_tokens": 1589350.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1200 }, { "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.0832072617246596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019173582550138235, "kl": 1.3919144153594971, "learning_rate": 2.6400966183574883e-06, "loss": 0.0014, "num_tokens": 1602132.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1210 }, { "completion_length": 11.15, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.15, "completions/mean_terminated_length": 11.15, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.08389492504469812, "frac_reward_zero_std": 1.0, "grad_norm": 0.004679285455495119, "kl": 1.2476289927959443, "learning_rate": 2.6159420289855074e-06, "loss": 0.0012, "num_tokens": 1612766.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1220 }, { "completion_length": 11.35, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.35, "completions/mean_terminated_length": 11.35, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.08458258836473663, "frac_reward_zero_std": 1.0, "grad_norm": 0.002475257497280836, "kl": 1.4096310913562775, "learning_rate": 2.591787439613527e-06, "loss": 0.0014, "num_tokens": 1625160.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1230 }, { "completion_length": 10.875, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 10.875, "completions/mean_terminated_length": 10.875, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.08527025168477513, "frac_reward_zero_std": 0.9, "grad_norm": 0.00401447806507349, "kl": 1.3029845595359801, "learning_rate": 2.567632850241546e-06, "loss": 0.0013, "num_tokens": 1637763.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1240 }, { "completion_length": 16.95, "completions/clipped_ratio": 0.0, "completions/max_length": 39.6, "completions/max_terminated_length": 39.6, "completions/mean_length": 16.95, "completions/mean_terminated_length": 16.95, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.08595791500481365, "frac_reward_zero_std": 0.9, "grad_norm": 0.005188316572457552, "kl": 1.3585187375545502, "learning_rate": 2.543478260869565e-06, "loss": 0.0014, "num_tokens": 1649345.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1250 }, { "completion_length": 11.125, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 11.125, "completions/mean_terminated_length": 11.125, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.08664557832485215, "frac_reward_zero_std": 1.0, "grad_norm": 0.025956762954592705, "kl": 1.5334448099136353, "learning_rate": 2.519323671497585e-06, "loss": 0.0015, "num_tokens": 1661062.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1260 }, { "completion_length": 11.275, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 11.275, "completions/mean_terminated_length": 11.275, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.08733324164489066, "frac_reward_zero_std": 1.0, "grad_norm": 0.004098770674318075, "kl": 1.2529529988765717, "learning_rate": 2.495169082125604e-06, "loss": 0.0013, "num_tokens": 1673301.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1270 }, { "completion_length": 10.525, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.525, "completions/mean_terminated_length": 10.525, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.08802090496492918, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032903961837291718, "kl": 1.2799896121025085, "learning_rate": 2.471014492753623e-06, "loss": 0.0013, "num_tokens": 1685302.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1280 }, { "completion_length": 10.35, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 10.35, "completions/mean_terminated_length": 10.35, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.08870856828496768, "frac_reward_zero_std": 1.0, "grad_norm": 0.00576725322753191, "kl": 1.3937119126319886, "learning_rate": 2.4468599033816427e-06, "loss": 0.0014, "num_tokens": 1697512.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1290 }, { "completion_length": 10.725, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 10.725, "completions/mean_terminated_length": 10.725, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.08939623160500619, "frac_reward_zero_std": 1.0, "grad_norm": 0.009266716428101063, "kl": 1.3509972453117371, "learning_rate": 2.4227053140096622e-06, "loss": 0.0014, "num_tokens": 1708789.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1300 }, { "completion_length": 10.9, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 10.9, "completions/mean_terminated_length": 10.9, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.09008389492504469, "frac_reward_zero_std": 1.0, "grad_norm": 0.004442331846803427, "kl": 1.4136140465736389, "learning_rate": 2.3985507246376813e-06, "loss": 0.0014, "num_tokens": 1721465.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1310 }, { "completion_length": 12.85, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 12.85, "completions/mean_terminated_length": 12.85, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.0907715582450832, "frac_reward_zero_std": 0.9, "grad_norm": 0.00520370714366436, "kl": 1.6327512860298157, "learning_rate": 2.374396135265701e-06, "loss": 0.0016, "num_tokens": 1732827.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1320 }, { "completion_length": 11.45, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.45, "completions/mean_terminated_length": 11.45, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.09145922156512172, "frac_reward_zero_std": 1.0, "grad_norm": 0.15331777930259705, "kl": 1.3064857959747314, "learning_rate": 2.35024154589372e-06, "loss": 0.0013, "num_tokens": 1744241.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1330 }, { "completion_length": 10.125, "completions/clipped_ratio": 0.0, "completions/max_length": 12.5, "completions/max_terminated_length": 12.5, "completions/mean_length": 10.125, "completions/mean_terminated_length": 10.125, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.09214688488516022, "frac_reward_zero_std": 1.0, "grad_norm": 0.01661134511232376, "kl": 1.4703512012958526, "learning_rate": 2.326086956521739e-06, "loss": 0.0015, "num_tokens": 1757074.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1340 }, { "completion_length": 12.15, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 12.15, "completions/mean_terminated_length": 12.15, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.09283454820519874, "frac_reward_zero_std": 1.0, "grad_norm": 0.002100709592923522, "kl": 1.1705379903316497, "learning_rate": 2.3019323671497585e-06, "loss": 0.0012, "num_tokens": 1768636.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1350 }, { "completion_length": 15.125, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 15.125, "completions/mean_terminated_length": 15.125, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.09352221152523725, "frac_reward_zero_std": 0.9, "grad_norm": 0.0030410720501095057, "kl": 1.1578946471214295, "learning_rate": 2.277777777777778e-06, "loss": 0.0012, "num_tokens": 1781393.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1360 }, { "completion_length": 9.95, "completions/clipped_ratio": 0.0, "completions/max_length": 12.6, "completions/max_terminated_length": 12.6, "completions/mean_length": 9.95, "completions/mean_terminated_length": 9.95, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.09420987484527575, "frac_reward_zero_std": 1.0, "grad_norm": 0.005324806552380323, "kl": 1.351249760389328, "learning_rate": 2.2536231884057976e-06, "loss": 0.0014, "num_tokens": 1794791.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1370 }, { "completion_length": 13.075, "completions/clipped_ratio": 0.0, "completions/max_length": 22.4, "completions/max_terminated_length": 22.4, "completions/mean_length": 13.075, "completions/mean_terminated_length": 13.075, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.09489753816531427, "frac_reward_zero_std": 0.9, "grad_norm": 0.004393610171973705, "kl": 1.335606962442398, "learning_rate": 2.2294685990338166e-06, "loss": 0.0013, "num_tokens": 1807182.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1380 }, { "completion_length": 10.375, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 10.375, "completions/mean_terminated_length": 10.375, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.09558520148535277, "frac_reward_zero_std": 1.0, "grad_norm": 0.011187166906893253, "kl": 1.3981504678726195, "learning_rate": 2.2053140096618357e-06, "loss": 0.0014, "num_tokens": 1817889.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1390 }, { "completion_length": 11.125, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 11.125, "completions/mean_terminated_length": 11.125, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.09627286480539128, "frac_reward_zero_std": 1.0, "grad_norm": 0.00982034020125866, "kl": 1.439992618560791, "learning_rate": 2.1811594202898552e-06, "loss": 0.0014, "num_tokens": 1830338.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1400 }, { "completion_length": 12.575, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 12.575, "completions/mean_terminated_length": 12.575, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.0969605281254298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037614547181874514, "kl": 1.459202778339386, "learning_rate": 2.1570048309178743e-06, "loss": 0.0015, "num_tokens": 1841853.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1410 }, { "completion_length": 11.325, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 11.325, "completions/mean_terminated_length": 11.325, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.0976481914454683, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022658687084913254, "kl": 1.3880295991897582, "learning_rate": 2.132850241545894e-06, "loss": 0.0014, "num_tokens": 1853382.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1420 }, { "completion_length": 12.575, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 12.575, "completions/mean_terminated_length": 12.575, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.09833585476550681, "frac_reward_zero_std": 0.9, "grad_norm": 0.0058380733244121075, "kl": 1.4471020340919494, "learning_rate": 2.1086956521739134e-06, "loss": 0.0014, "num_tokens": 1866417.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1430 }, { "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.09902351808554531, "frac_reward_zero_std": 1.0, "grad_norm": 0.01883215643465519, "kl": 1.319870752096176, "learning_rate": 2.0845410628019325e-06, "loss": 0.0013, "num_tokens": 1877421.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1440 }, { "completion_length": 13.3, "completions/clipped_ratio": 0.0, "completions/max_length": 23.6, "completions/max_terminated_length": 23.6, "completions/mean_length": 13.3, "completions/mean_terminated_length": 13.3, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.09971118140558383, "frac_reward_zero_std": 0.9, "grad_norm": 2.100250482559204, "kl": 1.1677093982696534, "learning_rate": 2.060386473429952e-06, "loss": 0.0012, "num_tokens": 1890953.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1450 }, { "completion_length": 10.3, "completions/clipped_ratio": 0.0, "completions/max_length": 12.8, "completions/max_terminated_length": 12.8, "completions/mean_length": 10.3, "completions/mean_terminated_length": 10.3, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.10039884472562234, "frac_reward_zero_std": 1.0, "grad_norm": 0.003426405368372798, "kl": 1.348715353012085, "learning_rate": 2.036231884057971e-06, "loss": 0.0013, "num_tokens": 1902461.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1460 }, { "completion_length": 11.925, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 11.925, "completions/mean_terminated_length": 11.925, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.10108650804566084, "frac_reward_zero_std": 1.0, "grad_norm": 0.004393596667796373, "kl": 1.2271339356899262, "learning_rate": 2.0120772946859906e-06, "loss": 0.0012, "num_tokens": 1914978.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1470 }, { "completion_length": 10.8, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 10.8, "completions/mean_terminated_length": 10.8, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.10177417136569936, "frac_reward_zero_std": 0.9, "grad_norm": 7.2538251876831055, "kl": 1.4542439758777619, "learning_rate": 1.98792270531401e-06, "loss": 0.0015, "num_tokens": 1927398.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1480 }, { "completion_length": 10.375, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 10.375, "completions/mean_terminated_length": 10.375, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.10246183468573786, "frac_reward_zero_std": 1.0, "grad_norm": 0.007453908212482929, "kl": 1.3577909409999847, "learning_rate": 1.963768115942029e-06, "loss": 0.0014, "num_tokens": 1939537.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1490 }, { "completion_length": 11.175, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.175, "completions/mean_terminated_length": 11.175, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.10314949800577637, "frac_reward_zero_std": 1.0, "grad_norm": 0.017898349091410637, "kl": 1.2097175359725951, "learning_rate": 1.9396135265700487e-06, "loss": 0.0012, "num_tokens": 1950164.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1500 }, { "completion_length": 10.475, "completions/clipped_ratio": 0.0, "completions/max_length": 14.4, "completions/max_terminated_length": 14.4, "completions/mean_length": 10.475, "completions/mean_terminated_length": 10.475, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.10383716132581489, "frac_reward_zero_std": 1.0, "grad_norm": 0.02172171324491501, "kl": 1.461452579498291, "learning_rate": 1.9154589371980678e-06, "loss": 0.0015, "num_tokens": 1962247.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1510 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.10452482464585339, "frac_reward_zero_std": 0.9, "grad_norm": 4.611449241638184, "kl": 1.579625529050827, "learning_rate": 1.891304347826087e-06, "loss": 0.0016, "num_tokens": 1972028.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1520 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 50.9, "completions/max_terminated_length": 50.9, "completions/mean_length": 20.1, "completions/mean_terminated_length": 20.1, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.1052124879658919, "frac_reward_zero_std": 0.9, "grad_norm": 0.0037286400329321623, "kl": 1.2361969292163848, "learning_rate": 1.8671497584541066e-06, "loss": 0.0012, "num_tokens": 1985076.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1530 }, { "completion_length": 27.475, "completions/clipped_ratio": 0.0, "completions/max_length": 82.3, "completions/max_terminated_length": 82.3, "completions/mean_length": 27.475, "completions/mean_terminated_length": 27.475, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1059001512859304, "frac_reward_zero_std": 0.8, "grad_norm": 1.7765352725982666, "kl": 1.4994196772575379, "learning_rate": 1.8429951690821257e-06, "loss": 0.0015, "num_tokens": 1998887.0, "reward": 5.9, "reward_std": 0.2, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1540 }, { "completion_length": 10.325, "completions/clipped_ratio": 0.0, "completions/max_length": 12.7, "completions/max_terminated_length": 12.7, "completions/mean_length": 10.325, "completions/mean_terminated_length": 10.325, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.10658781460596892, "frac_reward_zero_std": 1.0, "grad_norm": 0.013411914929747581, "kl": 1.5437786877155304, "learning_rate": 1.818840579710145e-06, "loss": 0.0015, "num_tokens": 2010696.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1550 }, { "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.4, "completions/max_terminated_length": 24.4, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.10727547792600743, "frac_reward_zero_std": 0.9, "grad_norm": 0.002627303823828697, "kl": 1.1799913942813873, "learning_rate": 1.7946859903381645e-06, "loss": 0.0012, "num_tokens": 2024560.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1560 }, { "completion_length": 11.175, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.175, "completions/mean_terminated_length": 11.175, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.10796314124604593, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034348671324551105, "kl": 1.3212086796760558, "learning_rate": 1.7705314009661836e-06, "loss": 0.0013, "num_tokens": 2037779.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1570 }, { "completion_length": 10.175, "completions/clipped_ratio": 0.0, "completions/max_length": 12.6, "completions/max_terminated_length": 12.6, "completions/mean_length": 10.175, "completions/mean_terminated_length": 10.175, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.10865080456608445, "frac_reward_zero_std": 1.0, "grad_norm": 0.008126153610646725, "kl": 1.4890945374965667, "learning_rate": 1.7463768115942031e-06, "loss": 0.0015, "num_tokens": 2049986.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1580 }, { "completion_length": 11.55, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 11.55, "completions/mean_terminated_length": 11.55, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.10933846788612295, "frac_reward_zero_std": 1.0, "grad_norm": 0.003164671128615737, "kl": 1.8153117537498473, "learning_rate": 1.7222222222222224e-06, "loss": 0.0018, "num_tokens": 2061292.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1590 }, { "completion_length": 11.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.8, "completions/mean_terminated_length": 11.8, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.11002613120616146, "frac_reward_zero_std": 1.0, "grad_norm": 0.008705941960215569, "kl": 1.372535276412964, "learning_rate": 1.6980676328502415e-06, "loss": 0.0014, "num_tokens": 2072076.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1600 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.11071379452619998, "frac_reward_zero_std": 0.9, "grad_norm": 3.7534122467041016, "kl": 1.3290419816970824, "learning_rate": 1.673913043478261e-06, "loss": 0.0013, "num_tokens": 2084146.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1610 }, { "completion_length": 10.725, "completions/clipped_ratio": 0.0, "completions/max_length": 13.8, "completions/max_terminated_length": 13.8, "completions/mean_length": 10.725, "completions/mean_terminated_length": 10.725, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.11140145784623848, "frac_reward_zero_std": 1.0, "grad_norm": 0.007745617534965277, "kl": 1.4284833550453186, "learning_rate": 1.6497584541062803e-06, "loss": 0.0014, "num_tokens": 2094831.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1620 }, { "completion_length": 11.55, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.55, "completions/mean_terminated_length": 11.55, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.11208912116627699, "frac_reward_zero_std": 1.0, "grad_norm": 0.007192945573478937, "kl": 1.3611448645591735, "learning_rate": 1.6256038647342998e-06, "loss": 0.0014, "num_tokens": 2108805.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1630 }, { "completion_length": 54.85, "completions/clipped_ratio": 0.0, "completions/max_length": 184.4, "completions/max_terminated_length": 184.4, "completions/mean_length": 54.85, "completions/mean_terminated_length": 54.85, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.1127767844863155, "frac_reward_zero_std": 0.8, "grad_norm": 0.0031543634831905365, "kl": 1.1925908386707307, "learning_rate": 1.601449275362319e-06, "loss": 0.0012, "num_tokens": 2123923.0, "reward": 5.925, "reward_std": 0.10773502588272095, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.053867512941360475, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.053867512941360475, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1640 }, { "completion_length": 10.875, "completions/clipped_ratio": 0.0, "completions/max_length": 13.8, "completions/max_terminated_length": 13.8, "completions/mean_length": 10.875, "completions/mean_terminated_length": 10.875, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.11346444780635401, "frac_reward_zero_std": 1.0, "grad_norm": 0.003872538451105356, "kl": 1.4870728313922883, "learning_rate": 1.5772946859903382e-06, "loss": 0.0015, "num_tokens": 2135622.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1650 }, { "completion_length": 11.275, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.275, "completions/mean_terminated_length": 11.275, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.11415211112639252, "frac_reward_zero_std": 1.0, "grad_norm": 0.006047138012945652, "kl": 1.2302622616291046, "learning_rate": 1.5531400966183577e-06, "loss": 0.0012, "num_tokens": 2148629.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1660 }, { "completion_length": 10.575, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 10.575, "completions/mean_terminated_length": 10.575, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.11483977444643102, "frac_reward_zero_std": 0.9, "grad_norm": 3.129744291305542, "kl": 1.347231537103653, "learning_rate": 1.5289855072463768e-06, "loss": 0.0013, "num_tokens": 2157484.0, "reward": 5.9875, "reward_std": 0.025, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1670 }, { "completion_length": 10.85, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 10.85, "completions/mean_terminated_length": 10.85, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.11552743776646954, "frac_reward_zero_std": 1.0, "grad_norm": 0.005663453601300716, "kl": 1.3255208492279054, "learning_rate": 1.5048309178743963e-06, "loss": 0.0013, "num_tokens": 2172046.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1680 }, { "completion_length": 10.4, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.4, "completions/mean_terminated_length": 10.4, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.11621510108650805, "frac_reward_zero_std": 1.0, "grad_norm": 0.007839322090148926, "kl": 1.516604733467102, "learning_rate": 1.4806763285024156e-06, "loss": 0.0015, "num_tokens": 2184446.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1690 }, { "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.11690276440654655, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044697728008031845, "kl": 1.2774751663208008, "learning_rate": 1.4565217391304347e-06, "loss": 0.0013, "num_tokens": 2198042.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1700 }, { "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.3, "completions/max_terminated_length": 13.3, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.11759042772658507, "frac_reward_zero_std": 1.0, "grad_norm": 0.008361663669347763, "kl": 1.3569441199302674, "learning_rate": 1.4323671497584543e-06, "loss": 0.0014, "num_tokens": 2210628.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1710 }, { "completion_length": 12.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 12.8, "completions/mean_terminated_length": 12.8, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.11827809104662357, "frac_reward_zero_std": 0.9, "grad_norm": 2.7055230140686035, "kl": 1.339830869436264, "learning_rate": 1.4082125603864736e-06, "loss": 0.0013, "num_tokens": 2223380.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1720 }, { "completion_length": 13.175, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 13.175, "completions/mean_terminated_length": 13.175, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.11896575436666208, "frac_reward_zero_std": 1.0, "grad_norm": 0.005280562676489353, "kl": 1.2809099555015564, "learning_rate": 1.3840579710144926e-06, "loss": 0.0013, "num_tokens": 2235851.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1730 }, { "completion_length": 12.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 12.1, "completions/mean_terminated_length": 12.1, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.1196534176867006, "frac_reward_zero_std": 1.0, "grad_norm": 0.004883504938334227, "kl": 1.0983420848846435, "learning_rate": 1.3599033816425122e-06, "loss": 0.0011, "num_tokens": 2248511.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1740 }, { "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.1203410810067391, "frac_reward_zero_std": 1.0, "grad_norm": 0.005267248023301363, "kl": 1.330749648809433, "learning_rate": 1.3357487922705315e-06, "loss": 0.0013, "num_tokens": 2259335.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1750 }, { "completion_length": 10.775, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 10.775, "completions/mean_terminated_length": 10.775, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.12102874432677761, "frac_reward_zero_std": 0.9, "grad_norm": 5.972342491149902, "kl": 1.4707993149757386, "learning_rate": 1.311594202898551e-06, "loss": 0.0015, "num_tokens": 2273490.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1760 }, { "completion_length": 10.8, "completions/clipped_ratio": 0.0, "completions/max_length": 13.9, "completions/max_terminated_length": 13.9, "completions/mean_length": 10.8, "completions/mean_terminated_length": 10.8, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.12171640764681611, "frac_reward_zero_std": 1.0, "grad_norm": 0.008044744841754436, "kl": 1.393112576007843, "learning_rate": 1.28743961352657e-06, "loss": 0.0014, "num_tokens": 2285606.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1770 }, { "completion_length": 11.7, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.7, "completions/mean_terminated_length": 11.7, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.12240407096685463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036649706307798624, "kl": 1.138010984659195, "learning_rate": 1.2632850241545894e-06, "loss": 0.0011, "num_tokens": 2297982.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1780 }, { "completion_length": 9.925, "completions/clipped_ratio": 0.0, "completions/max_length": 12.3, "completions/max_terminated_length": 12.3, "completions/mean_length": 9.925, "completions/mean_terminated_length": 9.925, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.12309173428689314, "frac_reward_zero_std": 1.0, "grad_norm": 0.00226943870075047, "kl": 1.3314465701580047, "learning_rate": 1.2391304347826089e-06, "loss": 0.0013, "num_tokens": 2309427.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1790 }, { "completion_length": 20.775, "completions/clipped_ratio": 0.0, "completions/max_length": 52.6, "completions/max_terminated_length": 52.6, "completions/mean_length": 20.775, "completions/mean_terminated_length": 20.775, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.12377939760693164, "frac_reward_zero_std": 0.9, "grad_norm": 0.0048567261546850204, "kl": 1.1917468369007111, "learning_rate": 1.214975845410628e-06, "loss": 0.0012, "num_tokens": 2321950.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1800 }, { "completion_length": 10.1, "completions/clipped_ratio": 0.0, "completions/max_length": 12.8, "completions/max_terminated_length": 12.8, "completions/mean_length": 10.1, "completions/mean_terminated_length": 10.1, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.12446706092697016, "frac_reward_zero_std": 1.0, "grad_norm": 0.004037019796669483, "kl": 1.2711975991725921, "learning_rate": 1.1908212560386475e-06, "loss": 0.0013, "num_tokens": 2333018.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1810 }, { "completion_length": 11.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 11.6, "completions/mean_terminated_length": 11.6, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.12515472424700866, "frac_reward_zero_std": 1.0, "grad_norm": 0.004209557548165321, "kl": 1.2455637753009796, "learning_rate": 1.1666666666666668e-06, "loss": 0.0012, "num_tokens": 2345706.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1820 }, { "completion_length": 10.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.9, "completions/mean_terminated_length": 10.9, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.1258423875670472, "frac_reward_zero_std": 1.0, "grad_norm": 0.004164872225373983, "kl": 1.3472298622131347, "learning_rate": 1.142512077294686e-06, "loss": 0.0013, "num_tokens": 2357826.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1830 }, { "completion_length": 10.525, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 10.525, "completions/mean_terminated_length": 10.525, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.1265300508870857, "frac_reward_zero_std": 1.0, "grad_norm": 0.010473374277353287, "kl": 1.3082951486110688, "learning_rate": 1.1183574879227054e-06, "loss": 0.0013, "num_tokens": 2369667.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1840 }, { "completion_length": 11.05, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.05, "completions/mean_terminated_length": 11.05, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.1272177142071242, "frac_reward_zero_std": 1.0, "grad_norm": 0.004580930341035128, "kl": 1.3786839723587037, "learning_rate": 1.0942028985507247e-06, "loss": 0.0014, "num_tokens": 2381753.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1850 }, { "completion_length": 11.175, "completions/clipped_ratio": 0.0, "completions/max_length": 13.9, "completions/max_terminated_length": 13.9, "completions/mean_length": 11.175, "completions/mean_terminated_length": 11.175, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.1279053775271627, "frac_reward_zero_std": 1.0, "grad_norm": 0.00417534913867712, "kl": 1.2470466375350953, "learning_rate": 1.070048309178744e-06, "loss": 0.0012, "num_tokens": 2392604.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1860 }, { "completion_length": 11.1, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.1, "completions/mean_terminated_length": 11.1, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.12859304084720122, "frac_reward_zero_std": 1.0, "grad_norm": 0.006656737998127937, "kl": 1.3809801578521728, "learning_rate": 1.0458937198067635e-06, "loss": 0.0014, "num_tokens": 2404688.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1870 }, { "completion_length": 23.325, "completions/clipped_ratio": 0.0, "completions/max_length": 64.4, "completions/max_terminated_length": 64.4, "completions/mean_length": 23.325, "completions/mean_terminated_length": 23.325, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.12928070416723972, "frac_reward_zero_std": 0.8, "grad_norm": 0.0029542380943894386, "kl": 1.4932059407234193, "learning_rate": 1.0217391304347828e-06, "loss": 0.0015, "num_tokens": 2416525.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1880 }, { "completion_length": 11.075, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 11.075, "completions/mean_terminated_length": 11.075, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.12996836748727822, "frac_reward_zero_std": 1.0, "grad_norm": 0.004283738322556019, "kl": 1.4605644285678863, "learning_rate": 9.97584541062802e-07, "loss": 0.0015, "num_tokens": 2428684.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1890 }, { "completion_length": 10.925, "completions/clipped_ratio": 0.0, "completions/max_length": 14.4, "completions/max_terminated_length": 14.4, "completions/mean_length": 10.925, "completions/mean_terminated_length": 10.925, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.13065603080731675, "frac_reward_zero_std": 1.0, "grad_norm": 0.004885092377662659, "kl": 1.2866985321044921, "learning_rate": 9.734299516908214e-07, "loss": 0.0013, "num_tokens": 2440733.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1900 }, { "completion_length": 11.4, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 11.4, "completions/mean_terminated_length": 11.4, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.13134369412735525, "frac_reward_zero_std": 1.0, "grad_norm": 0.006722769699990749, "kl": 1.2210811614990233, "learning_rate": 9.492753623188407e-07, "loss": 0.0012, "num_tokens": 2451929.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1910 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.13203135744739375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048691569827497005, "kl": 1.236713171005249, "learning_rate": 9.2512077294686e-07, "loss": 0.0012, "num_tokens": 2462035.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1920 }, { "completion_length": 10.6, "completions/clipped_ratio": 0.0, "completions/max_length": 13.4, "completions/max_terminated_length": 13.4, "completions/mean_length": 10.6, "completions/mean_terminated_length": 10.6, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.13271902076743228, "frac_reward_zero_std": 1.0, "grad_norm": 0.021696312353014946, "kl": 1.2257360517978668, "learning_rate": 9.009661835748792e-07, "loss": 0.0012, "num_tokens": 2473739.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1930 }, { "completion_length": 11.2, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 11.2, "completions/mean_terminated_length": 11.2, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.13340668408747078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032455010805279016, "kl": 1.325300359725952, "learning_rate": 8.768115942028986e-07, "loss": 0.0013, "num_tokens": 2485935.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1940 }, { "completion_length": 10.775, "completions/clipped_ratio": 0.0, "completions/max_length": 13.2, "completions/max_terminated_length": 13.2, "completions/mean_length": 10.775, "completions/mean_terminated_length": 10.775, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.13409434740750928, "frac_reward_zero_std": 1.0, "grad_norm": 0.002819440560415387, "kl": 1.2611912727355956, "learning_rate": 8.526570048309179e-07, "loss": 0.0013, "num_tokens": 2498274.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1950 }, { "completion_length": 9.65, "completions/clipped_ratio": 0.0, "completions/max_length": 12.7, "completions/max_terminated_length": 12.7, "completions/mean_length": 9.65, "completions/mean_terminated_length": 9.65, "completions/min_length": 7.3, "completions/min_terminated_length": 7.3, "epoch": 0.13478201072754778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0049956305883824825, "kl": 1.4177262127399444, "learning_rate": 8.285024154589373e-07, "loss": 0.0014, "num_tokens": 2510556.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1960 }, { "completion_length": 11.2, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.2, "completions/mean_terminated_length": 11.2, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.1354696740475863, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032518147490918636, "kl": 1.2991495847702026, "learning_rate": 8.043478260869565e-07, "loss": 0.0013, "num_tokens": 2525016.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1970 }, { "completion_length": 11.475, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 11.475, "completions/mean_terminated_length": 11.475, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.1361573373676248, "frac_reward_zero_std": 1.0, "grad_norm": 0.00288589671254158, "kl": 1.1318634927272797, "learning_rate": 7.801932367149758e-07, "loss": 0.0011, "num_tokens": 2538403.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1980 }, { "completion_length": 22.075, "completions/clipped_ratio": 0.0, "completions/max_length": 57.1, "completions/max_terminated_length": 57.1, "completions/mean_length": 22.075, "completions/mean_terminated_length": 22.075, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.1368450006876633, "frac_reward_zero_std": 0.7, "grad_norm": 1.648767113685608, "kl": 1.2390470504760742, "learning_rate": 7.560386473429952e-07, "loss": 0.0012, "num_tokens": 2551406.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 1990 }, { "completion_length": 10.525, "completions/clipped_ratio": 0.0, "completions/max_length": 13.3, "completions/max_terminated_length": 13.3, "completions/mean_length": 10.525, "completions/mean_terminated_length": 10.525, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.13753266400770184, "frac_reward_zero_std": 1.0, "grad_norm": 0.01037998590618372, "kl": 1.352639377117157, "learning_rate": 7.318840579710145e-07, "loss": 0.0014, "num_tokens": 2562439.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2000 }, { "completion_length": 11.425, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 11.425, "completions/mean_terminated_length": 11.425, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.13822032732774034, "frac_reward_zero_std": 1.0, "grad_norm": 0.003507354063913226, "kl": 1.4222690105438232, "learning_rate": 7.07729468599034e-07, "loss": 0.0014, "num_tokens": 2575488.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2010 }, { "completion_length": 22.575, "completions/clipped_ratio": 0.0, "completions/max_length": 57.5, "completions/max_terminated_length": 57.5, "completions/mean_length": 22.575, "completions/mean_terminated_length": 22.575, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.13890799064777884, "frac_reward_zero_std": 0.9, "grad_norm": 0.026074331253767014, "kl": 1.262561959028244, "learning_rate": 6.835748792270532e-07, "loss": 0.0013, "num_tokens": 2588159.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2020 }, { "completion_length": 10.575, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.575, "completions/mean_terminated_length": 10.575, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.13959565396781737, "frac_reward_zero_std": 1.0, "grad_norm": 0.00410475442185998, "kl": 1.2806087374687194, "learning_rate": 6.594202898550725e-07, "loss": 0.0013, "num_tokens": 2599894.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2030 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.14028331728785587, "frac_reward_zero_std": 1.0, "grad_norm": 0.015047146007418633, "kl": 21.605943036079406, "learning_rate": 6.352657004830919e-07, "loss": 0.0216, "num_tokens": 2613991.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2040 }, { "completion_length": 10.65, "completions/clipped_ratio": 0.0, "completions/max_length": 13.2, "completions/max_terminated_length": 13.2, "completions/mean_length": 10.65, "completions/mean_terminated_length": 10.65, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.14097098060789437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0043119946494698524, "kl": 1.3894463539123536, "learning_rate": 6.111111111111112e-07, "loss": 0.0014, "num_tokens": 2627701.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2050 }, { "completion_length": 11.175, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.175, "completions/mean_terminated_length": 11.175, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.14165864392793287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072251055389642715, "kl": 1.2303254783153534, "learning_rate": 5.869565217391305e-07, "loss": 0.0012, "num_tokens": 2639680.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2060 }, { "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.1423463072479714, "frac_reward_zero_std": 0.9, "grad_norm": 0.004525719676166773, "kl": 1.2622820734977722, "learning_rate": 5.628019323671498e-07, "loss": 0.0013, "num_tokens": 2652124.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2070 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.1430339705680099, "frac_reward_zero_std": 1.0, "grad_norm": 0.004848666954785585, "kl": 1.3604639172554016, "learning_rate": 5.386473429951692e-07, "loss": 0.0014, "num_tokens": 2664773.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2080 }, { "completion_length": 10.625, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 10.625, "completions/mean_terminated_length": 10.625, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.1437216338880484, "frac_reward_zero_std": 0.9, "grad_norm": 0.003022987162694335, "kl": 1.539460152387619, "learning_rate": 5.144927536231884e-07, "loss": 0.0015, "num_tokens": 2677914.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2090 }, { "completion_length": 11.35, "completions/clipped_ratio": 0.0, "completions/max_length": 14.4, "completions/max_terminated_length": 14.4, "completions/mean_length": 11.35, "completions/mean_terminated_length": 11.35, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.14440929720808693, "frac_reward_zero_std": 1.0, "grad_norm": 0.003124531824141741, "kl": 1.2718017101287842, "learning_rate": 4.903381642512078e-07, "loss": 0.0013, "num_tokens": 2690576.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2100 }, { "completion_length": 11.725, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.725, "completions/mean_terminated_length": 11.725, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.14509696052812543, "frac_reward_zero_std": 1.0, "grad_norm": 0.002378766192123294, "kl": 1.2865601122379302, "learning_rate": 4.6618357487922714e-07, "loss": 0.0013, "num_tokens": 2702989.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2110 }, { "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.14578462384816393, "frac_reward_zero_std": 1.0, "grad_norm": 0.004591071512550116, "kl": 1.3285470128059387, "learning_rate": 4.420289855072464e-07, "loss": 0.0013, "num_tokens": 2715139.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2120 }, { "completion_length": 10.675, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.675, "completions/mean_terminated_length": 10.675, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.14647228716820246, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032588632311671972, "kl": 1.2768325805664062, "learning_rate": 4.1787439613526574e-07, "loss": 0.0013, "num_tokens": 2728638.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2130 }, { "completion_length": 12.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 12.1, "completions/mean_terminated_length": 12.1, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.14715995048824096, "frac_reward_zero_std": 0.9, "grad_norm": 0.008527855388820171, "kl": 1.3150019347667694, "learning_rate": 3.9371980676328504e-07, "loss": 0.0013, "num_tokens": 2742582.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2140 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.14784761380827946, "frac_reward_zero_std": 0.9, "grad_norm": 0.005529398564249277, "kl": 1.3183214008808135, "learning_rate": 3.695652173913044e-07, "loss": 0.0013, "num_tokens": 2753762.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2150 }, { "completion_length": 11.425, "completions/clipped_ratio": 0.0, "completions/max_length": 14.4, "completions/max_terminated_length": 14.4, "completions/mean_length": 11.425, "completions/mean_terminated_length": 11.425, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.148535277128318, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021252967417240143, "kl": 1.1668545484542847, "learning_rate": 3.454106280193237e-07, "loss": 0.0012, "num_tokens": 2767079.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2160 }, { "completion_length": 11.4, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.4, "completions/mean_terminated_length": 11.4, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.1492229404483565, "frac_reward_zero_std": 1.0, "grad_norm": 0.015458406880497932, "kl": 1.3273634731769561, "learning_rate": 3.2125603864734306e-07, "loss": 0.0013, "num_tokens": 2780155.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2170 }, { "completion_length": 13.45, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 13.45, "completions/mean_terminated_length": 13.45, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.149910603768395, "frac_reward_zero_std": 0.9, "grad_norm": 0.0029216071125119925, "kl": 2.837045794725418, "learning_rate": 2.9710144927536236e-07, "loss": 0.0028, "num_tokens": 2793497.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2180 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.1505982670884335, "frac_reward_zero_std": 1.0, "grad_norm": 0.004856933373957872, "kl": 1.2759633779525756, "learning_rate": 2.7294685990338166e-07, "loss": 0.0013, "num_tokens": 2805209.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2190 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.15128593040847202, "frac_reward_zero_std": 1.0, "grad_norm": 0.005369469057768583, "kl": 1.2020570576190948, "learning_rate": 2.4879227053140096e-07, "loss": 0.0012, "num_tokens": 2819046.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2200 }, { "completion_length": 10.075, "completions/clipped_ratio": 0.0, "completions/max_length": 13.4, "completions/max_terminated_length": 13.4, "completions/mean_length": 10.075, "completions/mean_terminated_length": 10.075, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.15197359372851052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032669377978891134, "kl": 1.4382840514183044, "learning_rate": 2.2463768115942032e-07, "loss": 0.0014, "num_tokens": 2830509.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2210 }, { "completion_length": 10.575, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.575, "completions/mean_terminated_length": 10.575, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.15266125704854902, "frac_reward_zero_std": 1.0, "grad_norm": 0.011143822222948074, "kl": 1.4830769181251526, "learning_rate": 2.0048309178743962e-07, "loss": 0.0015, "num_tokens": 2842048.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2220 }, { "completion_length": 11.65, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 11.65, "completions/mean_terminated_length": 11.65, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.15334892036858755, "frac_reward_zero_std": 1.0, "grad_norm": 0.004490590654313564, "kl": 1.174393892288208, "learning_rate": 1.7632850241545895e-07, "loss": 0.0012, "num_tokens": 2854086.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2230 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.15403658368862605, "frac_reward_zero_std": 1.0, "grad_norm": 0.004963410086929798, "kl": 1.3155929028987885, "learning_rate": 1.5217391304347828e-07, "loss": 0.0013, "num_tokens": 2867383.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2240 }, { "completion_length": 11.15, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.15, "completions/mean_terminated_length": 11.15, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.15472424700866455, "frac_reward_zero_std": 1.0, "grad_norm": 0.004094480536878109, "kl": 1.2090741574764252, "learning_rate": 1.280193236714976e-07, "loss": 0.0012, "num_tokens": 2878613.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2250 }, { "completion_length": 41.475, "completions/clipped_ratio": 0.0, "completions/max_length": 134.7, "completions/max_terminated_length": 134.7, "completions/mean_length": 41.475, "completions/mean_terminated_length": 41.475, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.15541191032870308, "frac_reward_zero_std": 0.6, "grad_norm": 0.008804809302091599, "kl": 1.232531774044037, "learning_rate": 1.0386473429951691e-07, "loss": 0.0012, "num_tokens": 2891968.0, "reward": 5.8875, "reward_std": 0.225, "rewards/check_coherence/mean": 1.4625, "rewards/check_coherence/std": 0.075, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2260 }, { "completion_length": 11.4, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.4, "completions/mean_terminated_length": 11.4, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.15609957364874158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023921902757138014, "kl": 1.363280749320984, "learning_rate": 7.971014492753624e-08, "loss": 0.0014, "num_tokens": 2905756.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2270 }, { "completion_length": 15.475, "completions/clipped_ratio": 0.0, "completions/max_length": 30.2, "completions/max_terminated_length": 30.2, "completions/mean_length": 15.475, "completions/mean_terminated_length": 15.475, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.15678723696878008, "frac_reward_zero_std": 0.9, "grad_norm": 0.0035021391231566668, "kl": 1.1597087323665618, "learning_rate": 5.555555555555556e-08, "loss": 0.0012, "num_tokens": 2917635.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2280 }, { "completion_length": 13.9, "completions/clipped_ratio": 0.0, "completions/max_length": 24.6, "completions/max_terminated_length": 24.6, "completions/mean_length": 13.9, "completions/mean_terminated_length": 13.9, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.15747490028881858, "frac_reward_zero_std": 0.9, "grad_norm": 0.0036871584597975016, "kl": 1.1678650498390197, "learning_rate": 3.140096618357488e-08, "loss": 0.0012, "num_tokens": 2931199.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2290 }, { "completion_length": 50.375, "completions/clipped_ratio": 0.025, "completions/max_length": 170.5, "completions/max_terminated_length": 13.9, "completions/mean_length": 50.375, "completions/mean_terminated_length": 11.13333339691162, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.1581625636088571, "frac_reward_zero_std": 0.9, "grad_norm": 0.004820572212338448, "kl": 1.4776630043983459, "learning_rate": 7.246376811594204e-09, "loss": 0.0015, "num_tokens": 2943782.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2300 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 19.6, "completions/mean_terminated_length": 19.6, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.1588502269288956, "frac_reward_zero_std": 0.9, "grad_norm": 1.7149800062179565, "kl": 1.1576267778873444, "learning_rate": 4.115480649188515e-06, "loss": 0.0012, "num_tokens": 2956230.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2310 }, { "completion_length": 11.2, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 11.2, "completions/mean_terminated_length": 11.2, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.1595378902489341, "frac_reward_zero_std": 1.0, "grad_norm": 0.003225781721994281, "kl": 1.2905008971691132, "learning_rate": 4.109238451935081e-06, "loss": 0.0013, "num_tokens": 2966358.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2320 }, { "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.16022555356897264, "frac_reward_zero_std": 1.0, "grad_norm": 0.004192621912807226, "kl": 1.300399947166443, "learning_rate": 4.102996254681649e-06, "loss": 0.0013, "num_tokens": 2978250.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2330 }, { "completion_length": 11.05, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 11.05, "completions/mean_terminated_length": 11.05, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.16091321688901114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0060669067315757275, "kl": 1.0718920350074768, "learning_rate": 4.096754057428215e-06, "loss": 0.0011, "num_tokens": 2990024.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2340 }, { "completion_length": 10.55, "completions/clipped_ratio": 0.0, "completions/max_length": 12.9, "completions/max_terminated_length": 12.9, "completions/mean_length": 10.55, "completions/mean_terminated_length": 10.55, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.16160088020904964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020677910652011633, "kl": 1.2670764803886414, "learning_rate": 4.090511860174782e-06, "loss": 0.0013, "num_tokens": 3002978.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2350 }, { "completion_length": 10.05, "completions/clipped_ratio": 0.0, "completions/max_length": 11.6, "completions/max_terminated_length": 11.6, "completions/mean_length": 10.05, "completions/mean_terminated_length": 10.05, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.16228854352908817, "frac_reward_zero_std": 1.0, "grad_norm": 0.005457987543195486, "kl": 1.3371248841285706, "learning_rate": 4.0842696629213485e-06, "loss": 0.0013, "num_tokens": 3016648.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2360 }, { "completion_length": 10.8, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 10.8, "completions/mean_terminated_length": 10.8, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.16297620684912667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008317215833812952, "kl": 1.2839708745479583, "learning_rate": 4.078027465667916e-06, "loss": 0.0013, "num_tokens": 3026820.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2370 }, { "completion_length": 11.7, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.7, "completions/mean_terminated_length": 11.7, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.16366387016916517, "frac_reward_zero_std": 1.0, "grad_norm": 0.004411312751471996, "kl": 1.2436644613742829, "learning_rate": 4.071785268414482e-06, "loss": 0.0012, "num_tokens": 3037648.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2380 }, { "completion_length": 10.55, "completions/clipped_ratio": 0.0, "completions/max_length": 13.3, "completions/max_terminated_length": 13.3, "completions/mean_length": 10.55, "completions/mean_terminated_length": 10.55, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.1643515334892037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036025703884661198, "kl": 1.3522222876548766, "learning_rate": 4.0655430711610484e-06, "loss": 0.0014, "num_tokens": 3049514.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2390 }, { "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1650391968092422, "frac_reward_zero_std": 1.0, "grad_norm": 0.005733081139624119, "kl": 1.279357409477234, "learning_rate": 4.059300873907616e-06, "loss": 0.0013, "num_tokens": 3061816.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2400 }, { "completion_length": 11.95, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.95, "completions/mean_terminated_length": 11.95, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.1657268601292807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027056564576923847, "kl": 3.6889904975891112, "learning_rate": 4.053058676654182e-06, "loss": 0.0037, "num_tokens": 3075390.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2410 }, { "completion_length": 64.775, "completions/clipped_ratio": 0.025, "completions/max_length": 226.5, "completions/max_terminated_length": 70.1, "completions/mean_length": 64.775, "completions/mean_terminated_length": 25.483333396911622, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.1664145234493192, "frac_reward_zero_std": 0.7, "grad_norm": 3.775110960006714, "kl": 1.2242095589637756, "learning_rate": 4.046816479400749e-06, "loss": 0.0012, "num_tokens": 3090241.0, "reward": 5.8875, "reward_std": 0.225, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.45, "rewards/check_response_quality/std": 0.1, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2420 }, { "completion_length": 30.7, "completions/clipped_ratio": 0.0, "completions/max_length": 94.8, "completions/max_terminated_length": 94.8, "completions/mean_length": 30.7, "completions/mean_terminated_length": 30.7, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.16710218676935773, "frac_reward_zero_std": 0.9, "grad_norm": 0.008342845365405083, "kl": 1.3111794650554658, "learning_rate": 4.0405742821473155e-06, "loss": 0.0013, "num_tokens": 3103937.0, "reward": 5.9375, "reward_std": 0.125, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2430 }, { "completion_length": 11.45, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.45, "completions/mean_terminated_length": 11.45, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.16778985008939623, "frac_reward_zero_std": 1.0, "grad_norm": 0.006231814622879028, "kl": 1.4186553835868836, "learning_rate": 4.034332084893883e-06, "loss": 0.0014, "num_tokens": 3116879.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2440 }, { "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.16847751340943473, "frac_reward_zero_std": 1.0, "grad_norm": 0.00336334272287786, "kl": 1.346349060535431, "learning_rate": 4.028089887640449e-06, "loss": 0.0013, "num_tokens": 3129722.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2450 }, { "completion_length": 11.475, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.475, "completions/mean_terminated_length": 11.475, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.16916517672947326, "frac_reward_zero_std": 1.0, "grad_norm": 0.003932368475943804, "kl": 1.185975819826126, "learning_rate": 4.021847690387017e-06, "loss": 0.0012, "num_tokens": 3141045.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2460 }, { "completion_length": 11.475, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.475, "completions/mean_terminated_length": 11.475, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.16985284004951176, "frac_reward_zero_std": 1.0, "grad_norm": 0.002778848400339484, "kl": 1.2687766671180725, "learning_rate": 4.0156054931335835e-06, "loss": 0.0013, "num_tokens": 3155076.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2470 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.17054050336955026, "frac_reward_zero_std": 1.0, "grad_norm": 0.002489407081156969, "kl": 1.169280767440796, "learning_rate": 4.009363295880151e-06, "loss": 0.0012, "num_tokens": 3167025.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2480 }, { "completion_length": 18.075, "completions/clipped_ratio": 0.0, "completions/max_length": 44.3, "completions/max_terminated_length": 44.3, "completions/mean_length": 18.075, "completions/mean_terminated_length": 18.075, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.1712281666895888, "frac_reward_zero_std": 0.9, "grad_norm": 0.010691997595131397, "kl": 1.4809851229190827, "learning_rate": 4.003121098626717e-06, "loss": 0.0015, "num_tokens": 3177980.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2490 }, { "completion_length": 14.9, "completions/clipped_ratio": 0.0, "completions/max_length": 31.8, "completions/max_terminated_length": 31.8, "completions/mean_length": 14.9, "completions/mean_terminated_length": 14.9, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.1719158300096273, "frac_reward_zero_std": 0.8, "grad_norm": 0.002949868328869343, "kl": 1.39860680103302, "learning_rate": 3.2043314500941624e-06, "loss": 0.0014, "num_tokens": 3189244.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2500 }, { "completion_length": 10.625, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 10.625, "completions/mean_terminated_length": 10.625, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.1726034933296658, "frac_reward_zero_std": 1.0, "grad_norm": 0.010366762056946754, "kl": 1.3766119718551635, "learning_rate": 3.1949152542372884e-06, "loss": 0.0014, "num_tokens": 3202885.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2510 }, { "completion_length": 10.275, "completions/clipped_ratio": 0.0, "completions/max_length": 12.5, "completions/max_terminated_length": 12.5, "completions/mean_length": 10.275, "completions/mean_terminated_length": 10.275, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.1732911566497043, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029948921874165535, "kl": 1.4934549808502198, "learning_rate": 3.1854990583804148e-06, "loss": 0.0015, "num_tokens": 3214960.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2520 }, { "completion_length": 11.15, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.15, "completions/mean_terminated_length": 11.15, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.17397881996974282, "frac_reward_zero_std": 0.9, "grad_norm": 0.0014775346498936415, "kl": 1.3120596766471864, "learning_rate": 3.1760828625235407e-06, "loss": 0.0013, "num_tokens": 3225346.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2530 }, { "completion_length": 10.675, "completions/clipped_ratio": 0.0, "completions/max_length": 13.2, "completions/max_terminated_length": 13.2, "completions/mean_length": 10.675, "completions/mean_terminated_length": 10.675, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.17466648328978132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037268514279276133, "kl": 1.3505820691585542, "learning_rate": 3.1666666666666667e-06, "loss": 0.0014, "num_tokens": 3237365.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2540 }, { "completion_length": 10.6, "completions/clipped_ratio": 0.0, "completions/max_length": 12.7, "completions/max_terminated_length": 12.7, "completions/mean_length": 10.6, "completions/mean_terminated_length": 10.6, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.17535414660981982, "frac_reward_zero_std": 1.0, "grad_norm": 0.002531415317207575, "kl": 1.456881034374237, "learning_rate": 3.1572504708097927e-06, "loss": 0.0015, "num_tokens": 3251221.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2550 }, { "completion_length": 11.725, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 11.725, "completions/mean_terminated_length": 11.725, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.17604180992985835, "frac_reward_zero_std": 1.0, "grad_norm": 0.009061750024557114, "kl": 1.244165402650833, "learning_rate": 3.1478342749529195e-06, "loss": 0.0012, "num_tokens": 3263818.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2560 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.17672947324989685, "frac_reward_zero_std": 1.0, "grad_norm": 0.00531205628067255, "kl": 1.3328250467777252, "learning_rate": 3.1384180790960454e-06, "loss": 0.0013, "num_tokens": 3276140.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2570 }, { "completion_length": 10.15, "completions/clipped_ratio": 0.0, "completions/max_length": 11.5, "completions/max_terminated_length": 11.5, "completions/mean_length": 10.15, "completions/mean_terminated_length": 10.15, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.17741713656993535, "frac_reward_zero_std": 0.9, "grad_norm": 11.290862083435059, "kl": 1.4745511174201966, "learning_rate": 3.129001883239172e-06, "loss": 0.0015, "num_tokens": 3288702.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2580 }, { "completion_length": 10.475, "completions/clipped_ratio": 0.0, "completions/max_length": 12.8, "completions/max_terminated_length": 12.8, "completions/mean_length": 10.475, "completions/mean_terminated_length": 10.475, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.17810479988997388, "frac_reward_zero_std": 1.0, "grad_norm": 0.004790943581610918, "kl": 1.3869511127471923, "learning_rate": 3.1195856873822978e-06, "loss": 0.0014, "num_tokens": 3301205.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2590 }, { "completion_length": 10.85, "completions/clipped_ratio": 0.0, "completions/max_length": 13.3, "completions/max_terminated_length": 13.3, "completions/mean_length": 10.85, "completions/mean_terminated_length": 10.85, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.17879246321001238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015610884875059128, "kl": 1.197289276123047, "learning_rate": 3.1101694915254237e-06, "loss": 0.0012, "num_tokens": 3313747.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2600 }, { "completion_length": 10.675, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.675, "completions/mean_terminated_length": 10.675, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.17948012653005088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018645133823156357, "kl": 1.4583391726016999, "learning_rate": 3.1007532956685505e-06, "loss": 0.0015, "num_tokens": 3325470.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2610 }, { "completion_length": 16.625, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.18016778985008938, "frac_reward_zero_std": 0.9, "grad_norm": 0.004674810450524092, "kl": 1.2932902693748474, "learning_rate": 3.0913370998116765e-06, "loss": 0.0013, "num_tokens": 3337531.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2620 }, { "completion_length": 11.575, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.575, "completions/mean_terminated_length": 11.575, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.1808554531701279, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030683784279972315, "kl": 1.154135423898697, "learning_rate": 3.0819209039548024e-06, "loss": 0.0012, "num_tokens": 3350250.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2630 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.1815431164901664, "frac_reward_zero_std": 1.0, "grad_norm": 0.005663893185555935, "kl": 1.3627763926982879, "learning_rate": 3.072504708097929e-06, "loss": 0.0014, "num_tokens": 3362334.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2640 }, { "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.3, "completions/max_terminated_length": 13.3, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.1822307798102049, "frac_reward_zero_std": 0.9, "grad_norm": 0.004335007164627314, "kl": 1.2993446350097657, "learning_rate": 3.0630885122410548e-06, "loss": 0.0013, "num_tokens": 3376026.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2650 }, { "completion_length": 11.275, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 11.275, "completions/mean_terminated_length": 11.275, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.18291844313024344, "frac_reward_zero_std": 1.0, "grad_norm": 0.010090747848153114, "kl": 1.2555712342262269, "learning_rate": 3.0536723163841807e-06, "loss": 0.0013, "num_tokens": 3388197.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2660 }, { "completion_length": 10.475, "completions/clipped_ratio": 0.0, "completions/max_length": 12.5, "completions/max_terminated_length": 12.5, "completions/mean_length": 10.475, "completions/mean_terminated_length": 10.475, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.18360610645028194, "frac_reward_zero_std": 1.0, "grad_norm": 0.005461522843688726, "kl": 1.423725974559784, "learning_rate": 3.0442561205273075e-06, "loss": 0.0014, "num_tokens": 3399652.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2670 }, { "completion_length": 10.425, "completions/clipped_ratio": 0.0, "completions/max_length": 13.2, "completions/max_terminated_length": 13.2, "completions/mean_length": 10.425, "completions/mean_terminated_length": 10.425, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.18429376977032044, "frac_reward_zero_std": 1.0, "grad_norm": 0.002112050075083971, "kl": 1.3453264117240906, "learning_rate": 3.0348399246704335e-06, "loss": 0.0013, "num_tokens": 3411169.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2680 }, { "completion_length": 11.2, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.2, "completions/mean_terminated_length": 11.2, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.18498143309035897, "frac_reward_zero_std": 1.0, "grad_norm": 0.004399535246193409, "kl": 1.5060070991516112, "learning_rate": 3.0254237288135594e-06, "loss": 0.0015, "num_tokens": 3423749.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2690 }, { "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.7, "completions/max_terminated_length": 12.7, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.18566909641039747, "frac_reward_zero_std": 1.0, "grad_norm": 0.005135955289006233, "kl": 1.2228712022304535, "learning_rate": 3.016007532956686e-06, "loss": 0.0012, "num_tokens": 3434905.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2700 }, { "completion_length": 10.625, "completions/clipped_ratio": 0.0, "completions/max_length": 12.9, "completions/max_terminated_length": 12.9, "completions/mean_length": 10.625, "completions/mean_terminated_length": 10.625, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.18635675973043597, "frac_reward_zero_std": 1.0, "grad_norm": 0.008232400752604008, "kl": 1.2177106857299804, "learning_rate": 3.0065913370998118e-06, "loss": 0.0012, "num_tokens": 3446182.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2710 }, { "completion_length": 10.875, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 10.875, "completions/mean_terminated_length": 10.875, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.1870444230504745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071622999384999275, "kl": 1.387217903137207, "learning_rate": 2.9971751412429377e-06, "loss": 0.0014, "num_tokens": 3457733.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2720 }, { "completion_length": 10.675, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.675, "completions/mean_terminated_length": 10.675, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.187732086370513, "frac_reward_zero_std": 1.0, "grad_norm": 0.003097102278843522, "kl": 1.302090060710907, "learning_rate": 2.9877589453860645e-06, "loss": 0.0013, "num_tokens": 3470068.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2730 }, { "completion_length": 10.825, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 10.825, "completions/mean_terminated_length": 10.825, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1884197496905515, "frac_reward_zero_std": 1.0, "grad_norm": 0.003183470806106925, "kl": 1.4097931861877442, "learning_rate": 2.9783427495291905e-06, "loss": 0.0014, "num_tokens": 3480493.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2740 }, { "completion_length": 11.175, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.175, "completions/mean_terminated_length": 11.175, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.18910741301059, "frac_reward_zero_std": 1.0, "grad_norm": 0.004561976063996553, "kl": 1.1230164349079133, "learning_rate": 2.9689265536723165e-06, "loss": 0.0011, "num_tokens": 3490128.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2750 }, { "completion_length": 11.275, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 11.275, "completions/mean_terminated_length": 11.275, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.18979507633062853, "frac_reward_zero_std": 0.9, "grad_norm": 0.005841500591486692, "kl": 1.239791786670685, "learning_rate": 2.959510357815443e-06, "loss": 0.0012, "num_tokens": 3501943.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2760 }, { "completion_length": 11.125, "completions/clipped_ratio": 0.0, "completions/max_length": 13.9, "completions/max_terminated_length": 13.9, "completions/mean_length": 11.125, "completions/mean_terminated_length": 11.125, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.19048273965066703, "frac_reward_zero_std": 1.0, "grad_norm": 0.018330121412873268, "kl": 2.2330337703227996, "learning_rate": 2.950094161958569e-06, "loss": 0.0022, "num_tokens": 3514268.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2770 }, { "completion_length": 10.575, "completions/clipped_ratio": 0.0, "completions/max_length": 12.7, "completions/max_terminated_length": 12.7, "completions/mean_length": 10.575, "completions/mean_terminated_length": 10.575, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.19117040297070553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025839856825768948, "kl": 1.3854608416557312, "learning_rate": 2.9406779661016956e-06, "loss": 0.0014, "num_tokens": 3525975.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2780 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.19185806629074406, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035424588713794947, "kl": 1.3310083508491517, "learning_rate": 2.9312617702448216e-06, "loss": 0.0013, "num_tokens": 3538256.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2790 }, { "completion_length": 10.425, "completions/clipped_ratio": 0.0, "completions/max_length": 13.2, "completions/max_terminated_length": 13.2, "completions/mean_length": 10.425, "completions/mean_terminated_length": 10.425, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.19254572961078256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014532285276800394, "kl": 1.4109968423843384, "learning_rate": 2.9218455743879475e-06, "loss": 0.0014, "num_tokens": 3549189.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2800 }, { "completion_length": 12.45, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 12.45, "completions/mean_terminated_length": 12.45, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.19323339293082106, "frac_reward_zero_std": 0.8, "grad_norm": 0.004302640911191702, "kl": 1.2859423279762268, "learning_rate": 2.9124293785310735e-06, "loss": 0.0013, "num_tokens": 3561783.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2810 }, { "completion_length": 10.525, "completions/clipped_ratio": 0.0, "completions/max_length": 12.7, "completions/max_terminated_length": 12.7, "completions/mean_length": 10.525, "completions/mean_terminated_length": 10.525, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.1939210562508596, "frac_reward_zero_std": 1.0, "grad_norm": 0.007680397480726242, "kl": 1.4603121995925903, "learning_rate": 2.9030131826742e-06, "loss": 0.0015, "num_tokens": 3574500.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2820 }, { "completion_length": 10.775, "completions/clipped_ratio": 0.0, "completions/max_length": 12.9, "completions/max_terminated_length": 12.9, "completions/mean_length": 10.775, "completions/mean_terminated_length": 10.775, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.1946087195708981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020181452855467796, "kl": 1.3944905817508697, "learning_rate": 2.893596986817326e-06, "loss": 0.0014, "num_tokens": 3587815.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2830 }, { "completion_length": 11.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 11.1, "completions/mean_terminated_length": 11.1, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.1952963828909366, "frac_reward_zero_std": 1.0, "grad_norm": 0.006148161832243204, "kl": 1.2499525606632234, "learning_rate": 2.8841807909604526e-06, "loss": 0.0012, "num_tokens": 3599027.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2840 }, { "completion_length": 11.1, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.1, "completions/mean_terminated_length": 11.1, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.1959840462109751, "frac_reward_zero_std": 1.0, "grad_norm": 0.0890081599354744, "kl": 1.3890787661075592, "learning_rate": 2.8747645951035786e-06, "loss": 0.0014, "num_tokens": 3611155.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2850 }, { "completion_length": 11.65, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.65, "completions/mean_terminated_length": 11.65, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.19667170953101362, "frac_reward_zero_std": 1.0, "grad_norm": 0.004152151755988598, "kl": 1.1795121252536773, "learning_rate": 2.8653483992467045e-06, "loss": 0.0012, "num_tokens": 3622457.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2860 }, { "completion_length": 11.475, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 11.475, "completions/mean_terminated_length": 11.475, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.19735937285105212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033720259089022875, "kl": 1.2966312885284423, "learning_rate": 2.8559322033898305e-06, "loss": 0.0013, "num_tokens": 3635224.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2870 }, { "completion_length": 10.65, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.65, "completions/mean_terminated_length": 10.65, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.19804703617109062, "frac_reward_zero_std": 0.9, "grad_norm": 0.02166939526796341, "kl": 1.1902290284633636, "learning_rate": 2.846516007532957e-06, "loss": 0.0012, "num_tokens": 3648742.0, "reward": 5.9875, "reward_std": 0.025, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2880 }, { "completion_length": 17.05, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.19873469949112915, "frac_reward_zero_std": 0.8, "grad_norm": 0.003716163570061326, "kl": 1.2905597269535065, "learning_rate": 2.837099811676083e-06, "loss": 0.0013, "num_tokens": 3662064.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2890 }, { "completion_length": 20.15, "completions/clipped_ratio": 0.0, "completions/max_length": 49.5, "completions/max_terminated_length": 49.5, "completions/mean_length": 20.15, "completions/mean_terminated_length": 20.15, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.19942236281116765, "frac_reward_zero_std": 0.9, "grad_norm": 0.0016892965650185943, "kl": 1.356507009267807, "learning_rate": 2.8276836158192096e-06, "loss": 0.0014, "num_tokens": 3676226.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2900 }, { "completion_length": 10.425, "completions/clipped_ratio": 0.0, "completions/max_length": 12.2, "completions/max_terminated_length": 12.2, "completions/mean_length": 10.425, "completions/mean_terminated_length": 10.425, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.20011002613120615, "frac_reward_zero_std": 1.0, "grad_norm": 0.007044503930956125, "kl": 1.2502503633499145, "learning_rate": 2.8182674199623356e-06, "loss": 0.0013, "num_tokens": 3688579.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2910 }, { "completion_length": 11.475, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 11.475, "completions/mean_terminated_length": 11.475, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.20079768945124468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026405714452266693, "kl": 1.3286872804164886, "learning_rate": 2.8088512241054615e-06, "loss": 0.0013, "num_tokens": 3702350.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2920 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.20148535277128318, "frac_reward_zero_std": 1.0, "grad_norm": 0.002148184459656477, "kl": 1.135231328010559, "learning_rate": 2.799435028248588e-06, "loss": 0.0011, "num_tokens": 3713775.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2930 }, { "completion_length": 10.8, "completions/clipped_ratio": 0.0, "completions/max_length": 12.4, "completions/max_terminated_length": 12.4, "completions/mean_length": 10.8, "completions/mean_terminated_length": 10.8, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.20217301609132168, "frac_reward_zero_std": 1.0, "grad_norm": 0.004673804622143507, "kl": 1.3920660734176635, "learning_rate": 2.790018832391714e-06, "loss": 0.0014, "num_tokens": 3725491.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2940 }, { "completion_length": 12.475, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 12.475, "completions/mean_terminated_length": 12.475, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.2028606794113602, "frac_reward_zero_std": 1.0, "grad_norm": 0.004096169024705887, "kl": 1.3312494993209838, "learning_rate": 2.7806026365348403e-06, "loss": 0.0013, "num_tokens": 3738282.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2950 }, { "completion_length": 12.775, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 12.775, "completions/mean_terminated_length": 12.775, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.2035483427313987, "frac_reward_zero_std": 0.9, "grad_norm": 0.00869634561240673, "kl": 1.067715847492218, "learning_rate": 2.7711864406779666e-06, "loss": 0.0011, "num_tokens": 3752477.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2960 }, { "completion_length": 13.55, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 13.55, "completions/mean_terminated_length": 13.55, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.2042360060514372, "frac_reward_zero_std": 1.0, "grad_norm": 0.003822652855888009, "kl": 2.798942339420319, "learning_rate": 2.7617702448210926e-06, "loss": 0.0028, "num_tokens": 3765251.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2970 }, { "completion_length": 11.325, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.325, "completions/mean_terminated_length": 11.325, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.20492366937147571, "frac_reward_zero_std": 1.0, "grad_norm": 0.003919885028153658, "kl": 1.5202113151550294, "learning_rate": 2.7523540489642185e-06, "loss": 0.0015, "num_tokens": 3778692.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2980 }, { "completion_length": 13.125, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 13.125, "completions/mean_terminated_length": 13.125, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.20561133269151424, "frac_reward_zero_std": 1.0, "grad_norm": 0.10944122821092606, "kl": 1.2642498075962068, "learning_rate": 2.742937853107345e-06, "loss": 0.0013, "num_tokens": 3791293.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 2990 }, { "completion_length": 13.525, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 13.525, "completions/mean_terminated_length": 13.525, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.20629899601155274, "frac_reward_zero_std": 1.0, "grad_norm": 0.006753021385520697, "kl": 1.4397485315799714, "learning_rate": 2.733521657250471e-06, "loss": 0.0014, "num_tokens": 3803098.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3000 }, { "completion_length": 12.425, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 12.425, "completions/mean_terminated_length": 12.425, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.20698665933159124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026066696736961603, "kl": 1.0516700327396393, "learning_rate": 2.7241054613935973e-06, "loss": 0.0011, "num_tokens": 3815979.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3010 }, { "completion_length": 11.35, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.35, "completions/mean_terminated_length": 11.35, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.20767432265162977, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022212343756109476, "kl": 1.3609875798225404, "learning_rate": 2.7146892655367236e-06, "loss": 0.0014, "num_tokens": 3828709.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3020 }, { "completion_length": 15.95, "completions/clipped_ratio": 0.0, "completions/max_length": 28.1, "completions/max_terminated_length": 28.1, "completions/mean_length": 15.95, "completions/mean_terminated_length": 15.95, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.20836198597166827, "frac_reward_zero_std": 0.9, "grad_norm": 0.0025453646667301655, "kl": 1.0688377380371095, "learning_rate": 2.7052730696798496e-06, "loss": 0.0011, "num_tokens": 3839227.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3030 }, { "completion_length": 12.575, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 12.575, "completions/mean_terminated_length": 12.575, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.20904964929170677, "frac_reward_zero_std": 0.9, "grad_norm": 0.0028345719911158085, "kl": 1.3270336389541626, "learning_rate": 2.6958568738229756e-06, "loss": 0.0013, "num_tokens": 3851634.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3040 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.2097373126117453, "frac_reward_zero_std": 1.0, "grad_norm": 0.004961980972439051, "kl": 1.3371050000190734, "learning_rate": 2.686440677966102e-06, "loss": 0.0013, "num_tokens": 3865056.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3050 }, { "completion_length": 11.1, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.1, "completions/mean_terminated_length": 11.1, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.2104249759317838, "frac_reward_zero_std": 0.9, "grad_norm": 0.003095800057053566, "kl": 1.2146135807037353, "learning_rate": 2.677024482109228e-06, "loss": 0.0012, "num_tokens": 3876756.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3060 }, { "completion_length": 11.575, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 11.575, "completions/mean_terminated_length": 11.575, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.2111126392518223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020866338163614273, "kl": 1.2059853732585908, "learning_rate": 2.6676082862523543e-06, "loss": 0.0012, "num_tokens": 3891107.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3070 }, { "completion_length": 11.575, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 11.575, "completions/mean_terminated_length": 11.575, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.2118003025718608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037311518099159002, "kl": 1.2837676525115966, "learning_rate": 2.6581920903954807e-06, "loss": 0.0013, "num_tokens": 3903030.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3080 }, { "completion_length": 12.2, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 12.2, "completions/mean_terminated_length": 12.2, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.21248796589189933, "frac_reward_zero_std": 1.0, "grad_norm": 0.004318287596106529, "kl": 1.1657697916030885, "learning_rate": 2.6487758945386066e-06, "loss": 0.0012, "num_tokens": 3916166.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3090 }, { "completion_length": 11.875, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.875, "completions/mean_terminated_length": 11.875, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.21317562921193783, "frac_reward_zero_std": 1.0, "grad_norm": 0.004012678749859333, "kl": 1.1232018172740936, "learning_rate": 2.6393596986817326e-06, "loss": 0.0011, "num_tokens": 3927217.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3100 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.21386329253197633, "frac_reward_zero_std": 1.0, "grad_norm": 0.13298767805099487, "kl": 1.5185560762882233, "learning_rate": 2.629943502824859e-06, "loss": 0.0015, "num_tokens": 3938505.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3110 }, { "completion_length": 11.725, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.725, "completions/mean_terminated_length": 11.725, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.21455095585201486, "frac_reward_zero_std": 1.0, "grad_norm": 0.006646719295531511, "kl": 1.2882408559322358, "learning_rate": 2.6205273069679853e-06, "loss": 0.0013, "num_tokens": 3951066.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3120 }, { "completion_length": 12.225, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 12.225, "completions/mean_terminated_length": 12.225, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.21523861917205336, "frac_reward_zero_std": 1.0, "grad_norm": 0.007941076532006264, "kl": 1.1894244372844696, "learning_rate": 2.6111111111111113e-06, "loss": 0.0012, "num_tokens": 3962491.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3130 }, { "completion_length": 14.025, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 14.025, "completions/mean_terminated_length": 14.025, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.21592628249209186, "frac_reward_zero_std": 0.8, "grad_norm": 2.529683828353882, "kl": 1.2520922303199769, "learning_rate": 2.6016949152542377e-06, "loss": 0.0013, "num_tokens": 3974576.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3140 }, { "completion_length": 13.15, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 13.15, "completions/mean_terminated_length": 13.15, "completions/min_length": 10.2, "completions/min_terminated_length": 10.2, "epoch": 0.2166139458121304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029135840013623238, "kl": 1.076604688167572, "learning_rate": 2.5922787193973636e-06, "loss": 0.0011, "num_tokens": 3986806.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3150 }, { "completion_length": 13.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.6, "completions/mean_terminated_length": 13.6, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.2173016091321689, "frac_reward_zero_std": 1.0, "grad_norm": 0.00409655412659049, "kl": 1.1538120567798615, "learning_rate": 2.5828625235404896e-06, "loss": 0.0012, "num_tokens": 3999858.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3160 }, { "completion_length": 11.875, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.875, "completions/mean_terminated_length": 11.875, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.2179892724522074, "frac_reward_zero_std": 1.0, "grad_norm": 0.003736306680366397, "kl": 1.0594617307186127, "learning_rate": 2.573446327683616e-06, "loss": 0.0011, "num_tokens": 4012533.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3170 }, { "completion_length": 11.4, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.4, "completions/mean_terminated_length": 11.4, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.2186769357722459, "frac_reward_zero_std": 1.0, "grad_norm": 0.004909292794764042, "kl": 1.300600254535675, "learning_rate": 2.5640301318267423e-06, "loss": 0.0013, "num_tokens": 4025381.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3180 }, { "completion_length": 11.575, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.575, "completions/mean_terminated_length": 11.575, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.21936459909228442, "frac_reward_zero_std": 1.0, "grad_norm": 0.003566068597137928, "kl": 1.2702523350715638, "learning_rate": 2.5546139359698683e-06, "loss": 0.0013, "num_tokens": 4036896.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3190 }, { "completion_length": 11.65, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.65, "completions/mean_terminated_length": 11.65, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.22005226241232292, "frac_reward_zero_std": 1.0, "grad_norm": 0.020172731950879097, "kl": 1.0868790686130523, "learning_rate": 2.5451977401129947e-06, "loss": 0.0011, "num_tokens": 4050002.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3200 }, { "completion_length": 11.325, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.325, "completions/mean_terminated_length": 11.325, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.22073992573236143, "frac_reward_zero_std": 1.0, "grad_norm": 0.004111476242542267, "kl": 1.1443917155265808, "learning_rate": 2.5357815442561206e-06, "loss": 0.0011, "num_tokens": 4064027.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3210 }, { "completion_length": 15.975, "completions/clipped_ratio": 0.0, "completions/max_length": 32.1, "completions/max_terminated_length": 32.1, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.22142758905239995, "frac_reward_zero_std": 0.9, "grad_norm": 2.2132339477539062, "kl": 1.1676890075206756, "learning_rate": 2.5263653483992466e-06, "loss": 0.0012, "num_tokens": 4077274.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3220 }, { "completion_length": 10.9, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 10.9, "completions/mean_terminated_length": 10.9, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.22211525237243845, "frac_reward_zero_std": 1.0, "grad_norm": 0.017527904361486435, "kl": 1.3145560443401336, "learning_rate": 2.516949152542373e-06, "loss": 0.0013, "num_tokens": 4089186.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3230 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.22280291569247695, "frac_reward_zero_std": 1.0, "grad_norm": 0.005846535321325064, "kl": 1.8368690073490144, "learning_rate": 2.5075329566854994e-06, "loss": 0.0018, "num_tokens": 4101546.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3240 }, { "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.22349057901251548, "frac_reward_zero_std": 1.0, "grad_norm": 0.06038391590118408, "kl": 1.3158239006996155, "learning_rate": 2.4981167608286257e-06, "loss": 0.0013, "num_tokens": 4114641.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3250 }, { "completion_length": 11.575, "completions/clipped_ratio": 0.0, "completions/max_length": 14.4, "completions/max_terminated_length": 14.4, "completions/mean_length": 11.575, "completions/mean_terminated_length": 11.575, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.22417824233255398, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027491487562656403, "kl": 1.2638065814971924, "learning_rate": 2.4887005649717517e-06, "loss": 0.0013, "num_tokens": 4126064.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3260 }, { "completion_length": 11.175, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 11.175, "completions/mean_terminated_length": 11.175, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.22486590565259248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021324739791452885, "kl": 1.341496205329895, "learning_rate": 2.4792843691148776e-06, "loss": 0.0013, "num_tokens": 4137723.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3270 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.225553568972631, "frac_reward_zero_std": 1.0, "grad_norm": 0.002356611890718341, "kl": 1.2188643753528594, "learning_rate": 2.469868173258004e-06, "loss": 0.0012, "num_tokens": 4149288.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3280 }, { "completion_length": 10.975, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 10.975, "completions/mean_terminated_length": 10.975, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.22624123229266951, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019462181953713298, "kl": 1.0421675980091094, "learning_rate": 2.46045197740113e-06, "loss": 0.001, "num_tokens": 4162103.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3290 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 49.1, "completions/max_terminated_length": 49.1, "completions/mean_length": 20.7, "completions/mean_terminated_length": 20.7, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.22692889561270801, "frac_reward_zero_std": 0.9, "grad_norm": 0.004338334780186415, "kl": 1.2982787430286407, "learning_rate": 2.4510357815442564e-06, "loss": 0.0013, "num_tokens": 4174743.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3300 }, { "completion_length": 11.275, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.275, "completions/mean_terminated_length": 11.275, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.22761655893274652, "frac_reward_zero_std": 1.0, "grad_norm": 0.005578485317528248, "kl": 1.3092613160610198, "learning_rate": 2.4416195856873827e-06, "loss": 0.0013, "num_tokens": 4186514.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3310 }, { "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.22830422225278504, "frac_reward_zero_std": 1.0, "grad_norm": 0.03091367706656456, "kl": 1.1664791226387023, "learning_rate": 2.4322033898305087e-06, "loss": 0.0012, "num_tokens": 4199232.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3320 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.22899188557282354, "frac_reward_zero_std": 1.0, "grad_norm": 0.007058318238705397, "kl": 1.1536859154701233, "learning_rate": 2.422787193973635e-06, "loss": 0.0012, "num_tokens": 4212258.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3330 }, { "completion_length": 10.4, "completions/clipped_ratio": 0.0, "completions/max_length": 12.8, "completions/max_terminated_length": 12.8, "completions/mean_length": 10.4, "completions/mean_terminated_length": 10.4, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.22967954889286205, "frac_reward_zero_std": 1.0, "grad_norm": 0.004380435682833195, "kl": 1.3366443276405335, "learning_rate": 2.413370998116761e-06, "loss": 0.0013, "num_tokens": 4225234.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3340 }, { "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.23036721221290057, "frac_reward_zero_std": 1.0, "grad_norm": 0.004277032800018787, "kl": 1.1180653333663941, "learning_rate": 2.403954802259887e-06, "loss": 0.0011, "num_tokens": 4235825.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3350 }, { "completion_length": 11.675, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.675, "completions/mean_terminated_length": 11.675, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.23105487553293907, "frac_reward_zero_std": 1.0, "grad_norm": 0.005623528268188238, "kl": 1.3497309505939483, "learning_rate": 2.3945386064030134e-06, "loss": 0.0013, "num_tokens": 4249708.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3360 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.23174253885297758, "frac_reward_zero_std": 0.9, "grad_norm": 0.0034109626431018114, "kl": 1.3263630867004395, "learning_rate": 2.3851224105461398e-06, "loss": 0.0013, "num_tokens": 4262546.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3370 }, { "completion_length": 11.7, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 11.7, "completions/mean_terminated_length": 11.7, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.2324302021730161, "frac_reward_zero_std": 1.0, "grad_norm": 0.002905854256823659, "kl": 1.2896348893642426, "learning_rate": 2.3757062146892657e-06, "loss": 0.0013, "num_tokens": 4276578.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3380 }, { "completion_length": 12.325, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 12.325, "completions/mean_terminated_length": 12.325, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.2331178654930546, "frac_reward_zero_std": 1.0, "grad_norm": 0.002098673954606056, "kl": 1.4132690966129302, "learning_rate": 2.366290018832392e-06, "loss": 0.0014, "num_tokens": 4287999.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3390 }, { "completion_length": 14.6, "completions/clipped_ratio": 0.0, "completions/max_length": 30.1, "completions/max_terminated_length": 30.1, "completions/mean_length": 14.6, "completions/mean_terminated_length": 14.6, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2338055288130931, "frac_reward_zero_std": 0.9, "grad_norm": 0.0030663420911878347, "kl": 1.9233894765377044, "learning_rate": 2.356873822975518e-06, "loss": 0.0019, "num_tokens": 4299531.0, "reward": 5.85, "reward_std": 0.12247449159622192, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.05773502588272095, "rewards/check_response_quality/mean": 2.4375, "rewards/check_response_quality/std": 0.047871357202529906, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3400 }, { "completion_length": 11.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.8, "completions/mean_terminated_length": 11.8, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.2344931921331316, "frac_reward_zero_std": 1.0, "grad_norm": 0.009669655933976173, "kl": 1.1667460262775422, "learning_rate": 2.347457627118644e-06, "loss": 0.0012, "num_tokens": 4311467.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3410 }, { "completion_length": 10.8, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 10.8, "completions/mean_terminated_length": 10.8, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.23518085545317013, "frac_reward_zero_std": 1.0, "grad_norm": 0.004496072884649038, "kl": 1.3729178309440613, "learning_rate": 2.3380414312617704e-06, "loss": 0.0014, "num_tokens": 4322375.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3420 }, { "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.23586851877320864, "frac_reward_zero_std": 1.0, "grad_norm": 0.003123059868812561, "kl": 1.3280323922634125, "learning_rate": 2.3286252354048968e-06, "loss": 0.0013, "num_tokens": 4334663.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3430 }, { "completion_length": 12.95, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 12.95, "completions/mean_terminated_length": 12.95, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.23655618209324714, "frac_reward_zero_std": 1.0, "grad_norm": 0.00793770793825388, "kl": 1.1402764916419983, "learning_rate": 2.3192090395480227e-06, "loss": 0.0011, "num_tokens": 4346017.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3440 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.23724384541328566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027919018175452948, "kl": 1.099853754043579, "learning_rate": 2.309792843691149e-06, "loss": 0.0011, "num_tokens": 4359793.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3450 }, { "completion_length": 10.85, "completions/clipped_ratio": 0.0, "completions/max_length": 13.8, "completions/max_terminated_length": 13.8, "completions/mean_length": 10.85, "completions/mean_terminated_length": 10.85, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.23793150873332417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067101349122822285, "kl": 1.4456099629402162, "learning_rate": 2.300376647834275e-06, "loss": 0.0014, "num_tokens": 4373115.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3460 }, { "completion_length": 13.125, "completions/clipped_ratio": 0.0, "completions/max_length": 23.1, "completions/max_terminated_length": 23.1, "completions/mean_length": 13.125, "completions/mean_terminated_length": 13.125, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.23861917205336267, "frac_reward_zero_std": 0.9, "grad_norm": 0.07878517359495163, "kl": 2.221970522403717, "learning_rate": 2.290960451977401e-06, "loss": 0.0022, "num_tokens": 4386292.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3470 }, { "completion_length": 12.45, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 12.45, "completions/mean_terminated_length": 12.45, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.2393068353734012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027336543425917625, "kl": 1.2365751266479492, "learning_rate": 2.2815442561205274e-06, "loss": 0.0012, "num_tokens": 4398422.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3480 }, { "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.2399944986934397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021766237914562225, "kl": 1.2284077286720276, "learning_rate": 2.2721280602636538e-06, "loss": 0.0012, "num_tokens": 4410376.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3490 }, { "completion_length": 10.925, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 10.925, "completions/mean_terminated_length": 10.925, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.2406821620134782, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021278143394738436, "kl": 1.2818358659744262, "learning_rate": 2.26271186440678e-06, "loss": 0.0013, "num_tokens": 4422425.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3500 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.2413698253335167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027870163321495056, "kl": 1.248415756225586, "learning_rate": 2.253295668549906e-06, "loss": 0.0012, "num_tokens": 4433021.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3510 }, { "completion_length": 12.675, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 12.675, "completions/mean_terminated_length": 12.675, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.24205748865355523, "frac_reward_zero_std": 0.9, "grad_norm": 0.009928030893206596, "kl": 1.2978311777114868, "learning_rate": 2.243879472693032e-06, "loss": 0.0013, "num_tokens": 4446060.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3520 }, { "completion_length": 13.65, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 13.65, "completions/mean_terminated_length": 13.65, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.24274515197359373, "frac_reward_zero_std": 0.8, "grad_norm": 0.028688477352261543, "kl": 1.277863186597824, "learning_rate": 2.2344632768361585e-06, "loss": 0.0013, "num_tokens": 4458562.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3530 }, { "completion_length": 11.65, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 11.65, "completions/mean_terminated_length": 11.65, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.24343281529363223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037820255383849144, "kl": 1.2518242657184602, "learning_rate": 2.2250470809792844e-06, "loss": 0.0013, "num_tokens": 4469756.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3540 }, { "completion_length": 50.75, "completions/clipped_ratio": 0.025, "completions/max_length": 171.3, "completions/max_terminated_length": 13.9, "completions/mean_length": 50.75, "completions/mean_terminated_length": 11.4, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.24412047861367075, "frac_reward_zero_std": 0.9, "grad_norm": 0.0025780112482607365, "kl": 1.1971634149551391, "learning_rate": 2.215630885122411e-06, "loss": 0.0012, "num_tokens": 4484142.0, "reward": 5.9375, "reward_std": 0.125, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3550 }, { "completion_length": 15.85, "completions/clipped_ratio": 0.0, "completions/max_length": 30.9, "completions/max_terminated_length": 30.9, "completions/mean_length": 15.85, "completions/mean_terminated_length": 15.85, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.24480814193370926, "frac_reward_zero_std": 0.9, "grad_norm": 0.0080100167542696, "kl": 1.3040795743465423, "learning_rate": 2.206214689265537e-06, "loss": 0.0013, "num_tokens": 4497160.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3560 }, { "completion_length": 10.525, "completions/clipped_ratio": 0.0, "completions/max_length": 12.6, "completions/max_terminated_length": 12.6, "completions/mean_length": 10.525, "completions/mean_terminated_length": 10.525, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.24549580525374776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027778451330959797, "kl": 1.283364176750183, "learning_rate": 2.196798493408663e-06, "loss": 0.0013, "num_tokens": 4507981.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3570 }, { "completion_length": 11.325, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 11.325, "completions/mean_terminated_length": 11.325, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.24618346857378628, "frac_reward_zero_std": 0.9, "grad_norm": 0.005003831349313259, "kl": 1.2212236881256104, "learning_rate": 2.187382297551789e-06, "loss": 0.0012, "num_tokens": 4520882.0, "reward": 5.9875, "reward_std": 0.025, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3580 }, { "completion_length": 12.175, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 12.175, "completions/mean_terminated_length": 12.175, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.24687113189382479, "frac_reward_zero_std": 1.0, "grad_norm": 0.011205293238162994, "kl": 1.1657124042510987, "learning_rate": 2.1779661016949155e-06, "loss": 0.0012, "num_tokens": 4530921.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3590 }, { "completion_length": 10.525, "completions/clipped_ratio": 0.0, "completions/max_length": 12.5, "completions/max_terminated_length": 12.5, "completions/mean_length": 10.525, "completions/mean_terminated_length": 10.525, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.2475587952138633, "frac_reward_zero_std": 1.0, "grad_norm": 0.003226573346182704, "kl": 1.4166472613811494, "learning_rate": 2.1685499058380414e-06, "loss": 0.0014, "num_tokens": 4545070.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3600 }, { "completion_length": 11.425, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.425, "completions/mean_terminated_length": 11.425, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.24824645853390181, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015919512370601296, "kl": 1.390343391895294, "learning_rate": 2.159133709981168e-06, "loss": 0.0014, "num_tokens": 4557447.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3610 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.24893412185394032, "frac_reward_zero_std": 1.0, "grad_norm": 0.003537840908393264, "kl": 1.4276194095611572, "learning_rate": 2.149717514124294e-06, "loss": 0.0014, "num_tokens": 4569357.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3620 }, { "completion_length": 11.775, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 11.775, "completions/mean_terminated_length": 11.775, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.24962178517397882, "frac_reward_zero_std": 1.0, "grad_norm": 0.010827281512320042, "kl": 1.2425177216529846, "learning_rate": 2.14030131826742e-06, "loss": 0.0012, "num_tokens": 4583028.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3630 }, { "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.2, "completions/max_terminated_length": 31.2, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.2503094484940173, "frac_reward_zero_std": 0.9, "grad_norm": 0.019427252933382988, "kl": 1.2630140125751494, "learning_rate": 2.130885122410546e-06, "loss": 0.0013, "num_tokens": 4595408.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3640 }, { "completion_length": 13.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 13.7, "completions/mean_terminated_length": 13.7, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.2509971118140558, "frac_reward_zero_std": 1.0, "grad_norm": 0.003992835059762001, "kl": 1.285544329881668, "learning_rate": 2.1214689265536725e-06, "loss": 0.0013, "num_tokens": 4607900.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3650 }, { "completion_length": 14.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 14.2, "completions/mean_terminated_length": 14.2, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2516847751340944, "frac_reward_zero_std": 0.9, "grad_norm": 0.0025455814320594072, "kl": 1.118562602996826, "learning_rate": 2.1120527306967984e-06, "loss": 0.0011, "num_tokens": 4619776.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3660 }, { "completion_length": 13.925, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 13.925, "completions/mean_terminated_length": 13.925, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.2523724384541329, "frac_reward_zero_std": 1.0, "grad_norm": 0.005908417049795389, "kl": 1.1965693056583404, "learning_rate": 2.102636534839925e-06, "loss": 0.0012, "num_tokens": 4632933.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3670 }, { "completion_length": 12.15, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 12.15, "completions/mean_terminated_length": 12.15, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.2530601017741714, "frac_reward_zero_std": 1.0, "grad_norm": 0.004306701943278313, "kl": 1.1101278483867645, "learning_rate": 2.093220338983051e-06, "loss": 0.0011, "num_tokens": 4646659.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3680 }, { "completion_length": 30.85, "completions/clipped_ratio": 0.0, "completions/max_length": 90.2, "completions/max_terminated_length": 90.2, "completions/mean_length": 30.85, "completions/mean_terminated_length": 30.85, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.2537477650942099, "frac_reward_zero_std": 0.7, "grad_norm": 0.0031726714223623276, "kl": 0.928079804778099, "learning_rate": 2.083804143126177e-06, "loss": 0.0009, "num_tokens": 4661733.0, "reward": 5.9125, "reward_std": 0.175, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.45, "rewards/check_response_quality/std": 0.1, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3690 }, { "completion_length": 18.325, "completions/clipped_ratio": 0.0, "completions/max_length": 40.9, "completions/max_terminated_length": 40.9, "completions/mean_length": 18.325, "completions/mean_terminated_length": 18.325, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.2544354284142484, "frac_reward_zero_std": 0.9, "grad_norm": 0.003087216755375266, "kl": 1.2326574087142945, "learning_rate": 2.0743879472693035e-06, "loss": 0.0012, "num_tokens": 4672266.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3700 }, { "completion_length": 11.525, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 11.525, "completions/mean_terminated_length": 11.525, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.2551230917342869, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020335062872618437, "kl": 1.298705518245697, "learning_rate": 2.0649717514124295e-06, "loss": 0.0013, "num_tokens": 4685299.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3710 }, { "completion_length": 16.55, "completions/clipped_ratio": 0.0, "completions/max_length": 37.3, "completions/max_terminated_length": 37.3, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.2558107550543254, "frac_reward_zero_std": 0.9, "grad_norm": 0.01009492576122284, "kl": 1.3413202583789825, "learning_rate": 2.0555555555555555e-06, "loss": 0.0013, "num_tokens": 4698157.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3720 }, { "completion_length": 10.7, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 10.7, "completions/mean_terminated_length": 10.7, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.25649841837436393, "frac_reward_zero_std": 1.0, "grad_norm": 0.003856506897136569, "kl": 1.3558381140232085, "learning_rate": 2.046139359698682e-06, "loss": 0.0014, "num_tokens": 4708737.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3730 }, { "completion_length": 10.65, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 10.65, "completions/mean_terminated_length": 10.65, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.25718608169440244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032971419859677553, "kl": 1.7668309211730957, "learning_rate": 2.036723163841808e-06, "loss": 0.0018, "num_tokens": 4720975.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3740 }, { "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.25787374501444094, "frac_reward_zero_std": 0.8, "grad_norm": 0.016410939395427704, "kl": 1.5257445216178893, "learning_rate": 2.027306967984934e-06, "loss": 0.0015, "num_tokens": 4732599.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3750 }, { "completion_length": 10.575, "completions/clipped_ratio": 0.0, "completions/max_length": 13.4, "completions/max_terminated_length": 13.4, "completions/mean_length": 10.575, "completions/mean_terminated_length": 10.575, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.25856140833447944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026505778077989817, "kl": 1.4891623914241792, "learning_rate": 2.0178907721280605e-06, "loss": 0.0015, "num_tokens": 4745006.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3760 }, { "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.25924907165451794, "frac_reward_zero_std": 0.9, "grad_norm": 0.0036013510543853045, "kl": 1.337304413318634, "learning_rate": 2.0084745762711865e-06, "loss": 0.0013, "num_tokens": 4758060.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3770 }, { "completion_length": 10.95, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 10.95, "completions/mean_terminated_length": 10.95, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.25993673497455644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065917568281292915, "kl": 1.379406750202179, "learning_rate": 1.9990583804143125e-06, "loss": 0.0014, "num_tokens": 4770086.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3780 }, { "completion_length": 16.125, "completions/clipped_ratio": 0.0, "completions/max_length": 33.4, "completions/max_terminated_length": 33.4, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.26062439829459494, "frac_reward_zero_std": 0.9, "grad_norm": 2.601729393005371, "kl": 1.3989966630935669, "learning_rate": 1.989642184557439e-06, "loss": 0.0014, "num_tokens": 4782599.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3790 }, { "completion_length": 11.025, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.025, "completions/mean_terminated_length": 11.025, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.2613120616146335, "frac_reward_zero_std": 1.0, "grad_norm": 0.006292569916695356, "kl": 1.5066055417060853, "learning_rate": 1.9802259887005652e-06, "loss": 0.0015, "num_tokens": 4793128.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3800 }, { "completion_length": 11.35, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 11.35, "completions/mean_terminated_length": 11.35, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.261999724934672, "frac_reward_zero_std": 1.0, "grad_norm": 0.013005614280700684, "kl": 1.2771723389625549, "learning_rate": 1.970809792843691e-06, "loss": 0.0013, "num_tokens": 4806370.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3810 }, { "completion_length": 10.1, "completions/clipped_ratio": 0.0, "completions/max_length": 11.9, "completions/max_terminated_length": 11.9, "completions/mean_length": 10.1, "completions/mean_terminated_length": 10.1, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.2626873882547105, "frac_reward_zero_std": 1.0, "grad_norm": 0.003991563804447651, "kl": 1.386504888534546, "learning_rate": 1.9613935969868176e-06, "loss": 0.0014, "num_tokens": 4817862.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3820 }, { "completion_length": 24.625, "completions/clipped_ratio": 0.0, "completions/max_length": 66.4, "completions/max_terminated_length": 66.4, "completions/mean_length": 24.625, "completions/mean_terminated_length": 24.625, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.263375051574749, "frac_reward_zero_std": 0.9, "grad_norm": 0.007266217842698097, "kl": 1.4715389907360077, "learning_rate": 1.9519774011299435e-06, "loss": 0.0015, "num_tokens": 4830747.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3830 }, { "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.2640627148947875, "frac_reward_zero_std": 0.9, "grad_norm": 0.0025213544722646475, "kl": 1.2671421408653258, "learning_rate": 1.94256120527307e-06, "loss": 0.0013, "num_tokens": 4841201.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3840 }, { "completion_length": 11.275, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 11.275, "completions/mean_terminated_length": 11.275, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.264750378214826, "frac_reward_zero_std": 0.9, "grad_norm": 0.0049027493223547935, "kl": 1.5967715799808502, "learning_rate": 1.933145009416196e-06, "loss": 0.0016, "num_tokens": 4852968.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3850 }, { "completion_length": 11.125, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.125, "completions/mean_terminated_length": 11.125, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.26543804153486455, "frac_reward_zero_std": 0.9, "grad_norm": 0.005673303734511137, "kl": 1.341913378238678, "learning_rate": 1.9237288135593222e-06, "loss": 0.0013, "num_tokens": 4865945.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3860 }, { "completion_length": 11.6, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 11.6, "completions/mean_terminated_length": 11.6, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.26612570485490306, "frac_reward_zero_std": 1.0, "grad_norm": 0.002212796825915575, "kl": 1.410479474067688, "learning_rate": 1.9143126177024486e-06, "loss": 0.0014, "num_tokens": 4878421.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3870 }, { "completion_length": 10.625, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 10.625, "completions/mean_terminated_length": 10.625, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.26681336817494156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033141798339784145, "kl": 1.7209568858146667, "learning_rate": 1.9048964218455746e-06, "loss": 0.0017, "num_tokens": 4892258.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3880 }, { "completion_length": 11.275, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 11.275, "completions/mean_terminated_length": 11.275, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.26750103149498006, "frac_reward_zero_std": 1.0, "grad_norm": 0.007818554528057575, "kl": 1.5238799691200255, "learning_rate": 1.8954802259887005e-06, "loss": 0.0015, "num_tokens": 4904961.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3890 }, { "completion_length": 10.875, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 10.875, "completions/mean_terminated_length": 10.875, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.26818869481501856, "frac_reward_zero_std": 1.0, "grad_norm": 0.00452169356867671, "kl": 1.5174754858016968, "learning_rate": 1.886064030131827e-06, "loss": 0.0015, "num_tokens": 4916848.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3900 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.26887635813505706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015657467301934958, "kl": 1.440461552143097, "learning_rate": 1.876647834274953e-06, "loss": 0.0014, "num_tokens": 4929564.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3910 }, { "completion_length": 10.175, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.175, "completions/mean_terminated_length": 10.175, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.26956402145509556, "frac_reward_zero_std": 1.0, "grad_norm": 0.002364358166232705, "kl": 1.3664440631866455, "learning_rate": 1.867231638418079e-06, "loss": 0.0014, "num_tokens": 4941655.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3920 }, { "completion_length": 10.65, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 10.65, "completions/mean_terminated_length": 10.65, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2702516847751341, "frac_reward_zero_std": 1.0, "grad_norm": 0.003972493577748537, "kl": 1.4133961200714111, "learning_rate": 1.8578154425612054e-06, "loss": 0.0014, "num_tokens": 4954969.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3930 }, { "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.2, "completions/max_terminated_length": 40.2, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.2709393480951726, "frac_reward_zero_std": 0.8, "grad_norm": 0.008966933004558086, "kl": 1.941703236103058, "learning_rate": 1.8483992467043316e-06, "loss": 0.0019, "num_tokens": 4967071.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3940 }, { "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.2716270114152111, "frac_reward_zero_std": 1.0, "grad_norm": 0.010819694958627224, "kl": 1.5476664900779724, "learning_rate": 1.8389830508474578e-06, "loss": 0.0015, "num_tokens": 4980575.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3950 }, { "completion_length": 10.725, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 10.725, "completions/mean_terminated_length": 10.725, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.2723146747352496, "frac_reward_zero_std": 1.0, "grad_norm": 0.003213522955775261, "kl": 1.3780342757701873, "learning_rate": 1.829566854990584e-06, "loss": 0.0014, "num_tokens": 4993112.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3960 }, { "completion_length": 10.775, "completions/clipped_ratio": 0.0, "completions/max_length": 13.8, "completions/max_terminated_length": 13.8, "completions/mean_length": 10.775, "completions/mean_terminated_length": 10.775, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.2730023380552881, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077873156405985355, "kl": 1.4218901097774506, "learning_rate": 1.82015065913371e-06, "loss": 0.0014, "num_tokens": 5004699.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3970 }, { "completion_length": 10.475, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 10.475, "completions/mean_terminated_length": 10.475, "completions/min_length": 7.3, "completions/min_terminated_length": 7.3, "epoch": 0.2736900013753266, "frac_reward_zero_std": 1.0, "grad_norm": 0.00725904107093811, "kl": 1.928431499004364, "learning_rate": 1.8107344632768365e-06, "loss": 0.0019, "num_tokens": 5018150.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3980 }, { "completion_length": 9.925, "completions/clipped_ratio": 0.0, "completions/max_length": 12.6, "completions/max_terminated_length": 12.6, "completions/mean_length": 9.925, "completions/mean_terminated_length": 9.925, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.2743776646953652, "frac_reward_zero_std": 1.0, "grad_norm": 0.01053251139819622, "kl": 1.701702892780304, "learning_rate": 1.8013182674199624e-06, "loss": 0.0017, "num_tokens": 5030671.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 3990 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.2750653280154037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00460396520793438, "kl": 1.322218155860901, "learning_rate": 1.7919020715630886e-06, "loss": 0.0013, "num_tokens": 5043143.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4000 }, { "completion_length": 19.275, "completions/clipped_ratio": 0.0, "completions/max_length": 48.9, "completions/max_terminated_length": 48.9, "completions/mean_length": 19.275, "completions/mean_terminated_length": 19.275, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2757529913354422, "frac_reward_zero_std": 0.8, "grad_norm": 0.04520512372255325, "kl": 1.6543179631233216, "learning_rate": 1.782485875706215e-06, "loss": 0.0017, "num_tokens": 5056954.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4010 }, { "completion_length": 11.4, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.4, "completions/mean_terminated_length": 11.4, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.2764406546554807, "frac_reward_zero_std": 1.0, "grad_norm": 0.04019185155630112, "kl": 1.6368863046169282, "learning_rate": 1.773069679849341e-06, "loss": 0.0016, "num_tokens": 5067586.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4020 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.2771283179755192, "frac_reward_zero_std": 0.8, "grad_norm": 0.003386999014765024, "kl": 1.5015459895133971, "learning_rate": 1.763653483992467e-06, "loss": 0.0015, "num_tokens": 5079692.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4030 }, { "completion_length": 10.275, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.275, "completions/mean_terminated_length": 10.275, "completions/min_length": 7.4, "completions/min_terminated_length": 7.4, "epoch": 0.2778159812955577, "frac_reward_zero_std": 1.0, "grad_norm": 0.005510615184903145, "kl": 1.6972654938697815, "learning_rate": 1.7542372881355935e-06, "loss": 0.0017, "num_tokens": 5091063.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4040 }, { "completion_length": 10.975, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 10.975, "completions/mean_terminated_length": 10.975, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.2785036446155962, "frac_reward_zero_std": 1.0, "grad_norm": 0.004958089906722307, "kl": 1.593740427494049, "learning_rate": 1.7448210922787194e-06, "loss": 0.0016, "num_tokens": 5103514.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4050 }, { "completion_length": 9.2, "completions/clipped_ratio": 0.0, "completions/max_length": 12.9, "completions/max_terminated_length": 12.9, "completions/mean_length": 9.2, "completions/mean_terminated_length": 9.2, "completions/min_length": 6.9, "completions/min_terminated_length": 6.9, "epoch": 0.27919130793563474, "frac_reward_zero_std": 1.0, "grad_norm": 0.005375854205340147, "kl": 1.9296481251716613, "learning_rate": 1.7354048964218456e-06, "loss": 0.0019, "num_tokens": 5114922.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4060 }, { "completion_length": 10.025, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 10.025, "completions/mean_terminated_length": 10.025, "completions/min_length": 7.2, "completions/min_terminated_length": 7.2, "epoch": 0.27987897125567324, "frac_reward_zero_std": 0.9, "grad_norm": 0.027488671243190765, "kl": 1.7945331156253814, "learning_rate": 1.725988700564972e-06, "loss": 0.0018, "num_tokens": 5127735.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4070 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.28056663457571174, "frac_reward_zero_std": 1.0, "grad_norm": 0.005040109623223543, "kl": 2.0170140087604524, "learning_rate": 1.7165725047080982e-06, "loss": 0.002, "num_tokens": 5140555.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4080 }, { "completion_length": 10.825, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.825, "completions/mean_terminated_length": 10.825, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.28125429789575024, "frac_reward_zero_std": 1.0, "grad_norm": 0.008321479894220829, "kl": 1.7255867421627045, "learning_rate": 1.7071563088512241e-06, "loss": 0.0017, "num_tokens": 5152752.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4090 }, { "completion_length": 14.425, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 14.425, "completions/mean_terminated_length": 14.425, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.28194196121578874, "frac_reward_zero_std": 0.8, "grad_norm": 5.667783737182617, "kl": 1.611900508403778, "learning_rate": 1.6977401129943505e-06, "loss": 0.0016, "num_tokens": 5164429.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4100 }, { "completion_length": 10.2, "completions/clipped_ratio": 0.0, "completions/max_length": 12.7, "completions/max_terminated_length": 12.7, "completions/mean_length": 10.2, "completions/mean_terminated_length": 10.2, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.28262962453582724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034870856907218695, "kl": 1.443020796775818, "learning_rate": 1.6883239171374767e-06, "loss": 0.0014, "num_tokens": 5176221.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4110 }, { "completion_length": 10.35, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 10.35, "completions/mean_terminated_length": 10.35, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.28331728785586574, "frac_reward_zero_std": 1.0, "grad_norm": 0.006099638994783163, "kl": 1.7443130373954774, "learning_rate": 1.6789077212806026e-06, "loss": 0.0017, "num_tokens": 5189343.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4120 }, { "completion_length": 9.775, "completions/clipped_ratio": 0.0, "completions/max_length": 14.2, "completions/max_terminated_length": 14.2, "completions/mean_length": 9.775, "completions/mean_terminated_length": 9.775, "completions/min_length": 7.2, "completions/min_terminated_length": 7.2, "epoch": 0.2840049511759043, "frac_reward_zero_std": 1.0, "grad_norm": 0.006439814809709787, "kl": 1.7517601370811462, "learning_rate": 1.669491525423729e-06, "loss": 0.0018, "num_tokens": 5200602.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4130 }, { "completion_length": 13.075, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 13.075, "completions/mean_terminated_length": 13.075, "completions/min_length": 7.4, "completions/min_terminated_length": 7.4, "epoch": 0.2846926144959428, "frac_reward_zero_std": 0.7, "grad_norm": 0.003682193113490939, "kl": 1.5702327251434327, "learning_rate": 1.6600753295668552e-06, "loss": 0.0016, "num_tokens": 5212097.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4140 }, { "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.6, "completions/max_terminated_length": 40.6, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.2853802778159813, "frac_reward_zero_std": 0.9, "grad_norm": 0.007733450271189213, "kl": 1.5168497264385223, "learning_rate": 1.6506591337099813e-06, "loss": 0.0015, "num_tokens": 5223511.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4150 }, { "completion_length": 11.075, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.075, "completions/mean_terminated_length": 11.075, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.2860679411360198, "frac_reward_zero_std": 1.0, "grad_norm": 0.005073005799204111, "kl": 1.4336557388305664, "learning_rate": 1.6412429378531075e-06, "loss": 0.0014, "num_tokens": 5235182.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4160 }, { "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.2867556044560583, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042126537300646305, "kl": 1.6670551657676698, "learning_rate": 1.6318267419962337e-06, "loss": 0.0017, "num_tokens": 5246558.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4170 }, { "completion_length": 10.825, "completions/clipped_ratio": 0.0, "completions/max_length": 13.9, "completions/max_terminated_length": 13.9, "completions/mean_length": 10.825, "completions/mean_terminated_length": 10.825, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.2874432677760968, "frac_reward_zero_std": 1.0, "grad_norm": 0.003699904540553689, "kl": 1.4559585928916932, "learning_rate": 1.6224105461393598e-06, "loss": 0.0015, "num_tokens": 5257303.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4180 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.28813093109613536, "frac_reward_zero_std": 1.0, "grad_norm": 0.010834389366209507, "kl": 1.4884744346141816, "learning_rate": 1.612994350282486e-06, "loss": 0.0015, "num_tokens": 5269937.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4190 }, { "completion_length": 12.875, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 12.875, "completions/mean_terminated_length": 12.875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.28881859441617386, "frac_reward_zero_std": 0.9, "grad_norm": 0.006072077434509993, "kl": 1.4151581585407258, "learning_rate": 1.6035781544256122e-06, "loss": 0.0014, "num_tokens": 5284716.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4200 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.6, "completions/max_terminated_length": 46.6, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.28950625773621236, "frac_reward_zero_std": 0.8, "grad_norm": 0.00288102007471025, "kl": 1.5231776535511017, "learning_rate": 1.5941619585687384e-06, "loss": 0.0015, "num_tokens": 5298764.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4210 }, { "completion_length": 10.2, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 10.2, "completions/mean_terminated_length": 10.2, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.29019392105625086, "frac_reward_zero_std": 1.0, "grad_norm": 0.004693563561886549, "kl": 1.6336079835891724, "learning_rate": 1.5847457627118645e-06, "loss": 0.0016, "num_tokens": 5309980.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4220 }, { "completion_length": 11.675, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 11.675, "completions/mean_terminated_length": 11.675, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.29088158437628936, "frac_reward_zero_std": 1.0, "grad_norm": 0.02345423959195614, "kl": 1.402549785375595, "learning_rate": 1.5753295668549907e-06, "loss": 0.0014, "num_tokens": 5321771.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4230 }, { "completion_length": 11.95, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 11.95, "completions/mean_terminated_length": 11.95, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.29156924769632786, "frac_reward_zero_std": 0.9, "grad_norm": 0.005161237437278032, "kl": 1.5061214089393615, "learning_rate": 1.565913370998117e-06, "loss": 0.0015, "num_tokens": 5334045.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4240 }, { "completion_length": 10.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 10.7, "completions/mean_terminated_length": 10.7, "completions/min_length": 6.8, "completions/min_terminated_length": 6.8, "epoch": 0.29225691101636636, "frac_reward_zero_std": 1.0, "grad_norm": 0.008897802792489529, "kl": 2.1344063758850096, "learning_rate": 1.556497175141243e-06, "loss": 0.0021, "num_tokens": 5345873.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4250 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.2929445743364049, "frac_reward_zero_std": 1.0, "grad_norm": 0.015760373324155807, "kl": 1.5703859090805055, "learning_rate": 1.5470809792843692e-06, "loss": 0.0016, "num_tokens": 5358625.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4260 }, { "completion_length": 10.95, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 10.95, "completions/mean_terminated_length": 10.95, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.2936322376564434, "frac_reward_zero_std": 1.0, "grad_norm": 0.04361697658896446, "kl": 1.5343574345111848, "learning_rate": 1.5376647834274956e-06, "loss": 0.0015, "num_tokens": 5370723.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4270 }, { "completion_length": 10.925, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 10.925, "completions/mean_terminated_length": 10.925, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.2943199009764819, "frac_reward_zero_std": 1.0, "grad_norm": 0.00989339780062437, "kl": 1.8978851437568665, "learning_rate": 1.5282485875706215e-06, "loss": 0.0019, "num_tokens": 5383060.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4280 }, { "completion_length": 50.4, "completions/clipped_ratio": 0.025, "completions/max_length": 170.8, "completions/max_terminated_length": 14.2, "completions/mean_length": 50.4, "completions/mean_terminated_length": 11.15, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.2950075642965204, "frac_reward_zero_std": 0.9, "grad_norm": 0.0025704088620841503, "kl": 1.435192084312439, "learning_rate": 1.5188323917137477e-06, "loss": 0.0014, "num_tokens": 5397712.0, "reward": 5.9375, "reward_std": 0.125, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4290 }, { "completion_length": 11.05, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 11.05, "completions/mean_terminated_length": 11.05, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2956952276165589, "frac_reward_zero_std": 1.0, "grad_norm": 0.016429847106337547, "kl": 2.077840727567673, "learning_rate": 1.509416195856874e-06, "loss": 0.0021, "num_tokens": 5408798.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4300 }, { "completion_length": 11.775, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 11.775, "completions/mean_terminated_length": 11.775, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.2963828909365974, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027806151192635298, "kl": 1.5844416201114655, "learning_rate": 1.5e-06, "loss": 0.0016, "num_tokens": 5421021.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4310 }, { "completion_length": 20.375, "completions/clipped_ratio": 0.0, "completions/max_length": 51.5, "completions/max_terminated_length": 51.5, "completions/mean_length": 20.375, "completions/mean_terminated_length": 20.375, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.297070554256636, "frac_reward_zero_std": 0.9, "grad_norm": 0.005918384529650211, "kl": 1.2520611345767976, "learning_rate": 1.4905838041431264e-06, "loss": 0.0013, "num_tokens": 5435752.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4320 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 48.8, "completions/max_terminated_length": 48.8, "completions/mean_length": 19.6, "completions/mean_terminated_length": 19.6, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.2977582175766745, "frac_reward_zero_std": 0.8, "grad_norm": 0.004772708751261234, "kl": 1.4595943570137024, "learning_rate": 1.4811676082862526e-06, "loss": 0.0015, "num_tokens": 5449672.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4330 }, { "completion_length": 12.225, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 12.225, "completions/mean_terminated_length": 12.225, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.298445880896713, "frac_reward_zero_std": 1.0, "grad_norm": 0.018670905381441116, "kl": 1.2710513174533844, "learning_rate": 1.4717514124293785e-06, "loss": 0.0013, "num_tokens": 5460621.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4340 }, { "completion_length": 13.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 13.4, "completions/mean_terminated_length": 13.4, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.2991335442167515, "frac_reward_zero_std": 0.9, "grad_norm": 2.123333692550659, "kl": 1.4008326411247254, "learning_rate": 1.462335216572505e-06, "loss": 0.0014, "num_tokens": 5472757.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4350 }, { "completion_length": 11.875, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 11.875, "completions/mean_terminated_length": 11.875, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.29982120753679, "frac_reward_zero_std": 1.0, "grad_norm": 0.12061981111764908, "kl": 1.3641066908836366, "learning_rate": 1.452919020715631e-06, "loss": 0.0014, "num_tokens": 5483908.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4360 }, { "completion_length": 12.925, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 12.925, "completions/mean_terminated_length": 12.925, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.3005088708568285, "frac_reward_zero_std": 1.0, "grad_norm": 0.004400401841849089, "kl": 1.4162483930587768, "learning_rate": 1.443502824858757e-06, "loss": 0.0014, "num_tokens": 5495553.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4370 }, { "completion_length": 12.775, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 12.775, "completions/mean_terminated_length": 12.775, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.301196534176867, "frac_reward_zero_std": 1.0, "grad_norm": 0.01428204495459795, "kl": 1.3453535318374634, "learning_rate": 1.4340866290018834e-06, "loss": 0.0013, "num_tokens": 5505996.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4380 }, { "completion_length": 12.525, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 12.525, "completions/mean_terminated_length": 12.525, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.30188419749690554, "frac_reward_zero_std": 0.8, "grad_norm": 3.6220338344573975, "kl": 1.4408978760242461, "learning_rate": 1.4246704331450096e-06, "loss": 0.0014, "num_tokens": 5517849.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4390 }, { "completion_length": 12.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 12.7, "completions/mean_terminated_length": 12.7, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.30257186081694404, "frac_reward_zero_std": 0.9, "grad_norm": 0.009245552122592926, "kl": 1.8541507005691529, "learning_rate": 1.4152542372881356e-06, "loss": 0.0019, "num_tokens": 5531473.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4400 }, { "completion_length": 10.775, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 10.775, "completions/mean_terminated_length": 10.775, "completions/min_length": 7.4, "completions/min_terminated_length": 7.4, "epoch": 0.30325952413698254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041836886666715145, "kl": 1.533377969264984, "learning_rate": 1.405838041431262e-06, "loss": 0.0015, "num_tokens": 5542716.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4410 }, { "completion_length": 12.05, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 12.05, "completions/mean_terminated_length": 12.05, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.30394718745702104, "frac_reward_zero_std": 1.0, "grad_norm": 0.003924133721739054, "kl": 1.1724998950958252, "learning_rate": 1.396421845574388e-06, "loss": 0.0012, "num_tokens": 5556754.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4420 }, { "completion_length": 11.575, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 11.575, "completions/mean_terminated_length": 11.575, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.30463485077705954, "frac_reward_zero_std": 1.0, "grad_norm": 0.006332984194159508, "kl": 1.4067680716514588, "learning_rate": 1.387005649717514e-06, "loss": 0.0014, "num_tokens": 5571041.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4430 }, { "completion_length": 13.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.4, "completions/mean_terminated_length": 13.4, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.30532251409709804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028947910759598017, "kl": 1.2639997959136964, "learning_rate": 1.3775894538606404e-06, "loss": 0.0013, "num_tokens": 5583229.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4440 }, { "completion_length": 15.325, "completions/clipped_ratio": 0.0, "completions/max_length": 31.9, "completions/max_terminated_length": 31.9, "completions/mean_length": 15.325, "completions/mean_terminated_length": 15.325, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.3060101774171366, "frac_reward_zero_std": 0.9, "grad_norm": 0.004690113477408886, "kl": 1.286442184448242, "learning_rate": 1.3681732580037666e-06, "loss": 0.0013, "num_tokens": 5593870.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4450 }, { "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.3066978407371751, "frac_reward_zero_std": 0.9, "grad_norm": 0.011687826365232468, "kl": 1.7261317849159241, "learning_rate": 1.358757062146893e-06, "loss": 0.0017, "num_tokens": 5608202.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4460 }, { "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.3073855040572136, "frac_reward_zero_std": 1.0, "grad_norm": 0.00487670348957181, "kl": 1.3319970309734344, "learning_rate": 1.349340866290019e-06, "loss": 0.0013, "num_tokens": 5620112.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4470 }, { "completion_length": 9.975, "completions/clipped_ratio": 0.0, "completions/max_length": 12.8, "completions/max_terminated_length": 12.8, "completions/mean_length": 9.975, "completions/mean_terminated_length": 9.975, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.3080731673772521, "frac_reward_zero_std": 1.0, "grad_norm": 0.0082427728921175, "kl": 1.5232180416584016, "learning_rate": 1.3399246704331451e-06, "loss": 0.0015, "num_tokens": 5632167.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4480 }, { "completion_length": 13.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 13.2, "completions/mean_terminated_length": 13.2, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3087608306972906, "frac_reward_zero_std": 0.9, "grad_norm": 0.0026437605265527964, "kl": 1.2098506152629853, "learning_rate": 1.3305084745762715e-06, "loss": 0.0012, "num_tokens": 5644367.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4490 }, { "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.3094484940173291, "frac_reward_zero_std": 1.0, "grad_norm": 0.019921699538826942, "kl": 1.5676705598831178, "learning_rate": 1.3210922787193975e-06, "loss": 0.0016, "num_tokens": 5656659.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4500 }, { "completion_length": 11.675, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.675, "completions/mean_terminated_length": 11.675, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.3101361573373676, "frac_reward_zero_std": 1.0, "grad_norm": 0.019338268786668777, "kl": 2.288919413089752, "learning_rate": 1.3116760828625236e-06, "loss": 0.0023, "num_tokens": 5668666.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4510 }, { "completion_length": 10.1, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.1, "completions/mean_terminated_length": 10.1, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.31082382065740616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0050545958802104, "kl": 1.5871056616306305, "learning_rate": 1.30225988700565e-06, "loss": 0.0016, "num_tokens": 5683262.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4520 }, { "completion_length": 10.4, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.4, "completions/mean_terminated_length": 10.4, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.31151148397744466, "frac_reward_zero_std": 1.0, "grad_norm": 0.011630339547991753, "kl": 1.5014687776565552, "learning_rate": 1.292843691148776e-06, "loss": 0.0015, "num_tokens": 5695350.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4530 }, { "completion_length": 11.475, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.475, "completions/mean_terminated_length": 11.475, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.31219914729748316, "frac_reward_zero_std": 1.0, "grad_norm": 0.004649888724088669, "kl": 1.3500673830509187, "learning_rate": 1.2834274952919021e-06, "loss": 0.0014, "num_tokens": 5706957.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4540 }, { "completion_length": 10.475, "completions/clipped_ratio": 0.0, "completions/max_length": 13.2, "completions/max_terminated_length": 13.2, "completions/mean_length": 10.475, "completions/mean_terminated_length": 10.475, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.31288681061752166, "frac_reward_zero_std": 1.0, "grad_norm": 0.001390202553011477, "kl": 1.491336989402771, "learning_rate": 1.2740112994350285e-06, "loss": 0.0015, "num_tokens": 5718220.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4550 }, { "completion_length": 12.775, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 12.775, "completions/mean_terminated_length": 12.775, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.31357447393756016, "frac_reward_zero_std": 1.0, "grad_norm": 0.004471197724342346, "kl": 1.2512777149677277, "learning_rate": 1.2645951035781545e-06, "loss": 0.0013, "num_tokens": 5730955.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4560 }, { "completion_length": 12.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 12.2, "completions/mean_terminated_length": 12.2, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.31426213725759866, "frac_reward_zero_std": 1.0, "grad_norm": 0.021901508793234825, "kl": 1.457699954509735, "learning_rate": 1.2551789077212806e-06, "loss": 0.0015, "num_tokens": 5743123.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4570 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.31494980057763716, "frac_reward_zero_std": 1.0, "grad_norm": 0.005899836774915457, "kl": 1.377420848608017, "learning_rate": 1.245762711864407e-06, "loss": 0.0014, "num_tokens": 5755139.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4580 }, { "completion_length": 26.875, "completions/clipped_ratio": 0.0, "completions/max_length": 75.9, "completions/max_terminated_length": 75.9, "completions/mean_length": 26.875, "completions/mean_terminated_length": 26.875, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.3156374638976757, "frac_reward_zero_std": 0.9, "grad_norm": 0.006933213677257299, "kl": 1.3808945536613464, "learning_rate": 1.236346516007533e-06, "loss": 0.0014, "num_tokens": 5766890.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4590 }, { "completion_length": 10.975, "completions/clipped_ratio": 0.0, "completions/max_length": 13.4, "completions/max_terminated_length": 13.4, "completions/mean_length": 10.975, "completions/mean_terminated_length": 10.975, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.3163251272177142, "frac_reward_zero_std": 1.0, "grad_norm": 0.008102341555058956, "kl": 1.4617763698101043, "learning_rate": 1.2269303201506591e-06, "loss": 0.0015, "num_tokens": 5777961.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4600 }, { "completion_length": 19.825, "completions/clipped_ratio": 0.0, "completions/max_length": 45.8, "completions/max_terminated_length": 45.8, "completions/mean_length": 19.825, "completions/mean_terminated_length": 19.825, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.3170127905377527, "frac_reward_zero_std": 0.8, "grad_norm": 0.007490513846278191, "kl": 1.2782706379890443, "learning_rate": 1.2175141242937855e-06, "loss": 0.0013, "num_tokens": 5790118.0, "reward": 5.9375, "reward_std": 0.125, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4610 }, { "completion_length": 13.075, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 13.075, "completions/mean_terminated_length": 13.075, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.3177004538577912, "frac_reward_zero_std": 1.0, "grad_norm": 0.004942750558257103, "kl": 1.2929556488990783, "learning_rate": 1.2080979284369115e-06, "loss": 0.0013, "num_tokens": 5801985.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4620 }, { "completion_length": 18.35, "completions/clipped_ratio": 0.0, "completions/max_length": 42.6, "completions/max_terminated_length": 42.6, "completions/mean_length": 18.35, "completions/mean_terminated_length": 18.35, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.3183881171778297, "frac_reward_zero_std": 0.9, "grad_norm": 1.5946123600006104, "kl": 1.4938054442405702, "learning_rate": 1.1986817325800379e-06, "loss": 0.0015, "num_tokens": 5815619.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4630 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.3190757804978682, "frac_reward_zero_std": 0.9, "grad_norm": 0.00890259351581335, "kl": 1.2599179446697235, "learning_rate": 1.189265536723164e-06, "loss": 0.0013, "num_tokens": 5826956.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4640 }, { "completion_length": 11.6, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.6, "completions/mean_terminated_length": 11.6, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.3197634438179068, "frac_reward_zero_std": 1.0, "grad_norm": 0.011387999169528484, "kl": 2.2636526942253115, "learning_rate": 1.17984934086629e-06, "loss": 0.0023, "num_tokens": 5839092.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4650 }, { "completion_length": 12.825, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 12.825, "completions/mean_terminated_length": 12.825, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.3204511071379453, "frac_reward_zero_std": 1.0, "grad_norm": 0.004085191525518894, "kl": 1.345226788520813, "learning_rate": 1.1704331450094164e-06, "loss": 0.0013, "num_tokens": 5852409.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4660 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.3211387704579838, "frac_reward_zero_std": 0.9, "grad_norm": 0.0036149087827652693, "kl": 1.3768569946289062, "learning_rate": 1.1610169491525425e-06, "loss": 0.0014, "num_tokens": 5866463.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4670 }, { "completion_length": 21.925, "completions/clipped_ratio": 0.0, "completions/max_length": 57.8, "completions/max_terminated_length": 57.8, "completions/mean_length": 21.925, "completions/mean_terminated_length": 21.925, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.3218264337780223, "frac_reward_zero_std": 0.9, "grad_norm": 0.05678132548928261, "kl": 1.2918590784072876, "learning_rate": 1.1516007532956687e-06, "loss": 0.0013, "num_tokens": 5878300.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4680 }, { "completion_length": 11.1, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 11.1, "completions/mean_terminated_length": 11.1, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.3225140970980608, "frac_reward_zero_std": 0.9, "grad_norm": 0.005522090010344982, "kl": 1.47247673869133, "learning_rate": 1.1421845574387949e-06, "loss": 0.0015, "num_tokens": 5890220.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4690 }, { "completion_length": 11.775, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 11.775, "completions/mean_terminated_length": 11.775, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.3232017604180993, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025551936123520136, "kl": 1.3015658378601074, "learning_rate": 1.132768361581921e-06, "loss": 0.0013, "num_tokens": 5903907.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4700 }, { "completion_length": 10.85, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.85, "completions/mean_terminated_length": 10.85, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.3238894237381378, "frac_reward_zero_std": 1.0, "grad_norm": 0.00985870510339737, "kl": 1.5268153309822083, "learning_rate": 1.1233521657250472e-06, "loss": 0.0015, "num_tokens": 5914369.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4710 }, { "completion_length": 13.325, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 13.325, "completions/mean_terminated_length": 13.325, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.32457708705817634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0380832813680172, "kl": 1.440810000896454, "learning_rate": 1.1139359698681734e-06, "loss": 0.0014, "num_tokens": 5926126.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4720 }, { "completion_length": 10.3, "completions/clipped_ratio": 0.0, "completions/max_length": 13.1, "completions/max_terminated_length": 13.1, "completions/mean_length": 10.3, "completions/mean_terminated_length": 10.3, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.32526475037821484, "frac_reward_zero_std": 1.0, "grad_norm": 0.00719305407255888, "kl": 1.5471572160720826, "learning_rate": 1.1045197740112995e-06, "loss": 0.0015, "num_tokens": 5939830.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4730 }, { "completion_length": 11.45, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.45, "completions/mean_terminated_length": 11.45, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.32595241369825334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0119554428383708, "kl": 1.56171954870224, "learning_rate": 1.0951035781544257e-06, "loss": 0.0016, "num_tokens": 5953144.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4740 }, { "completion_length": 10.775, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 10.775, "completions/mean_terminated_length": 10.775, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.32664007701829184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0054775746539235115, "kl": 1.531346207857132, "learning_rate": 1.0856873822975519e-06, "loss": 0.0015, "num_tokens": 5965163.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4750 }, { "completion_length": 11.55, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.55, "completions/mean_terminated_length": 11.55, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.32732774033833034, "frac_reward_zero_std": 1.0, "grad_norm": 0.020217234268784523, "kl": 1.3433250963687897, "learning_rate": 1.076271186440678e-06, "loss": 0.0013, "num_tokens": 5977441.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4760 }, { "completion_length": 26.275, "completions/clipped_ratio": 0.0, "completions/max_length": 71.5, "completions/max_terminated_length": 71.5, "completions/mean_length": 26.275, "completions/mean_terminated_length": 26.275, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.32801540365836884, "frac_reward_zero_std": 0.8, "grad_norm": 0.012751337140798569, "kl": 1.5057712554931642, "learning_rate": 1.0668549905838042e-06, "loss": 0.0015, "num_tokens": 5989348.0, "reward": 5.9, "reward_std": 0.14574271440505981, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.053867512941360475, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.053867512941360475, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4770 }, { "completion_length": 16.825, "completions/clipped_ratio": 0.0, "completions/max_length": 36.5, "completions/max_terminated_length": 36.5, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.3287030669784074, "frac_reward_zero_std": 0.9, "grad_norm": 0.008169161155819893, "kl": 1.307063925266266, "learning_rate": 1.0574387947269304e-06, "loss": 0.0013, "num_tokens": 6001573.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4780 }, { "completion_length": 9.875, "completions/clipped_ratio": 0.0, "completions/max_length": 12.9, "completions/max_terminated_length": 12.9, "completions/mean_length": 9.875, "completions/mean_terminated_length": 9.875, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.3293907302984459, "frac_reward_zero_std": 0.9, "grad_norm": 0.022803228348493576, "kl": 1.3791126608848572, "learning_rate": 1.0480225988700566e-06, "loss": 0.0014, "num_tokens": 6013696.0, "reward": 5.9875, "reward_std": 0.025, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4790 }, { "completion_length": 11.45, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.45, "completions/mean_terminated_length": 11.45, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.3300783936184844, "frac_reward_zero_std": 1.0, "grad_norm": 0.01593186892569065, "kl": 1.3368689715862274, "learning_rate": 1.0386064030131827e-06, "loss": 0.0013, "num_tokens": 6026230.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4800 }, { "completion_length": 19.125, "completions/clipped_ratio": 0.0, "completions/max_length": 48.1, "completions/max_terminated_length": 48.1, "completions/mean_length": 19.125, "completions/mean_terminated_length": 19.125, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.3307660569385229, "frac_reward_zero_std": 0.9, "grad_norm": 0.002272400539368391, "kl": 2.082442098855972, "learning_rate": 1.029190207156309e-06, "loss": 0.0021, "num_tokens": 6038639.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4810 }, { "completion_length": 11.2, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.2, "completions/mean_terminated_length": 11.2, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.3314537202585614, "frac_reward_zero_std": 1.0, "grad_norm": 0.004652642644941807, "kl": 1.4440054893493652, "learning_rate": 1.0197740112994353e-06, "loss": 0.0014, "num_tokens": 6050235.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4820 }, { "completion_length": 11.075, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.075, "completions/mean_terminated_length": 11.075, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.3321413835785999, "frac_reward_zero_std": 1.0, "grad_norm": 0.00583495432510972, "kl": 1.6347231984138488, "learning_rate": 1.0103578154425612e-06, "loss": 0.0016, "num_tokens": 6061518.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4830 }, { "completion_length": 10.85, "completions/clipped_ratio": 0.0, "completions/max_length": 13.3, "completions/max_terminated_length": 13.3, "completions/mean_length": 10.85, "completions/mean_terminated_length": 10.85, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.3328290468986384, "frac_reward_zero_std": 1.0, "grad_norm": 0.003644515760242939, "kl": 1.3461124837398528, "learning_rate": 1.0009416195856874e-06, "loss": 0.0013, "num_tokens": 6073212.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4840 }, { "completion_length": 11.425, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 11.425, "completions/mean_terminated_length": 11.425, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.33351671021867696, "frac_reward_zero_std": 1.0, "grad_norm": 0.006971648428589106, "kl": 1.4714319944381713, "learning_rate": 9.915254237288138e-07, "loss": 0.0015, "num_tokens": 6085061.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4850 }, { "completion_length": 32.725, "completions/clipped_ratio": 0.0, "completions/max_length": 100.4, "completions/max_terminated_length": 100.4, "completions/mean_length": 32.725, "completions/mean_terminated_length": 32.725, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.33420437353871546, "frac_reward_zero_std": 0.9, "grad_norm": 0.010788694955408573, "kl": 1.3329748511314392, "learning_rate": 9.821092278719397e-07, "loss": 0.0013, "num_tokens": 6099130.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4860 }, { "completion_length": 10.875, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 10.875, "completions/mean_terminated_length": 10.875, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.33489203685875396, "frac_reward_zero_std": 0.9, "grad_norm": 0.002744455123320222, "kl": 2.1530386984348295, "learning_rate": 9.72693032015066e-07, "loss": 0.0022, "num_tokens": 6111409.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4870 }, { "completion_length": 12.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 12.9, "completions/mean_terminated_length": 12.9, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.33557970017879246, "frac_reward_zero_std": 0.9, "grad_norm": 0.004031331278383732, "kl": 1.3381536304950714, "learning_rate": 9.632768361581923e-07, "loss": 0.0013, "num_tokens": 6123997.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4880 }, { "completion_length": 10.7, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 10.7, "completions/mean_terminated_length": 10.7, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.33626736349883096, "frac_reward_zero_std": 0.9, "grad_norm": 0.03266019746661186, "kl": 1.422250461578369, "learning_rate": 9.538606403013182e-07, "loss": 0.0014, "num_tokens": 6137029.0, "reward": 5.9875, "reward_std": 0.025, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4890 }, { "completion_length": 26.85, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 26.85, "completions/mean_terminated_length": 26.85, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.33695502681886946, "frac_reward_zero_std": 0.9, "grad_norm": 0.004609998781234026, "kl": 1.1960768818855285, "learning_rate": 9.444444444444445e-07, "loss": 0.0012, "num_tokens": 6151579.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4900 }, { "completion_length": 12.125, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.125, "completions/mean_terminated_length": 12.125, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.33764269013890796, "frac_reward_zero_std": 1.0, "grad_norm": 0.007132112048566341, "kl": 1.325413703918457, "learning_rate": 9.350282485875707e-07, "loss": 0.0013, "num_tokens": 6164076.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4910 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3383303534589465, "frac_reward_zero_std": 1.0, "grad_norm": 0.005040779244154692, "kl": 1.277837687730789, "learning_rate": 9.25612052730697e-07, "loss": 0.0013, "num_tokens": 6175141.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4920 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.339018016778985, "frac_reward_zero_std": 1.0, "grad_norm": 0.004855050239712, "kl": 1.2279843807220459, "learning_rate": 9.16195856873823e-07, "loss": 0.0012, "num_tokens": 6187590.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4930 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.3397056800990235, "frac_reward_zero_std": 1.0, "grad_norm": 0.004113995004445314, "kl": 1.2801987171173095, "learning_rate": 9.067796610169492e-07, "loss": 0.0013, "num_tokens": 6199832.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4940 }, { "completion_length": 10.65, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 10.65, "completions/mean_terminated_length": 10.65, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.340393343419062, "frac_reward_zero_std": 1.0, "grad_norm": 0.04914846643805504, "kl": 1.4876422524452209, "learning_rate": 8.973634651600755e-07, "loss": 0.0015, "num_tokens": 6213162.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4950 }, { "completion_length": 15.8, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 15.8, "completions/mean_terminated_length": 15.8, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.3410810067391005, "frac_reward_zero_std": 0.9, "grad_norm": 0.011781705543398857, "kl": 1.1131984174251557, "learning_rate": 8.879472693032015e-07, "loss": 0.0011, "num_tokens": 6223426.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4960 }, { "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.341768670059139, "frac_reward_zero_std": 0.9, "grad_norm": 0.008672394789755344, "kl": 1.2104420125484467, "learning_rate": 8.785310734463277e-07, "loss": 0.0012, "num_tokens": 6235529.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4970 }, { "completion_length": 10.95, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 10.95, "completions/mean_terminated_length": 10.95, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.3424563333791776, "frac_reward_zero_std": 1.0, "grad_norm": 0.006266096141189337, "kl": 1.310611402988434, "learning_rate": 8.69114877589454e-07, "loss": 0.0013, "num_tokens": 6245659.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4980 }, { "completion_length": 11.475, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.475, "completions/mean_terminated_length": 11.475, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.3431439966992161, "frac_reward_zero_std": 1.0, "grad_norm": 0.012767767533659935, "kl": 1.3206363081932069, "learning_rate": 8.596986817325801e-07, "loss": 0.0013, "num_tokens": 6257054.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 4990 }, { "completion_length": 12.575, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 12.575, "completions/mean_terminated_length": 12.575, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.3438316600192546, "frac_reward_zero_std": 0.9, "grad_norm": 0.006154041737318039, "kl": 1.1578357517719269, "learning_rate": 8.502824858757062e-07, "loss": 0.0012, "num_tokens": 6270365.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5000 }, { "completion_length": 12.45, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.45, "completions/mean_terminated_length": 12.45, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.3445193233392931, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072873965837061405, "kl": 1.4725398778915406, "learning_rate": 8.408662900188325e-07, "loss": 0.0015, "num_tokens": 6281839.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5010 }, { "completion_length": 21.55, "completions/clipped_ratio": 0.0, "completions/max_length": 53.4, "completions/max_terminated_length": 53.4, "completions/mean_length": 21.55, "completions/mean_terminated_length": 21.55, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.3452069866593316, "frac_reward_zero_std": 0.9, "grad_norm": 0.009922947734594345, "kl": 1.309710693359375, "learning_rate": 8.314500941619586e-07, "loss": 0.0013, "num_tokens": 6295025.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5020 }, { "completion_length": 19.225, "completions/clipped_ratio": 0.0, "completions/max_length": 42.8, "completions/max_terminated_length": 42.8, "completions/mean_length": 19.225, "completions/mean_terminated_length": 19.225, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.3458946499793701, "frac_reward_zero_std": 0.9, "grad_norm": 0.00749584287405014, "kl": 1.1847262263298035, "learning_rate": 8.220338983050847e-07, "loss": 0.0012, "num_tokens": 6307442.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5030 }, { "completion_length": 12.65, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 12.65, "completions/mean_terminated_length": 12.65, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.3465823132994086, "frac_reward_zero_std": 0.9, "grad_norm": 0.004699906334280968, "kl": 1.1936538338661193, "learning_rate": 8.12617702448211e-07, "loss": 0.0012, "num_tokens": 6320480.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5040 }, { "completion_length": 27.425, "completions/clipped_ratio": 0.0, "completions/max_length": 75.3, "completions/max_terminated_length": 75.3, "completions/mean_length": 27.425, "completions/mean_terminated_length": 27.425, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.34726997661944714, "frac_reward_zero_std": 0.8, "grad_norm": 0.004290629643946886, "kl": 1.093562251329422, "learning_rate": 8.032015065913372e-07, "loss": 0.0011, "num_tokens": 6332481.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5050 }, { "completion_length": 11.525, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.525, "completions/mean_terminated_length": 11.525, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.34795763993948564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031757554970681667, "kl": 1.2823837637901305, "learning_rate": 7.937853107344634e-07, "loss": 0.0013, "num_tokens": 6343942.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5060 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.34864530325952414, "frac_reward_zero_std": 1.0, "grad_norm": 0.005417166743427515, "kl": 1.0797785520553589, "learning_rate": 7.843691148775895e-07, "loss": 0.0011, "num_tokens": 6356732.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5070 }, { "completion_length": 18.375, "completions/clipped_ratio": 0.0, "completions/max_length": 41.7, "completions/max_terminated_length": 41.7, "completions/mean_length": 18.375, "completions/mean_terminated_length": 18.375, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.34933296657956264, "frac_reward_zero_std": 0.9, "grad_norm": 0.005895074922591448, "kl": 1.1690425515174865, "learning_rate": 7.749529190207157e-07, "loss": 0.0012, "num_tokens": 6366627.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5080 }, { "completion_length": 11.925, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.925, "completions/mean_terminated_length": 11.925, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.35002062989960114, "frac_reward_zero_std": 1.0, "grad_norm": 0.005460694897919893, "kl": 1.1566421210765838, "learning_rate": 7.655367231638419e-07, "loss": 0.0012, "num_tokens": 6378400.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5090 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 49.7, "completions/max_terminated_length": 49.7, "completions/mean_length": 20.2, "completions/mean_terminated_length": 20.2, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.35070829321963964, "frac_reward_zero_std": 0.9, "grad_norm": 0.02746504172682762, "kl": 1.2156450688838958, "learning_rate": 7.56120527306968e-07, "loss": 0.0012, "num_tokens": 6389916.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5100 }, { "completion_length": 28.125, "completions/clipped_ratio": 0.0, "completions/max_length": 78.9, "completions/max_terminated_length": 78.9, "completions/mean_length": 28.125, "completions/mean_terminated_length": 28.125, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.3513959565396782, "frac_reward_zero_std": 0.9, "grad_norm": 0.002399625489488244, "kl": 1.255337220430374, "learning_rate": 7.467043314500942e-07, "loss": 0.0013, "num_tokens": 6403225.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5110 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.3520836198597167, "frac_reward_zero_std": 0.9, "grad_norm": 0.008576685562729836, "kl": 1.2706177592277528, "learning_rate": 7.372881355932204e-07, "loss": 0.0013, "num_tokens": 6413773.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5120 }, { "completion_length": 12.125, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 12.125, "completions/mean_terminated_length": 12.125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3527712831797552, "frac_reward_zero_std": 1.0, "grad_norm": 0.004991778638213873, "kl": 1.2184751868247985, "learning_rate": 7.278719397363465e-07, "loss": 0.0012, "num_tokens": 6426346.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5130 }, { "completion_length": 12.025, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 12.025, "completions/mean_terminated_length": 12.025, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.3534589464997937, "frac_reward_zero_std": 1.0, "grad_norm": 0.005585167091339827, "kl": 1.2197207391262055, "learning_rate": 7.184557438794728e-07, "loss": 0.0012, "num_tokens": 6439211.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5140 }, { "completion_length": 40.525, "completions/clipped_ratio": 0.0, "completions/max_length": 127.6, "completions/max_terminated_length": 127.6, "completions/mean_length": 40.525, "completions/mean_terminated_length": 40.525, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.3541466098198322, "frac_reward_zero_std": 0.8, "grad_norm": 0.007972132414579391, "kl": 1.2606733858585357, "learning_rate": 7.09039548022599e-07, "loss": 0.0013, "num_tokens": 6453252.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5150 }, { "completion_length": 11.075, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.075, "completions/mean_terminated_length": 11.075, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.3548342731398707, "frac_reward_zero_std": 1.0, "grad_norm": 0.009181777015328407, "kl": 1.3541767716407775, "learning_rate": 6.996233521657251e-07, "loss": 0.0014, "num_tokens": 6464707.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5160 }, { "completion_length": 21.575, "completions/clipped_ratio": 0.0, "completions/max_length": 53.3, "completions/max_terminated_length": 53.3, "completions/mean_length": 21.575, "completions/mean_terminated_length": 21.575, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.3555219364599092, "frac_reward_zero_std": 0.9, "grad_norm": 0.005340268835425377, "kl": 1.1702833890914917, "learning_rate": 6.902071563088513e-07, "loss": 0.0012, "num_tokens": 6476706.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5170 }, { "completion_length": 11.4, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.4, "completions/mean_terminated_length": 11.4, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.35620959977994776, "frac_reward_zero_std": 1.0, "grad_norm": 0.00588555634021759, "kl": 1.3390805840492248, "learning_rate": 6.807909604519775e-07, "loss": 0.0013, "num_tokens": 6488342.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5180 }, { "completion_length": 11.225, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 11.225, "completions/mean_terminated_length": 11.225, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.35689726309998626, "frac_reward_zero_std": 1.0, "grad_norm": 0.007043247576802969, "kl": 1.256647562980652, "learning_rate": 6.713747645951036e-07, "loss": 0.0013, "num_tokens": 6500363.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5190 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 13.3, "completions/max_terminated_length": 13.3, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.35758492642002476, "frac_reward_zero_std": 1.0, "grad_norm": 0.005169827025383711, "kl": 1.0854641497135162, "learning_rate": 6.619585687382298e-07, "loss": 0.0011, "num_tokens": 6513667.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5200 }, { "completion_length": 13.6, "completions/clipped_ratio": 0.0, "completions/max_length": 24.6, "completions/max_terminated_length": 24.6, "completions/mean_length": 13.6, "completions/mean_terminated_length": 13.6, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.35827258974006326, "frac_reward_zero_std": 0.8, "grad_norm": 0.02052394114434719, "kl": 1.5007223725318908, "learning_rate": 6.52542372881356e-07, "loss": 0.0015, "num_tokens": 6526107.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5210 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.35896025306010176, "frac_reward_zero_std": 1.0, "grad_norm": 0.2504121959209442, "kl": 1.3355840384960174, "learning_rate": 6.431261770244822e-07, "loss": 0.0013, "num_tokens": 6538363.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5220 }, { "completion_length": 10.95, "completions/clipped_ratio": 0.0, "completions/max_length": 14.2, "completions/max_terminated_length": 14.2, "completions/mean_length": 10.95, "completions/mean_terminated_length": 10.95, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.35964791638014026, "frac_reward_zero_std": 1.0, "grad_norm": 0.006526256911456585, "kl": 1.2021831452846528, "learning_rate": 6.337099811676084e-07, "loss": 0.0012, "num_tokens": 6551289.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5230 }, { "completion_length": 29.4, "completions/clipped_ratio": 0.0, "completions/max_length": 82.8, "completions/max_terminated_length": 82.8, "completions/mean_length": 29.4, "completions/mean_terminated_length": 29.4, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.36033557970017877, "frac_reward_zero_std": 0.7, "grad_norm": 0.00306605058722198, "kl": 1.1230719089508057, "learning_rate": 6.242937853107346e-07, "loss": 0.0011, "num_tokens": 6563953.0, "reward": 5.9, "reward_std": 0.2, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5240 }, { "completion_length": 12.65, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 12.65, "completions/mean_terminated_length": 12.65, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.3610232430202173, "frac_reward_zero_std": 1.0, "grad_norm": 0.005555626004934311, "kl": 1.14589102268219, "learning_rate": 6.148775894538607e-07, "loss": 0.0011, "num_tokens": 6576639.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5250 }, { "completion_length": 12.325, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 12.325, "completions/mean_terminated_length": 12.325, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.3617109063402558, "frac_reward_zero_std": 1.0, "grad_norm": 0.006481971126049757, "kl": 1.2317140579223633, "learning_rate": 6.054613935969868e-07, "loss": 0.0012, "num_tokens": 6589136.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5260 }, { "completion_length": 63.85, "completions/clipped_ratio": 0.0, "completions/max_length": 219.6, "completions/max_terminated_length": 219.6, "completions/mean_length": 63.85, "completions/mean_terminated_length": 63.85, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.3623985696602943, "frac_reward_zero_std": 0.8, "grad_norm": 1.8743151426315308, "kl": 1.0735913693904877, "learning_rate": 5.960451977401131e-07, "loss": 0.0011, "num_tokens": 6602826.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5270 }, { "completion_length": 12.425, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 12.425, "completions/mean_terminated_length": 12.425, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.3630862329803328, "frac_reward_zero_std": 1.0, "grad_norm": 0.00245496304705739, "kl": 1.2225584924221038, "learning_rate": 5.866290018832392e-07, "loss": 0.0012, "num_tokens": 6615731.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5280 }, { "completion_length": 14.025, "completions/clipped_ratio": 0.0, "completions/max_length": 26.3, "completions/max_terminated_length": 26.3, "completions/mean_length": 14.025, "completions/mean_terminated_length": 14.025, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.3637738963003713, "frac_reward_zero_std": 0.8, "grad_norm": 22.05070686340332, "kl": 1.3892334163188935, "learning_rate": 5.772128060263654e-07, "loss": 0.0014, "num_tokens": 6627700.0, "reward": 5.9125, "reward_std": 0.175, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5290 }, { "completion_length": 12.275, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 12.275, "completions/mean_terminated_length": 12.275, "completions/min_length": 10.2, "completions/min_terminated_length": 10.2, "epoch": 0.3644615596204098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0061513264663517475, "kl": 1.1569065511226655, "learning_rate": 5.677966101694916e-07, "loss": 0.0012, "num_tokens": 6641163.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5300 }, { "completion_length": 12.275, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 12.275, "completions/mean_terminated_length": 12.275, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.3651492229404484, "frac_reward_zero_std": 1.0, "grad_norm": 0.004866019356995821, "kl": 1.1956783890724183, "learning_rate": 5.583804143126178e-07, "loss": 0.0012, "num_tokens": 6653154.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5310 }, { "completion_length": 19.55, "completions/clipped_ratio": 0.0, "completions/max_length": 46.2, "completions/max_terminated_length": 46.2, "completions/mean_length": 19.55, "completions/mean_terminated_length": 19.55, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.3658368862604869, "frac_reward_zero_std": 0.9, "grad_norm": 0.0047823116183280945, "kl": 1.1544413030147553, "learning_rate": 5.489642184557439e-07, "loss": 0.0012, "num_tokens": 6665856.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5320 }, { "completion_length": 16.125, "completions/clipped_ratio": 0.0, "completions/max_length": 32.2, "completions/max_terminated_length": 32.2, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.3665245495805254, "frac_reward_zero_std": 0.9, "grad_norm": 0.002742278855293989, "kl": 1.186799842119217, "learning_rate": 5.395480225988701e-07, "loss": 0.0012, "num_tokens": 6678137.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5330 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.3672122129005639, "frac_reward_zero_std": 1.0, "grad_norm": 0.008778350427746773, "kl": 1.0832504153251648, "learning_rate": 5.301318267419963e-07, "loss": 0.0011, "num_tokens": 6690458.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5340 }, { "completion_length": 15.9, "completions/clipped_ratio": 0.0, "completions/max_length": 31.8, "completions/max_terminated_length": 31.8, "completions/mean_length": 15.9, "completions/mean_terminated_length": 15.9, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3678998762206024, "frac_reward_zero_std": 0.9, "grad_norm": 0.004558028653264046, "kl": 1.1966825664043426, "learning_rate": 5.207156308851224e-07, "loss": 0.0012, "num_tokens": 6704106.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5350 }, { "completion_length": 12.55, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 12.55, "completions/mean_terminated_length": 12.55, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.3685875395406409, "frac_reward_zero_std": 1.0, "grad_norm": 0.004179758485406637, "kl": 1.0407968640327454, "learning_rate": 5.112994350282487e-07, "loss": 0.001, "num_tokens": 6716772.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5360 }, { "completion_length": 16.3, "completions/clipped_ratio": 0.0, "completions/max_length": 35.9, "completions/max_terminated_length": 35.9, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.3692752028606794, "frac_reward_zero_std": 0.9, "grad_norm": 3.5104458332061768, "kl": 1.2622505128383636, "learning_rate": 5.018832391713748e-07, "loss": 0.0013, "num_tokens": 6731212.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5370 }, { "completion_length": 13.9, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 13.9, "completions/mean_terminated_length": 13.9, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.36996286618071794, "frac_reward_zero_std": 0.9, "grad_norm": 0.0022497333120554686, "kl": 1.1641826272010802, "learning_rate": 4.924670433145009e-07, "loss": 0.0012, "num_tokens": 6742524.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5380 }, { "completion_length": 12.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 12.4, "completions/mean_terminated_length": 12.4, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.37065052950075644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036960735451430082, "kl": 1.3138091266155243, "learning_rate": 4.830508474576272e-07, "loss": 0.0013, "num_tokens": 6755684.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5390 }, { "completion_length": 12.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.8, "completions/mean_terminated_length": 12.8, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.37133819282079494, "frac_reward_zero_std": 1.0, "grad_norm": 0.008950471878051758, "kl": 1.09179944396019, "learning_rate": 4.736346516007533e-07, "loss": 0.0011, "num_tokens": 6769472.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5400 }, { "completion_length": 11.425, "completions/clipped_ratio": 0.0, "completions/max_length": 13.8, "completions/max_terminated_length": 13.8, "completions/mean_length": 11.425, "completions/mean_terminated_length": 11.425, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.37202585614083344, "frac_reward_zero_std": 1.0, "grad_norm": 0.00552986329421401, "kl": 1.1067481756210327, "learning_rate": 4.6421845574387955e-07, "loss": 0.0011, "num_tokens": 6781257.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5410 }, { "completion_length": 25.55, "completions/clipped_ratio": 0.0, "completions/max_length": 68.4, "completions/max_terminated_length": 68.4, "completions/mean_length": 25.55, "completions/mean_terminated_length": 25.55, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.37271351946087194, "frac_reward_zero_std": 0.9, "grad_norm": 0.005075867287814617, "kl": 1.245941936969757, "learning_rate": 4.5480225988700566e-07, "loss": 0.0012, "num_tokens": 6794011.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5420 }, { "completion_length": 12.2, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 12.2, "completions/mean_terminated_length": 12.2, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.37340118278091045, "frac_reward_zero_std": 1.0, "grad_norm": 0.015166271477937698, "kl": 1.3145410895347596, "learning_rate": 4.4538606403013183e-07, "loss": 0.0013, "num_tokens": 6804963.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5430 }, { "completion_length": 12.175, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 12.175, "completions/mean_terminated_length": 12.175, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.374088846100949, "frac_reward_zero_std": 1.0, "grad_norm": 0.006140697747468948, "kl": 1.1052743911743164, "learning_rate": 4.3596986817325805e-07, "loss": 0.0011, "num_tokens": 6817578.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5440 }, { "completion_length": 11.975, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.975, "completions/mean_terminated_length": 11.975, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.3747765094209875, "frac_reward_zero_std": 1.0, "grad_norm": 0.011798047460615635, "kl": 1.282853376865387, "learning_rate": 4.265536723163842e-07, "loss": 0.0013, "num_tokens": 6827773.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5450 }, { "completion_length": 12.475, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 12.475, "completions/mean_terminated_length": 12.475, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.375464172741026, "frac_reward_zero_std": 1.0, "grad_norm": 0.004385827574878931, "kl": 1.091407561302185, "learning_rate": 4.1713747645951044e-07, "loss": 0.0011, "num_tokens": 6840336.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5460 }, { "completion_length": 12.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 12.9, "completions/mean_terminated_length": 12.9, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.3761518360610645, "frac_reward_zero_std": 1.0, "grad_norm": 0.004174704663455486, "kl": 1.1986376702785493, "learning_rate": 4.0772128060263656e-07, "loss": 0.0012, "num_tokens": 6851460.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5470 }, { "completion_length": 12.65, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 12.65, "completions/mean_terminated_length": 12.65, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.376839499381103, "frac_reward_zero_std": 0.9, "grad_norm": 0.00416609225794673, "kl": 1.216666615009308, "learning_rate": 3.983050847457627e-07, "loss": 0.0012, "num_tokens": 6862858.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5480 }, { "completion_length": 10.725, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 10.725, "completions/mean_terminated_length": 10.725, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.3775271627011415, "frac_reward_zero_std": 1.0, "grad_norm": 0.008793276734650135, "kl": 1.2551342070102691, "learning_rate": 3.8888888888888895e-07, "loss": 0.0013, "num_tokens": 6873611.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5490 }, { "completion_length": 12.025, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 12.025, "completions/mean_terminated_length": 12.025, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.37821482602118, "frac_reward_zero_std": 1.0, "grad_norm": 0.004354639444500208, "kl": 1.259047031402588, "learning_rate": 3.7947269303201506e-07, "loss": 0.0013, "num_tokens": 6885284.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5500 }, { "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 60.3, "completions/max_terminated_length": 60.3, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.37890248934121856, "frac_reward_zero_std": 0.9, "grad_norm": 0.003756628604605794, "kl": 1.2296605169773103, "learning_rate": 3.700564971751413e-07, "loss": 0.0012, "num_tokens": 6898330.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5510 }, { "completion_length": 13.475, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 13.475, "completions/mean_terminated_length": 13.475, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.37959015266125706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011304677464067936, "kl": 1.1572206735610961, "learning_rate": 3.6064030131826745e-07, "loss": 0.0012, "num_tokens": 6909365.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5520 }, { "completion_length": 12.85, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 12.85, "completions/mean_terminated_length": 12.85, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.38027781598129556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032178533729165792, "kl": 1.1290329098701477, "learning_rate": 3.512241054613937e-07, "loss": 0.0011, "num_tokens": 6921611.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5530 }, { "completion_length": 12.95, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 12.95, "completions/mean_terminated_length": 12.95, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.38096547930133406, "frac_reward_zero_std": 1.0, "grad_norm": 0.00934622623026371, "kl": 1.1119795739650726, "learning_rate": 3.418079096045198e-07, "loss": 0.0011, "num_tokens": 6933669.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5540 }, { "completion_length": 12.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 12.4, "completions/mean_terminated_length": 12.4, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.38165314262137257, "frac_reward_zero_std": 1.0, "grad_norm": 0.007171685341745615, "kl": 1.3122216761112213, "learning_rate": 3.3239171374764596e-07, "loss": 0.0013, "num_tokens": 6945577.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5550 }, { "completion_length": 42.975, "completions/clipped_ratio": 0.0, "completions/max_length": 135.7, "completions/max_terminated_length": 135.7, "completions/mean_length": 42.975, "completions/mean_terminated_length": 42.975, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.38234080594141107, "frac_reward_zero_std": 0.7, "grad_norm": 0.021711384877562523, "kl": 1.1385715395212173, "learning_rate": 3.229755178907722e-07, "loss": 0.0011, "num_tokens": 6958564.0, "reward": 5.9125, "reward_std": 0.175, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.45, "rewards/check_response_quality/std": 0.1, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5560 }, { "completion_length": 40.675, "completions/clipped_ratio": 0.0, "completions/max_length": 127.9, "completions/max_terminated_length": 127.9, "completions/mean_length": 40.675, "completions/mean_terminated_length": 40.675, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.38302846926144957, "frac_reward_zero_std": 0.7, "grad_norm": 0.0039871432818472385, "kl": 1.0618961095809936, "learning_rate": 3.135593220338983e-07, "loss": 0.0011, "num_tokens": 6971879.0, "reward": 5.9, "reward_std": 0.2, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5570 }, { "completion_length": 12.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 12.3, "completions/mean_terminated_length": 12.3, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3837161325814881, "frac_reward_zero_std": 1.0, "grad_norm": 0.004354615230113268, "kl": 1.0903443574905396, "learning_rate": 3.041431261770245e-07, "loss": 0.0011, "num_tokens": 6982067.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5580 }, { "completion_length": 12.175, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 12.175, "completions/mean_terminated_length": 12.175, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.3844037959015266, "frac_reward_zero_std": 1.0, "grad_norm": 0.006406493950635195, "kl": 1.113939094543457, "learning_rate": 2.947269303201507e-07, "loss": 0.0011, "num_tokens": 6994550.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5590 }, { "completion_length": 12.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 12.4, "completions/mean_terminated_length": 12.4, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.3850914592215651, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028774929232895374, "kl": 1.1482871413230895, "learning_rate": 2.8531073446327686e-07, "loss": 0.0011, "num_tokens": 7007206.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5600 }, { "completion_length": 25.925, "completions/clipped_ratio": 0.0, "completions/max_length": 68.3, "completions/max_terminated_length": 68.3, "completions/mean_length": 25.925, "completions/mean_terminated_length": 25.925, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.3857791225416036, "frac_reward_zero_std": 0.8, "grad_norm": 0.004740417003631592, "kl": 1.0405526280403137, "learning_rate": 2.75894538606403e-07, "loss": 0.001, "num_tokens": 7019915.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5610 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.3864667858616421, "frac_reward_zero_std": 0.9, "grad_norm": 0.014549007639288902, "kl": 1.0921140372753144, "learning_rate": 2.6647834274952925e-07, "loss": 0.0011, "num_tokens": 7034123.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5620 }, { "completion_length": 25.325, "completions/clipped_ratio": 0.0, "completions/max_length": 67.2, "completions/max_terminated_length": 67.2, "completions/mean_length": 25.325, "completions/mean_terminated_length": 25.325, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.3871544491816806, "frac_reward_zero_std": 0.7, "grad_norm": 0.002857876941561699, "kl": 1.0279980540275573, "learning_rate": 2.5706214689265536e-07, "loss": 0.001, "num_tokens": 7048884.0, "reward": 5.9, "reward_std": 0.2, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5630 }, { "completion_length": 22.8, "completions/clipped_ratio": 0.0, "completions/max_length": 59.2, "completions/max_terminated_length": 59.2, "completions/mean_length": 22.8, "completions/mean_terminated_length": 22.8, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3878421125017192, "frac_reward_zero_std": 0.9, "grad_norm": 0.005176835227757692, "kl": 1.3074536800384522, "learning_rate": 2.4764595103578153e-07, "loss": 0.0013, "num_tokens": 7061932.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5640 }, { "completion_length": 11.6, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 11.6, "completions/mean_terminated_length": 11.6, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.3885297758217577, "frac_reward_zero_std": 1.0, "grad_norm": 0.012251128442585468, "kl": 1.2243261456489563, "learning_rate": 2.3822975517890773e-07, "loss": 0.0012, "num_tokens": 7074416.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5650 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.3892174391417962, "frac_reward_zero_std": 1.0, "grad_norm": 0.006726069375872612, "kl": 2.5275397300720215, "learning_rate": 2.2881355932203392e-07, "loss": 0.0025, "num_tokens": 7087462.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5660 }, { "completion_length": 12.1, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 12.1, "completions/mean_terminated_length": 12.1, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.3899051024618347, "frac_reward_zero_std": 1.0, "grad_norm": 0.004947757348418236, "kl": 1.1592617809772492, "learning_rate": 2.193973634651601e-07, "loss": 0.0012, "num_tokens": 7100302.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5670 }, { "completion_length": 14.325, "completions/clipped_ratio": 0.0, "completions/max_length": 22.6, "completions/max_terminated_length": 22.6, "completions/mean_length": 14.325, "completions/mean_terminated_length": 14.325, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.3905927657818732, "frac_reward_zero_std": 0.9, "grad_norm": 0.03539891913533211, "kl": 1.1650193750858306, "learning_rate": 2.0998116760828628e-07, "loss": 0.0012, "num_tokens": 7112795.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5680 }, { "completion_length": 19.675, "completions/clipped_ratio": 0.0, "completions/max_length": 45.7, "completions/max_terminated_length": 45.7, "completions/mean_length": 19.675, "completions/mean_terminated_length": 19.675, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3912804291019117, "frac_reward_zero_std": 0.7, "grad_norm": 1.8668094873428345, "kl": 1.1627625286579133, "learning_rate": 2.0056497175141243e-07, "loss": 0.0012, "num_tokens": 7125062.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5690 }, { "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.3919680924219502, "frac_reward_zero_std": 1.0, "grad_norm": 0.00394619582220912, "kl": 1.174496626853943, "learning_rate": 1.9114877589453862e-07, "loss": 0.0012, "num_tokens": 7136598.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5700 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.39265575574198874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035702527966350317, "kl": 1.0594860434532165, "learning_rate": 1.817325800376648e-07, "loss": 0.0011, "num_tokens": 7148570.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5710 }, { "completion_length": 11.025, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 11.025, "completions/mean_terminated_length": 11.025, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.39334341906202724, "frac_reward_zero_std": 1.0, "grad_norm": 0.005145955365151167, "kl": 1.3463842570781708, "learning_rate": 1.7231638418079099e-07, "loss": 0.0013, "num_tokens": 7159799.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5720 }, { "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.39403108238206574, "frac_reward_zero_std": 1.0, "grad_norm": 0.027580684050917625, "kl": 1.1848535418510437, "learning_rate": 1.6290018832391715e-07, "loss": 0.0012, "num_tokens": 7173311.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5730 }, { "completion_length": 13.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.3, "completions/mean_terminated_length": 13.3, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.39471874570210425, "frac_reward_zero_std": 0.9, "grad_norm": 0.00894416868686676, "kl": 1.2177072703838348, "learning_rate": 1.5348399246704332e-07, "loss": 0.0012, "num_tokens": 7185047.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5740 }, { "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.7, "completions/max_terminated_length": 53.7, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.39540640902214275, "frac_reward_zero_std": 0.9, "grad_norm": 0.009776029735803604, "kl": 0.933041188120842, "learning_rate": 1.440677966101695e-07, "loss": 0.0009, "num_tokens": 7198301.0, "reward": 5.9, "reward_std": 0.1154700517654419, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.05773502588272095, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.028867512941360474, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.028867512941360474, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5750 }, { "completion_length": 29.575, "completions/clipped_ratio": 0.0, "completions/max_length": 83.1, "completions/max_terminated_length": 83.1, "completions/mean_length": 29.575, "completions/mean_terminated_length": 29.575, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.39609407234218125, "frac_reward_zero_std": 0.9, "grad_norm": 0.004383026156574488, "kl": 1.0633900821208955, "learning_rate": 1.346516007532957e-07, "loss": 0.0011, "num_tokens": 7212852.0, "reward": 5.925, "reward_std": 0.09574271440505981, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.05773502588272095, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5760 }, { "completion_length": 13.475, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 13.475, "completions/mean_terminated_length": 13.475, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.3967817356622198, "frac_reward_zero_std": 1.0, "grad_norm": 0.026582496240735054, "kl": 0.9946746349334716, "learning_rate": 1.2523540489642186e-07, "loss": 0.001, "num_tokens": 7225491.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5770 }, { "completion_length": 11.55, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 11.55, "completions/mean_terminated_length": 11.55, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3974693989822583, "frac_reward_zero_std": 1.0, "grad_norm": 0.006764892488718033, "kl": 1.2381967782974244, "learning_rate": 1.1581920903954804e-07, "loss": 0.0012, "num_tokens": 7238005.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5780 }, { "completion_length": 12.475, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 12.475, "completions/mean_terminated_length": 12.475, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.3981570623022968, "frac_reward_zero_std": 1.0, "grad_norm": 0.009114019572734833, "kl": 1.2961783528327941, "learning_rate": 1.0640301318267422e-07, "loss": 0.0013, "num_tokens": 7249244.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5790 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.3988447256223353, "frac_reward_zero_std": 1.0, "grad_norm": 0.004210586193948984, "kl": 1.234499990940094, "learning_rate": 9.698681732580038e-08, "loss": 0.0012, "num_tokens": 7261796.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5800 }, { "completion_length": 11.725, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 11.725, "completions/mean_terminated_length": 11.725, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.3995323889423738, "frac_reward_zero_std": 1.0, "grad_norm": 0.005776832811534405, "kl": 1.5726662933826447, "learning_rate": 8.757062146892656e-08, "loss": 0.0016, "num_tokens": 7272721.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5810 }, { "completion_length": 11.35, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.35, "completions/mean_terminated_length": 11.35, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.4002200522624123, "frac_reward_zero_std": 1.0, "grad_norm": 0.009506648406386375, "kl": 1.2478809118270875, "learning_rate": 7.815442561205274e-08, "loss": 0.0012, "num_tokens": 7282155.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5820 }, { "completion_length": 11.8, "completions/clipped_ratio": 0.0, "completions/max_length": 14.4, "completions/max_terminated_length": 14.4, "completions/mean_length": 11.8, "completions/mean_terminated_length": 11.8, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.4009077155824508, "frac_reward_zero_std": 1.0, "grad_norm": 0.004437907133251429, "kl": 1.7026907682418824, "learning_rate": 6.873822975517891e-08, "loss": 0.0017, "num_tokens": 7294895.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5830 }, { "completion_length": 11.875, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.875, "completions/mean_terminated_length": 11.875, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.40159537890248936, "frac_reward_zero_std": 1.0, "grad_norm": 0.00963292270898819, "kl": 1.1885063588619231, "learning_rate": 5.932203389830509e-08, "loss": 0.0012, "num_tokens": 7304270.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5840 }, { "completion_length": 13.025, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 13.025, "completions/mean_terminated_length": 13.025, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.40228304222252786, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034977158065885305, "kl": 1.1110076546669005, "learning_rate": 4.9905838041431265e-08, "loss": 0.0011, "num_tokens": 7315871.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5850 }, { "completion_length": 30.975, "completions/clipped_ratio": 0.0, "completions/max_length": 90.9, "completions/max_terminated_length": 90.9, "completions/mean_length": 30.975, "completions/mean_terminated_length": 30.975, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.40297070554256637, "frac_reward_zero_std": 0.9, "grad_norm": 0.00785754807293415, "kl": 1.146417075395584, "learning_rate": 4.048964218455744e-08, "loss": 0.0011, "num_tokens": 7329542.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5860 }, { "completion_length": 56.375, "completions/clipped_ratio": 0.0, "completions/max_length": 189.6, "completions/max_terminated_length": 189.6, "completions/mean_length": 56.375, "completions/mean_terminated_length": 56.375, "completions/min_length": 10.2, "completions/min_terminated_length": 10.2, "epoch": 0.40365836886260487, "frac_reward_zero_std": 0.7, "grad_norm": 1.7976064682006836, "kl": 1.1094903528690339, "learning_rate": 3.107344632768362e-08, "loss": 0.0011, "num_tokens": 7343725.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.9625, "rewards/match_format_approximately/std": 0.075, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5870 }, { "completion_length": 12.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 12.7, "completions/mean_terminated_length": 12.7, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.40434603218264337, "frac_reward_zero_std": 1.0, "grad_norm": 0.006050520576536655, "kl": 1.1388359487056732, "learning_rate": 2.1657250470809794e-08, "loss": 0.0011, "num_tokens": 7356937.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5880 }, { "completion_length": 16.025, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 16.025, "completions/mean_terminated_length": 16.025, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.40503369550268187, "frac_reward_zero_std": 0.9, "grad_norm": 4.21384334564209, "kl": 1.2012366831302643, "learning_rate": 1.2241054613935971e-08, "loss": 0.0012, "num_tokens": 7370634.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5890 }, { "completion_length": 12.95, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 12.95, "completions/mean_terminated_length": 12.95, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.4057213588227204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032482228707522154, "kl": 1.1373628199100494, "learning_rate": 2.8248587570621472e-09, "loss": 0.0011, "num_tokens": 7382464.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5900 }, { "completion_length": 15.8, "completions/clipped_ratio": 0.0, "completions/max_length": 31.8, "completions/max_terminated_length": 31.8, "completions/mean_length": 15.8, "completions/mean_terminated_length": 15.8, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.4064090221427589, "frac_reward_zero_std": 0.9, "grad_norm": 0.007901502773165703, "kl": 1.0803104102611543, "learning_rate": 2.9302222222222227e-06, "loss": 0.0011, "num_tokens": 7394784.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5910 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.4070966854627974, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035292813554406166, "kl": 1.12642440199852, "learning_rate": 2.925777777777778e-06, "loss": 0.0011, "num_tokens": 7406184.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5920 }, { "completion_length": 12.65, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 12.65, "completions/mean_terminated_length": 12.65, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4077843487828359, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023179894778877497, "kl": 1.2278540432453156, "learning_rate": 2.9213333333333337e-06, "loss": 0.0012, "num_tokens": 7419590.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5930 }, { "completion_length": 19.65, "completions/clipped_ratio": 0.0, "completions/max_length": 49.9, "completions/max_terminated_length": 49.9, "completions/mean_length": 19.65, "completions/mean_terminated_length": 19.65, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.4084720121028744, "frac_reward_zero_std": 0.9, "grad_norm": 0.005237962584942579, "kl": 1.34213387966156, "learning_rate": 2.916888888888889e-06, "loss": 0.0013, "num_tokens": 7430864.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5940 }, { "completion_length": 12.275, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 12.275, "completions/mean_terminated_length": 12.275, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.4091596754229129, "frac_reward_zero_std": 1.0, "grad_norm": 0.05530301854014397, "kl": 1.1015527904033662, "learning_rate": 2.9124444444444442e-06, "loss": 0.0011, "num_tokens": 7443395.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5950 }, { "completion_length": 13.675, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 13.675, "completions/mean_terminated_length": 13.675, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.40984733874295143, "frac_reward_zero_std": 0.9, "grad_norm": 0.0057378592900931835, "kl": 1.3134727358818055, "learning_rate": 2.9080000000000004e-06, "loss": 0.0013, "num_tokens": 7457046.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5960 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.41053500206299, "frac_reward_zero_std": 1.0, "grad_norm": 0.09183824807405472, "kl": 1.3441233158111572, "learning_rate": 2.9035555555555556e-06, "loss": 0.0013, "num_tokens": 7468995.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5970 }, { "completion_length": 11.225, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.225, "completions/mean_terminated_length": 11.225, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.4112226653830285, "frac_reward_zero_std": 1.0, "grad_norm": 0.00760465394705534, "kl": 1.5161000728607177, "learning_rate": 2.8991111111111113e-06, "loss": 0.0015, "num_tokens": 7481972.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5980 }, { "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.8, "completions/max_terminated_length": 22.8, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.411910328703067, "frac_reward_zero_std": 1.0, "grad_norm": 0.004714191425591707, "kl": 1.405648809671402, "learning_rate": 2.8946666666666666e-06, "loss": 0.0014, "num_tokens": 7494194.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 5990 }, { "completion_length": 17.175, "completions/clipped_ratio": 0.0, "completions/max_length": 39.2, "completions/max_terminated_length": 39.2, "completions/mean_length": 17.175, "completions/mean_terminated_length": 17.175, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.4125979920231055, "frac_reward_zero_std": 0.8, "grad_norm": 2.1586594581604004, "kl": 1.4109269559383393, "learning_rate": 2.8902222222222227e-06, "loss": 0.0014, "num_tokens": 7505437.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6000 }, { "completion_length": 24.45, "completions/clipped_ratio": 0.0, "completions/max_length": 68.7, "completions/max_terminated_length": 68.7, "completions/mean_length": 24.45, "completions/mean_terminated_length": 24.45, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.413285655343144, "frac_reward_zero_std": 0.9, "grad_norm": 0.041129227727651596, "kl": 1.3110809564590453, "learning_rate": 2.885777777777778e-06, "loss": 0.0013, "num_tokens": 7518291.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6010 }, { "completion_length": 11.525, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.525, "completions/mean_terminated_length": 11.525, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.4139733186631825, "frac_reward_zero_std": 1.0, "grad_norm": 0.004482824355363846, "kl": 1.3215213119983673, "learning_rate": 2.8813333333333337e-06, "loss": 0.0013, "num_tokens": 7530908.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6020 }, { "completion_length": 10.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 10.8, "completions/mean_terminated_length": 10.8, "completions/min_length": 7.1, "completions/min_terminated_length": 7.1, "epoch": 0.414660981983221, "frac_reward_zero_std": 0.9, "grad_norm": 0.004378788638859987, "kl": 1.6575611710548401, "learning_rate": 2.876888888888889e-06, "loss": 0.0017, "num_tokens": 7542956.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6030 }, { "completion_length": 9.875, "completions/clipped_ratio": 0.0, "completions/max_length": 12.3, "completions/max_terminated_length": 12.3, "completions/mean_length": 9.875, "completions/mean_terminated_length": 9.875, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.41534864530325954, "frac_reward_zero_std": 1.0, "grad_norm": 0.004252531565725803, "kl": 1.6062816500663757, "learning_rate": 2.872444444444445e-06, "loss": 0.0016, "num_tokens": 7555015.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6040 }, { "completion_length": 9.15, "completions/clipped_ratio": 0.0, "completions/max_length": 11.3, "completions/max_terminated_length": 11.3, "completions/mean_length": 9.15, "completions/mean_terminated_length": 9.15, "completions/min_length": 7.1, "completions/min_terminated_length": 7.1, "epoch": 0.41603630862329805, "frac_reward_zero_std": 1.0, "grad_norm": 0.008693459443747997, "kl": 1.7664753794670105, "learning_rate": 2.8680000000000003e-06, "loss": 0.0018, "num_tokens": 7567833.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6050 }, { "completion_length": 22.175, "completions/clipped_ratio": 0.0, "completions/max_length": 61.1, "completions/max_terminated_length": 61.1, "completions/mean_length": 22.175, "completions/mean_terminated_length": 22.175, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.41672397194333655, "frac_reward_zero_std": 0.9, "grad_norm": 0.025784730911254883, "kl": 1.5375929474830627, "learning_rate": 2.8635555555555556e-06, "loss": 0.0015, "num_tokens": 7580336.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6060 }, { "completion_length": 10.9, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.9, "completions/mean_terminated_length": 10.9, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.41741163526337505, "frac_reward_zero_std": 1.0, "grad_norm": 0.023190055042505264, "kl": 1.4921037673950195, "learning_rate": 2.8591111111111113e-06, "loss": 0.0015, "num_tokens": 7591768.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6070 }, { "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.8, "completions/max_terminated_length": 13.8, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 7.4, "completions/min_terminated_length": 7.4, "epoch": 0.41809929858341355, "frac_reward_zero_std": 1.0, "grad_norm": 0.01030003372579813, "kl": 1.9349348664283752, "learning_rate": 2.8546666666666666e-06, "loss": 0.0019, "num_tokens": 7603882.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6080 }, { "completion_length": 10.95, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 10.95, "completions/mean_terminated_length": 10.95, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.41878696190345205, "frac_reward_zero_std": 1.0, "grad_norm": 0.006510532461106777, "kl": 1.429217952489853, "learning_rate": 2.8502222222222227e-06, "loss": 0.0014, "num_tokens": 7617252.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6090 }, { "completion_length": 10.875, "completions/clipped_ratio": 0.0, "completions/max_length": 13.9, "completions/max_terminated_length": 13.9, "completions/mean_length": 10.875, "completions/mean_terminated_length": 10.875, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.4194746252234906, "frac_reward_zero_std": 1.0, "grad_norm": 0.005413992330431938, "kl": 1.5536803781986237, "learning_rate": 2.845777777777778e-06, "loss": 0.0016, "num_tokens": 7629711.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6100 }, { "completion_length": 10.7, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 10.7, "completions/mean_terminated_length": 10.7, "completions/min_length": 7.8, "completions/min_terminated_length": 7.8, "epoch": 0.4201622885435291, "frac_reward_zero_std": 1.0, "grad_norm": 0.004487996455281973, "kl": 1.3762087106704712, "learning_rate": 2.8413333333333336e-06, "loss": 0.0014, "num_tokens": 7642731.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6110 }, { "completion_length": 9.45, "completions/clipped_ratio": 0.0, "completions/max_length": 12.3, "completions/max_terminated_length": 12.3, "completions/mean_length": 9.45, "completions/mean_terminated_length": 9.45, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.4208499518635676, "frac_reward_zero_std": 1.0, "grad_norm": 0.016771214082837105, "kl": 1.5483428835868835, "learning_rate": 2.836888888888889e-06, "loss": 0.0015, "num_tokens": 7655069.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6120 }, { "completion_length": 10.6, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 10.6, "completions/mean_terminated_length": 10.6, "completions/min_length": 6.9, "completions/min_terminated_length": 6.9, "epoch": 0.4215376151836061, "frac_reward_zero_std": 1.0, "grad_norm": 0.007195493672043085, "kl": 1.6185973286628723, "learning_rate": 2.832444444444445e-06, "loss": 0.0016, "num_tokens": 7667193.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6130 }, { "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.4222252785036446, "frac_reward_zero_std": 1.0, "grad_norm": 0.00659464905038476, "kl": 1.662790560722351, "learning_rate": 2.8280000000000003e-06, "loss": 0.0017, "num_tokens": 7679765.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6140 }, { "completion_length": 10.425, "completions/clipped_ratio": 0.0, "completions/max_length": 12.7, "completions/max_terminated_length": 12.7, "completions/mean_length": 10.425, "completions/mean_terminated_length": 10.425, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.4229129418236831, "frac_reward_zero_std": 1.0, "grad_norm": 0.015074046328663826, "kl": 1.298115348815918, "learning_rate": 2.823555555555556e-06, "loss": 0.0013, "num_tokens": 7690646.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6150 }, { "completion_length": 9.8, "completions/clipped_ratio": 0.0, "completions/max_length": 12.9, "completions/max_terminated_length": 12.9, "completions/mean_length": 9.8, "completions/mean_terminated_length": 9.8, "completions/min_length": 6.9, "completions/min_terminated_length": 6.9, "epoch": 0.4236006051437216, "frac_reward_zero_std": 1.0, "grad_norm": 0.03453448787331581, "kl": 1.4956003904342652, "learning_rate": 2.8191111111111112e-06, "loss": 0.0015, "num_tokens": 7701978.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6160 }, { "completion_length": 10.8, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 10.8, "completions/mean_terminated_length": 10.8, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.42428826846376017, "frac_reward_zero_std": 1.0, "grad_norm": 0.02794322744011879, "kl": 1.452487486600876, "learning_rate": 2.8146666666666665e-06, "loss": 0.0015, "num_tokens": 7715074.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6170 }, { "completion_length": 10.05, "completions/clipped_ratio": 0.0, "completions/max_length": 12.5, "completions/max_terminated_length": 12.5, "completions/mean_length": 10.05, "completions/mean_terminated_length": 10.05, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.42497593178379867, "frac_reward_zero_std": 1.0, "grad_norm": 0.011381459422409534, "kl": 1.4053172051906586, "learning_rate": 2.8102222222222226e-06, "loss": 0.0014, "num_tokens": 7726628.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6180 }, { "completion_length": 10.925, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.925, "completions/mean_terminated_length": 10.925, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.42566359510383717, "frac_reward_zero_std": 1.0, "grad_norm": 0.003446686314418912, "kl": 1.3329946517944335, "learning_rate": 2.805777777777778e-06, "loss": 0.0013, "num_tokens": 7739049.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6190 }, { "completion_length": 11.125, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.125, "completions/mean_terminated_length": 11.125, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.42635125842387567, "frac_reward_zero_std": 0.9, "grad_norm": 0.018087368458509445, "kl": 1.4069275319576264, "learning_rate": 2.8013333333333336e-06, "loss": 0.0014, "num_tokens": 7750026.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6200 }, { "completion_length": 9.95, "completions/clipped_ratio": 0.0, "completions/max_length": 13.4, "completions/max_terminated_length": 13.4, "completions/mean_length": 9.95, "completions/mean_terminated_length": 9.95, "completions/min_length": 7.1, "completions/min_terminated_length": 7.1, "epoch": 0.42703892174391417, "frac_reward_zero_std": 1.0, "grad_norm": 0.006026132497936487, "kl": 1.6816380977630616, "learning_rate": 2.796888888888889e-06, "loss": 0.0017, "num_tokens": 7761324.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6210 }, { "completion_length": 12.05, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 12.05, "completions/mean_terminated_length": 12.05, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.42772658506395267, "frac_reward_zero_std": 1.0, "grad_norm": 0.004558009561151266, "kl": 1.3497009932994843, "learning_rate": 2.792444444444445e-06, "loss": 0.0013, "num_tokens": 7775018.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6220 }, { "completion_length": 11.725, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 11.725, "completions/mean_terminated_length": 11.725, "completions/min_length": 7.9, "completions/min_terminated_length": 7.9, "epoch": 0.4284142483839912, "frac_reward_zero_std": 1.0, "grad_norm": 0.003908331040292978, "kl": 1.3773386299610137, "learning_rate": 2.7880000000000002e-06, "loss": 0.0014, "num_tokens": 7785839.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6230 }, { "completion_length": 11.925, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 11.925, "completions/mean_terminated_length": 11.925, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.4291019117040297, "frac_reward_zero_std": 0.9, "grad_norm": 0.0031500409822911024, "kl": 1.2545689225196839, "learning_rate": 2.783555555555556e-06, "loss": 0.0013, "num_tokens": 7797700.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6240 }, { "completion_length": 11.35, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 11.35, "completions/mean_terminated_length": 11.35, "completions/min_length": 7.7, "completions/min_terminated_length": 7.7, "epoch": 0.4297895750240682, "frac_reward_zero_std": 0.9, "grad_norm": 0.004434330854564905, "kl": 1.3473502099514008, "learning_rate": 2.779111111111111e-06, "loss": 0.0013, "num_tokens": 7810890.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6250 }, { "completion_length": 12.175, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 12.175, "completions/mean_terminated_length": 12.175, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.4304772383441067, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038921276573091745, "kl": 1.2112794041633606, "learning_rate": 2.7746666666666665e-06, "loss": 0.0012, "num_tokens": 7821885.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6260 }, { "completion_length": 10.9, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 10.9, "completions/mean_terminated_length": 10.9, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.43116490166414523, "frac_reward_zero_std": 1.0, "grad_norm": 0.017070645466446877, "kl": 1.3707080781459808, "learning_rate": 2.7702222222222226e-06, "loss": 0.0014, "num_tokens": 7833037.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6270 }, { "completion_length": 11.8, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.8, "completions/mean_terminated_length": 11.8, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.43185256498418373, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044752685353159904, "kl": 1.2199202120304107, "learning_rate": 2.765777777777778e-06, "loss": 0.0012, "num_tokens": 7847013.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6280 }, { "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.2, "completions/max_terminated_length": 14.2, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.43254022830422223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020100600086152554, "kl": 1.2238180756568908, "learning_rate": 2.7613333333333335e-06, "loss": 0.0012, "num_tokens": 7859531.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6290 }, { "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.6, "completions/max_terminated_length": 68.6, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.4332278916242608, "frac_reward_zero_std": 0.9, "grad_norm": 2.2058627605438232, "kl": 1.1081747114658356, "learning_rate": 2.756888888888889e-06, "loss": 0.0011, "num_tokens": 7873755.0, "reward": 5.925, "reward_std": 0.09574271440505981, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.028867512941360474, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.028867512941360474, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6300 }, { "completion_length": 11.55, "completions/clipped_ratio": 0.0, "completions/max_length": 14.4, "completions/max_terminated_length": 14.4, "completions/mean_length": 11.55, "completions/mean_terminated_length": 11.55, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.4339155549442993, "frac_reward_zero_std": 1.0, "grad_norm": 0.003843477461487055, "kl": 1.2722107827663423, "learning_rate": 2.752444444444445e-06, "loss": 0.0013, "num_tokens": 7884969.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6310 }, { "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.4346032182643378, "frac_reward_zero_std": 1.0, "grad_norm": 0.005162264686077833, "kl": 1.1211694359779358, "learning_rate": 2.748e-06, "loss": 0.0011, "num_tokens": 7897721.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6320 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.4352908815843763, "frac_reward_zero_std": 1.0, "grad_norm": 0.001960468478500843, "kl": 1.2898571014404296, "learning_rate": 2.743555555555556e-06, "loss": 0.0013, "num_tokens": 7909910.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6330 }, { "completion_length": 13.7, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 13.7, "completions/mean_terminated_length": 13.7, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.4359785449044148, "frac_reward_zero_std": 0.8, "grad_norm": 0.0032273821998387575, "kl": 1.246416300535202, "learning_rate": 2.739111111111111e-06, "loss": 0.0012, "num_tokens": 7922274.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6340 }, { "completion_length": 12.225, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 12.225, "completions/mean_terminated_length": 12.225, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.4366662082244533, "frac_reward_zero_std": 1.0, "grad_norm": 0.00453966436907649, "kl": 1.3314736366271973, "learning_rate": 2.7346666666666673e-06, "loss": 0.0013, "num_tokens": 7933375.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6350 }, { "completion_length": 11.675, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.675, "completions/mean_terminated_length": 11.675, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.4373538715444918, "frac_reward_zero_std": 1.0, "grad_norm": 0.003801640821620822, "kl": 1.244855809211731, "learning_rate": 2.7302222222222225e-06, "loss": 0.0012, "num_tokens": 7944086.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6360 }, { "completion_length": 16.425, "completions/clipped_ratio": 0.0, "completions/max_length": 30.4, "completions/max_terminated_length": 30.4, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.43804153486453035, "frac_reward_zero_std": 0.9, "grad_norm": 0.0055861701257526875, "kl": 1.2101807296276093, "learning_rate": 2.725777777777778e-06, "loss": 0.0012, "num_tokens": 7957839.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6370 }, { "completion_length": 12.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 12.2, "completions/mean_terminated_length": 12.2, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.43872919818456885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032540559768676758, "kl": 1.1290572345256806, "learning_rate": 2.7213333333333335e-06, "loss": 0.0011, "num_tokens": 7969519.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6380 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.43941686150460735, "frac_reward_zero_std": 1.0, "grad_norm": 0.007687001954764128, "kl": 1.2707543969154358, "learning_rate": 2.7168888888888888e-06, "loss": 0.0013, "num_tokens": 7982380.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6390 }, { "completion_length": 11.275, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.275, "completions/mean_terminated_length": 11.275, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.44010452482464585, "frac_reward_zero_std": 1.0, "grad_norm": 0.003866757033392787, "kl": 1.3291922807693481, "learning_rate": 2.712444444444445e-06, "loss": 0.0013, "num_tokens": 7994443.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6400 }, { "completion_length": 11.675, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.675, "completions/mean_terminated_length": 11.675, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.44079218814468435, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025106698740273714, "kl": 1.2189786791801454, "learning_rate": 2.708e-06, "loss": 0.0012, "num_tokens": 8005686.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6410 }, { "completion_length": 10.975, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 10.975, "completions/mean_terminated_length": 10.975, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.44147985146472285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029226518236100674, "kl": 1.3308802485466003, "learning_rate": 2.703555555555556e-06, "loss": 0.0013, "num_tokens": 8018109.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6420 }, { "completion_length": 12.175, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 12.175, "completions/mean_terminated_length": 12.175, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.4421675147847614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057080830447375774, "kl": 1.3752940356731416, "learning_rate": 2.699111111111111e-06, "loss": 0.0014, "num_tokens": 8032504.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6430 }, { "completion_length": 13.825, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 13.825, "completions/mean_terminated_length": 13.825, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.4428551781047999, "frac_reward_zero_std": 1.0, "grad_norm": 0.004137388430535793, "kl": 1.1381323873996734, "learning_rate": 2.6946666666666672e-06, "loss": 0.0011, "num_tokens": 8044841.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6440 }, { "completion_length": 12.025, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 12.025, "completions/mean_terminated_length": 12.025, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.4435428414248384, "frac_reward_zero_std": 1.0, "grad_norm": 0.004201957024633884, "kl": 1.29626407623291, "learning_rate": 2.6902222222222225e-06, "loss": 0.0013, "num_tokens": 8056430.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6450 }, { "completion_length": 12.725, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 12.725, "completions/mean_terminated_length": 12.725, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.4442305047448769, "frac_reward_zero_std": 1.0, "grad_norm": 0.004895097576081753, "kl": 1.070450747013092, "learning_rate": 2.685777777777778e-06, "loss": 0.0011, "num_tokens": 8068211.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6460 }, { "completion_length": 14.125, "completions/clipped_ratio": 0.0, "completions/max_length": 22.3, "completions/max_terminated_length": 22.3, "completions/mean_length": 14.125, "completions/mean_terminated_length": 14.125, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.4449181680649154, "frac_reward_zero_std": 0.9, "grad_norm": 0.007197881117463112, "kl": 1.2501220405101776, "learning_rate": 2.6813333333333335e-06, "loss": 0.0013, "num_tokens": 8080248.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6470 }, { "completion_length": 12.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.9, "completions/mean_terminated_length": 12.9, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.4456058313849539, "frac_reward_zero_std": 1.0, "grad_norm": 0.003812718205153942, "kl": 1.3214125633239746, "learning_rate": 2.6768888888888887e-06, "loss": 0.0013, "num_tokens": 8094288.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6480 }, { "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.4462934947049924, "frac_reward_zero_std": 1.0, "grad_norm": 0.003874816931784153, "kl": 1.3365242898464202, "learning_rate": 2.672444444444445e-06, "loss": 0.0013, "num_tokens": 8108670.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6490 }, { "completion_length": 13.275, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 13.275, "completions/mean_terminated_length": 13.275, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.44698115802503097, "frac_reward_zero_std": 0.9, "grad_norm": 0.002750998130068183, "kl": 1.2364072024822235, "learning_rate": 2.668e-06, "loss": 0.0012, "num_tokens": 8123205.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6500 }, { "completion_length": 10.525, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.525, "completions/mean_terminated_length": 10.525, "completions/min_length": 7.6, "completions/min_terminated_length": 7.6, "epoch": 0.44766882134506947, "frac_reward_zero_std": 1.0, "grad_norm": 0.006352982483804226, "kl": 1.334709495306015, "learning_rate": 2.663555555555556e-06, "loss": 0.0013, "num_tokens": 8136234.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6510 }, { "completion_length": 11.175, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.175, "completions/mean_terminated_length": 11.175, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.44835648466510797, "frac_reward_zero_std": 1.0, "grad_norm": 0.005306031089276075, "kl": 1.203692877292633, "learning_rate": 2.659111111111111e-06, "loss": 0.0012, "num_tokens": 8148841.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6520 }, { "completion_length": 10.475, "completions/clipped_ratio": 0.0, "completions/max_length": 13.4, "completions/max_terminated_length": 13.4, "completions/mean_length": 10.475, "completions/mean_terminated_length": 10.475, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.44904414798514647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006456505972892046, "kl": 1.4366894364356995, "learning_rate": 2.654666666666667e-06, "loss": 0.0014, "num_tokens": 8161592.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6530 }, { "completion_length": 10.65, "completions/clipped_ratio": 0.0, "completions/max_length": 12.9, "completions/max_terminated_length": 12.9, "completions/mean_length": 10.65, "completions/mean_terminated_length": 10.65, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.44973181130518497, "frac_reward_zero_std": 1.0, "grad_norm": 0.00415951618924737, "kl": 1.2912186980247498, "learning_rate": 2.6502222222222225e-06, "loss": 0.0013, "num_tokens": 8174226.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6540 }, { "completion_length": 12.875, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 12.875, "completions/mean_terminated_length": 12.875, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.45041947462522347, "frac_reward_zero_std": 0.9, "grad_norm": 0.005188601557165384, "kl": 1.2176654636859894, "learning_rate": 2.645777777777778e-06, "loss": 0.0012, "num_tokens": 8187109.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6550 }, { "completion_length": 11.025, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 11.025, "completions/mean_terminated_length": 11.025, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.451107137945262, "frac_reward_zero_std": 1.0, "grad_norm": 0.006083431653678417, "kl": 1.3448700129985809, "learning_rate": 2.6413333333333334e-06, "loss": 0.0013, "num_tokens": 8198434.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6560 }, { "completion_length": 11.725, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.725, "completions/mean_terminated_length": 11.725, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.4517948012653005, "frac_reward_zero_std": 1.0, "grad_norm": 0.003008403116837144, "kl": 1.1978749930858612, "learning_rate": 2.6368888888888887e-06, "loss": 0.0012, "num_tokens": 8209723.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6570 }, { "completion_length": 11.425, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 11.425, "completions/mean_terminated_length": 11.425, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.45248246458533903, "frac_reward_zero_std": 1.0, "grad_norm": 0.008322266861796379, "kl": 1.235140424966812, "learning_rate": 2.632444444444445e-06, "loss": 0.0012, "num_tokens": 8223700.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6580 }, { "completion_length": 11.225, "completions/clipped_ratio": 0.0, "completions/max_length": 14.2, "completions/max_terminated_length": 14.2, "completions/mean_length": 11.225, "completions/mean_terminated_length": 11.225, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.45317012790537753, "frac_reward_zero_std": 1.0, "grad_norm": 0.002982344478368759, "kl": 1.4228580594062805, "learning_rate": 2.628e-06, "loss": 0.0014, "num_tokens": 8234605.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6590 }, { "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 76.9, "completions/max_terminated_length": 76.9, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.45385779122541603, "frac_reward_zero_std": 0.8, "grad_norm": 1.6949481964111328, "kl": 1.038349199295044, "learning_rate": 2.6235555555555558e-06, "loss": 0.001, "num_tokens": 8249379.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6600 }, { "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.5, "completions/max_terminated_length": 39.5, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.45454545454545453, "frac_reward_zero_std": 0.8, "grad_norm": 0.01325819082558155, "kl": 1.1541422367095948, "learning_rate": 2.619111111111111e-06, "loss": 0.0012, "num_tokens": 8260369.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6610 }, { "completion_length": 12.35, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 12.35, "completions/mean_terminated_length": 12.35, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.45523311786549303, "frac_reward_zero_std": 1.0, "grad_norm": 0.003837710013613105, "kl": 1.2479790687561034, "learning_rate": 2.614666666666667e-06, "loss": 0.0012, "num_tokens": 8271919.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6620 }, { "completion_length": 12.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 12.4, "completions/mean_terminated_length": 12.4, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.4559207811855316, "frac_reward_zero_std": 1.0, "grad_norm": 0.00758333969861269, "kl": 1.2035202145576478, "learning_rate": 2.6102222222222224e-06, "loss": 0.0012, "num_tokens": 8284087.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6630 }, { "completion_length": 11.875, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.875, "completions/mean_terminated_length": 11.875, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.4566084445055701, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032163290306925774, "kl": 1.258764785528183, "learning_rate": 2.605777777777778e-06, "loss": 0.0013, "num_tokens": 8296478.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6640 }, { "completion_length": 20.175, "completions/clipped_ratio": 0.0, "completions/max_length": 47.7, "completions/max_terminated_length": 47.7, "completions/mean_length": 20.175, "completions/mean_terminated_length": 20.175, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.4572961078256086, "frac_reward_zero_std": 0.9, "grad_norm": 0.0021373762283474207, "kl": 1.3757518768310546, "learning_rate": 2.6017777777777782e-06, "loss": 0.0014, "num_tokens": 8308309.0, "reward": 5.95, "reward_std": 0.05773502588272095, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6650 }, { "completion_length": 23.675, "completions/clipped_ratio": 0.0, "completions/max_length": 59.1, "completions/max_terminated_length": 59.1, "completions/mean_length": 23.675, "completions/mean_terminated_length": 23.675, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.4579837711456471, "frac_reward_zero_std": 0.9, "grad_norm": 0.004941616673022509, "kl": 0.9016536772251129, "learning_rate": 2.5973333333333335e-06, "loss": 0.0009, "num_tokens": 8321792.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6660 }, { "completion_length": 22.925, "completions/clipped_ratio": 0.0, "completions/max_length": 58.4, "completions/max_terminated_length": 58.4, "completions/mean_length": 22.925, "completions/mean_terminated_length": 22.925, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.4586714344656856, "frac_reward_zero_std": 0.9, "grad_norm": 0.014675181359052658, "kl": 1.3598309397697448, "learning_rate": 2.592888888888889e-06, "loss": 0.0014, "num_tokens": 8334013.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6670 }, { "completion_length": 12.525, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.525, "completions/mean_terminated_length": 12.525, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.4593590977857241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0958554819226265, "kl": 1.2881322383880616, "learning_rate": 2.5884444444444445e-06, "loss": 0.0013, "num_tokens": 8345978.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6680 }, { "completion_length": 11.075, "completions/clipped_ratio": 0.0, "completions/max_length": 14.2, "completions/max_terminated_length": 14.2, "completions/mean_length": 11.075, "completions/mean_terminated_length": 11.075, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.4600467611057626, "frac_reward_zero_std": 1.0, "grad_norm": 0.007988972589373589, "kl": 1.558272522687912, "learning_rate": 2.5840000000000006e-06, "loss": 0.0016, "num_tokens": 8357717.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6690 }, { "completion_length": 11.175, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.175, "completions/mean_terminated_length": 11.175, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.46073442442580115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023797177709639072, "kl": 1.2147677838802338, "learning_rate": 2.579555555555556e-06, "loss": 0.0012, "num_tokens": 8370428.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6700 }, { "completion_length": 11.225, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 11.225, "completions/mean_terminated_length": 11.225, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.46142208774583965, "frac_reward_zero_std": 1.0, "grad_norm": 0.004716573283076286, "kl": 1.1666797995567322, "learning_rate": 2.5751111111111115e-06, "loss": 0.0012, "num_tokens": 8382905.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6710 }, { "completion_length": 12.525, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 12.525, "completions/mean_terminated_length": 12.525, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.46210975106587815, "frac_reward_zero_std": 1.0, "grad_norm": 0.004184249322861433, "kl": 1.1673298239707948, "learning_rate": 2.570666666666667e-06, "loss": 0.0012, "num_tokens": 8395142.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6720 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.6, "completions/max_terminated_length": 48.6, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.46279741438591665, "frac_reward_zero_std": 0.9, "grad_norm": 0.006774708162993193, "kl": 1.487388950586319, "learning_rate": 2.566222222222222e-06, "loss": 0.0015, "num_tokens": 8407102.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6730 }, { "completion_length": 13.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 13.8, "completions/mean_terminated_length": 13.8, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.46348507770595515, "frac_reward_zero_std": 1.0, "grad_norm": 0.002235209569334984, "kl": 1.0196092247962951, "learning_rate": 2.561777777777778e-06, "loss": 0.001, "num_tokens": 8420438.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6740 }, { "completion_length": 12.125, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 12.125, "completions/mean_terminated_length": 12.125, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.46417274102599365, "frac_reward_zero_std": 0.8, "grad_norm": 0.0041185058653354645, "kl": 1.2029572665691375, "learning_rate": 2.5573333333333335e-06, "loss": 0.0012, "num_tokens": 8433695.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6750 }, { "completion_length": 10.825, "completions/clipped_ratio": 0.0, "completions/max_length": 13.3, "completions/max_terminated_length": 13.3, "completions/mean_length": 10.825, "completions/mean_terminated_length": 10.825, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.4648604043460322, "frac_reward_zero_std": 1.0, "grad_norm": 0.030052315443754196, "kl": 1.404645836353302, "learning_rate": 2.552888888888889e-06, "loss": 0.0014, "num_tokens": 8444880.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6760 }, { "completion_length": 12.35, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 12.35, "completions/mean_terminated_length": 12.35, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.4655480676660707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051916250959038734, "kl": 1.1174580216407777, "learning_rate": 2.5484444444444444e-06, "loss": 0.0011, "num_tokens": 8456626.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6770 }, { "completion_length": 11.925, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.925, "completions/mean_terminated_length": 11.925, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.4662357309861092, "frac_reward_zero_std": 1.0, "grad_norm": 0.005567606072872877, "kl": 1.062576812505722, "learning_rate": 2.5440000000000005e-06, "loss": 0.0011, "num_tokens": 8468635.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6780 }, { "completion_length": 23.6, "completions/clipped_ratio": 0.0, "completions/max_length": 60.1, "completions/max_terminated_length": 60.1, "completions/mean_length": 23.6, "completions/mean_terminated_length": 23.6, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.4669233943061477, "frac_reward_zero_std": 0.9, "grad_norm": 0.009036717936396599, "kl": 1.821528172492981, "learning_rate": 2.539555555555556e-06, "loss": 0.0018, "num_tokens": 8482275.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6790 }, { "completion_length": 12.025, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 12.025, "completions/mean_terminated_length": 12.025, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.4676110576261862, "frac_reward_zero_std": 1.0, "grad_norm": 0.006006492301821709, "kl": 1.1518677949905396, "learning_rate": 2.5351111111111115e-06, "loss": 0.0012, "num_tokens": 8494032.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6800 }, { "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.4682987209462247, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030979763250797987, "kl": 1.0430574774742127, "learning_rate": 2.5306666666666668e-06, "loss": 0.001, "num_tokens": 8506708.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6810 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 33.8, "completions/max_terminated_length": 33.8, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.4689863842662632, "frac_reward_zero_std": 0.9, "grad_norm": 0.0050082216039299965, "kl": 1.1010195553302764, "learning_rate": 2.526222222222223e-06, "loss": 0.0011, "num_tokens": 8518388.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6820 }, { "completion_length": 13.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 13.7, "completions/mean_terminated_length": 13.7, "completions/min_length": 10.2, "completions/min_terminated_length": 10.2, "epoch": 0.46967404758630177, "frac_reward_zero_std": 1.0, "grad_norm": 0.013809357769787312, "kl": 0.9667928516864777, "learning_rate": 2.521777777777778e-06, "loss": 0.001, "num_tokens": 8529944.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6830 }, { "completion_length": 11.65, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 11.65, "completions/mean_terminated_length": 11.65, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.47036171090634027, "frac_reward_zero_std": 1.0, "grad_norm": 0.004135849419981241, "kl": 1.0926547050476074, "learning_rate": 2.5173333333333334e-06, "loss": 0.0011, "num_tokens": 8541106.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6840 }, { "completion_length": 12.025, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 12.025, "completions/mean_terminated_length": 12.025, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.47104937422637877, "frac_reward_zero_std": 1.0, "grad_norm": 0.004002850037068129, "kl": 1.0938444793224336, "learning_rate": 2.512888888888889e-06, "loss": 0.0011, "num_tokens": 8552911.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6850 }, { "completion_length": 25.525, "completions/clipped_ratio": 0.0, "completions/max_length": 65.6, "completions/max_terminated_length": 65.6, "completions/mean_length": 25.525, "completions/mean_terminated_length": 25.525, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.47173703754641727, "frac_reward_zero_std": 0.8, "grad_norm": 0.004127295222133398, "kl": 1.0458847165107727, "learning_rate": 2.5084444444444444e-06, "loss": 0.001, "num_tokens": 8565248.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.4875, "rewards/check_coherence/std": 0.025, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6860 }, { "completion_length": 12.65, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 12.65, "completions/mean_terminated_length": 12.65, "completions/min_length": 10.4, "completions/min_terminated_length": 10.4, "epoch": 0.47242470086645577, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025481998454779387, "kl": 1.1385599613189696, "learning_rate": 2.5040000000000005e-06, "loss": 0.0011, "num_tokens": 8575918.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6870 }, { "completion_length": 13.575, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 13.575, "completions/mean_terminated_length": 13.575, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.47311236418649427, "frac_reward_zero_std": 1.0, "grad_norm": 0.004273406229913235, "kl": 1.358880877494812, "learning_rate": 2.4995555555555558e-06, "loss": 0.0014, "num_tokens": 8590145.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6880 }, { "completion_length": 13.275, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 13.275, "completions/mean_terminated_length": 13.275, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.47380002750653283, "frac_reward_zero_std": 0.9, "grad_norm": 0.004883030895143747, "kl": 1.3985342502593994, "learning_rate": 2.495111111111111e-06, "loss": 0.0014, "num_tokens": 8602292.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6890 }, { "completion_length": 16.1, "completions/clipped_ratio": 0.0, "completions/max_length": 30.3, "completions/max_terminated_length": 30.3, "completions/mean_length": 16.1, "completions/mean_terminated_length": 16.1, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.47448769082657133, "frac_reward_zero_std": 0.9, "grad_norm": 0.004624954890459776, "kl": 1.0976074278354644, "learning_rate": 2.4906666666666667e-06, "loss": 0.0011, "num_tokens": 8614088.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6900 }, { "completion_length": 12.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 12.4, "completions/mean_terminated_length": 12.4, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.47517535414660983, "frac_reward_zero_std": 1.0, "grad_norm": 0.028318747878074646, "kl": 1.1434998154640197, "learning_rate": 2.4862222222222224e-06, "loss": 0.0011, "num_tokens": 8626912.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6910 }, { "completion_length": 13.175, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 13.175, "completions/mean_terminated_length": 13.175, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.47586301746664833, "frac_reward_zero_std": 1.0, "grad_norm": 0.005463233217597008, "kl": 1.1536414802074433, "learning_rate": 2.481777777777778e-06, "loss": 0.0012, "num_tokens": 8639415.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6920 }, { "completion_length": 17.725, "completions/clipped_ratio": 0.0, "completions/max_length": 37.1, "completions/max_terminated_length": 37.1, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.47655068078668683, "frac_reward_zero_std": 0.8, "grad_norm": 0.004396663047373295, "kl": 1.224897998571396, "learning_rate": 2.4773333333333334e-06, "loss": 0.0012, "num_tokens": 8651924.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6930 }, { "completion_length": 11.426470588235293, "completions/clipped_ratio": 0.0, "completions/max_length": 14.61111111111111, "completions/max_terminated_length": 14.61111111111111, "completions/mean_length": 11.51388888888889, "completions/mean_terminated_length": 11.51388888888889, "completions/min_length": 9.11111111111111, "completions/min_terminated_length": 9.11111111111111, "epoch": 0.47723834410672533, "frac_reward_zero_std": 1.0, "grad_norm": 0.014061485417187214, "kl": 1.0381863713264465, "learning_rate": 2.472888888888889e-06, "loss": 0.001, "num_tokens": 8663938.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6940 }, { "completion_length": 12.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 12.9, "completions/mean_terminated_length": 12.9, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.47792600742676383, "frac_reward_zero_std": 1.0, "grad_norm": 0.017283568158745766, "kl": 1.2144360840320587, "learning_rate": 2.4684444444444448e-06, "loss": 0.0012, "num_tokens": 8676042.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6950 }, { "completion_length": 12.575, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 12.575, "completions/mean_terminated_length": 12.575, "completions/min_length": 10.6, "completions/min_terminated_length": 10.6, "epoch": 0.4786136707468024, "frac_reward_zero_std": 1.0, "grad_norm": 0.006373599637299776, "kl": 1.0703495502471925, "learning_rate": 2.4640000000000005e-06, "loss": 0.0011, "num_tokens": 8689881.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6960 }, { "completion_length": 11.525, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.525, "completions/mean_terminated_length": 11.525, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.4793013340668409, "frac_reward_zero_std": 1.0, "grad_norm": 0.003493634983897209, "kl": 1.2606799006462097, "learning_rate": 2.4595555555555557e-06, "loss": 0.0013, "num_tokens": 8701782.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6970 }, { "completion_length": 12.45, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 12.45, "completions/mean_terminated_length": 12.45, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.4799889973868794, "frac_reward_zero_std": 1.0, "grad_norm": 0.011007771827280521, "kl": 1.0327221691608428, "learning_rate": 2.4551111111111114e-06, "loss": 0.001, "num_tokens": 8713824.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6980 }, { "completion_length": 14.05, "completions/clipped_ratio": 0.0, "completions/max_length": 23.4, "completions/max_terminated_length": 23.4, "completions/mean_length": 14.05, "completions/mean_terminated_length": 14.05, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.4806766607069179, "frac_reward_zero_std": 0.9, "grad_norm": 0.0025435087736696005, "kl": 1.0930546462535857, "learning_rate": 2.4506666666666667e-06, "loss": 0.0011, "num_tokens": 8728734.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 6990 }, { "completion_length": 12.425, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 12.425, "completions/mean_terminated_length": 12.425, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.4813643240269564, "frac_reward_zero_std": 1.0, "grad_norm": 0.004313310608267784, "kl": 1.1628154873847962, "learning_rate": 2.4462222222222224e-06, "loss": 0.0012, "num_tokens": 8743119.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7000 }, { "completion_length": 16.1, "completions/clipped_ratio": 0.0, "completions/max_length": 31.8, "completions/max_terminated_length": 31.8, "completions/mean_length": 16.1, "completions/mean_terminated_length": 16.1, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.4820519873469949, "frac_reward_zero_std": 0.9, "grad_norm": 0.007840165868401527, "kl": 1.2261770963668823, "learning_rate": 2.441777777777778e-06, "loss": 0.0012, "num_tokens": 8755975.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7010 }, { "completion_length": 35.4, "completions/clipped_ratio": 0.0, "completions/max_length": 105.5, "completions/max_terminated_length": 105.5, "completions/mean_length": 35.4, "completions/mean_terminated_length": 35.4, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.4827396506670334, "frac_reward_zero_std": 0.9, "grad_norm": 0.003724177833646536, "kl": 1.0877568125724792, "learning_rate": 2.4373333333333333e-06, "loss": 0.0011, "num_tokens": 8768903.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7020 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.48342731398707195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032062076497823, "kl": 1.1118020951747893, "learning_rate": 2.432888888888889e-06, "loss": 0.0011, "num_tokens": 8780440.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7030 }, { "completion_length": 12.425, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 12.425, "completions/mean_terminated_length": 12.425, "completions/min_length": 10.4, "completions/min_terminated_length": 10.4, "epoch": 0.48411497730711045, "frac_reward_zero_std": 1.0, "grad_norm": 0.021040547639131546, "kl": 1.0397081434726716, "learning_rate": 2.4284444444444447e-06, "loss": 0.001, "num_tokens": 8792553.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7040 }, { "completion_length": 12.375, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 12.375, "completions/mean_terminated_length": 12.375, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.48480264062714895, "frac_reward_zero_std": 1.0, "grad_norm": 0.004737348761409521, "kl": 1.1431108355522155, "learning_rate": 2.4240000000000004e-06, "loss": 0.0011, "num_tokens": 8804176.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7050 }, { "completion_length": 13.075, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 13.075, "completions/mean_terminated_length": 13.075, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.48549030394718745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026906414423137903, "kl": 0.9650578558444977, "learning_rate": 2.4195555555555557e-06, "loss": 0.001, "num_tokens": 8816415.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7060 }, { "completion_length": 12.225, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 12.225, "completions/mean_terminated_length": 12.225, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.48617796726722595, "frac_reward_zero_std": 1.0, "grad_norm": 0.002324128057807684, "kl": 1.049161559343338, "learning_rate": 2.4151111111111114e-06, "loss": 0.001, "num_tokens": 8828372.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7070 }, { "completion_length": 11.975, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.975, "completions/mean_terminated_length": 11.975, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.48686563058726445, "frac_reward_zero_std": 1.0, "grad_norm": 0.003440011525526643, "kl": 1.1597104012966155, "learning_rate": 2.410666666666667e-06, "loss": 0.0012, "num_tokens": 8839839.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7080 }, { "completion_length": 12.825, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 12.825, "completions/mean_terminated_length": 12.825, "completions/min_length": 10.3, "completions/min_terminated_length": 10.3, "epoch": 0.487553293907303, "frac_reward_zero_std": 1.0, "grad_norm": 0.004029946867376566, "kl": 0.98714559674263, "learning_rate": 2.4062222222222223e-06, "loss": 0.001, "num_tokens": 8852208.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7090 }, { "completion_length": 12.65, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 12.65, "completions/mean_terminated_length": 12.65, "completions/min_length": 10.7, "completions/min_terminated_length": 10.7, "epoch": 0.4882409572273415, "frac_reward_zero_std": 1.0, "grad_norm": 0.004136668052524328, "kl": 0.9978300929069519, "learning_rate": 2.401777777777778e-06, "loss": 0.001, "num_tokens": 8865298.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7100 }, { "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.48892862054738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0393175333738327, "kl": 1.0882606029510498, "learning_rate": 2.3973333333333333e-06, "loss": 0.0011, "num_tokens": 8877470.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7110 }, { "completion_length": 11.55, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 11.55, "completions/mean_terminated_length": 11.55, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.4896162838674185, "frac_reward_zero_std": 1.0, "grad_norm": 0.003470748895779252, "kl": 1.0130544245243072, "learning_rate": 2.392888888888889e-06, "loss": 0.001, "num_tokens": 8891472.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7120 }, { "completion_length": 43.35, "completions/clipped_ratio": 0.0, "completions/max_length": 121.6, "completions/max_terminated_length": 121.6, "completions/mean_length": 43.35, "completions/mean_terminated_length": 43.35, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.490303947187457, "frac_reward_zero_std": 0.7, "grad_norm": 0.005588744767010212, "kl": 1.0089176952838899, "learning_rate": 2.3884444444444447e-06, "loss": 0.001, "num_tokens": 8903570.0, "reward": 5.9, "reward_std": 0.15773502588272095, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.45, "rewards/check_response_quality/std": 0.07886751294136048, "rewards/match_format_approximately/mean": 0.95, "rewards/match_format_approximately/std": 0.07886751294136048, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7130 }, { "completion_length": 21.725, "completions/clipped_ratio": 0.0, "completions/max_length": 53.3, "completions/max_terminated_length": 53.3, "completions/mean_length": 21.725, "completions/mean_terminated_length": 21.725, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.4909916105074955, "frac_reward_zero_std": 0.9, "grad_norm": 0.0038998995441943407, "kl": 2.2435892522335052, "learning_rate": 2.3840000000000004e-06, "loss": 0.0022, "num_tokens": 8916203.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7140 }, { "completion_length": 12.6, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 12.6, "completions/mean_terminated_length": 12.6, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.491679273827534, "frac_reward_zero_std": 1.0, "grad_norm": 0.7611438035964966, "kl": 1.965437227487564, "learning_rate": 2.3795555555555557e-06, "loss": 0.002, "num_tokens": 8927227.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7150 }, { "completion_length": 11.65, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 11.65, "completions/mean_terminated_length": 11.65, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.49236693714757257, "frac_reward_zero_std": 1.0, "grad_norm": 0.02354721911251545, "kl": 1.0878031313419343, "learning_rate": 2.3751111111111113e-06, "loss": 0.0011, "num_tokens": 8940697.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7160 }, { "completion_length": 20.825, "completions/clipped_ratio": 0.0, "completions/max_length": 45.6, "completions/max_terminated_length": 45.6, "completions/mean_length": 20.825, "completions/mean_terminated_length": 20.825, "completions/min_length": 10.4, "completions/min_terminated_length": 10.4, "epoch": 0.49305460046761107, "frac_reward_zero_std": 0.9, "grad_norm": 0.005910597741603851, "kl": 1.0086887955665589, "learning_rate": 2.370666666666667e-06, "loss": 0.001, "num_tokens": 8952058.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7170 }, { "completion_length": 11.95, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.95, "completions/mean_terminated_length": 11.95, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.49374226378764957, "frac_reward_zero_std": 1.0, "grad_norm": 0.003222224535420537, "kl": 1.2605473756790162, "learning_rate": 2.3662222222222227e-06, "loss": 0.0013, "num_tokens": 8965632.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7180 }, { "completion_length": 23.6, "completions/clipped_ratio": 0.0, "completions/max_length": 63.1, "completions/max_terminated_length": 63.1, "completions/mean_length": 23.6, "completions/mean_terminated_length": 23.6, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.49442992710768807, "frac_reward_zero_std": 0.9, "grad_norm": 0.004417297430336475, "kl": 1.1330610036849975, "learning_rate": 2.361777777777778e-06, "loss": 0.0011, "num_tokens": 8977804.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7190 }, { "completion_length": 44.475, "completions/clipped_ratio": 0.0, "completions/max_length": 140.2, "completions/max_terminated_length": 140.2, "completions/mean_length": 44.475, "completions/mean_terminated_length": 44.475, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.4951175904277266, "frac_reward_zero_std": 0.8, "grad_norm": 0.019977256655693054, "kl": 1.0677195250988007, "learning_rate": 2.3573333333333333e-06, "loss": 0.0011, "num_tokens": 8992319.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7200 }, { "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 87.3, "completions/max_terminated_length": 87.3, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.4958052537477651, "frac_reward_zero_std": 0.8, "grad_norm": 0.008993498049676418, "kl": 1.0857190370559693, "learning_rate": 2.352888888888889e-06, "loss": 0.0011, "num_tokens": 9004757.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7210 }, { "completion_length": 13.55, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 13.55, "completions/mean_terminated_length": 13.55, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.49649291706780363, "frac_reward_zero_std": 1.0, "grad_norm": 0.004971934948116541, "kl": 1.0797606348991393, "learning_rate": 2.3484444444444447e-06, "loss": 0.0011, "num_tokens": 9018407.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7220 }, { "completion_length": 12.15, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 12.15, "completions/mean_terminated_length": 12.15, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.49718058038784213, "frac_reward_zero_std": 1.0, "grad_norm": 0.022533122450113297, "kl": 1.2900502800941467, "learning_rate": 2.3440000000000003e-06, "loss": 0.0013, "num_tokens": 9031465.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7230 }, { "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 8.3, "completions/min_terminated_length": 8.3, "epoch": 0.49786824370788063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030792122706770897, "kl": 1.3528812944889068, "learning_rate": 2.3395555555555556e-06, "loss": 0.0014, "num_tokens": 9042471.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7240 }, { "completion_length": 11.475, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 11.475, "completions/mean_terminated_length": 11.475, "completions/min_length": 8.1, "completions/min_terminated_length": 8.1, "epoch": 0.49855590702791913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058341664262115955, "kl": 1.2619741141796113, "learning_rate": 2.3351111111111113e-06, "loss": 0.0013, "num_tokens": 9052594.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7250 }, { "completion_length": 20.475, "completions/clipped_ratio": 0.0, "completions/max_length": 49.1, "completions/max_terminated_length": 49.1, "completions/mean_length": 20.475, "completions/mean_terminated_length": 20.475, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.49924357034795763, "frac_reward_zero_std": 0.8, "grad_norm": 1.8776947259902954, "kl": 1.384527599811554, "learning_rate": 2.330666666666667e-06, "loss": 0.0014, "num_tokens": 9064341.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7260 }, { "completion_length": 12.875, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 12.875, "completions/mean_terminated_length": 12.875, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.49993123366799613, "frac_reward_zero_std": 1.0, "grad_norm": 0.004909676965326071, "kl": 1.2514336943626403, "learning_rate": 2.3262222222222227e-06, "loss": 0.0013, "num_tokens": 9075776.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7270 }, { "completion_length": 13.025, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 13.025, "completions/mean_terminated_length": 13.025, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5006188969880346, "frac_reward_zero_std": 1.0, "grad_norm": 0.009278550744056702, "kl": 1.2726584792137146, "learning_rate": 2.321777777777778e-06, "loss": 0.0013, "num_tokens": 9090041.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7280 }, { "completion_length": 36.35, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 36.35, "completions/mean_terminated_length": 36.35, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5013065603080732, "frac_reward_zero_std": 0.9, "grad_norm": 0.011710338294506073, "kl": 1.0427227377891541, "learning_rate": 2.3173333333333336e-06, "loss": 0.001, "num_tokens": 9105039.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7290 }, { "completion_length": 12.925, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 12.925, "completions/mean_terminated_length": 12.925, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.5019942236281116, "frac_reward_zero_std": 0.9, "grad_norm": 0.004796651192009449, "kl": 1.1195420622825623, "learning_rate": 2.312888888888889e-06, "loss": 0.0011, "num_tokens": 9116628.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7300 }, { "completion_length": 11.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 11.7, "completions/mean_terminated_length": 11.7, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.5026818869481502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037184860557317734, "kl": 1.1613843977451324, "learning_rate": 2.3084444444444446e-06, "loss": 0.0012, "num_tokens": 9129592.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7310 }, { "completion_length": 23.125, "completions/clipped_ratio": 0.0, "completions/max_length": 59.6, "completions/max_terminated_length": 59.6, "completions/mean_length": 23.125, "completions/mean_terminated_length": 23.125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5033695502681887, "frac_reward_zero_std": 0.9, "grad_norm": 0.004970069508999586, "kl": 1.218816888332367, "learning_rate": 2.3040000000000003e-06, "loss": 0.0012, "num_tokens": 9141525.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7320 }, { "completion_length": 14.8, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 14.8, "completions/mean_terminated_length": 14.8, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5040572135882272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029654251411557198, "kl": 1.0582695841789245, "learning_rate": 2.2995555555555556e-06, "loss": 0.0011, "num_tokens": 9154301.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7330 }, { "completion_length": 13.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 13.4, "completions/mean_terminated_length": 13.4, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.5047448769082657, "frac_reward_zero_std": 0.9, "grad_norm": 0.006998724769800901, "kl": 1.0817440152168274, "learning_rate": 2.2951111111111113e-06, "loss": 0.0011, "num_tokens": 9166301.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7340 }, { "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.8, "completions/max_terminated_length": 26.8, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.5054325402283042, "frac_reward_zero_std": 0.8, "grad_norm": 2.057863235473633, "kl": 1.164160829782486, "learning_rate": 2.290666666666667e-06, "loss": 0.0012, "num_tokens": 9178547.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7350 }, { "completion_length": 12.875, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 12.875, "completions/mean_terminated_length": 12.875, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.5061202035483428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038249208591878414, "kl": 0.9877580821514129, "learning_rate": 2.2862222222222226e-06, "loss": 0.001, "num_tokens": 9190510.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7360 }, { "completion_length": 21.55, "completions/clipped_ratio": 0.0, "completions/max_length": 51.9, "completions/max_terminated_length": 51.9, "completions/mean_length": 21.55, "completions/mean_terminated_length": 21.55, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5068078668683812, "frac_reward_zero_std": 0.9, "grad_norm": 1.4600623846054077, "kl": 1.057120794057846, "learning_rate": 2.281777777777778e-06, "loss": 0.0011, "num_tokens": 9201872.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7370 }, { "completion_length": 28.325, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 28.325, "completions/mean_terminated_length": 28.325, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.5074955301884198, "frac_reward_zero_std": 0.9, "grad_norm": 0.007701764348894358, "kl": 1.1267670691013336, "learning_rate": 2.2773333333333336e-06, "loss": 0.0011, "num_tokens": 9214445.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7380 }, { "completion_length": 44.175, "completions/clipped_ratio": 0.0, "completions/max_length": 141.4, "completions/max_terminated_length": 141.4, "completions/mean_length": 44.175, "completions/mean_terminated_length": 44.175, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.5081831935084583, "frac_reward_zero_std": 0.8, "grad_norm": 1.5618634223937988, "kl": 1.123524260520935, "learning_rate": 2.2728888888888893e-06, "loss": 0.0011, "num_tokens": 9230012.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7390 }, { "completion_length": 13.175, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 13.175, "completions/mean_terminated_length": 13.175, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.5088708568284968, "frac_reward_zero_std": 1.0, "grad_norm": 0.005219184327870607, "kl": 1.0605081737041473, "learning_rate": 2.2684444444444446e-06, "loss": 0.0011, "num_tokens": 9242431.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7400 }, { "completion_length": 12.45, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 12.45, "completions/mean_terminated_length": 12.45, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.5095585201485353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051316008903086185, "kl": 1.2531267821788787, "learning_rate": 2.2640000000000003e-06, "loss": 0.0013, "num_tokens": 9253669.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7410 }, { "completion_length": 12.15, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 12.15, "completions/mean_terminated_length": 12.15, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.5102461834685738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031868712976574898, "kl": 1.106162852048874, "learning_rate": 2.2595555555555555e-06, "loss": 0.0011, "num_tokens": 9265887.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7420 }, { "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.9, "completions/max_terminated_length": 25.9, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.5109338467886123, "frac_reward_zero_std": 0.9, "grad_norm": 0.005863267928361893, "kl": 1.2646816730499268, "learning_rate": 2.2551111111111112e-06, "loss": 0.0013, "num_tokens": 9278577.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7430 }, { "completion_length": 11.525, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.525, "completions/mean_terminated_length": 11.525, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5116215101086508, "frac_reward_zero_std": 1.0, "grad_norm": 0.004301924724131823, "kl": 1.326978874206543, "learning_rate": 2.250666666666667e-06, "loss": 0.0013, "num_tokens": 9289242.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7440 }, { "completion_length": 15.85, "completions/clipped_ratio": 0.0, "completions/max_length": 32.2, "completions/max_terminated_length": 32.2, "completions/mean_length": 15.85, "completions/mean_terminated_length": 15.85, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.5123091734286893, "frac_reward_zero_std": 0.9, "grad_norm": 0.013630717992782593, "kl": 1.1819639325141906, "learning_rate": 2.2462222222222226e-06, "loss": 0.0012, "num_tokens": 9300732.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7450 }, { "completion_length": 11.425, "completions/clipped_ratio": 0.0, "completions/max_length": 15.7, "completions/max_terminated_length": 15.7, "completions/mean_length": 11.425, "completions/mean_terminated_length": 11.425, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.5129968367487279, "frac_reward_zero_std": 1.0, "grad_norm": 0.00520959822461009, "kl": 1.3894258320331574, "learning_rate": 2.241777777777778e-06, "loss": 0.0014, "num_tokens": 9314441.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7460 }, { "completion_length": 12.275, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 12.275, "completions/mean_terminated_length": 12.275, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.5136845000687663, "frac_reward_zero_std": 1.0, "grad_norm": 0.004836928565055132, "kl": 1.126960998773575, "learning_rate": 2.2373333333333336e-06, "loss": 0.0011, "num_tokens": 9327980.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7470 }, { "completion_length": 21.275, "completions/clipped_ratio": 0.0, "completions/max_length": 52.5, "completions/max_terminated_length": 52.5, "completions/mean_length": 21.275, "completions/mean_terminated_length": 21.275, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5143721633888049, "frac_reward_zero_std": 0.9, "grad_norm": 0.004135287832468748, "kl": 1.2688943803310395, "learning_rate": 2.2328888888888893e-06, "loss": 0.0013, "num_tokens": 9341155.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7480 }, { "completion_length": 16.175, "completions/clipped_ratio": 0.0, "completions/max_length": 33.5, "completions/max_terminated_length": 33.5, "completions/mean_length": 16.175, "completions/mean_terminated_length": 16.175, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5150598267088433, "frac_reward_zero_std": 0.8, "grad_norm": 0.006331494078040123, "kl": 1.248352313041687, "learning_rate": 2.228444444444445e-06, "loss": 0.0012, "num_tokens": 9353610.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.45, "rewards/check_coherence/std": 0.1, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7490 }, { "completion_length": 12.975, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 12.975, "completions/mean_terminated_length": 12.975, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5157474900288819, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027948105707764626, "kl": 1.1735309839248658, "learning_rate": 2.2240000000000002e-06, "loss": 0.0012, "num_tokens": 9366841.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7500 }, { "completion_length": 12.2, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 12.2, "completions/mean_terminated_length": 12.2, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.5164351533489203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0080268494784832, "kl": 1.3003155648708344, "learning_rate": 2.2195555555555555e-06, "loss": 0.0013, "num_tokens": 9377665.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7510 }, { "completion_length": 13.175, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 13.175, "completions/mean_terminated_length": 13.175, "completions/min_length": 10.3, "completions/min_terminated_length": 10.3, "epoch": 0.5171228166689589, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034015493001788855, "kl": 1.1480665683746338, "learning_rate": 2.215111111111111e-06, "loss": 0.0011, "num_tokens": 9390644.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7520 }, { "completion_length": 11.05, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.05, "completions/mean_terminated_length": 11.05, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.5178104799889974, "frac_reward_zero_std": 1.0, "grad_norm": 0.002507806057110429, "kl": 1.3187538743019105, "learning_rate": 2.210666666666667e-06, "loss": 0.0013, "num_tokens": 9403746.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7530 }, { "completion_length": 15.55, "completions/clipped_ratio": 0.0, "completions/max_length": 29.6, "completions/max_terminated_length": 29.6, "completions/mean_length": 15.55, "completions/mean_terminated_length": 15.55, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.5184981433090359, "frac_reward_zero_std": 0.9, "grad_norm": 0.040676817297935486, "kl": 1.3105673789978027, "learning_rate": 2.2062222222222226e-06, "loss": 0.0013, "num_tokens": 9416300.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7540 }, { "completion_length": 12.175, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 12.175, "completions/mean_terminated_length": 12.175, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.5191858066290744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030532637611031532, "kl": 1.3284153163433075, "learning_rate": 2.201777777777778e-06, "loss": 0.0013, "num_tokens": 9428207.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7550 }, { "completion_length": 11.075, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.075, "completions/mean_terminated_length": 11.075, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.5198734699491129, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032129832543432713, "kl": 1.3229759514331818, "learning_rate": 2.1973333333333335e-06, "loss": 0.0013, "num_tokens": 9442022.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7560 }, { "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.5205611332691514, "frac_reward_zero_std": 1.0, "grad_norm": 0.005039629060775042, "kl": 1.0451285064220428, "learning_rate": 2.1928888888888892e-06, "loss": 0.001, "num_tokens": 9456135.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7570 }, { "completion_length": 12.175, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 12.175, "completions/mean_terminated_length": 12.175, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.5212487965891899, "frac_reward_zero_std": 1.0, "grad_norm": 0.011741072870790958, "kl": 1.2608206033706666, "learning_rate": 2.188444444444445e-06, "loss": 0.0013, "num_tokens": 9465490.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7580 }, { "completion_length": 12.025, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.025, "completions/mean_terminated_length": 12.025, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.5219364599092284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034133840817958117, "kl": 1.2925956785678863, "learning_rate": 2.184e-06, "loss": 0.0013, "num_tokens": 9479079.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7590 }, { "completion_length": 13.45, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 13.45, "completions/mean_terminated_length": 13.45, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.522624123229267, "frac_reward_zero_std": 0.9, "grad_norm": 0.014618651010096073, "kl": 1.281085979938507, "learning_rate": 2.179555555555556e-06, "loss": 0.0013, "num_tokens": 9490605.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7600 }, { "completion_length": 12.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 12.7, "completions/mean_terminated_length": 12.7, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.5233117865493054, "frac_reward_zero_std": 1.0, "grad_norm": 0.03030218929052353, "kl": 1.2633468508720398, "learning_rate": 2.175111111111111e-06, "loss": 0.0013, "num_tokens": 9503505.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7610 }, { "completion_length": 11.725, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.725, "completions/mean_terminated_length": 11.725, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.523999449869344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029963438864797354, "kl": 1.3427102744579316, "learning_rate": 2.170666666666667e-06, "loss": 0.0013, "num_tokens": 9515646.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7620 }, { "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.5246871131893824, "frac_reward_zero_std": 1.0, "grad_norm": 0.005397267173975706, "kl": 1.4040341019630431, "learning_rate": 2.1662222222222225e-06, "loss": 0.0014, "num_tokens": 9527161.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7630 }, { "completion_length": 11.975, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 11.975, "completions/mean_terminated_length": 11.975, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.525374776509421, "frac_reward_zero_std": 1.0, "grad_norm": 0.014865939505398273, "kl": 1.3119125723838807, "learning_rate": 2.161777777777778e-06, "loss": 0.0013, "num_tokens": 9539084.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7640 }, { "completion_length": 11.8, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.8, "completions/mean_terminated_length": 11.8, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.5260624398294595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044557503424584866, "kl": 1.3221172630786895, "learning_rate": 2.1573333333333335e-06, "loss": 0.0013, "num_tokens": 9550292.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7650 }, { "completion_length": 13.35, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 13.35, "completions/mean_terminated_length": 13.35, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.526750103149498, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037254090420901775, "kl": 1.1248468935489655, "learning_rate": 2.152888888888889e-06, "loss": 0.0011, "num_tokens": 9562470.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7660 }, { "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.5274377664695366, "frac_reward_zero_std": 1.0, "grad_norm": 0.002507027005776763, "kl": 1.1201979637145996, "learning_rate": 2.148444444444445e-06, "loss": 0.0011, "num_tokens": 9577014.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7670 }, { "completion_length": 17.025, "completions/clipped_ratio": 0.0, "completions/max_length": 37.5, "completions/max_terminated_length": 37.5, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.528125429789575, "frac_reward_zero_std": 0.9, "grad_norm": 0.003964269068092108, "kl": 1.2861777245998383, "learning_rate": 2.144e-06, "loss": 0.0013, "num_tokens": 9589907.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7680 }, { "completion_length": 11.175, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.175, "completions/mean_terminated_length": 11.175, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.5288130931096136, "frac_reward_zero_std": 1.0, "grad_norm": 0.002470179693773389, "kl": 1.1973251819610595, "learning_rate": 2.139555555555556e-06, "loss": 0.0012, "num_tokens": 9602342.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7690 }, { "completion_length": 12.375, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 12.375, "completions/mean_terminated_length": 12.375, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.529500756429652, "frac_reward_zero_std": 1.0, "grad_norm": 0.012987960129976273, "kl": 1.1113298773765563, "learning_rate": 2.1351111111111115e-06, "loss": 0.0011, "num_tokens": 9614297.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7700 }, { "completion_length": 12.125, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 12.125, "completions/mean_terminated_length": 12.125, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5301884197496906, "frac_reward_zero_std": 1.0, "grad_norm": 0.004978030454367399, "kl": 1.324486207962036, "learning_rate": 2.130666666666667e-06, "loss": 0.0013, "num_tokens": 9626970.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7710 }, { "completion_length": 11.575, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.575, "completions/mean_terminated_length": 11.575, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.5308760830697291, "frac_reward_zero_std": 1.0, "grad_norm": 0.027692638337612152, "kl": 1.2244141578674317, "learning_rate": 2.1262222222222225e-06, "loss": 0.0012, "num_tokens": 9638625.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7720 }, { "completion_length": 11.675, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.675, "completions/mean_terminated_length": 11.675, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.5315637463897676, "frac_reward_zero_std": 1.0, "grad_norm": 0.003005094826221466, "kl": 1.3298441231250764, "learning_rate": 2.1217777777777778e-06, "loss": 0.0013, "num_tokens": 9650808.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7730 }, { "completion_length": 13.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 13.9, "completions/mean_terminated_length": 13.9, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.5322514097098061, "frac_reward_zero_std": 0.9, "grad_norm": 0.0026819314807653427, "kl": 1.2448628067970275, "learning_rate": 2.1173333333333334e-06, "loss": 0.0012, "num_tokens": 9661696.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7740 }, { "completion_length": 17.225, "completions/clipped_ratio": 0.0, "completions/max_length": 34.7, "completions/max_terminated_length": 34.7, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5329390730298446, "frac_reward_zero_std": 0.9, "grad_norm": 0.002824925584718585, "kl": 1.2569087088108062, "learning_rate": 2.112888888888889e-06, "loss": 0.0013, "num_tokens": 9673633.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7750 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 14.2, "completions/max_terminated_length": 14.2, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.5336267363498831, "frac_reward_zero_std": 1.0, "grad_norm": 0.005798778962343931, "kl": 1.2169726014137268, "learning_rate": 2.108444444444445e-06, "loss": 0.0012, "num_tokens": 9685013.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7760 }, { "completion_length": 11.925, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 11.925, "completions/mean_terminated_length": 11.925, "completions/min_length": 8.4, "completions/min_terminated_length": 8.4, "epoch": 0.5343143996699216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027706269174814224, "kl": 1.3897801697254182, "learning_rate": 2.104e-06, "loss": 0.0014, "num_tokens": 9696738.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7770 }, { "completion_length": 13.375, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 13.375, "completions/mean_terminated_length": 13.375, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.5350020629899601, "frac_reward_zero_std": 1.0, "grad_norm": 0.005443184170871973, "kl": 1.1451765239238738, "learning_rate": 2.099555555555556e-06, "loss": 0.0011, "num_tokens": 9709865.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7780 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.5356897263099987, "frac_reward_zero_std": 1.0, "grad_norm": 0.002919774502515793, "kl": 1.372281551361084, "learning_rate": 2.0951111111111115e-06, "loss": 0.0014, "num_tokens": 9721363.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7790 }, { "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.7, "completions/max_terminated_length": 38.7, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5363773896300371, "frac_reward_zero_std": 0.9, "grad_norm": 0.0032499770168215036, "kl": 1.1967048406600953, "learning_rate": 2.0906666666666668e-06, "loss": 0.0012, "num_tokens": 9734369.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7800 }, { "completion_length": 12.825, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 12.825, "completions/mean_terminated_length": 12.825, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5370650529500757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038574356585741043, "kl": 1.216644722223282, "learning_rate": 2.0862222222222224e-06, "loss": 0.0012, "num_tokens": 9745214.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7810 }, { "completion_length": 12.125, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 12.125, "completions/mean_terminated_length": 12.125, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.5377527162701141, "frac_reward_zero_std": 0.9, "grad_norm": 0.010060185566544533, "kl": 1.2828055381774903, "learning_rate": 2.0817777777777777e-06, "loss": 0.0013, "num_tokens": 9756951.0, "reward": 5.9875, "reward_std": 0.025, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7820 }, { "completion_length": 12.55, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 12.55, "completions/mean_terminated_length": 12.55, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5384403795901527, "frac_reward_zero_std": 1.0, "grad_norm": 0.007177610415965319, "kl": 1.303609848022461, "learning_rate": 2.0773333333333334e-06, "loss": 0.0013, "num_tokens": 9769141.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7830 }, { "completion_length": 11.7, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.7, "completions/mean_terminated_length": 11.7, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5391280429101911, "frac_reward_zero_std": 1.0, "grad_norm": 0.00721388915553689, "kl": 1.2078775942325592, "learning_rate": 2.072888888888889e-06, "loss": 0.0012, "num_tokens": 9782721.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7840 }, { "completion_length": 12.875, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 12.875, "completions/mean_terminated_length": 12.875, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.5398157062302297, "frac_reward_zero_std": 0.9, "grad_norm": 0.007724335417151451, "kl": 1.339196938276291, "learning_rate": 2.068444444444445e-06, "loss": 0.0013, "num_tokens": 9795264.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7850 }, { "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5405033695502682, "frac_reward_zero_std": 0.9, "grad_norm": 0.0032738870941102505, "kl": 1.2018405854701997, "learning_rate": 2.064e-06, "loss": 0.0012, "num_tokens": 9805751.0, "reward": 5.925, "reward_std": 0.05, "rewards/check_coherence/mean": 1.425, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7860 }, { "completion_length": 12.275, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 12.275, "completions/mean_terminated_length": 12.275, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.5411910328703067, "frac_reward_zero_std": 1.0, "grad_norm": 0.003743327222764492, "kl": 1.7937437176704407, "learning_rate": 2.0595555555555558e-06, "loss": 0.0018, "num_tokens": 9818498.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7870 }, { "completion_length": 13.15, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 13.15, "completions/mean_terminated_length": 13.15, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.5418786961903452, "frac_reward_zero_std": 0.9, "grad_norm": 0.006482637953013182, "kl": 1.3055089831352233, "learning_rate": 2.0551111111111114e-06, "loss": 0.0013, "num_tokens": 9828884.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7880 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.5425663595103837, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032682991586625576, "kl": 1.2907763242721557, "learning_rate": 2.0506666666666667e-06, "loss": 0.0013, "num_tokens": 9840316.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7890 }, { "completion_length": 13.85, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 13.85, "completions/mean_terminated_length": 13.85, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.5432540228304222, "frac_reward_zero_std": 0.9, "grad_norm": 2.878571033477783, "kl": 1.3566003799438477, "learning_rate": 2.0462222222222224e-06, "loss": 0.0014, "num_tokens": 9852170.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7900 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5439416861504607, "frac_reward_zero_std": 1.0, "grad_norm": 0.015462033450603485, "kl": 1.1428892016410828, "learning_rate": 2.041777777777778e-06, "loss": 0.0011, "num_tokens": 9865254.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7910 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5446293494704992, "frac_reward_zero_std": 1.0, "grad_norm": 0.004104136023670435, "kl": 1.057891947031021, "learning_rate": 2.0373333333333334e-06, "loss": 0.0011, "num_tokens": 9877294.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7920 }, { "completion_length": 32.025, "completions/clipped_ratio": 0.0, "completions/max_length": 94.1, "completions/max_terminated_length": 94.1, "completions/mean_length": 32.025, "completions/mean_terminated_length": 32.025, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.5453170127905378, "frac_reward_zero_std": 0.8, "grad_norm": 0.00987865962088108, "kl": 1.331625509262085, "learning_rate": 2.032888888888889e-06, "loss": 0.0013, "num_tokens": 9890899.0, "reward": 5.9125, "reward_std": 0.175, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4625, "rewards/check_response_quality/std": 0.075, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7930 }, { "completion_length": 13.05, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 13.05, "completions/mean_terminated_length": 13.05, "completions/min_length": 10.9, "completions/min_terminated_length": 10.9, "epoch": 0.5460046761105762, "frac_reward_zero_std": 1.0, "grad_norm": 0.002187013393267989, "kl": 1.2939410388469696, "learning_rate": 2.0284444444444447e-06, "loss": 0.0013, "num_tokens": 9903709.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7940 }, { "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.7, "completions/min_terminated_length": 10.7, "epoch": 0.5466923394306148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029057443607598543, "kl": 0.9948434948921203, "learning_rate": 2.024e-06, "loss": 0.001, "num_tokens": 9915857.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7950 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 13.6, "completions/max_terminated_length": 13.6, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.5473800027506532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022877950686961412, "kl": 1.2410366594791413, "learning_rate": 2.0195555555555557e-06, "loss": 0.0012, "num_tokens": 9929511.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7960 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.6, "completions/max_terminated_length": 46.6, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.5480676660706918, "frac_reward_zero_std": 0.9, "grad_norm": 0.013363093137741089, "kl": 1.0715976715087892, "learning_rate": 2.0151111111111114e-06, "loss": 0.0011, "num_tokens": 9942003.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7970 }, { "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.3, "completions/min_terminated_length": 10.3, "epoch": 0.5487553293907304, "frac_reward_zero_std": 1.0, "grad_norm": 0.004301813896745443, "kl": 1.037192702293396, "learning_rate": 2.0106666666666667e-06, "loss": 0.001, "num_tokens": 9953631.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7980 }, { "completion_length": 14.875, "completions/clipped_ratio": 0.0, "completions/max_length": 25.3, "completions/max_terminated_length": 25.3, "completions/mean_length": 14.875, "completions/mean_terminated_length": 14.875, "completions/min_length": 10.2, "completions/min_terminated_length": 10.2, "epoch": 0.5494429927107688, "frac_reward_zero_std": 0.9, "grad_norm": 0.005624216981232166, "kl": 1.1227632999420165, "learning_rate": 2.0062222222222224e-06, "loss": 0.0011, "num_tokens": 9963874.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 7990 }, { "completion_length": 12.525, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 12.525, "completions/mean_terminated_length": 12.525, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5501306560308074, "frac_reward_zero_std": 1.0, "grad_norm": 0.002584875328466296, "kl": 1.117231160402298, "learning_rate": 2.001777777777778e-06, "loss": 0.0011, "num_tokens": 9976807.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8000 }, { "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.9, "completions/max_terminated_length": 29.9, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.5508183193508458, "frac_reward_zero_std": 0.9, "grad_norm": 0.004169847350567579, "kl": 1.0605437874794006, "learning_rate": 1.9973333333333337e-06, "loss": 0.0011, "num_tokens": 9989301.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8010 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.5515059826708844, "frac_reward_zero_std": 1.0, "grad_norm": 0.005917856469750404, "kl": 1.1970151841640473, "learning_rate": 1.992888888888889e-06, "loss": 0.0012, "num_tokens": 10001573.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8020 }, { "completion_length": 11.725, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 11.725, "completions/mean_terminated_length": 11.725, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.5521936459909228, "frac_reward_zero_std": 0.9, "grad_norm": 3.603567600250244, "kl": 1.2946309864521026, "learning_rate": 1.9884444444444447e-06, "loss": 0.0013, "num_tokens": 10014162.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8030 }, { "completion_length": 12.1, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 12.1, "completions/mean_terminated_length": 12.1, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.5528813093109614, "frac_reward_zero_std": 0.9, "grad_norm": 0.0021072819363325834, "kl": 1.2057231783866882, "learning_rate": 1.984e-06, "loss": 0.0012, "num_tokens": 10026470.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8040 }, { "completion_length": 12.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 12.2, "completions/mean_terminated_length": 12.2, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.5535689726309999, "frac_reward_zero_std": 1.0, "grad_norm": 0.18881727755069733, "kl": 1.5855478703975678, "learning_rate": 1.9795555555555557e-06, "loss": 0.0016, "num_tokens": 10038874.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8050 }, { "completion_length": 11.95, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.95, "completions/mean_terminated_length": 11.95, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5542566359510384, "frac_reward_zero_std": 1.0, "grad_norm": 0.004419395700097084, "kl": 1.1470963537693024, "learning_rate": 1.9751111111111114e-06, "loss": 0.0011, "num_tokens": 10049296.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8060 }, { "completion_length": 14.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 14.3, "completions/mean_terminated_length": 14.3, "completions/min_length": 10.2, "completions/min_terminated_length": 10.2, "epoch": 0.5549442992710769, "frac_reward_zero_std": 1.0, "grad_norm": 0.002880280837416649, "kl": 0.9816267251968384, "learning_rate": 1.9706666666666666e-06, "loss": 0.001, "num_tokens": 10062172.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8070 }, { "completion_length": 12.925, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.925, "completions/mean_terminated_length": 12.925, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.5556319625911154, "frac_reward_zero_std": 0.9, "grad_norm": 0.005350610241293907, "kl": 1.1550312280654906, "learning_rate": 1.9662222222222223e-06, "loss": 0.0012, "num_tokens": 10076217.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8080 }, { "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 10.3, "completions/min_terminated_length": 10.3, "epoch": 0.5563196259111539, "frac_reward_zero_std": 1.0, "grad_norm": 0.005526478867977858, "kl": 1.0993483304977416, "learning_rate": 1.961777777777778e-06, "loss": 0.0011, "num_tokens": 10089445.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8090 }, { "completion_length": 13.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 13.1, "completions/mean_terminated_length": 13.1, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5570072892311924, "frac_reward_zero_std": 1.0, "grad_norm": 0.004046977963298559, "kl": 1.1231428921222686, "learning_rate": 1.9573333333333337e-06, "loss": 0.0011, "num_tokens": 10103149.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8100 }, { "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.2, "completions/min_terminated_length": 10.2, "epoch": 0.5576949525512309, "frac_reward_zero_std": 0.8, "grad_norm": 0.0036207973025739193, "kl": 1.399965763092041, "learning_rate": 1.952888888888889e-06, "loss": 0.0014, "num_tokens": 10115033.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8110 }, { "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.5583826158712695, "frac_reward_zero_std": 1.0, "grad_norm": 0.020446887239813805, "kl": 1.1484535217285157, "learning_rate": 1.9484444444444447e-06, "loss": 0.0011, "num_tokens": 10129707.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8120 }, { "completion_length": 14.575, "completions/clipped_ratio": 0.0, "completions/max_length": 26.9, "completions/max_terminated_length": 26.9, "completions/mean_length": 14.575, "completions/mean_terminated_length": 14.575, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5590702791913079, "frac_reward_zero_std": 0.9, "grad_norm": 0.018497074022889137, "kl": 1.132157129049301, "learning_rate": 1.944e-06, "loss": 0.0011, "num_tokens": 10142250.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8130 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5597579425113465, "frac_reward_zero_std": 0.9, "grad_norm": 0.003500531194731593, "kl": 1.4015939116477967, "learning_rate": 1.9395555555555556e-06, "loss": 0.0014, "num_tokens": 10153132.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8140 }, { "completion_length": 15.225, "completions/clipped_ratio": 0.0, "completions/max_length": 27.5, "completions/max_terminated_length": 27.5, "completions/mean_length": 15.225, "completions/mean_terminated_length": 15.225, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.5604456058313849, "frac_reward_zero_std": 0.9, "grad_norm": 0.002279973356053233, "kl": 1.0312190353870392, "learning_rate": 1.9351111111111113e-06, "loss": 0.001, "num_tokens": 10165629.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8150 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 43.8, "completions/max_terminated_length": 43.8, "completions/mean_length": 20.3, "completions/mean_terminated_length": 20.3, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.5611332691514235, "frac_reward_zero_std": 0.9, "grad_norm": 0.009704441763460636, "kl": 0.9771228730678558, "learning_rate": 1.9306666666666666e-06, "loss": 0.001, "num_tokens": 10179037.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8160 }, { "completion_length": 13.725, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 13.725, "completions/mean_terminated_length": 13.725, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.5618209324714619, "frac_reward_zero_std": 1.0, "grad_norm": 0.005785486660897732, "kl": 1.0721399009227752, "learning_rate": 1.9262222222222223e-06, "loss": 0.0011, "num_tokens": 10190790.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8170 }, { "completion_length": 13.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 13.3, "completions/mean_terminated_length": 13.3, "completions/min_length": 10.6, "completions/min_terminated_length": 10.6, "epoch": 0.5625085957915005, "frac_reward_zero_std": 0.9, "grad_norm": 0.006073605734854937, "kl": 1.0811134040355683, "learning_rate": 1.921777777777778e-06, "loss": 0.0011, "num_tokens": 10201646.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8180 }, { "completion_length": 21.425, "completions/clipped_ratio": 0.0, "completions/max_length": 53.7, "completions/max_terminated_length": 53.7, "completions/mean_length": 21.425, "completions/mean_terminated_length": 21.425, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.563196259111539, "frac_reward_zero_std": 0.9, "grad_norm": 0.008377310819923878, "kl": 1.2893824517726897, "learning_rate": 1.9173333333333337e-06, "loss": 0.0013, "num_tokens": 10213359.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8190 }, { "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.5638839224315775, "frac_reward_zero_std": 0.9, "grad_norm": 0.0031006948556751013, "kl": 1.283377367258072, "learning_rate": 1.912888888888889e-06, "loss": 0.0013, "num_tokens": 10225839.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8200 }, { "completion_length": 11.7, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.7, "completions/mean_terminated_length": 11.7, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.564571585751616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019166450947523117, "kl": 1.1040916562080383, "learning_rate": 1.9084444444444446e-06, "loss": 0.0011, "num_tokens": 10238179.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8210 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 14.1, "completions/max_terminated_length": 14.1, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.5652592490716545, "frac_reward_zero_std": 1.0, "grad_norm": 0.008414049632847309, "kl": 1.11358762383461, "learning_rate": 1.9040000000000003e-06, "loss": 0.0011, "num_tokens": 10249411.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8220 }, { "completion_length": 11.825, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 11.825, "completions/mean_terminated_length": 11.825, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.565946912391693, "frac_reward_zero_std": 1.0, "grad_norm": 0.011217906139791012, "kl": 1.1142095625400543, "learning_rate": 1.8995555555555556e-06, "loss": 0.0011, "num_tokens": 10263452.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8230 }, { "completion_length": 12.35, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 12.35, "completions/mean_terminated_length": 12.35, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5666345757117315, "frac_reward_zero_std": 1.0, "grad_norm": 0.003973840270191431, "kl": 1.2286709368228912, "learning_rate": 1.8951111111111113e-06, "loss": 0.0012, "num_tokens": 10275398.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8240 }, { "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 0.56732223903177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032951238099485636, "kl": 0.9890437185764313, "learning_rate": 1.8906666666666668e-06, "loss": 0.001, "num_tokens": 10287797.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8250 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.5680099023518086, "frac_reward_zero_std": 1.0, "grad_norm": 0.008584411814808846, "kl": 1.2706584572792052, "learning_rate": 1.8862222222222222e-06, "loss": 0.0013, "num_tokens": 10300355.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8260 }, { "completion_length": 12.425, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 12.425, "completions/mean_terminated_length": 12.425, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.568697565671847, "frac_reward_zero_std": 1.0, "grad_norm": 0.027631660923361778, "kl": 1.2103368103504182, "learning_rate": 1.881777777777778e-06, "loss": 0.0012, "num_tokens": 10311192.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8270 }, { "completion_length": 12.1, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 12.1, "completions/mean_terminated_length": 12.1, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.5693852289918856, "frac_reward_zero_std": 1.0, "grad_norm": 0.002199193462729454, "kl": 1.0713283360004424, "learning_rate": 1.8773333333333334e-06, "loss": 0.0011, "num_tokens": 10324664.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8280 }, { "completion_length": 11.55, "completions/clipped_ratio": 0.0, "completions/max_length": 14.9, "completions/max_terminated_length": 14.9, "completions/mean_length": 11.55, "completions/mean_terminated_length": 11.55, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.570072892311924, "frac_reward_zero_std": 0.9, "grad_norm": 0.009694810956716537, "kl": 1.261211758852005, "learning_rate": 1.8728888888888891e-06, "loss": 0.0013, "num_tokens": 10334646.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8290 }, { "completion_length": 12.525, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 12.525, "completions/mean_terminated_length": 12.525, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.5707605556319626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036631503608077765, "kl": 1.2292134821414948, "learning_rate": 1.8684444444444446e-06, "loss": 0.0012, "num_tokens": 10347247.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8300 }, { "completion_length": 12.325, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 12.325, "completions/mean_terminated_length": 12.325, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.5714482189520012, "frac_reward_zero_std": 1.0, "grad_norm": 0.003963626455515623, "kl": 4.332015436887741, "learning_rate": 1.8640000000000003e-06, "loss": 0.0043, "num_tokens": 10360732.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8310 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5721358822720396, "frac_reward_zero_std": 1.0, "grad_norm": 0.002724467311054468, "kl": 1.1339951932430268, "learning_rate": 1.8595555555555558e-06, "loss": 0.0011, "num_tokens": 10373472.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8320 }, { "completion_length": 11.85, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.85, "completions/mean_terminated_length": 11.85, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.5728235455920782, "frac_reward_zero_std": 0.9, "grad_norm": 0.0034971812274307013, "kl": 1.0714882016181946, "learning_rate": 1.8551111111111112e-06, "loss": 0.0011, "num_tokens": 10386234.0, "reward": 5.9625, "reward_std": 0.075, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8330 }, { "completion_length": 24.55, "completions/clipped_ratio": 0.0, "completions/max_length": 63.6, "completions/max_terminated_length": 63.6, "completions/mean_length": 24.55, "completions/mean_terminated_length": 24.55, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.5735112089121166, "frac_reward_zero_std": 0.8, "grad_norm": 0.0032173949293792248, "kl": 1.1109222888946533, "learning_rate": 1.8506666666666667e-06, "loss": 0.0011, "num_tokens": 10399544.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8340 }, { "completion_length": 11.675, "completions/clipped_ratio": 0.0, "completions/max_length": 14.2, "completions/max_terminated_length": 14.2, "completions/mean_length": 11.675, "completions/mean_terminated_length": 11.675, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.5741988722321552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018772322218865156, "kl": 1.2787903010845185, "learning_rate": 1.8462222222222222e-06, "loss": 0.0013, "num_tokens": 10411335.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8350 }, { "completion_length": 11.7, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.7, "completions/mean_terminated_length": 11.7, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5748865355521936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0070866490714251995, "kl": 1.1854866743087769, "learning_rate": 1.8417777777777779e-06, "loss": 0.0012, "num_tokens": 10424947.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8360 }, { "completion_length": 11.6, "completions/clipped_ratio": 0.0, "completions/max_length": 13.7, "completions/max_terminated_length": 13.7, "completions/mean_length": 11.6, "completions/mean_terminated_length": 11.6, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.5755741988722322, "frac_reward_zero_std": 1.0, "grad_norm": 0.002047724789008498, "kl": 1.1531950116157532, "learning_rate": 1.8373333333333334e-06, "loss": 0.0012, "num_tokens": 10439447.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8370 }, { "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.5762618621922707, "frac_reward_zero_std": 0.9, "grad_norm": 0.005259161815047264, "kl": 1.24915811419487, "learning_rate": 1.832888888888889e-06, "loss": 0.0012, "num_tokens": 10452997.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8380 }, { "completion_length": 10.725, "completions/clipped_ratio": 0.0, "completions/max_length": 13.2, "completions/max_terminated_length": 13.2, "completions/mean_length": 10.725, "completions/mean_terminated_length": 10.725, "completions/min_length": 8.7, "completions/min_terminated_length": 8.7, "epoch": 0.5769495255123092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033741919323801994, "kl": 1.3191501200199127, "learning_rate": 1.8284444444444445e-06, "loss": 0.0013, "num_tokens": 10465878.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8390 }, { "completion_length": 11.3, "completions/clipped_ratio": 0.0, "completions/max_length": 14.3, "completions/max_terminated_length": 14.3, "completions/mean_length": 11.3, "completions/mean_terminated_length": 11.3, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.5776371888323477, "frac_reward_zero_std": 1.0, "grad_norm": 0.003714599646627903, "kl": 1.2673472166061401, "learning_rate": 1.8240000000000002e-06, "loss": 0.0013, "num_tokens": 10477354.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8400 }, { "completion_length": 11.8, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.8, "completions/mean_terminated_length": 11.8, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.5783248521523862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022408729419112206, "kl": 1.218839818239212, "learning_rate": 1.8195555555555557e-06, "loss": 0.0012, "num_tokens": 10489222.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8410 }, { "completion_length": 11.675, "completions/clipped_ratio": 0.0, "completions/max_length": 15.1, "completions/max_terminated_length": 15.1, "completions/mean_length": 11.675, "completions/mean_terminated_length": 11.675, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5790125154724247, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019483575597405434, "kl": 1.03574578166008, "learning_rate": 1.8151111111111114e-06, "loss": 0.001, "num_tokens": 10500381.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8420 }, { "completion_length": 11.975, "completions/clipped_ratio": 0.0, "completions/max_length": 15.2, "completions/max_terminated_length": 15.2, "completions/mean_length": 11.975, "completions/mean_terminated_length": 11.975, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.5797001787924632, "frac_reward_zero_std": 1.0, "grad_norm": 0.004584670998156071, "kl": 1.1955373942852021, "learning_rate": 1.8106666666666667e-06, "loss": 0.0012, "num_tokens": 10512416.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8430 }, { "completion_length": 18.525, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 18.525, "completions/mean_terminated_length": 18.525, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.5803878421125017, "frac_reward_zero_std": 0.9, "grad_norm": 0.0029421942308545113, "kl": 1.170852828025818, "learning_rate": 1.8062222222222222e-06, "loss": 0.0012, "num_tokens": 10526149.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8440 }, { "completion_length": 11.7, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.7, "completions/mean_terminated_length": 11.7, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.5810755054325403, "frac_reward_zero_std": 1.0, "grad_norm": 0.004435285460203886, "kl": 1.3717931151390075, "learning_rate": 1.8017777777777779e-06, "loss": 0.0014, "num_tokens": 10538289.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8450 }, { "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5817631687525787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038708082865923643, "kl": 1.1633994162082673, "learning_rate": 1.7973333333333333e-06, "loss": 0.0012, "num_tokens": 10550281.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8460 }, { "completion_length": 11.7, "completions/clipped_ratio": 0.0, "completions/max_length": 14.2, "completions/max_terminated_length": 14.2, "completions/mean_length": 11.7, "completions/mean_terminated_length": 11.7, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5824508320726173, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018100917804986238, "kl": 1.089043253660202, "learning_rate": 1.792888888888889e-06, "loss": 0.0011, "num_tokens": 10563557.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8470 }, { "completion_length": 12.225, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 12.225, "completions/mean_terminated_length": 12.225, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5831384953926557, "frac_reward_zero_std": 1.0, "grad_norm": 0.007320227101445198, "kl": 1.1034001350402831, "learning_rate": 1.7884444444444445e-06, "loss": 0.0011, "num_tokens": 10574910.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8480 }, { "completion_length": 12.45, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 12.45, "completions/mean_terminated_length": 12.45, "completions/min_length": 10.3, "completions/min_terminated_length": 10.3, "epoch": 0.5838261587126943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036800052039325237, "kl": 1.1248241066932678, "learning_rate": 1.7840000000000002e-06, "loss": 0.0011, "num_tokens": 10588092.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8490 }, { "completion_length": 12.675, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 12.675, "completions/mean_terminated_length": 12.675, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.5845138220327327, "frac_reward_zero_std": 1.0, "grad_norm": 0.01451704278588295, "kl": 1.1269919991493225, "learning_rate": 1.7795555555555557e-06, "loss": 0.0011, "num_tokens": 10599079.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8500 }, { "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.5852014853527713, "frac_reward_zero_std": 1.0, "grad_norm": 0.003538082353770733, "kl": 1.2523035109043121, "learning_rate": 1.7751111111111114e-06, "loss": 0.0013, "num_tokens": 10610125.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8510 }, { "completion_length": 12.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 12.7, "completions/mean_terminated_length": 12.7, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.5858891486728098, "frac_reward_zero_std": 1.0, "grad_norm": 0.004502912051975727, "kl": 1.0592267155647277, "learning_rate": 1.7706666666666669e-06, "loss": 0.0011, "num_tokens": 10621961.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8520 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 14.6, "completions/max_terminated_length": 14.6, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5865768119928483, "frac_reward_zero_std": 1.0, "grad_norm": 0.00452207587659359, "kl": 1.2047041416168214, "learning_rate": 1.7662222222222225e-06, "loss": 0.0012, "num_tokens": 10633465.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8530 }, { "completion_length": 13.925, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 13.925, "completions/mean_terminated_length": 13.925, "completions/min_length": 10.2, "completions/min_terminated_length": 10.2, "epoch": 0.5872644753128868, "frac_reward_zero_std": 1.0, "grad_norm": 0.057433757930994034, "kl": 1.1581880033016205, "learning_rate": 1.7617777777777778e-06, "loss": 0.0012, "num_tokens": 10644906.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8540 }, { "completion_length": 12.625, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 12.625, "completions/mean_terminated_length": 12.625, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.5879521386329253, "frac_reward_zero_std": 1.0, "grad_norm": 0.005435035564005375, "kl": 1.1151172339916229, "learning_rate": 1.7573333333333333e-06, "loss": 0.0011, "num_tokens": 10657511.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8550 }, { "completion_length": 12.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 12.9, "completions/mean_terminated_length": 12.9, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5886398019529638, "frac_reward_zero_std": 1.0, "grad_norm": 0.004669901914894581, "kl": 1.1759621858596803, "learning_rate": 1.752888888888889e-06, "loss": 0.0012, "num_tokens": 10670195.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8560 }, { "completion_length": 11.925, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 11.925, "completions/mean_terminated_length": 11.925, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.5893274652730024, "frac_reward_zero_std": 1.0, "grad_norm": 0.003652728395536542, "kl": 1.1641576826572417, "learning_rate": 1.7484444444444445e-06, "loss": 0.0012, "num_tokens": 10682340.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8570 }, { "completion_length": 12.025, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.025, "completions/mean_terminated_length": 12.025, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.5900151285930408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023117121309041977, "kl": 1.1836768448352815, "learning_rate": 1.7440000000000002e-06, "loss": 0.0012, "num_tokens": 10694753.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8580 }, { "completion_length": 13.1, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 13.1, "completions/mean_terminated_length": 13.1, "completions/min_length": 10.3, "completions/min_terminated_length": 10.3, "epoch": 0.5907027919130794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037394952960312366, "kl": 1.2706849694252014, "learning_rate": 1.7395555555555556e-06, "loss": 0.0013, "num_tokens": 10707989.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8590 }, { "completion_length": 12.6, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 12.6, "completions/mean_terminated_length": 12.6, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.5913904552331178, "frac_reward_zero_std": 1.0, "grad_norm": 0.003737551858648658, "kl": 1.1382164716720582, "learning_rate": 1.7351111111111113e-06, "loss": 0.0011, "num_tokens": 10721337.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8600 }, { "completion_length": 11.475, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.475, "completions/mean_terminated_length": 11.475, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.5920781185531564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045438166707754135, "kl": 1.2445359587669373, "learning_rate": 1.7306666666666668e-06, "loss": 0.0012, "num_tokens": 10732732.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8610 }, { "completion_length": 11.05, "completions/clipped_ratio": 0.0, "completions/max_length": 13.4, "completions/max_terminated_length": 13.4, "completions/mean_length": 11.05, "completions/mean_terminated_length": 11.05, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.5927657818731948, "frac_reward_zero_std": 1.0, "grad_norm": 0.007625124417245388, "kl": 1.1852139592170716, "learning_rate": 1.7262222222222225e-06, "loss": 0.0012, "num_tokens": 10744194.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8620 }, { "completion_length": 11.9, "completions/clipped_ratio": 0.0, "completions/max_length": 14.8, "completions/max_terminated_length": 14.8, "completions/mean_length": 11.9, "completions/mean_terminated_length": 11.9, "completions/min_length": 9.6, "completions/min_terminated_length": 9.6, "epoch": 0.5934534451932334, "frac_reward_zero_std": 1.0, "grad_norm": 0.006672825198620558, "kl": 1.1356277108192443, "learning_rate": 1.721777777777778e-06, "loss": 0.0011, "num_tokens": 10756170.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8630 }, { "completion_length": 12.65, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.65, "completions/mean_terminated_length": 12.65, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.594141108513272, "frac_reward_zero_std": 1.0, "grad_norm": 0.015049074776470661, "kl": 1.2014117240905762, "learning_rate": 1.7173333333333333e-06, "loss": 0.0012, "num_tokens": 10769332.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8640 }, { "completion_length": 12.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.9, "completions/mean_terminated_length": 12.9, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.5948287718333104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029658586718142033, "kl": 1.1564993917942048, "learning_rate": 1.712888888888889e-06, "loss": 0.0012, "num_tokens": 10781564.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8650 }, { "completion_length": 12.925, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 12.925, "completions/mean_terminated_length": 12.925, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.595516435153349, "frac_reward_zero_std": 0.9, "grad_norm": 0.003762275679036975, "kl": 1.100640344619751, "learning_rate": 1.7084444444444444e-06, "loss": 0.0011, "num_tokens": 10792437.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8660 }, { "completion_length": 12.175, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 12.175, "completions/mean_terminated_length": 12.175, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.5962040984733874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047178128734230995, "kl": 1.2938535451889037, "learning_rate": 1.7040000000000001e-06, "loss": 0.0013, "num_tokens": 10805384.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8670 }, { "completion_length": 12.425, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 12.425, "completions/mean_terminated_length": 12.425, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.596891761793426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027440576814115047, "kl": 1.2109251201152802, "learning_rate": 1.6995555555555556e-06, "loss": 0.0012, "num_tokens": 10818121.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8680 }, { "completion_length": 13.125, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 13.125, "completions/mean_terminated_length": 13.125, "completions/min_length": 10.3, "completions/min_terminated_length": 10.3, "epoch": 0.5975794251134644, "frac_reward_zero_std": 1.0, "grad_norm": 0.004016244318336248, "kl": 1.1241649985313416, "learning_rate": 1.6951111111111113e-06, "loss": 0.0011, "num_tokens": 10831398.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8690 }, { "completion_length": 12.275, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 12.275, "completions/mean_terminated_length": 12.275, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.598267088433503, "frac_reward_zero_std": 1.0, "grad_norm": 0.004033960402011871, "kl": 1.2029431879520416, "learning_rate": 1.6906666666666668e-06, "loss": 0.0012, "num_tokens": 10843609.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8700 }, { "completion_length": 16.525, "completions/clipped_ratio": 0.0, "completions/max_length": 29.7, "completions/max_terminated_length": 29.7, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 10.7, "completions/min_terminated_length": 10.7, "epoch": 0.5989547517535415, "frac_reward_zero_std": 0.9, "grad_norm": 0.002579750493168831, "kl": 1.306799578666687, "learning_rate": 1.6862222222222225e-06, "loss": 0.0013, "num_tokens": 10855890.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8710 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 38.9, "completions/max_terminated_length": 38.9, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 9.3, "completions/min_terminated_length": 9.3, "epoch": 0.59964241507358, "frac_reward_zero_std": 0.9, "grad_norm": 0.004552737809717655, "kl": 1.1339510440826417, "learning_rate": 1.681777777777778e-06, "loss": 0.0011, "num_tokens": 10868374.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8720 }, { "completion_length": 13.025, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 13.025, "completions/mean_terminated_length": 13.025, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.6003300783936185, "frac_reward_zero_std": 1.0, "grad_norm": 0.004149656742811203, "kl": 1.2595533192157746, "learning_rate": 1.6773333333333336e-06, "loss": 0.0013, "num_tokens": 10881307.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8730 }, { "completion_length": 11.925, "completions/clipped_ratio": 0.0, "completions/max_length": 15.3, "completions/max_terminated_length": 15.3, "completions/mean_length": 11.925, "completions/mean_terminated_length": 11.925, "completions/min_length": 9.4, "completions/min_terminated_length": 9.4, "epoch": 0.601017741713657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024880890268832445, "kl": 1.2572630107402802, "learning_rate": 1.672888888888889e-06, "loss": 0.0013, "num_tokens": 10894100.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8740 }, { "completion_length": 13.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 13.3, "completions/mean_terminated_length": 13.3, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.6017054050336955, "frac_reward_zero_std": 1.0, "grad_norm": 0.004873007535934448, "kl": 1.2138256490230561, "learning_rate": 1.6684444444444444e-06, "loss": 0.0012, "num_tokens": 10907060.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8750 }, { "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.8, "completions/max_terminated_length": 38.8, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 10.4, "completions/min_terminated_length": 10.4, "epoch": 0.602393068353734, "frac_reward_zero_std": 0.9, "grad_norm": 0.0031885248608887196, "kl": 1.0743233919143678, "learning_rate": 1.664e-06, "loss": 0.0011, "num_tokens": 10918878.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8760 }, { "completion_length": 11.8, "completions/clipped_ratio": 0.0, "completions/max_length": 14.7, "completions/max_terminated_length": 14.7, "completions/mean_length": 11.8, "completions/mean_terminated_length": 11.8, "completions/min_length": 8.9, "completions/min_terminated_length": 8.9, "epoch": 0.6030807316737725, "frac_reward_zero_std": 1.0, "grad_norm": 0.013680394738912582, "kl": 1.296369630098343, "learning_rate": 1.6595555555555556e-06, "loss": 0.0013, "num_tokens": 10930078.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8770 }, { "completion_length": 12.85, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 12.85, "completions/mean_terminated_length": 12.85, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.6037683949938111, "frac_reward_zero_std": 1.0, "grad_norm": 0.003447858849540353, "kl": 1.0455607414245605, "learning_rate": 1.6551111111111112e-06, "loss": 0.001, "num_tokens": 10941140.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8780 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 39.6, "completions/max_terminated_length": 39.6, "completions/mean_length": 18.3, "completions/mean_terminated_length": 18.3, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.6044560583138495, "frac_reward_zero_std": 0.8, "grad_norm": 0.002344973385334015, "kl": 1.1524518728256226, "learning_rate": 1.6506666666666667e-06, "loss": 0.0012, "num_tokens": 10954176.0, "reward": 5.925, "reward_std": 0.15, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.475, "rewards/check_response_quality/std": 0.05, "rewards/match_format_approximately/mean": 0.975, "rewards/match_format_approximately/std": 0.05, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8790 }, { "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 9.1, "completions/min_terminated_length": 9.1, "epoch": 0.6051437216338881, "frac_reward_zero_std": 1.0, "grad_norm": 0.004940181504935026, "kl": 1.3828946709632874, "learning_rate": 1.6462222222222224e-06, "loss": 0.0014, "num_tokens": 10966583.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8800 }, { "completion_length": 17.85, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 17.85, "completions/mean_terminated_length": 17.85, "completions/min_length": 10.1, "completions/min_terminated_length": 10.1, "epoch": 0.6058313849539265, "frac_reward_zero_std": 0.9, "grad_norm": 0.006498556584119797, "kl": 1.036690926551819, "learning_rate": 1.641777777777778e-06, "loss": 0.001, "num_tokens": 10980065.0, "reward": 5.975, "reward_std": 0.05, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8810 }, { "completion_length": 12.225, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 12.225, "completions/mean_terminated_length": 12.225, "completions/min_length": 9.9, "completions/min_terminated_length": 9.9, "epoch": 0.6065190482739651, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031990488059818745, "kl": 1.0018371641635895, "learning_rate": 1.6373333333333336e-06, "loss": 0.001, "num_tokens": 10992706.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8820 }, { "completion_length": 12.525, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 12.525, "completions/mean_terminated_length": 12.525, "completions/min_length": 9.8, "completions/min_terminated_length": 9.8, "epoch": 0.6072067115940035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022266560699790716, "kl": 1.026029396057129, "learning_rate": 1.632888888888889e-06, "loss": 0.001, "num_tokens": 11005827.0, "reward": 6.0, "reward_std": 0.0, "rewards/check_coherence/mean": 1.5, "rewards/check_coherence/std": 0.0, "rewards/check_response_quality/mean": 2.5, "rewards/check_response_quality/std": 0.0, "rewards/match_format_approximately/mean": 1.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8830 }, { "completion_length": 23.075, "completions/clipped_ratio": 0.0, "completions/max_length": 57.9, "completions/max_terminated_length": 57.9, "completions/mean_length": 23.075, "completions/mean_terminated_length": 23.075, "completions/min_length": 9.7, "completions/min_terminated_length": 9.7, "epoch": 0.6078943749140421, "frac_reward_zero_std": 0.9, "grad_norm": 0.004883910529315472, "kl": 1.190100622177124, "learning_rate": 1.6284444444444448e-06, "loss": 0.0012, "num_tokens": 11018086.0, "reward": 5.95, "reward_std": 0.1, "rewards/check_coherence/mean": 1.475, "rewards/check_coherence/std": 0.05, "rewards/check_response_quality/mean": 2.4875, "rewards/check_response_quality/std": 0.025, "rewards/match_format_approximately/mean": 0.9875, "rewards/match_format_approximately/std": 0.025, "rewards/match_format_exactly/mean": 1.0, "rewards/match_format_exactly/std": 0.0, "step": 8840 } ], "logging_steps": 10, "max_steps": 12500, "num_input_tokens_seen": 11018086, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }